def main(inputPtnPath,outputPath,pspath,inputPath): model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") properties = projizz.buildYagoProperties({"tp":[],"tn":[],"fp":[],"fn":[]}) st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath)) domainRange = projizz.getYagoRelationDomainRange() start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath ))) t += 1 pool.close() pool.join() for res in result: r = res.get() for m in r: properties[m]["tp"] += r[m]["tp"] properties[m]["tn"] += r[m]["tn"] properties[m]["fp"] += r[m]["fp"] properties[m]["fn"] += r[m]["fn"] print "start write out to %s" % (outputPath) json.dump(properties,open(outputPath,"w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath,pspath,inputPath): model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp":[],"tn":[],"fp":[],"fn":[]}) st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath)) domainRange = projizz.getYagoRelationDomainRange() start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath ))) t += 1 pool.close() pool.join() for res in result: r = res.get() for m in r: properties[m]["tp"] += r[m]["tp"] properties[m]["tn"] += r[m]["tn"] properties[m]["fp"] += r[m]["fp"] properties[m]["fn"] += r[m]["fn"] print "start write out to %s" % (outputPath) json.dump(properties,open(outputPath,"w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath,pspath,inputPath,confidence,outputFilename,nbcPath): #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") model, table = projizz.readPrefixTreeModelWithTable("../yago//yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[],"et1":[],"et2":[],"et3":[]}) st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath)) domainRange = projizz.getYagoRelationDomainRange() start_time = datetime.now() cpuCount = multiprocessing.cpu_count() if cpuCount > 8: cpuCount = 8 pool = multiprocessing.Pool(processes=cpuCount) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) #result.append(filterFunction(t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,classifiers )) result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,nbcPath ))) t += 1 pool.close() pool.join() expResult = {} for res in result: r = res.get() for keyname in r: if not keyname in expResult: expResult[keyname] = copy.deepcopy(properties) for m in r[keyname]: if m == "produced": continue expResult[keyname][m]["tp"] += r[keyname][m]["tp"] expResult[keyname][m]["fp"] += r[keyname][m]["fp"] expResult[keyname][m]["fn"] += r[keyname][m]["fn"] expResult[keyname][m]["et1"] += r[keyname][m]["et1"] expResult[keyname][m]["et2"] += r[keyname][m]["et2"] expResult[keyname][m]["et3"] += r[keyname][m]["et3"] if not os.path.isdir(outputPath): os.mkdir(outputPath) for keyname in expResult: p = expResult[keyname] if not os.path.isdir(os.path.join(outputPath,keyname)): os.mkdir(os.path.join(outputPath,keyname)) projizz.jsonWrite(p,os.path.join(outputPath,keyname,outputFilename)) print "start write out to %s" % (os.path.join(outputPath,keyname)) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPath, inputPtnPath, vsmPath, confidence, psfile, outputPath, outputFilename): model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp": [], "fp": [], "fn": []}) domainRange = projizz.getYagoRelationDomainRange() idf, docs, lens = projizz.getVSMmodels(vsmPath) st = projizz.getSortedPatternStatistic(projizz.jsonRead(psfile)) vsmData = (idf, docs, lens) projizz.checkPath(outputPath) start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append( pool.apply_async( mapper, (t, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData))) #result.append( mapper( t, filename, inputPath, inputPtnPath, table, partAns, domainRange, confidence, vsmData )) t += 1 pool.close() pool.join() expResult = {} for res in result: r = res.get() for keyname in r: if not keyname in expResult: expResult[keyname] = copy.deepcopy(properties) for m in r[keyname]: if m == "produced": continue expResult[keyname][m]["tp"] += r[keyname][m]["tp"] expResult[keyname][m]["fp"] += r[keyname][m]["fp"] expResult[keyname][m]["fn"] += r[keyname][m]["fn"] for keyname in expResult: p = expResult[keyname] keydirName = "vsm-%d" % (keyname) projizz.checkPath(os.path.join(outputPath, keydirName)) projizz.jsonWrite(p, os.path.join(outputPath, keydirName, outputFilename)) print "start write out to %s" % (os.path.join(outputPath, keydirName)) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def trainModel(jobid,relation,inputpath,outputPath): print "%d -> train %s" % (jobid,relation) instances = [] filepath = os.path.join(inputpath,"%s.pos" % (relation)) if os.path.exists(filepath): posInstances = projizz.jsonRead( filepath ) for data in posInstances: instances.append( (data["text"], data["label"]) ) filepath = os.path.join(inputpath,"%s.neg" % (relation)) if os.path.exists(filepath): negInstances = projizz.jsonRead( filepath ) for data in negInstances: instances.append( (data["text"], data["label"]) ) if len(instances) == 0: print "Cannot build %s.nbc because there are no training data." % (relation) classifier = projizz.NaiveBayesClassifier(instances) classifier.save( os.path.join(outputPath,"%s.nbc" % (relation)) ) print "%d -> Write to %s %s.nbc" % (jobid,outputPath,relation)
def trainModel(jobid, relation, inputpath, outputPath): print "%d -> train %s" % (jobid, relation) instances = [] filepath = os.path.join(inputpath, "%s.pos" % (relation)) if os.path.exists(filepath): posInstances = projizz.jsonRead(filepath) for data in posInstances: instances.append((data["text"], data["label"])) filepath = os.path.join(inputpath, "%s.neg" % (relation)) if os.path.exists(filepath): negInstances = projizz.jsonRead(filepath) for data in negInstances: instances.append((data["text"], data["label"])) if len(instances) == 0: print "Cannot build %s.nbc because there are no training data." % ( relation) classifier = projizz.NaiveBayesClassifier(instances) classifier.save(os.path.join(outputPath, "%s.nbc" % (relation))) print "%d -> Write to %s %s.nbc" % (jobid, outputPath, relation)
def main(inputPtnPath, outputPath, pspath): model, table = projizz.readPrefixTreeModelWithTable( "./yagoPatternTree.model", "./yagoPatternTree.table") properties = projizz.buildYagoProperties({ "tp": [], "tn": [], "fp": [], "fn": [] }) sp = projizz.getSortedStatistic(projizz.jsonRead(pspath)) validate = [] # Get Top 200 Relation for relation in sp: count = 0 for ptnId, ptnS in sp[relation]: ptnData = table[ptnId] if len(ptnData["relations"]) == 1: count += 1 validate.append(ptnId) if count >= 200: break start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append( pool.apply_async(filterFunction, (t, filename, inputPtnPath, model, table, partAns, validate))) t += 1 pool.close() pool.join() for res in result: r = res.get() for m in r: properties[m]["tp"] += r[m]["tp"] properties[m]["tn"] += r[m]["tn"] properties[m]["fp"] += r[m]["fp"] properties[m]["fn"] += r[m]["fn"] print "start write out to %s" % (outputPath) json.dump(properties, open(outputPath, "w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath,pspath,inputPath,confidence,outputFilename): #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") model, table = projizz.readPrefixTreeModelWithTable("../yago//yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[],"et1":[],"et2":[],"et3":[]}) st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath)) domainRange = projizz.getYagoRelationDomainRange() start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence ))) t += 1 pool.close() pool.join() expResult = {} for res in result: r = res.get() for keyname in r: if not keyname in expResult: expResult[keyname] = copy.deepcopy(properties) for m in r[keyname]: if m == "produced": continue expResult[keyname][m]["tp"] += r[keyname][m]["tp"] expResult[keyname][m]["fp"] += r[keyname][m]["fp"] expResult[keyname][m]["fn"] += r[keyname][m]["fn"] expResult[keyname][m]["et1"] += r[keyname][m]["et1"] expResult[keyname][m]["et2"] += r[keyname][m]["et2"] expResult[keyname][m]["et3"] += r[keyname][m]["et3"] if not os.path.isdir(outputPath): os.mkdir(outputPath) for keyname in expResult: p = expResult[keyname] if not os.path.isdir(os.path.join(outputPath,keyname)): os.mkdir(os.path.join(outputPath,keyname)) projizz.jsonWrite(p,os.path.join(outputPath,keyname,outputFilename)) print "start write out to %s" % (os.path.join(outputPath,keyname)) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPath, inputPtnPath, vsmPath, confidence, psfile, outputPath, outputFilename): model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[]}) domainRange = projizz.getYagoRelationDomainRange() idf,docs,lens = projizz.getVSMmodels(vsmPath) st = projizz.getSortedPatternStatistic( projizz.jsonRead(psfile) ) vsmData = (idf, docs, lens) projizz.checkPath(outputPath) start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append(pool.apply_async(mapper, ( t, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData ))) #result.append( mapper( t, filename, inputPath, inputPtnPath, table, partAns, domainRange, confidence, vsmData )) t += 1 pool.close() pool.join() expResult = {} for res in result: r = res.get() for keyname in r: if not keyname in expResult: expResult[keyname] = copy.deepcopy(properties) for m in r[keyname]: if m == "produced": continue expResult[keyname][m]["tp"] += r[keyname][m]["tp"] expResult[keyname][m]["fp"] += r[keyname][m]["fp"] expResult[keyname][m]["fn"] += r[keyname][m]["fn"] for keyname in expResult: p = expResult[keyname] keydirName = "vsm-%d" % (keyname) projizz.checkPath( os.path.join(outputPath,keydirName)) projizz.jsonWrite(p,os.path.join(outputPath,keydirName,outputFilename)) print "start write out to %s" % (os.path.join(outputPath,keydirName)) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def outputStatistics(jsonPath): # Read file in. properties = projizz.jsonRead(jsonPath) occDocs = [] for degree in range(1, 18): degree = "%d" % (degree) if not degree in properties: print "%s\t%d" % (degree, 0) else: print "%s\t%d" % (degree, len(properties[degree])) for ptnId in properties[degree]: for articleId in properties[degree][ptnId]["occ"]: if not articleId in occDocs: occDocs.append(articleId) print len(occDocs)
def main(inputPtnPath,outputPath,pspath): model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") properties = projizz.buildYagoProperties({"tp":[],"tn":[],"fp":[],"fn":[]}) sp = projizz.getSortedStatistic(projizz.jsonRead(pspath)) validate = [] # Get Top 100 Relation for relation in sp: count = 0 for ptnId,ptnS in sp[relation]: ptnData = table[ptnId] if len(ptnData["relations"]) == 1: count += 1 validate.append(ptnId) if count >= 100: break start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,validate ))) t += 1 pool.close() pool.join() for res in result: r = res.get() for m in r: properties[m]["tp"] += r[m]["tp"] properties[m]["tn"] += r[m]["tn"] properties[m]["fp"] += r[m]["fp"] properties[m]["fn"] += r[m]["fn"] print "start write out to %s" % (outputPath) json.dump(properties,open(outputPath,"w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def filterFunction(jobid, filename, inputPtnPath, model, table, partAns, st, domainRange, inputPath, confidence, nbcPath): # read patterns in articles contentPtnJson = json.load(open(os.path.join(inputPtnPath, filename), "r")) contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) classifiers = projizz.getNBClassifiers(nbcPath) print "Worker %d : Read %s into filter" % (jobid, filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 # prepare keys for multiple-exp # degree: 1 ~ 5 # ambigu: select 1, select n (threshold:.5, .75), select all # type or not: no type info, type info expResult = {} for deg in range(1, 6): for typ in ["n", "t"]: if not deg == 1: for amb in ["one", "50", "75", "all"]: keyname = "%d-%s-%s" % (deg, amb, typ) expResult[keyname] = copy.deepcopy(partAns) else: keyname = "%d-1-%s" % (deg, typ) expResult[keyname] = copy.deepcopy(partAns) print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) for keyname in expResult: args = keyname.split("-") degree = int(args[0]) ambigu = args[1] typ = args[2] # Relation extraction relaEx = [] ptnExRela = {} # rela: ptns def recordPtnMakeRela(ptnId, rela, record): if not rela in record: record[rela] = [] if not ptnId in record[rela]: record[rela].append(ptnId) for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] if lineText[ 0] == "^": # It's a wikipeida reference comments, ignore it! continue for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) # validate the pattern if not projizz.isPatternValidate( ptnId, table, confidence=confidence, st=st): continue # get all possible relation of this pattern rfp = table[ptnId]["relations"] # check degree if len(rfp) > degree: continue # # Decide to choice relation # if len(rfp) == 1: # or degree == 1 if st[ptnId][0][1]["support"] > 0 and not rfp[ 0] in relaEx: if typ == "t": if domainRange[rfp[0]]["domain"] in types: pr = rfp[0] if not classifiers[ pr] == None and classifiers[ pr].classify( lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela( ptnId, rfp[0], ptnExRela) else: pr = rfp[0] if not classifiers[pr] == None and classifiers[ pr].classify(lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela(ptnId, rfp[0], ptnExRela) else: if ambigu == "one": if typ == "t": for ptnst in st[ptnId]: # ptnst[0] = relation # ptnst[1] = {"support": , "total": } if ptnst[1]["support"] > 0 and domainRange[ ptnst[0]]["domain"] in types: if not ptnst[ 0] in relaEx and not classifiers[ ptnst[ 0]] == None and classifiers[ ptnst[0]].classify( lineText ) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela( ptnId, ptnst[0], ptnExRela) break else: if st[ptnId][0][1]["support"] > 0 and not rfp[ 0] in relaEx and not classifiers[ rfp[0]] == None and classifiers[ rfp[0]].classify( lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela(ptnId, rfp[0], ptnExRela) elif ambigu == "all": for ptnst in st[ptnId]: if typ == "t": if domainRange[ ptnst[0]]["domain"] in types: if not ptnst[ 0] in relaEx and not classifiers[ ptnst[ 0]] == None and classifiers[ ptnst[0]].classify( lineText ) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela( ptnId, ptnst[0], ptnExRela) else: if not ptnst[ 0] in relaEx and not classifiers[ ptnst[ 0]] == None and classifiers[ ptnst[0]].classify( lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela( ptnId, ptnst[0], ptnExRela) else: th = 0.75 if ambigu == "50": th = 0.5 b = st[ptnId][0][1]["support"] if b > 0: for ptnst in st[ptnId]: if float(ptnst[1]["support"]) / float( b) >= th: if typ == "t": if domainRange[ptnst[0]][ "domain"] in types and not ptnst[ 0] in relaEx and not classifiers[ ptnst[0]] == None and classifiers[ ptnst[ 0]].classify( lineText ) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela( ptnId, ptnst[0], ptnExRela) else: if not ptnst[0] in relaEx and not classifiers[ ptnst[ 0]] == None and classifiers[ ptnst[0]].classify( lineText ) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela( ptnId, ptnst[0], ptnExRela) # Evaluation for attribute in expResult[keyname]: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: expResult[keyname][attribute]["tp"].append( ans["revid"]) else: expResult[keyname][attribute]["fn"].append( ans["revid"]) else: if postive: # False Positive expResult[keyname][attribute]["fp"].append( ans["revid"]) # TODO - 分析錯誤原因 if attribute in ptnExRela: if attribute in originRela: # type 2 error expResult[keyname][attribute]["et2"].append( ans["revid"]) else: found = False ptns = ptnExRela[ attribute] # get the patterns raise the Relation for pid in ptns: for psbR in table[pid]["relations"]: if psbR == attribute: continue # here means that the pattern can raise a `correct' relation in answer, may it choice or not if domainRange[psbR][ "domain"] in types and psbR in relation: found = True break if found: # type 1 error expResult[keyname][attribute][ "et1"].append(ans["revid"]) else: # type 3 error expResult[keyname][attribute][ "et3"].append(ans["revid"]) else: # 這是什麼情況?@@ 這種狀況基`本不可能發生吧XD pass else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return expResult
def mapper(jobid,filename,inputPath,outputPath,model,table): # Read article article = projizz.jsonRead( os.path.join(inputPath,filename) ) stemmer = PorterStemmer() tks = {} print "Worker %d : Read %s into filter" % (jobid,filename) count = 0 total = 0 for line in article: count += 1 tokens = projizz.getTokens(line) for token in tokens: t = stemmer.stem(token) if t not in tks: tks[t] = 0 tks[t] += 1 total += 1 if count % 1000 == 0: print "worker %d done %d lines" % (jobid,count) # Remove stopwords for sw in projizz._stopwords: _sw = stemmer.stem(sw) if _sw in tks: total -= tks[_sw] tks.pop(_sw) needRemove = [] maxTF = 0 for t in tks: # ignore only one time word if tks[t] <= 1: needRemove.append(t) total -= tks[t] continue # ignore the case contain number if "0" in t or "1" in t or "2" in t or "3" in t or "4" in t or "5" in t or "6" in t or "7" in t or "8" in t or "9" in t: needRemove.append(t) total -= tks[t] continue if tks[t] > maxTF: maxTF = tks[t] for rm in needRemove: tks.pop(rm) projizz.jsonWrite(tks,os.path.join(outputPath,filename.replace(".json",".tfc"))) # Calculate tf for t in tks: tc = tks[t] tks[t] = float(tc)/float(maxTF) projizz.jsonWrite(tks,os.path.join(outputPath,filename.replace(".json",".tf"))) print "worker %d write out." % (jobid) return (filename,tks)
def viewer(path,threshold): model = projizz.jsonRead(path) sortedModel = sorted(model.items(), key=lambda x:x[1], reverse=True) for word, score in sortedModel: if score >= threshold: print "%s\t%f" % (word.encode("utf-8"),score)
def main(inputPath,inputPtnPath,outputPath,outputPtnPath): debug = False if not os.path.isdir(outputPath): os.mkdir(outputPath) if not os.path.isdir(outputPtnPath): os.mkdir(outputPtnPath) result = [] count = 0 # Update answer cpuN = multiprocessing.cpu_count() print "CoreNumber = %d" % (cpuN) pool = multiprocessing.Pool(processes=12) t = 0 for filename in os.listdir(inputPath): if ".json" in filename: t += 1 if debug: result.append(updateAnswer(t,inputPath,filename)) else: result.append(pool.apply_async(updateAnswer, (t,inputPath,filename))) pool.close() pool.join() # Rebuild articles and patterns tmpArticle = {} tmpPtn = {} dataSize = 0 for res in result: if debug: filename,articles = res else: filename,articles = res.get() print filename,len(articles) a = projizz.jsonRead(os.path.join(inputPath,filename)) p = projizz.jsonRead(os.path.join(inputPtnPath,filename)) for key in articles: dataSize += 1 tmpArticle[key] = a[key] tmpPtn[key] = p[key] if len(tmpPtn) == 1000: print "write to %05d.json" % (count) projizz.jsonWrite(tmpArticle,os.path.join(outputPath,"%05d.json" % (count))) projizz.jsonWrite(tmpPtn,os.path.join(outputPtnPath,"%05d.json" % (count))) tmpArticle = {} tmpPtn = {} count += 1 if len(tmpPtn) > 0: print "write to %05d.json" % (count) projizz.jsonWrite(tmpArticle,os.path.join(outputPath,"%05d.json" % (count))) projizz.jsonWrite(tmpPtn,os.path.join(outputPtnPath,"%05d.json" % (count))) tmpArticle = {} tmpPtn = {} count += 1 # Split to 5 splitTo5part("/tmp2/r01922024","y-all","/tmp2/r01922024","y") splitTo5part("/tmp2/r01922024","y-ptn-all","/tmp2/r01922024","y-ptn") print "write %d files. (%d)" % (count,dataSize)
def filterFunction(jobid,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,nbcPath): # read patterns in articles contentPtnJson = json.load(open(os.path.join(inputPtnPath,filename),"r")) contentJson = projizz.jsonRead(os.path.join(inputPath,filename)) classifiers = projizz.getNBClassifiers(nbcPath) print "Worker %d : Read %s into filter" % (jobid,filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid":{"$in":queries}}) print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count()) count = 0 # prepare keys for multiple-exp # degree: 1 ~ 5 # ambigu: select 1, select n (threshold:.5, .75), select all # type or not: no type info, type info expResult = {} for deg in range(1,6): for typ in ["n","t"]: if not deg == 1: for amb in ["one","50","75","all"]: keyname = "%d-%s-%s" % (deg,amb,typ) expResult[keyname] = copy.deepcopy(partAns) else: keyname = "%d-1-%s" % (deg,typ) expResult[keyname] = copy.deepcopy(partAns) print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) for keyname in expResult: args = keyname.split("-") degree = int(args[0]) ambigu = args[1] typ = args[2] # Relation extraction relaEx = [] ptnExRela = {} # rela: ptns def recordPtnMakeRela(ptnId,rela,record): if not rela in record: record[rela] = [] if not ptnId in record[rela]: record[rela].append(ptnId) for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] if lineText[0] == "^": # It's a wikipeida reference comments, ignore it! continue for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) # validate the pattern if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st): continue # get all possible relation of this pattern rfp = table[ptnId]["relations"] # check degree if len(rfp) > degree: continue # # Decide to choice relation # if len(rfp) == 1: # or degree == 1 if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx: if typ == "t": if domainRange[rfp[0]]["domain"] in types: pr = rfp[0] if not classifiers[pr] == None and classifiers[pr].classify(lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela(ptnId, rfp[0], ptnExRela) else: pr = rfp[0] if not classifiers[pr] == None and classifiers[pr].classify(lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela(ptnId, rfp[0], ptnExRela) else: if ambigu == "one": if typ == "t": for ptnst in st[ptnId]: # ptnst[0] = relation # ptnst[1] = {"support": , "total": } if ptnst[1]["support"] > 0 and domainRange[ptnst[0]]["domain"] in types: if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela(ptnId, ptnst[0], ptnExRela) break else: if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx and not classifiers[rfp[0]] == None and classifiers[rfp[0]].classify(lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela(ptnId, rfp[0], ptnExRela) elif ambigu == "all": for ptnst in st[ptnId]: if typ == "t": if domainRange[ptnst[0]]["domain"] in types: if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela(ptnId, ptnst[0], ptnExRela) else: if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela(ptnId, ptnst[0], ptnExRela) else: th = 0.75 if ambigu == "50": th = 0.5 b = st[ptnId][0][1]["support"] if b > 0: for ptnst in st[ptnId]: if float(ptnst[1]["support"])/float(b) >= th: if typ == "t": if domainRange[ptnst[0]]["domain"] in types and not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela(ptnId, ptnst[0], ptnExRela) else: if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela(ptnId, ptnst[0], ptnExRela) # Evaluation for attribute in expResult[keyname]: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: expResult[keyname][attribute]["tp"].append(ans["revid"]) else: expResult[keyname][attribute]["fn"].append(ans["revid"]) else: if postive: # False Positive expResult[keyname][attribute]["fp"].append(ans["revid"]) # TODO - 分析錯誤原因 if attribute in ptnExRela: if attribute in originRela: # type 2 error expResult[keyname][attribute]["et2"].append(ans["revid"]) else: found = False ptns = ptnExRela[attribute] # get the patterns raise the Relation for pid in ptns: for psbR in table[pid]["relations"]: if psbR == attribute: continue # here means that the pattern can raise a `correct' relation in answer, may it choice or not if domainRange[psbR]["domain"] in types and psbR in relation: found = True break if found: # type 1 error expResult[keyname][attribute]["et1"].append(ans["revid"]) else: # type 3 error expResult[keyname][attribute]["et3"].append(ans["revid"]) else: # 這是什麼情況?@@ 這種狀況基`本不可能發生吧XD pass else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid,count) return expResult
def mapper(jobid, filename, inputPath, inputPtnPath, model, table): # Read article contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) # Read ptn contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename)) print "Worker %d : Read %s into filter" % (jobid, filename) ### Connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 supportInstanceByFile = {} for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) relation = ans["observed"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) supportInstanceByFile[key] = {} for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) if not projizz.isPatternValidate(ptnId, table): continue for rela in table[ptnId]["relations"]: # it's a support instance if rela in relation: if not ptnId in supportInstanceByFile[key]: supportInstanceByFile[key][ptnId] = {} if not rela in supportInstanceByFile[key][ptnId]: supportInstanceByFile[key][ptnId][rela] = [] if not line[0] in supportInstanceByFile[key][ptnId][ rela]: supportInstanceByFile[key][ptnId][rela].append( line[0]) for ptnId in supportInstanceByFile[key]: for rela in supportInstanceByFile[key][ptnId]: lines = supportInstanceByFile[key][ptnId][rela] supportInstanceByFile[key][ptnId][rela] = [] for lineN in lines: supportInstanceByFile[key][ptnId][rela].append( article[lineN]) if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return supportInstanceByFile
def main(inputPath, inputPtnPath, outputPath, outputPtnPath): debug = False if not os.path.isdir(outputPath): os.mkdir(outputPath) if not os.path.isdir(outputPtnPath): os.mkdir(outputPtnPath) result = [] count = 0 # Update answer cpuN = multiprocessing.cpu_count() print "CoreNumber = %d" % (cpuN) pool = multiprocessing.Pool(processes=12) t = 0 for filename in os.listdir(inputPath): if ".json" in filename: t += 1 if debug: result.append(updateAnswer(t, inputPath, filename)) else: result.append( pool.apply_async(updateAnswer, (t, inputPath, filename))) pool.close() pool.join() # Rebuild articles and patterns tmpArticle = {} tmpPtn = {} dataSize = 0 for res in result: if debug: filename, articles = res else: filename, articles = res.get() print filename, len(articles) a = projizz.jsonRead(os.path.join(inputPath, filename)) p = projizz.jsonRead(os.path.join(inputPtnPath, filename)) for key in articles: dataSize += 1 tmpArticle[key] = a[key] tmpPtn[key] = p[key] if len(tmpPtn) == 1000: print "write to %05d.json" % (count) projizz.jsonWrite( tmpArticle, os.path.join(outputPath, "%05d.json" % (count))) projizz.jsonWrite( tmpPtn, os.path.join(outputPtnPath, "%05d.json" % (count))) tmpArticle = {} tmpPtn = {} count += 1 if len(tmpPtn) > 0: print "write to %05d.json" % (count) projizz.jsonWrite(tmpArticle, os.path.join(outputPath, "%05d.json" % (count))) projizz.jsonWrite(tmpPtn, os.path.join(outputPtnPath, "%05d.json" % (count))) tmpArticle = {} tmpPtn = {} count += 1 # Split to 5 splitTo5part("/tmp2/r01922024", "y-all", "/tmp2/r01922024", "y") splitTo5part("/tmp2/r01922024", "y-ptn-all", "/tmp2/r01922024", "y-ptn") print "write %d files. (%d)" % (count, dataSize)
def main(part,revid): # Paths (on NLG workstation) inputPath = "/tmp2/ccli/y-part-%s/" % (part) inputPtnPath = "/tmp2/ccli/y-ptn-part-%s/" % (part) spPath = "../yago/yagoPSv2/ps.%s.json" % (part) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer itr = collection.find({"revid":revid}) # find filename a = os.popen("grep -nr \"%s\" %s" % (revid,inputPath)).readline() targetFilename = a.split(":")[0].split("/")[-1] key = "%s.txt" % (revid) pattern = projizz.jsonRead(inputPtnPath+targetFilename)[key] article = projizz.articleSimpleSentenceFileter(projizz.jsonRead(inputPath+targetFilename)[key]) st = projizz.getSortedPatternStatistic(projizz.jsonRead(spPath)) domainRange = projizz.getYagoRelationDomainRange(); model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../yago/yagoPatternTree.table") print "Part %s, RevID=%s, in %s" % (part,revid,targetFilename) for ans in itr: targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] answers = ans["observed"] print "Target=%s\nTarget token=%s" % (ans["_id"].encode("utf-8"),targetName) print "Type=%s" % (types) print "Answer=%s" % (answers) for line in pattern: lineText = article[line[0]] named = False for namedToken in targetName: if namedToken in lineText: named = True break if not named: # No target name in line text continue # go to next line. for ptn in line[1]: ptnId = "%d" % (ptn[0]) #rfp = table[ptnId]["relations"] if not ptnId in st: continue for ps in st[ptnId]: if float(ps[1]["support"])/float(ps[1]["total"]) > 0: if domainRange[ps[0]]["domain"] in types: print "#%d" % (line[0]),lineText.encode("utf-8") isIn = "(X)" if ps[0] in answers: isIn = "(O)" print "%s %s/%s/{%d,%d}/ %s" % (isIn,ptnId,table[ptnId]["pattern"],ps[1]["support"],ps[1]["total"],ps[0]) pass # select top 1 break # prevent second ans break
def generate(inputSPIpath,inputTestPath,outputVSMpath,confidence): # Checking output path projizz.checkPath(outputVSMpath) model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) # Collect not used keys # because using 5-fold CV t = 0 result = [] for filename in os.listdir(inputTestPath): if ".json" in filename: result.append( pool.apply_async( mapper, (t,filename,inputTestPath) ) ) t += 1 pool.close() pool.join() notUsedKeys = [] for r in result: ks = r.get() notUsedKeys += ks ### Build Model # Paatern Selection modelArticles = projizz.buildYagoProperties([]) words = [] count = 0 for filename in os.listdir(inputSPIpath): if ".json" in filename: ptnId = filename[:-5] # ignore invalidate pattern if not projizz.isPatternValidate(ptnId, table, confidence=confidence): continue count += 1 print count,ptnId ptnInstance = projizz.jsonRead( os.path.join(inputSPIpath,filename) ) for rela in ptnInstance: for key in ptnInstance[rela]: # ignore in testing data's key if key in notUsedKeys: continue for line in ptnInstance[rela][key]: modelArticles[rela].append(line) if count%100 == 0: print "Read",count,"files" for relation in modelArticles: print relation projizz.jsonWrite(modelArticles[relation],os.path.join(outputVSMpath,"%s.txt" % (relation)))
def mapper(jobid, filename, inputTestPath): contentPtnJson = projizz.jsonRead( os.path.join(inputTestPath, filename) ) keys = map(lambda x: x, contentPtnJson) print "Worker %d read %s, done." % (jobid, filename) return keys
# -*- coding: utf-8 -*- # qcl # import sys import projizz if len(sys.argv) <= 1: print "$ python ./simpleSortedViewer.py [ps json]" else: filename = sys.argv[1] ps = projizz.jsonRead(filename) sortedp = projizz.getSortedStatistic(ps) model,table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") for relation in sortedp: print relation for ptnId,ptnS in sortedp[relation]: print "%s\t%s %s %s" % (relation,table[ptnId]["pattern"],ptnId,ptnS)
def filterFunction(jobid,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence): # read patterns in articles contentPtnJson = json.load(open(os.path.join(inputPtnPath,filename),"r")) contentJson = projizz.jsonRead(os.path.join(inputPath,filename)) print "Worker %d : Read %s into filter" % (jobid,filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid":{"$in":queries}}) print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count()) count = 0 # prepare keys for multiple-exp # degree: 1 ~ 5 # ambigu: select 1, select n (threshold:.5, .75), select all # type or not: no type info, type info expResult = {} for deg in range(1,6): for typ in ["n","t"]: if not deg == 1: for amb in ["one","50","75","all"]: keyname = "%d-%s-%s" % (deg,amb,typ) expResult[keyname] = copy.deepcopy(partAns) else: keyname = "%d-1-%s" % (deg,typ) expResult[keyname] = copy.deepcopy(partAns) print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) for keyname in expResult: args = keyname.split("-") degree = int(args[0]) ambigu = args[1] typ = args[2] # Relation extraction relaEx = [] for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] if lineText[0] == "^": # It's a wikipeida reference comments, ignore it! continue for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st): continue rfp = table[ptnId]["relations"] # check degree if len(rfp) > degree: continue if len(rfp) == 1: # or degree == 1 if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx: if typ == "t": if domainRange[rfp[0]]["domain"] in types: relaEx.append(rfp[0]) else: relaEx.append(rfp[0]) else: if ambigu == "one": if typ == "t": for ptnst in st[ptnId]: # ptnst[0] = relation # ptnst[1] = {"support": , "total": } if ptnst[1]["support"] > 0 and domainRange[ptnst[0]]["domain"] in types: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) break else: if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx: relaEx.append(rfp[0]) elif ambigu == "all": for ptnst in st[ptnId]: if typ == "t": if domainRange[ptnst[0]]["domain"] in types: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) else: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) else: th = 0.75 if ambigu == "50": th = 0.5 b = st[ptnId][0][1]["support"] if b > 0: for ptnst in st[ptnId]: if float(ptnst[1]["support"])/float(b) >= th: if typ == "t": if domainRange[ptnst[0]]["domain"] in types and not ptnst[0] in relaEx: relaEx.append(ptnst[0]) else: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) # Evaluation for attribute in expResult[keyname]: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: expResult[keyname][attribute]["tp"].append(ans["revid"]) else: expResult[keyname][attribute]["fn"].append(ans["revid"]) else: if postive: expResult[keyname][attribute]["fp"].append(ans["revid"]) else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid,count) return expResult
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData): # read articles and patterns contentJson = projizz.jsonRead(os.path.join(inputPath,filename)) contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath,filename)) print "Worker %d : Read %s" % (jobid,filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid":{"$in":queries}}) print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count()) count = 0 expResult = {} relaEx = {} # set thresholds for th in range(0,51,5): expResult[th] = copy.deepcopy(partAns) relaEx[th] = [] print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) # TODO # Relation extraction for line in ptnEx: # line[0]: line number # line[1]: array of patterns for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) ptntks = table[ptnId]["pattern"] lineText = article[line[0]] if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st): continue rfp = table[ptnId]["relations"] # check degree if len(rfp) > 5: continue # if no support, ignore this pattern if st[ptnId][0][1]["support"] <= 0: continue # TODO - Modlify string, remove pattern text in string? cosRlt = projizz.vsmSimilarity( lineText, vsmData, relas=rfp, ptntext=ptntks ) # NOTE - if cosine value > threshold then there is a relation (?) for keyname in expResult: threshold = float(keyname)/100.0 for pr in cosRlt: # Check type if domainRange[pr]["domain"] in types: if cosRlt[pr] > threshold: if pr not in relaEx[keyname]: relaEx[keyname].append(pr) #### Evaluation for keyname in expResult: for attribute in expResult[keyname]: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx[keyname]: postive = True if attribute in relation: true = True if true: if postive: expResult[keyname][attribute]["tp"].append(ans["revid"]) else: expResult[keyname][attribute]["fn"].append(ans["revid"]) else: if postive: expResult[keyname][attribute]["fp"].append(ans["revid"]) else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid,count) return expResult
def mapper(jobid, filename, inputPath, inputPtnPath, model, table, confidence): # Read article contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) # Read ptn contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename)) print "Worker %d : Read %s into filter" % (jobid, filename) ### Connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 supportInstanceByFile = {} linesByRelations = {} linesNoRelaByRelations = {} POS = {} NEG = {} for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) relation = ans["observed"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) supportInstanceByFile[key] = {} linesByRela = {} linesByNoRela = {} pos = {} neg = {} for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) if not projizz.isPatternValidate( ptnId, table, confidence=confidence): continue # give up degree > 5 's pattern if len(table[ptnId]["relations"]) > 5: continue for rela in table[ptnId]["relations"]: # it's a support instance if rela in relation: # NOTE - remove pattern text. if not rela in linesByRela: linesByRela[rela] = {} if not line[0] in linesByRela[rela]: linesByRela[rela][line[0]] = [] if not ptnId in linesByRela[rela][line[0]]: linesByRela[rela][line[0]].append(ptnId) # For binary classifier if not rela in pos: pos[rela] = [] if not lineText[0] == "^" and line[0] not in pos[rela]: pos[rela].append(line[0]) else: if not rela in linesByNoRela: linesByNoRela[rela] = {} if not line[0] in linesByNoRela[rela]: linesByNoRela[rela][line[0]] = [] if not ptnId in linesByNoRela[rela][line[0]]: linesByNoRela[rela][line[0]].append(ptnId) # For binary classifier if not rela in neg: neg[rela] = [] if not lineText[0] == "^" and line[0] not in neg[rela]: neg[rela].append(line[0]) for rela in linesByRela: if not rela in linesByRelations: linesByRelations[rela] = [] for lineN in linesByRela[rela]: text = projizz.getTokens(article[lineN].lower()) for ptnId in linesByRela[rela][lineN]: ptntext = table[ptnId]["pattern"].split() for ptntk in ptntext: if ptntk in text: text.remove(ptntk) l = ' '.join(text) linesByRelations[rela].append(l) for rela in linesByNoRela: if not rela in linesNoRelaByRelations: linesNoRelaByRelations[rela] = [] for lineN in linesByNoRela[rela]: text = projizz.getTokens(article[lineN].lower()) for ptnId in linesByNoRela[rela][lineN]: ptntext = table[ptnId]["pattern"].split() for ptntk in ptntext: if ptntk in text: text.remove(ptntk) l = ' '.join(text) linesNoRelaByRelations[rela].append(l) # For binary classifier for rela in pos: if not rela in POS: POS[rela] = [] for lineN in pos[rela]: POS[rela].append({"text": article[lineN], "label": "pos"}) for rela in neg: if not rela in NEG: NEG[rela] = [] for lineN in neg[rela]: NEG[rela].append({"text": article[lineN], "label": "neg"}) if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return linesByRelations, linesNoRelaByRelations, POS, NEG
def updateAnswer(jobid, inputPath, filename): contenJson = projizz.jsonRead(os.path.join(inputPath, filename)) print "#%d - %s" % (jobid, filename) connect = Connection() answerCollection = connect.projizz.result.yago.answer factCollection = connect.projizz.yago.facts queries = map(lambda x: x[:-4], contenJson) itr = answerCollection.find({"revid": {"$in": queries}}) print "#%d - query=%d,result=%d" % (jobid, len(queries), itr.count()) count = 0 ty1g = 0 ty2g = 0 updateC = 0 articles = [] for ans in itr: count += 1 articleID = "%s.txt" % (ans["revid"]) articleName = ans["_id"] properties = ans["properties"] #not consider references. #references = ans["references"] if len(properties) == 0: # give up those no properties' article # print "#%d - give up %s (1)" % (jobid,articleID) ty1g += 1 continue needUpdate = len(properties) lines = projizz.articleSimpleSentenceFileter(contenJson[articleID]) text = "" for line in lines: text += (line + " ") observed = [] for pro in properties: pitr = factCollection.find({ "property": pro, "subject": articleName }) if pitr.count() < 1: notNeed.append(pro) continue found = False for fact in pitr: tokens = projizz.getNamedEntityTokens(fact["object"]) for token in tokens: if token in text: found = True break if found: break if found: observed.append(pro) if len(observed) > 0: articles.append(articleID) ans["observed"] = observed answerCollection.update({"revid": ans["revid"]}, ans, upsert=False) else: ty2g += 1 #print "#%d - give up %s (2)" % (jobid,articleID) print "#%d -> update %d (give up %d + %d)" % (jobid, len(articles), ty1g, ty2g) return (filename, articles)
def updateAnswer(jobid,inputPath,filename): contenJson = projizz.jsonRead(os.path.join(inputPath,filename)) print "#%d - %s" % (jobid,filename) connect = Connection() answerCollection = connect.projizz.result.yago.answer factCollection = connect.projizz.yago.facts queries = map(lambda x: x[:-4], contenJson) itr = answerCollection.find({"revid":{"$in":queries}}) print "#%d - query=%d,result=%d" % (jobid,len(queries),itr.count()) count = 0 ty1g = 0 ty2g = 0 updateC = 0 articles = [] for ans in itr: count += 1 articleID = "%s.txt" % (ans["revid"]) articleName = ans["_id"] properties = ans["properties"] #not consider references. #references = ans["references"] if len(properties) == 0: # give up those no properties' article # print "#%d - give up %s (1)" % (jobid,articleID) ty1g += 1 continue needUpdate = len(properties) lines = projizz.articleSimpleSentenceFileter(contenJson[articleID]) text = "" for line in lines: text += (line + " ") observed = [] for pro in properties: pitr = factCollection.find({"property":pro,"subject":articleName}) if pitr.count() < 1: notNeed.append(pro) continue found = False for fact in pitr: tokens = projizz.getNamedEntityTokens(fact["object"]) for token in tokens: if token in text: found = True break if found: break if found: observed.append(pro) if len(observed) > 0: articles.append(articleID) ans["observed"] = observed answerCollection.update({"revid":ans["revid"]},ans,upsert=False) else: ty2g += 1 #print "#%d - give up %s (2)" % (jobid,articleID) print "#%d -> update %d (give up %d + %d)" % (jobid,len(articles),ty1g,ty2g) return (filename,articles)
def mapper(jobid, filename, inputPath, topN, outputPath, model, table): # Read article article = projizz.jsonRead(os.path.join(inputPath, filename)) stemmer = PorterStemmer() tks = {} print "Worker %d : Read %s into filter" % (jobid, filename) count = 0 total = 0 for line in article: count += 1 tokens = projizz.getTokens(line) for token in tokens: t = stemmer.stem(token) if t not in tks: tks[t] = 0 tks[t] += 1 total += 1 if count % 1000 == 0: print "worker %d done %d lines" % (jobid, count) # Remove stopwords for sw in projizz._stopwords: _sw = stemmer.stem(sw) if _sw in tks: total -= tks[_sw] tks.pop(_sw) needRemove = [] maxTF = 1 for t in tks: # ignore only one time word if tks[t] <= 1: needRemove.append(t) total -= tks[t] continue # ignore the case contain number if "0" in t or "1" in t or "2" in t or "3" in t or "4" in t or "5" in t or "6" in t or "7" in t or "8" in t or "9" in t: needRemove.append(t) total -= tks[t] continue #if tks[t] > maxTF: # maxTF = tks[t] for rm in needRemove: tks.pop(rm) projizz.jsonWrite( tks, os.path.join(outputPath, filename.replace(".json", ".tfc"))) ### select top N words # sort by tfc sortedTks = sorted(tks.items(), key=lambda x: x[1], reverse=True) tks = {} maxTF = sortedTks[0][1] # Calculate tf top = 0 for t, c in sortedTks: top += 1 tks[t] = float(c) / float(maxTF) if top == topN: break projizz.jsonWrite( tks, os.path.join(outputPath, filename.replace(".json", ".tf"))) print "worker %d write out." % (jobid) return (filename, tks)
def generate(inputSPIpath, inputTestPath, outputVSMpath, confidence): # Checking output path projizz.checkPath(outputVSMpath) model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) # Collect not used keys # because using 5-fold CV t = 0 result = [] for filename in os.listdir(inputTestPath): if ".json" in filename: result.append( pool.apply_async(mapper, (t, filename, inputTestPath))) t += 1 pool.close() pool.join() notUsedKeys = [] for r in result: ks = r.get() notUsedKeys += ks ### Build Model # Paatern Selection modelArticles = projizz.buildYagoProperties([]) words = [] count = 0 for filename in os.listdir(inputSPIpath): if ".json" in filename: ptnId = filename[:-5] # ignore invalidate pattern if not projizz.isPatternValidate( ptnId, table, confidence=confidence): continue count += 1 print count, ptnId ptnInstance = projizz.jsonRead(os.path.join( inputSPIpath, filename)) for rela in ptnInstance: for key in ptnInstance[rela]: # ignore in testing data's key if key in notUsedKeys: continue for line in ptnInstance[rela][key]: modelArticles[rela].append(line) if count % 100 == 0: print "Read", count, "files" for relation in modelArticles: print relation projizz.jsonWrite(modelArticles[relation], os.path.join(outputVSMpath, "%s.txt" % (relation)))
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, nbcPath): # read articles and patterns contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename)) classifiers = projizz.getNBClassifiers(nbcPath) print "Worker %d : Read %s" % (jobid, filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 expResult = partAns relaEx = [] print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) # Relation extraction for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] if lineText[ 0] == "^": # It's a wikipeida reference comments, ignore it! continue for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) ptntks = table[ptnId]["pattern"] if not projizz.isPatternValidate( ptnId, table, confidence=confidence, st=st): continue rfp = table[ptnId]["relations"] # check degree if len(rfp) > 5: continue # if no support, ignore this pattern if st[ptnId][0][1]["support"] <= 0: continue for ptnst in st[ptnId]: # ptnst[0] = relation # ptnst[1] = {"support":,"total": } if domainRange[ptnst[0]] not in types: continue if classifiers[ptnst[0]] == None: continue if classifiers[ptnst[0]].classify(lineText) == "pos": if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) #### Evaluation for attribute in expResult: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: expResult[attribute]["tp"].append(ans["revid"]) else: expResult[attribute]["fn"].append(ans["revid"]) else: if postive: expResult[attribute]["fp"].append(ans["revid"]) else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return expResult
def filterFunction(jobid, filename, inputPtnPath, model, table, partAns, st, domainRange, inputPath, confidence): # read patterns in articles contentPtnJson = json.load(open(os.path.join(inputPtnPath, filename), "r")) contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) print "Worker %d : Read %s into filter" % (jobid, filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 # prepare keys for multiple-exp # degree: 1 ~ 5 # ambigu: select 1, select n (threshold:.5, .75), select all # type or not: no type info, type info expResult = {} for deg in range(1, 6): for typ in ["n", "t"]: if not deg == 1: for amb in ["one", "50", "75", "all"]: keyname = "%d-%s-%s" % (deg, amb, typ) expResult[keyname] = copy.deepcopy(partAns) else: keyname = "%d-1-%s" % (deg, typ) expResult[keyname] = copy.deepcopy(partAns) print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) for keyname in expResult: args = keyname.split("-") degree = int(args[0]) ambigu = args[1] typ = args[2] # Relation extraction relaEx = [] for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] if lineText[ 0] == "^": # It's a wikipeida reference comments, ignore it! continue for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) if not projizz.isPatternValidate( ptnId, table, confidence=confidence, st=st): continue rfp = table[ptnId]["relations"] # check degree if len(rfp) > degree: continue if len(rfp) == 1: # or degree == 1 if st[ptnId][0][1]["support"] > 0 and not rfp[ 0] in relaEx: if typ == "t": if domainRange[rfp[0]]["domain"] in types: relaEx.append(rfp[0]) else: relaEx.append(rfp[0]) else: if ambigu == "one": if typ == "t": for ptnst in st[ptnId]: # ptnst[0] = relation # ptnst[1] = {"support": , "total": } if ptnst[1]["support"] > 0 and domainRange[ ptnst[0]]["domain"] in types: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) break else: if st[ptnId][0][1]["support"] > 0 and not rfp[ 0] in relaEx: relaEx.append(rfp[0]) elif ambigu == "all": for ptnst in st[ptnId]: if typ == "t": if domainRange[ ptnst[0]]["domain"] in types: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) else: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) else: th = 0.75 if ambigu == "50": th = 0.5 b = st[ptnId][0][1]["support"] if b > 0: for ptnst in st[ptnId]: if float(ptnst[1]["support"]) / float( b) >= th: if typ == "t": if domainRange[ptnst[0]][ "domain"] in types and not ptnst[ 0] in relaEx: relaEx.append(ptnst[0]) else: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) # Evaluation for attribute in expResult[keyname]: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: expResult[keyname][attribute]["tp"].append( ans["revid"]) else: expResult[keyname][attribute]["fn"].append( ans["revid"]) else: if postive: expResult[keyname][attribute]["fp"].append( ans["revid"]) else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return expResult
def main(part, revid): # Paths (on NLG workstation) inputPath = "/tmp2/ccli/yago-part-%s/" % (part) inputPtnPath = "/tmp2/ccli/yago-ptn-part-%s/" % (part) spPath = "../yago/yagoPSv1/ps.%s.json" % (part) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer itr = collection.find({"revid": revid}) # find filename a = os.popen("grep -nr \"%s\" %s" % (revid, inputPath)).readline() targetFilename = a.split(":")[0].split("/")[-1] key = "%s.txt" % (revid) pattern = projizz.jsonRead(inputPtnPath + targetFilename)[key] article = projizz.articleSimpleSentenceFileter( projizz.jsonRead(inputPath + targetFilename)[key]) st = projizz.getSortedPatternStatistic(projizz.jsonRead(spPath)) domainRange = projizz.getYagoRelationDomainRange() model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../yago/yagoPatternTree.table") print "Part %s, RevID=%s, in %s" % (part, revid, targetFilename) for ans in itr: targetName = ans["_id"].replace("(", "").replace(")", "").split( "_") # get entity name's part types = ans["type"] answers = ans["properties"] print "Target=%s\nTarget token=%s" % (ans["_id"].encode("utf-8"), targetName) print "Type=%s" % (types) print "Answer=%s" % (answers) for line in pattern: lineText = article[line[0]] named = False for namedToken in targetName: if namedToken in lineText: named = True break if not named: # No target name in line text continue # go to next line. for ptn in line[1]: ptnId = "%d" % (ptn[0]) #rfp = table[ptnId]["relations"] if not ptnId in st: continue for ps in st[ptnId]: if float(ps[1]["support"]) / float(ps[1]["total"]) > 0: if domainRange[ps[0]]["domain"] in types: print "#%d" % (line[0]), lineText.encode("utf-8") isIn = "(X)" if ps[0] in answers: isIn = "(O)" print "%s %s/%s/{%d,%d}/ %s" % ( isIn, ptnId, table[ptnId]["pattern"], ps[1]["support"], ps[1]["total"], ps[0]) pass # select top 1 break # prevent second ans break
def mapper(jobid,filename,inputPath,inputPtnPath,model,table,confidence): # Read article contentJson = projizz.jsonRead( os.path.join(inputPath,filename) ) # Read ptn contentPtnJson = projizz.jsonRead( os.path.join(inputPtnPath,filename) ) print "Worker %d : Read %s into filter" % (jobid,filename) ### Connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid":{"$in":queries}}) print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count()) count = 0 supportInstanceByFile = {} linesByRelations = {} linesNoRelaByRelations = {} POS = {} NEG = {} for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) relation = ans["observed"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) supportInstanceByFile[key] = {} linesByRela = {} linesByNoRela = {} pos = {} neg = {} for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) if not projizz.isPatternValidate(ptnId, table, confidence=confidence): continue # give up degree > 5 's pattern if len(table[ptnId]["relations"]) > 5: continue for rela in table[ptnId]["relations"]: # it's a support instance if rela in relation: # NOTE - remove pattern text. if not rela in linesByRela: linesByRela[rela] = {} if not line[0] in linesByRela[rela]: linesByRela[rela][line[0]] = [] if not ptnId in linesByRela[rela][line[0]]: linesByRela[rela][line[0]].append(ptnId) # For binary classifier if not rela in pos: pos[rela] = [] if not lineText[0] == "^" and line[0] not in pos[rela]: pos[rela].append(line[0]) else: if not rela in linesByNoRela: linesByNoRela[rela] = {} if not line[0] in linesByNoRela[rela]: linesByNoRela[rela][line[0]] = [] if not ptnId in linesByNoRela[rela][line[0]]: linesByNoRela[rela][line[0]].append(ptnId) # For binary classifier if not rela in neg: neg[rela] = [] if not lineText[0] == "^" and line[0] not in neg[rela]: neg[rela].append(line[0]) for rela in linesByRela: if not rela in linesByRelations: linesByRelations[rela] = [] for lineN in linesByRela[rela]: text = projizz.getTokens( article[lineN].lower() ) for ptnId in linesByRela[rela][lineN]: ptntext = table[ptnId]["pattern"].split() for ptntk in ptntext: if ptntk in text: text.remove(ptntk) l = ' '.join(text) linesByRelations[rela].append(l) for rela in linesByNoRela: if not rela in linesNoRelaByRelations: linesNoRelaByRelations[rela] = [] for lineN in linesByNoRela[rela]: text = projizz.getTokens( article[lineN].lower() ) for ptnId in linesByNoRela[rela][lineN]: ptntext = table[ptnId]["pattern"].split() for ptntk in ptntext: if ptntk in text: text.remove(ptntk) l = ' '.join(text) linesNoRelaByRelations[rela].append(l) # For binary classifier for rela in pos: if not rela in POS: POS[rela] = [] for lineN in pos[rela]: POS[rela].append( {"text":article[lineN],"label":"pos"} ) for rela in neg: if not rela in NEG: NEG[rela] = [] for lineN in neg[rela]: NEG[rela].append( {"text":article[lineN],"label":"neg"} ) if count % 100 == 0: print "worker #%d done %d." % (jobid,count) return linesByRelations,linesNoRelaByRelations,POS,NEG
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData): # read articles and patterns contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename)) print "Worker %d : Read %s" % (jobid, filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 expResult = {} relaEx = {} # set thresholds for th in range(0, 51, 5): expResult[th] = copy.deepcopy(partAns) relaEx[th] = [] print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) # TODO # Relation extraction for line in ptnEx: # line[0]: line number # line[1]: array of patterns for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) ptntks = table[ptnId]["pattern"] lineText = article[line[0]] if not projizz.isPatternValidate( ptnId, table, confidence=confidence, st=st): continue rfp = table[ptnId]["relations"] # check degree if len(rfp) > 5: continue # if no support, ignore this pattern if st[ptnId][0][1]["support"] <= 0: continue # TODO - Modlify string, remove pattern text in string? cosRlt = projizz.vsmSimilarity(lineText, vsmData, relas=rfp, ptntext=ptntks) # NOTE - if cosine value > threshold then there is a relation (?) for keyname in expResult: threshold = float(keyname) / 100.0 for pr in cosRlt: # Check type if domainRange[pr]["domain"] in types: if cosRlt[pr] > threshold: if pr not in relaEx[keyname]: relaEx[keyname].append(pr) #### Evaluation for keyname in expResult: for attribute in expResult[keyname]: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx[keyname]: postive = True if attribute in relation: true = True if true: if postive: expResult[keyname][attribute]["tp"].append( ans["revid"]) else: expResult[keyname][attribute]["fn"].append( ans["revid"]) else: if postive: expResult[keyname][attribute]["fp"].append( ans["revid"]) else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return expResult
def mapper(jobid,filename,inputPath,inputPtnPath,model,table): # Read article contentJson = projizz.jsonRead( os.path.join(inputPath,filename) ) # Read ptn contentPtnJson = projizz.jsonRead( os.path.join(inputPtnPath,filename) ) print "Worker %d : Read %s into filter" % (jobid,filename) ### Connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid":{"$in":queries}}) print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count()) count = 0 supportInstanceByFile = {} for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) relation = ans["observed"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) supportInstanceByFile[key] = {} for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) if not projizz.isPatternValidate(ptnId, table): continue for rela in table[ptnId]["relations"]: # it's a support instance if rela in relation: if not ptnId in supportInstanceByFile[key]: supportInstanceByFile[key][ptnId] = {} if not rela in supportInstanceByFile[key][ptnId]: supportInstanceByFile[key][ptnId][rela] = [] if not line[0] in supportInstanceByFile[key][ptnId][rela]: supportInstanceByFile[key][ptnId][rela].append(line[0]) for ptnId in supportInstanceByFile[key]: for rela in supportInstanceByFile[key][ptnId]: lines = supportInstanceByFile[key][ptnId][rela] supportInstanceByFile[key][ptnId][rela] = [] for lineN in lines: supportInstanceByFile[key][ptnId][rela].append(article[lineN]) if count % 100 == 0: print "worker #%d done %d." % (jobid,count) return supportInstanceByFile
# -*- coding: utf-8 -*- # qcl # import sys import projizz if len(sys.argv) <= 1: print "$ python ./simpleSortedViewer.py [ps json]" else: filename = sys.argv[1] ps = projizz.jsonRead(filename) sortedp = projizz.getSortedStatistic(ps) model, table = projizz.readPrefixTreeModelWithTable( "./yagoPatternTree.model", "./yagoPatternTree.table") for relation in sortedp: print relation for ptnId, ptnS in sortedp[relation]: print "%s\t%s %s %s" % (relation, table[ptnId]["pattern"], ptnId, ptnS)
def viewer(path, threshold): model = projizz.jsonRead(path) sortedModel = sorted(model.items(), key=lambda x: x[1], reverse=True) for word, score in sortedModel: if score >= threshold: print "%s\t%f" % (word.encode("utf-8"), score)
def mapper(jobid, filename, inputTestPath): contentPtnJson = projizz.jsonRead(os.path.join(inputTestPath, filename)) keys = map(lambda x: x, contentPtnJson) print "Worker %d read %s, done." % (jobid, filename) return keys
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, nbcPath): # read articles and patterns contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename)) classifiers = projizz.getNBClassifiers(nbcPath) print "Worker %d : Read %s" % (jobid, filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 expResult = partAns relaEx = [] print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid # targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) # Relation extraction for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] if lineText[0] == "^": # It's a wikipeida reference comments, ignore it! continue for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) ptntks = table[ptnId]["pattern"] if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st): continue rfp = table[ptnId]["relations"] # check degree if len(rfp) > 5: continue # if no support, ignore this pattern if st[ptnId][0][1]["support"] <= 0: continue for ptnst in st[ptnId]: # ptnst[0] = relation # ptnst[1] = {"support":,"total": } if domainRange[ptnst[0]] not in types: continue if classifiers[ptnst[0]] == None: continue if classifiers[ptnst[0]].classify(lineText) == "pos": if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) #### Evaluation for attribute in expResult: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: expResult[attribute]["tp"].append(ans["revid"]) else: expResult[attribute]["fn"].append(ans["revid"]) else: if postive: expResult[attribute]["fp"].append(ans["revid"]) else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return expResult