def filterFunction(jobid,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath): # read patterns in articles contentPtnJson = json.load(open(os.path.join(inputPtnPath,filename),"r")) # read articles contentJson = json.load(open(os.path.join(inputPath,filename),"r")) print "Worker %d : Read %s into filter" % (jobid,filename) politicalPosition = ["Secretary","Premier","Mayor","Captain","Minister","Chief","Governor","General","Ambassadors","Member"] # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid":{"$in":queries}}) print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count()) count = 0 for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid targetName = projizz.getNamedEntityTokens(ans["_id"]) # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) relaEx = [] for line in ptnEx: # line[0]: line number lineText = article[line[0]] named = False for namedToken in targetName: if namedToken in lineText: named = True break if not named: # No target name in line text continue # go to next line. for ptn in line[1]: # line[1]: array of patterns ptnId = "%d" % (ptn[0]) # ptn[0]: pattern ID, [1]: start, [2]: end rfp = table[ptnId]["relations"] # ignore non-used pattern if not table[ptnId]["used"]: continue if "eval" in table[ptnId] and not table[ptnId]["eval"]: continue # never seen pattern if not ptnId in st: continue # if only one relation if len(rfp) < 2: if "holdsPoliticalPosition" in rfp: foundPosition = False for position in politicalPosition: if position in lineText and not "holdsPoliticalPosition" in relaEx: relaEx.append("holdsPoliticalPosition") break if foundPosition: continue if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx: relaEx.append(rfp[0]) # more than one relation else: if "holdsPoliticalPosition" in rfp: foundPosition = False for position in politicalPosition: if position in lineText and not "holdsPoliticalPosition" in relaEx: relaEx.append("holdsPoliticalPosition") break if foundPosition: continue # using the first as the answer if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx: relaEx.append(rfp[0]) # Remove impossible relations toBeRemove = [] for attribute in relaEx: # speical case, produced if domainRange[attribute] == "": continue if not domainRange[attribute]["domain"] in types: if not attribute in toBeRemove: toBeRemove.append(attribute) for attribute in toBeRemove: relaEx.remove(attribute) # Evaluation for attribute in partAns: postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: partAns[attribute]["tp"].append(ans["revid"]) else: partAns[attribute]["fn"].append(ans["revid"]) else: if postive: partAns[attribute]["fp"].append(ans["revid"]) else: partAns[attribute]["tn"].append(ans["revid"]) if count % 100 == 0: print "worker #%d done %d." % (jobid,count) return partAns
def filterFunction(jobid, filename, inputPtnPath, model, table, partAns, st, domainRange, inputPath): # read patterns in articles contentPtnJson = json.load(open(os.path.join(inputPtnPath, filename), "r")) # read articles contentJson = json.load(open(os.path.join(inputPath, filename), "r")) print "Worker %d : Read %s into filter" % (jobid, filename) politicalPosition = [ "Secretary", "Premier", "Mayor", "Captain", "Minister", "Chief", "Governor", "General", "Ambassadors", "Member" ] # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid targetName = projizz.getNamedEntityTokens( ans["_id"]) # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) relaEx = [] for line in ptnEx: # line[0]: line number lineText = article[line[0]] named = False for namedToken in targetName: if namedToken in lineText: named = True break if not named: # No target name in line text continue # go to next line. for ptn in line[1]: # line[1]: array of patterns ptnId = "%d" % (ptn[0] ) # ptn[0]: pattern ID, [1]: start, [2]: end rfp = table[ptnId]["relations"] # ignore non-used pattern if not table[ptnId]["used"]: continue if "eval" in table[ptnId] and not table[ptnId]["eval"]: continue # never seen pattern if not ptnId in st: continue # if only one relation if len(rfp) < 2: if "holdsPoliticalPosition" in rfp: foundPosition = False for position in politicalPosition: if position in lineText and not "holdsPoliticalPosition" in relaEx: relaEx.append("holdsPoliticalPosition") break if foundPosition: continue if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx: relaEx.append(rfp[0]) # more than one relation else: if "holdsPoliticalPosition" in rfp: foundPosition = False for position in politicalPosition: if position in lineText and not "holdsPoliticalPosition" in relaEx: relaEx.append("holdsPoliticalPosition") break if foundPosition: continue # using the first as the answer if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx: relaEx.append(rfp[0]) # Remove impossible relations toBeRemove = [] for attribute in relaEx: # speical case, produced if domainRange[attribute] == "": continue if not domainRange[attribute]["domain"] in types: if not attribute in toBeRemove: toBeRemove.append(attribute) for attribute in toBeRemove: relaEx.remove(attribute) # Evaluation for attribute in partAns: postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: partAns[attribute]["tp"].append(ans["revid"]) else: partAns[attribute]["fn"].append(ans["revid"]) else: if postive: partAns[attribute]["fp"].append(ans["revid"]) else: partAns[attribute]["tn"].append(ans["revid"]) if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return partAns
def updateAnswer(jobid, inputPath, filename): contenJson = projizz.jsonRead(os.path.join(inputPath, filename)) print "#%d - %s" % (jobid, filename) connect = Connection() answerCollection = connect.projizz.result.yago.answer factCollection = connect.projizz.yago.facts queries = map(lambda x: x[:-4], contenJson) itr = answerCollection.find({"revid": {"$in": queries}}) print "#%d - query=%d,result=%d" % (jobid, len(queries), itr.count()) count = 0 ty1g = 0 ty2g = 0 updateC = 0 articles = [] for ans in itr: count += 1 articleID = "%s.txt" % (ans["revid"]) articleName = ans["_id"] properties = ans["properties"] #not consider references. #references = ans["references"] if len(properties) == 0: # give up those no properties' article # print "#%d - give up %s (1)" % (jobid,articleID) ty1g += 1 continue needUpdate = len(properties) lines = projizz.articleSimpleSentenceFileter(contenJson[articleID]) text = "" for line in lines: text += (line + " ") observed = [] for pro in properties: pitr = factCollection.find({ "property": pro, "subject": articleName }) if pitr.count() < 1: notNeed.append(pro) continue found = False for fact in pitr: tokens = projizz.getNamedEntityTokens(fact["object"]) for token in tokens: if token in text: found = True break if found: break if found: observed.append(pro) if len(observed) > 0: articles.append(articleID) ans["observed"] = observed answerCollection.update({"revid": ans["revid"]}, ans, upsert=False) else: ty2g += 1 #print "#%d - give up %s (2)" % (jobid,articleID) print "#%d -> update %d (give up %d + %d)" % (jobid, len(articles), ty1g, ty2g) return (filename, articles)
def updateAnswer(jobid,inputPath,filename): contenJson = projizz.jsonRead(os.path.join(inputPath,filename)) print "#%d - %s" % (jobid,filename) connect = Connection() answerCollection = connect.projizz.result.yago.answer factCollection = connect.projizz.yago.facts queries = map(lambda x: x[:-4], contenJson) itr = answerCollection.find({"revid":{"$in":queries}}) print "#%d - query=%d,result=%d" % (jobid,len(queries),itr.count()) count = 0 ty1g = 0 ty2g = 0 updateC = 0 articles = [] for ans in itr: count += 1 articleID = "%s.txt" % (ans["revid"]) articleName = ans["_id"] properties = ans["properties"] #not consider references. #references = ans["references"] if len(properties) == 0: # give up those no properties' article # print "#%d - give up %s (1)" % (jobid,articleID) ty1g += 1 continue needUpdate = len(properties) lines = projizz.articleSimpleSentenceFileter(contenJson[articleID]) text = "" for line in lines: text += (line + " ") observed = [] for pro in properties: pitr = factCollection.find({"property":pro,"subject":articleName}) if pitr.count() < 1: notNeed.append(pro) continue found = False for fact in pitr: tokens = projizz.getNamedEntityTokens(fact["object"]) for token in tokens: if token in text: found = True break if found: break if found: observed.append(pro) if len(observed) > 0: articles.append(articleID) ans["observed"] = observed answerCollection.update({"revid":ans["revid"]},ans,upsert=False) else: ty2g += 1 #print "#%d - give up %s (2)" % (jobid,articleID) print "#%d -> update %d (give up %d + %d)" % (jobid,len(articles),ty1g,ty2g) return (filename,articles)