def tryToFindRela(jobid, filename, dataInputPath, ptnOutputPath, model, table): content = projizz.combinedFileReader(os.path.join(dataInputPath, filename)) print "Worker %d : Read %s into filter" % (jobid, filename) count = 0 dealL = 0 patternEx = {} for articleName in content: pattern = [] article = projizz.articleSimpleSentenceFileter(content[articleName]) lineCount = 0 for line in article: dealL += 1 tokens = projizz._posTagger.tag(line) patternExtracted = projizz.naiveExtractPatterns(tokens, model) if len(patternExtracted) > 0: pattern.append((lineCount, patternExtracted)) if dealL % 10000 == 0: print "Worker %d deal with %d lines." % (jobid, dealL) lineCount += 1 patternEx[articleName] = pattern count += 1 if count % 100 == 0: print "Worker %d deal with %d files" % (jobid, count) gc.collect() projizz.combinedFileWriter(patternEx, os.path.join(ptnOutputPath, filename)) print "Worker %d : Write results out to %s." % (jobid, filename)
def tryToFindRela(jobid, filename, dataInputPath, ptnOutputPath, model, table): content = projizz.combinedFileReader(os.path.join(dataInputPath,filename)) print "Worker %d : Read %s into filter" % (jobid,filename) count = 0 dealL = 0 patternEx = {} for articleName in content: pattern = [] article = projizz.articleSimpleSentenceFileter(content[articleName]) lineCount = 0 for line in article: dealL += 1 tokens = projizz._posTagger.tag(line) patternExtracted = projizz.naiveExtractPatterns(tokens,model) if len(patternExtracted) > 0: pattern.append((lineCount,patternExtracted)) if dealL % 10000 == 0: print "Worker %d deal with %d lines." % (jobid,dealL) lineCount += 1 patternEx[articleName] = pattern count += 1 if count % 100 == 0: print "Worker %d deal with %d files" % (jobid,count) gc.collect() projizz.combinedFileWriter(patternEx,os.path.join(ptnOutputPath,filename)) print "Worker %d : Write results out to %s." % (jobid,filename)
def tryToFindRela(jobid, filename, dataInputPath, resultOutPath, ptnOutputPath, model, tree): content = projizz.combinedFileReader(os.path.join(dataInputPath,filename)) print "Worker %d : Read %s into filter" % (jobid,filename) count = 0 dealL = 0 results = {} patternEx = {} for articleName in content: result = {} pattern = [] article = projizz.articleSimpleSentenceFileter(content[articleName]) for line in article: tokens = projizz._posTagger.tag(line) patternExtracted = projizz.naiveExtractPatterns(tokens,model) for ptnId,start,to in patternExtracted: dealL += 1 rels = tree[ptnId]["relations"] if len(rels) < 2: for r in rels: if not r in result: result[r] = 0 result[r] += 1 if not ptnId in pattern: pattern.append(ptnId) if dealL % 10000 == 0: print "Worker %d deal with %d lines." % (jobid,dealL) results[articleName] = result patternEx[articleName] = pattern count += 1 if count % 100 == 0: print "Worker %d deal with %d files" % (jobid,count) gc.collect() projizz.combinedFileWriter(results,os.path.join(resultOutPath,filename)) projizz.combinedFileWriter(patternEx,os.path.join(ptnOutputPath,filename)) print "Worker %d : Write results out to %s." % (jobid,filename)
def testing(filename): content = projizz.combinedFileReader(filename) model, table = projizz.readPrefixTreeModel("./../prefix_tree_model/patternTree.json") start_time = datetime.now() for articleName in content: print articleName article = projizz.articleSimpleSentenceFileter(content[articleName]) for line in article: tokens = projizz._posTagger.tag(line) patternExtracted = projizz.naiveExtractPatterns(tokens,model) if len(patternExtracted)>0: print line.encode("utf-8") for ptnId,start,to in patternExtracted: print "\t[%d] %s" % (ptnId,table[ptnId]["pattern"]) print "\n----" diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds,diff.microseconds)
def testing(filename): content = projizz.combinedFileReader(filename) model, table = projizz.readPrefixTreeModel( "./../prefix_tree_model/patternTree.json") start_time = datetime.now() for articleName in content: print articleName article = projizz.articleSimpleSentenceFileter(content[articleName]) for line in article: tokens = projizz._posTagger.tag(line) patternExtracted = projizz.naiveExtractPatterns(tokens, model) if len(patternExtracted) > 0: print line.encode("utf-8") for ptnId, start, to in patternExtracted: print "\t[%d] %s" % (ptnId, table[ptnId]["pattern"]) print "\n----" diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def filterFunction(jobid,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath): # read patterns in articles contentPtnJson = json.load(open(os.path.join(inputPtnPath,filename),"r")) # read articles contentJson = json.load(open(os.path.join(inputPath,filename),"r")) print "Worker %d : Read %s into filter" % (jobid,filename) politicalPosition = ["Secretary","Premier","Mayor","Captain","Minister","Chief","Governor","General","Ambassadors","Member"] # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid":{"$in":queries}}) print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count()) count = 0 for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid targetName = projizz.getNamedEntityTokens(ans["_id"]) # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) relaEx = [] for line in ptnEx: # line[0]: line number lineText = article[line[0]] named = False for namedToken in targetName: if namedToken in lineText: named = True break if not named: # No target name in line text continue # go to next line. for ptn in line[1]: # line[1]: array of patterns ptnId = "%d" % (ptn[0]) # ptn[0]: pattern ID, [1]: start, [2]: end rfp = table[ptnId]["relations"] # ignore non-used pattern if not table[ptnId]["used"]: continue if "eval" in table[ptnId] and not table[ptnId]["eval"]: continue # never seen pattern if not ptnId in st: continue # if only one relation if len(rfp) < 2: if "holdsPoliticalPosition" in rfp: foundPosition = False for position in politicalPosition: if position in lineText and not "holdsPoliticalPosition" in relaEx: relaEx.append("holdsPoliticalPosition") break if foundPosition: continue if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx: relaEx.append(rfp[0]) # more than one relation else: if "holdsPoliticalPosition" in rfp: foundPosition = False for position in politicalPosition: if position in lineText and not "holdsPoliticalPosition" in relaEx: relaEx.append("holdsPoliticalPosition") break if foundPosition: continue # using the first as the answer if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx: relaEx.append(rfp[0]) # Remove impossible relations toBeRemove = [] for attribute in relaEx: # speical case, produced if domainRange[attribute] == "": continue if not domainRange[attribute]["domain"] in types: if not attribute in toBeRemove: toBeRemove.append(attribute) for attribute in toBeRemove: relaEx.remove(attribute) # Evaluation for attribute in partAns: postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: partAns[attribute]["tp"].append(ans["revid"]) else: partAns[attribute]["fn"].append(ans["revid"]) else: if postive: partAns[attribute]["fp"].append(ans["revid"]) else: partAns[attribute]["tn"].append(ans["revid"]) if count % 100 == 0: print "worker #%d done %d." % (jobid,count) return partAns
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData): # read articles and patterns contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename)) print "Worker %d : Read %s" % (jobid, filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 expResult = {} relaEx = {} # set thresholds for th in range(0, 51, 5): expResult[th] = copy.deepcopy(partAns) relaEx[th] = [] print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) # TODO # Relation extraction for line in ptnEx: # line[0]: line number # line[1]: array of patterns for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) ptntks = table[ptnId]["pattern"] lineText = article[line[0]] if not projizz.isPatternValidate( ptnId, table, confidence=confidence, st=st): continue rfp = table[ptnId]["relations"] # check degree if len(rfp) > 5: continue # if no support, ignore this pattern if st[ptnId][0][1]["support"] <= 0: continue # TODO - Modlify string, remove pattern text in string? cosRlt = projizz.vsmSimilarity(lineText, vsmData, relas=rfp, ptntext=ptntks) # NOTE - if cosine value > threshold then there is a relation (?) for keyname in expResult: threshold = float(keyname) / 100.0 for pr in cosRlt: # Check type if domainRange[pr]["domain"] in types: if cosRlt[pr] > threshold: if pr not in relaEx[keyname]: relaEx[keyname].append(pr) #### Evaluation for keyname in expResult: for attribute in expResult[keyname]: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx[keyname]: postive = True if attribute in relation: true = True if true: if postive: expResult[keyname][attribute]["tp"].append( ans["revid"]) else: expResult[keyname][attribute]["fn"].append( ans["revid"]) else: if postive: expResult[keyname][attribute]["fp"].append( ans["revid"]) else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return expResult
def mapper(jobid,filename,inputPath,inputPtnPath,model,table): # Read article contentJson = projizz.jsonRead( os.path.join(inputPath,filename) ) # Read ptn contentPtnJson = projizz.jsonRead( os.path.join(inputPtnPath,filename) ) print "Worker %d : Read %s into filter" % (jobid,filename) ### Connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid":{"$in":queries}}) print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count()) count = 0 supportInstanceByFile = {} for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) relation = ans["observed"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) supportInstanceByFile[key] = {} for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) if not projizz.isPatternValidate(ptnId, table): continue for rela in table[ptnId]["relations"]: # it's a support instance if rela in relation: if not ptnId in supportInstanceByFile[key]: supportInstanceByFile[key][ptnId] = {} if not rela in supportInstanceByFile[key][ptnId]: supportInstanceByFile[key][ptnId][rela] = [] if not line[0] in supportInstanceByFile[key][ptnId][rela]: supportInstanceByFile[key][ptnId][rela].append(line[0]) for ptnId in supportInstanceByFile[key]: for rela in supportInstanceByFile[key][ptnId]: lines = supportInstanceByFile[key][ptnId][rela] supportInstanceByFile[key][ptnId][rela] = [] for lineN in lines: supportInstanceByFile[key][ptnId][rela].append(article[lineN]) if count % 100 == 0: print "worker #%d done %d." % (jobid,count) return supportInstanceByFile
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, nbcPath): # read articles and patterns contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename)) classifiers = projizz.getNBClassifiers(nbcPath) print "Worker %d : Read %s" % (jobid, filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 expResult = partAns relaEx = [] print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) # Relation extraction for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] if lineText[ 0] == "^": # It's a wikipeida reference comments, ignore it! continue for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) ptntks = table[ptnId]["pattern"] if not projizz.isPatternValidate( ptnId, table, confidence=confidence, st=st): continue rfp = table[ptnId]["relations"] # check degree if len(rfp) > 5: continue # if no support, ignore this pattern if st[ptnId][0][1]["support"] <= 0: continue for ptnst in st[ptnId]: # ptnst[0] = relation # ptnst[1] = {"support":,"total": } if domainRange[ptnst[0]] not in types: continue if classifiers[ptnst[0]] == None: continue if classifiers[ptnst[0]].classify(lineText) == "pos": if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) #### Evaluation for attribute in expResult: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: expResult[attribute]["tp"].append(ans["revid"]) else: expResult[attribute]["fn"].append(ans["revid"]) else: if postive: expResult[attribute]["fp"].append(ans["revid"]) else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return expResult
def filterFunction(jobid,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,nbcPath): # read patterns in articles contentPtnJson = json.load(open(os.path.join(inputPtnPath,filename),"r")) contentJson = projizz.jsonRead(os.path.join(inputPath,filename)) classifiers = projizz.getNBClassifiers(nbcPath) print "Worker %d : Read %s into filter" % (jobid,filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid":{"$in":queries}}) print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count()) count = 0 # prepare keys for multiple-exp # degree: 1 ~ 5 # ambigu: select 1, select n (threshold:.5, .75), select all # type or not: no type info, type info expResult = {} for deg in range(1,6): for typ in ["n","t"]: if not deg == 1: for amb in ["one","50","75","all"]: keyname = "%d-%s-%s" % (deg,amb,typ) expResult[keyname] = copy.deepcopy(partAns) else: keyname = "%d-1-%s" % (deg,typ) expResult[keyname] = copy.deepcopy(partAns) print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) for keyname in expResult: args = keyname.split("-") degree = int(args[0]) ambigu = args[1] typ = args[2] # Relation extraction relaEx = [] ptnExRela = {} # rela: ptns def recordPtnMakeRela(ptnId,rela,record): if not rela in record: record[rela] = [] if not ptnId in record[rela]: record[rela].append(ptnId) for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] if lineText[0] == "^": # It's a wikipeida reference comments, ignore it! continue for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) # validate the pattern if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st): continue # get all possible relation of this pattern rfp = table[ptnId]["relations"] # check degree if len(rfp) > degree: continue # # Decide to choice relation # if len(rfp) == 1: # or degree == 1 if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx: if typ == "t": if domainRange[rfp[0]]["domain"] in types: pr = rfp[0] if not classifiers[pr] == None and classifiers[pr].classify(lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela(ptnId, rfp[0], ptnExRela) else: pr = rfp[0] if not classifiers[pr] == None and classifiers[pr].classify(lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela(ptnId, rfp[0], ptnExRela) else: if ambigu == "one": if typ == "t": for ptnst in st[ptnId]: # ptnst[0] = relation # ptnst[1] = {"support": , "total": } if ptnst[1]["support"] > 0 and domainRange[ptnst[0]]["domain"] in types: if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela(ptnId, ptnst[0], ptnExRela) break else: if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx and not classifiers[rfp[0]] == None and classifiers[rfp[0]].classify(lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela(ptnId, rfp[0], ptnExRela) elif ambigu == "all": for ptnst in st[ptnId]: if typ == "t": if domainRange[ptnst[0]]["domain"] in types: if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela(ptnId, ptnst[0], ptnExRela) else: if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela(ptnId, ptnst[0], ptnExRela) else: th = 0.75 if ambigu == "50": th = 0.5 b = st[ptnId][0][1]["support"] if b > 0: for ptnst in st[ptnId]: if float(ptnst[1]["support"])/float(b) >= th: if typ == "t": if domainRange[ptnst[0]]["domain"] in types and not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela(ptnId, ptnst[0], ptnExRela) else: if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela(ptnId, ptnst[0], ptnExRela) # Evaluation for attribute in expResult[keyname]: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: expResult[keyname][attribute]["tp"].append(ans["revid"]) else: expResult[keyname][attribute]["fn"].append(ans["revid"]) else: if postive: # False Positive expResult[keyname][attribute]["fp"].append(ans["revid"]) # TODO - 分析錯誤原因 if attribute in ptnExRela: if attribute in originRela: # type 2 error expResult[keyname][attribute]["et2"].append(ans["revid"]) else: found = False ptns = ptnExRela[attribute] # get the patterns raise the Relation for pid in ptns: for psbR in table[pid]["relations"]: if psbR == attribute: continue # here means that the pattern can raise a `correct' relation in answer, may it choice or not if domainRange[psbR]["domain"] in types and psbR in relation: found = True break if found: # type 1 error expResult[keyname][attribute]["et1"].append(ans["revid"]) else: # type 3 error expResult[keyname][attribute]["et3"].append(ans["revid"]) else: # 這是什麼情況?@@ 這種狀況基`本不可能發生吧XD pass else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid,count) return expResult
def main(part,revid): # Paths (on NLG workstation) inputPath = "/tmp2/ccli/y-part-%s/" % (part) inputPtnPath = "/tmp2/ccli/y-ptn-part-%s/" % (part) spPath = "../yago/yagoPSv2/ps.%s.json" % (part) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer itr = collection.find({"revid":revid}) # find filename a = os.popen("grep -nr \"%s\" %s" % (revid,inputPath)).readline() targetFilename = a.split(":")[0].split("/")[-1] key = "%s.txt" % (revid) pattern = projizz.jsonRead(inputPtnPath+targetFilename)[key] article = projizz.articleSimpleSentenceFileter(projizz.jsonRead(inputPath+targetFilename)[key]) st = projizz.getSortedPatternStatistic(projizz.jsonRead(spPath)) domainRange = projizz.getYagoRelationDomainRange(); model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../yago/yagoPatternTree.table") print "Part %s, RevID=%s, in %s" % (part,revid,targetFilename) for ans in itr: targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] answers = ans["observed"] print "Target=%s\nTarget token=%s" % (ans["_id"].encode("utf-8"),targetName) print "Type=%s" % (types) print "Answer=%s" % (answers) for line in pattern: lineText = article[line[0]] named = False for namedToken in targetName: if namedToken in lineText: named = True break if not named: # No target name in line text continue # go to next line. for ptn in line[1]: ptnId = "%d" % (ptn[0]) #rfp = table[ptnId]["relations"] if not ptnId in st: continue for ps in st[ptnId]: if float(ps[1]["support"])/float(ps[1]["total"]) > 0: if domainRange[ps[0]]["domain"] in types: print "#%d" % (line[0]),lineText.encode("utf-8") isIn = "(X)" if ps[0] in answers: isIn = "(O)" print "%s %s/%s/{%d,%d}/ %s" % (isIn,ptnId,table[ptnId]["pattern"],ps[1]["support"],ps[1]["total"],ps[0]) pass # select top 1 break # prevent second ans break
def mapper(jobid, filename, inputPath, inputPtnPath, model, table): # Read article contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) # Read ptn contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename)) print "Worker %d : Read %s into filter" % (jobid, filename) ### Connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 supportInstanceByFile = {} for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) relation = ans["observed"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) supportInstanceByFile[key] = {} for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) if not projizz.isPatternValidate(ptnId, table): continue for rela in table[ptnId]["relations"]: # it's a support instance if rela in relation: if not ptnId in supportInstanceByFile[key]: supportInstanceByFile[key][ptnId] = {} if not rela in supportInstanceByFile[key][ptnId]: supportInstanceByFile[key][ptnId][rela] = [] if not line[0] in supportInstanceByFile[key][ptnId][ rela]: supportInstanceByFile[key][ptnId][rela].append( line[0]) for ptnId in supportInstanceByFile[key]: for rela in supportInstanceByFile[key][ptnId]: lines = supportInstanceByFile[key][ptnId][rela] supportInstanceByFile[key][ptnId][rela] = [] for lineN in lines: supportInstanceByFile[key][ptnId][rela].append( article[lineN]) if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return supportInstanceByFile
def filterFunction(jobid, filename, inputPtnPath, model, table, partAns, st, domainRange, inputPath, confidence, nbcPath): # read patterns in articles contentPtnJson = json.load(open(os.path.join(inputPtnPath, filename), "r")) contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) classifiers = projizz.getNBClassifiers(nbcPath) print "Worker %d : Read %s into filter" % (jobid, filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 # prepare keys for multiple-exp # degree: 1 ~ 5 # ambigu: select 1, select n (threshold:.5, .75), select all # type or not: no type info, type info expResult = {} for deg in range(1, 6): for typ in ["n", "t"]: if not deg == 1: for amb in ["one", "50", "75", "all"]: keyname = "%d-%s-%s" % (deg, amb, typ) expResult[keyname] = copy.deepcopy(partAns) else: keyname = "%d-1-%s" % (deg, typ) expResult[keyname] = copy.deepcopy(partAns) print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) for keyname in expResult: args = keyname.split("-") degree = int(args[0]) ambigu = args[1] typ = args[2] # Relation extraction relaEx = [] ptnExRela = {} # rela: ptns def recordPtnMakeRela(ptnId, rela, record): if not rela in record: record[rela] = [] if not ptnId in record[rela]: record[rela].append(ptnId) for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] if lineText[ 0] == "^": # It's a wikipeida reference comments, ignore it! continue for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) # validate the pattern if not projizz.isPatternValidate( ptnId, table, confidence=confidence, st=st): continue # get all possible relation of this pattern rfp = table[ptnId]["relations"] # check degree if len(rfp) > degree: continue # # Decide to choice relation # if len(rfp) == 1: # or degree == 1 if st[ptnId][0][1]["support"] > 0 and not rfp[ 0] in relaEx: if typ == "t": if domainRange[rfp[0]]["domain"] in types: pr = rfp[0] if not classifiers[ pr] == None and classifiers[ pr].classify( lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela( ptnId, rfp[0], ptnExRela) else: pr = rfp[0] if not classifiers[pr] == None and classifiers[ pr].classify(lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela(ptnId, rfp[0], ptnExRela) else: if ambigu == "one": if typ == "t": for ptnst in st[ptnId]: # ptnst[0] = relation # ptnst[1] = {"support": , "total": } if ptnst[1]["support"] > 0 and domainRange[ ptnst[0]]["domain"] in types: if not ptnst[ 0] in relaEx and not classifiers[ ptnst[ 0]] == None and classifiers[ ptnst[0]].classify( lineText ) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela( ptnId, ptnst[0], ptnExRela) break else: if st[ptnId][0][1]["support"] > 0 and not rfp[ 0] in relaEx and not classifiers[ rfp[0]] == None and classifiers[ rfp[0]].classify( lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela(ptnId, rfp[0], ptnExRela) elif ambigu == "all": for ptnst in st[ptnId]: if typ == "t": if domainRange[ ptnst[0]]["domain"] in types: if not ptnst[ 0] in relaEx and not classifiers[ ptnst[ 0]] == None and classifiers[ ptnst[0]].classify( lineText ) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela( ptnId, ptnst[0], ptnExRela) else: if not ptnst[ 0] in relaEx and not classifiers[ ptnst[ 0]] == None and classifiers[ ptnst[0]].classify( lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela( ptnId, ptnst[0], ptnExRela) else: th = 0.75 if ambigu == "50": th = 0.5 b = st[ptnId][0][1]["support"] if b > 0: for ptnst in st[ptnId]: if float(ptnst[1]["support"]) / float( b) >= th: if typ == "t": if domainRange[ptnst[0]][ "domain"] in types and not ptnst[ 0] in relaEx and not classifiers[ ptnst[0]] == None and classifiers[ ptnst[ 0]].classify( lineText ) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela( ptnId, ptnst[0], ptnExRela) else: if not ptnst[0] in relaEx and not classifiers[ ptnst[ 0]] == None and classifiers[ ptnst[0]].classify( lineText ) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela( ptnId, ptnst[0], ptnExRela) # Evaluation for attribute in expResult[keyname]: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: expResult[keyname][attribute]["tp"].append( ans["revid"]) else: expResult[keyname][attribute]["fn"].append( ans["revid"]) else: if postive: # False Positive expResult[keyname][attribute]["fp"].append( ans["revid"]) # TODO - 分析錯誤原因 if attribute in ptnExRela: if attribute in originRela: # type 2 error expResult[keyname][attribute]["et2"].append( ans["revid"]) else: found = False ptns = ptnExRela[ attribute] # get the patterns raise the Relation for pid in ptns: for psbR in table[pid]["relations"]: if psbR == attribute: continue # here means that the pattern can raise a `correct' relation in answer, may it choice or not if domainRange[psbR][ "domain"] in types and psbR in relation: found = True break if found: # type 1 error expResult[keyname][attribute][ "et1"].append(ans["revid"]) else: # type 3 error expResult[keyname][attribute][ "et3"].append(ans["revid"]) else: # 這是什麼情況?@@ 這種狀況基`本不可能發生吧XD pass else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return expResult
def mapper(jobid, filename, inputPath, inputPtnPath, model, table, confidence): # Read article contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) # Read ptn contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename)) print "Worker %d : Read %s into filter" % (jobid, filename) ### Connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 supportInstanceByFile = {} linesByRelations = {} linesNoRelaByRelations = {} POS = {} NEG = {} for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) relation = ans["observed"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) supportInstanceByFile[key] = {} linesByRela = {} linesByNoRela = {} pos = {} neg = {} for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) if not projizz.isPatternValidate( ptnId, table, confidence=confidence): continue # give up degree > 5 's pattern if len(table[ptnId]["relations"]) > 5: continue for rela in table[ptnId]["relations"]: # it's a support instance if rela in relation: # NOTE - remove pattern text. if not rela in linesByRela: linesByRela[rela] = {} if not line[0] in linesByRela[rela]: linesByRela[rela][line[0]] = [] if not ptnId in linesByRela[rela][line[0]]: linesByRela[rela][line[0]].append(ptnId) # For binary classifier if not rela in pos: pos[rela] = [] if not lineText[0] == "^" and line[0] not in pos[rela]: pos[rela].append(line[0]) else: if not rela in linesByNoRela: linesByNoRela[rela] = {} if not line[0] in linesByNoRela[rela]: linesByNoRela[rela][line[0]] = [] if not ptnId in linesByNoRela[rela][line[0]]: linesByNoRela[rela][line[0]].append(ptnId) # For binary classifier if not rela in neg: neg[rela] = [] if not lineText[0] == "^" and line[0] not in neg[rela]: neg[rela].append(line[0]) for rela in linesByRela: if not rela in linesByRelations: linesByRelations[rela] = [] for lineN in linesByRela[rela]: text = projizz.getTokens(article[lineN].lower()) for ptnId in linesByRela[rela][lineN]: ptntext = table[ptnId]["pattern"].split() for ptntk in ptntext: if ptntk in text: text.remove(ptntk) l = ' '.join(text) linesByRelations[rela].append(l) for rela in linesByNoRela: if not rela in linesNoRelaByRelations: linesNoRelaByRelations[rela] = [] for lineN in linesByNoRela[rela]: text = projizz.getTokens(article[lineN].lower()) for ptnId in linesByNoRela[rela][lineN]: ptntext = table[ptnId]["pattern"].split() for ptntk in ptntext: if ptntk in text: text.remove(ptntk) l = ' '.join(text) linesNoRelaByRelations[rela].append(l) # For binary classifier for rela in pos: if not rela in POS: POS[rela] = [] for lineN in pos[rela]: POS[rela].append({"text": article[lineN], "label": "pos"}) for rela in neg: if not rela in NEG: NEG[rela] = [] for lineN in neg[rela]: NEG[rela].append({"text": article[lineN], "label": "neg"}) if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return linesByRelations, linesNoRelaByRelations, POS, NEG
def filterFunction(jobid, filename, inputPtnPath, model, table, partAns, st, domainRange, inputPath): # read patterns in articles contentPtnJson = json.load(open(os.path.join(inputPtnPath, filename), "r")) # read articles contentJson = json.load(open(os.path.join(inputPath, filename), "r")) print "Worker %d : Read %s into filter" % (jobid, filename) politicalPosition = [ "Secretary", "Premier", "Mayor", "Captain", "Minister", "Chief", "Governor", "General", "Ambassadors", "Member" ] # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid targetName = projizz.getNamedEntityTokens( ans["_id"]) # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) relaEx = [] for line in ptnEx: # line[0]: line number lineText = article[line[0]] named = False for namedToken in targetName: if namedToken in lineText: named = True break if not named: # No target name in line text continue # go to next line. for ptn in line[1]: # line[1]: array of patterns ptnId = "%d" % (ptn[0] ) # ptn[0]: pattern ID, [1]: start, [2]: end rfp = table[ptnId]["relations"] # ignore non-used pattern if not table[ptnId]["used"]: continue if "eval" in table[ptnId] and not table[ptnId]["eval"]: continue # never seen pattern if not ptnId in st: continue # if only one relation if len(rfp) < 2: if "holdsPoliticalPosition" in rfp: foundPosition = False for position in politicalPosition: if position in lineText and not "holdsPoliticalPosition" in relaEx: relaEx.append("holdsPoliticalPosition") break if foundPosition: continue if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx: relaEx.append(rfp[0]) # more than one relation else: if "holdsPoliticalPosition" in rfp: foundPosition = False for position in politicalPosition: if position in lineText and not "holdsPoliticalPosition" in relaEx: relaEx.append("holdsPoliticalPosition") break if foundPosition: continue # using the first as the answer if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx: relaEx.append(rfp[0]) # Remove impossible relations toBeRemove = [] for attribute in relaEx: # speical case, produced if domainRange[attribute] == "": continue if not domainRange[attribute]["domain"] in types: if not attribute in toBeRemove: toBeRemove.append(attribute) for attribute in toBeRemove: relaEx.remove(attribute) # Evaluation for attribute in partAns: postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: partAns[attribute]["tp"].append(ans["revid"]) else: partAns[attribute]["fn"].append(ans["revid"]) else: if postive: partAns[attribute]["fp"].append(ans["revid"]) else: partAns[attribute]["tn"].append(ans["revid"]) if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return partAns
def updateAnswer(jobid,inputPath,filename): contenJson = projizz.jsonRead(os.path.join(inputPath,filename)) print "#%d - %s" % (jobid,filename) connect = Connection() answerCollection = connect.projizz.result.yago.answer factCollection = connect.projizz.yago.facts queries = map(lambda x: x[:-4], contenJson) itr = answerCollection.find({"revid":{"$in":queries}}) print "#%d - query=%d,result=%d" % (jobid,len(queries),itr.count()) count = 0 ty1g = 0 ty2g = 0 updateC = 0 articles = [] for ans in itr: count += 1 articleID = "%s.txt" % (ans["revid"]) articleName = ans["_id"] properties = ans["properties"] #not consider references. #references = ans["references"] if len(properties) == 0: # give up those no properties' article # print "#%d - give up %s (1)" % (jobid,articleID) ty1g += 1 continue needUpdate = len(properties) lines = projizz.articleSimpleSentenceFileter(contenJson[articleID]) text = "" for line in lines: text += (line + " ") observed = [] for pro in properties: pitr = factCollection.find({"property":pro,"subject":articleName}) if pitr.count() < 1: notNeed.append(pro) continue found = False for fact in pitr: tokens = projizz.getNamedEntityTokens(fact["object"]) for token in tokens: if token in text: found = True break if found: break if found: observed.append(pro) if len(observed) > 0: articles.append(articleID) ans["observed"] = observed answerCollection.update({"revid":ans["revid"]},ans,upsert=False) else: ty2g += 1 #print "#%d - give up %s (2)" % (jobid,articleID) print "#%d -> update %d (give up %d + %d)" % (jobid,len(articles),ty1g,ty2g) return (filename,articles)
def mapper(jobid,filename,inputPath,inputPtnPath,model,table,confidence): # Read article contentJson = projizz.jsonRead( os.path.join(inputPath,filename) ) # Read ptn contentPtnJson = projizz.jsonRead( os.path.join(inputPtnPath,filename) ) print "Worker %d : Read %s into filter" % (jobid,filename) ### Connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid":{"$in":queries}}) print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count()) count = 0 supportInstanceByFile = {} linesByRelations = {} linesNoRelaByRelations = {} POS = {} NEG = {} for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) relation = ans["observed"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) supportInstanceByFile[key] = {} linesByRela = {} linesByNoRela = {} pos = {} neg = {} for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) if not projizz.isPatternValidate(ptnId, table, confidence=confidence): continue # give up degree > 5 's pattern if len(table[ptnId]["relations"]) > 5: continue for rela in table[ptnId]["relations"]: # it's a support instance if rela in relation: # NOTE - remove pattern text. if not rela in linesByRela: linesByRela[rela] = {} if not line[0] in linesByRela[rela]: linesByRela[rela][line[0]] = [] if not ptnId in linesByRela[rela][line[0]]: linesByRela[rela][line[0]].append(ptnId) # For binary classifier if not rela in pos: pos[rela] = [] if not lineText[0] == "^" and line[0] not in pos[rela]: pos[rela].append(line[0]) else: if not rela in linesByNoRela: linesByNoRela[rela] = {} if not line[0] in linesByNoRela[rela]: linesByNoRela[rela][line[0]] = [] if not ptnId in linesByNoRela[rela][line[0]]: linesByNoRela[rela][line[0]].append(ptnId) # For binary classifier if not rela in neg: neg[rela] = [] if not lineText[0] == "^" and line[0] not in neg[rela]: neg[rela].append(line[0]) for rela in linesByRela: if not rela in linesByRelations: linesByRelations[rela] = [] for lineN in linesByRela[rela]: text = projizz.getTokens( article[lineN].lower() ) for ptnId in linesByRela[rela][lineN]: ptntext = table[ptnId]["pattern"].split() for ptntk in ptntext: if ptntk in text: text.remove(ptntk) l = ' '.join(text) linesByRelations[rela].append(l) for rela in linesByNoRela: if not rela in linesNoRelaByRelations: linesNoRelaByRelations[rela] = [] for lineN in linesByNoRela[rela]: text = projizz.getTokens( article[lineN].lower() ) for ptnId in linesByNoRela[rela][lineN]: ptntext = table[ptnId]["pattern"].split() for ptntk in ptntext: if ptntk in text: text.remove(ptntk) l = ' '.join(text) linesNoRelaByRelations[rela].append(l) # For binary classifier for rela in pos: if not rela in POS: POS[rela] = [] for lineN in pos[rela]: POS[rela].append( {"text":article[lineN],"label":"pos"} ) for rela in neg: if not rela in NEG: NEG[rela] = [] for lineN in neg[rela]: NEG[rela].append( {"text":article[lineN],"label":"neg"} ) if count % 100 == 0: print "worker #%d done %d." % (jobid,count) return linesByRelations,linesNoRelaByRelations,POS,NEG
def filterFunction(jobid,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence): # read patterns in articles contentPtnJson = json.load(open(os.path.join(inputPtnPath,filename),"r")) contentJson = projizz.jsonRead(os.path.join(inputPath,filename)) print "Worker %d : Read %s into filter" % (jobid,filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid":{"$in":queries}}) print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count()) count = 0 # prepare keys for multiple-exp # degree: 1 ~ 5 # ambigu: select 1, select n (threshold:.5, .75), select all # type or not: no type info, type info expResult = {} for deg in range(1,6): for typ in ["n","t"]: if not deg == 1: for amb in ["one","50","75","all"]: keyname = "%d-%s-%s" % (deg,amb,typ) expResult[keyname] = copy.deepcopy(partAns) else: keyname = "%d-1-%s" % (deg,typ) expResult[keyname] = copy.deepcopy(partAns) print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) for keyname in expResult: args = keyname.split("-") degree = int(args[0]) ambigu = args[1] typ = args[2] # Relation extraction relaEx = [] for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] if lineText[0] == "^": # It's a wikipeida reference comments, ignore it! continue for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st): continue rfp = table[ptnId]["relations"] # check degree if len(rfp) > degree: continue if len(rfp) == 1: # or degree == 1 if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx: if typ == "t": if domainRange[rfp[0]]["domain"] in types: relaEx.append(rfp[0]) else: relaEx.append(rfp[0]) else: if ambigu == "one": if typ == "t": for ptnst in st[ptnId]: # ptnst[0] = relation # ptnst[1] = {"support": , "total": } if ptnst[1]["support"] > 0 and domainRange[ptnst[0]]["domain"] in types: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) break else: if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx: relaEx.append(rfp[0]) elif ambigu == "all": for ptnst in st[ptnId]: if typ == "t": if domainRange[ptnst[0]]["domain"] in types: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) else: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) else: th = 0.75 if ambigu == "50": th = 0.5 b = st[ptnId][0][1]["support"] if b > 0: for ptnst in st[ptnId]: if float(ptnst[1]["support"])/float(b) >= th: if typ == "t": if domainRange[ptnst[0]]["domain"] in types and not ptnst[0] in relaEx: relaEx.append(ptnst[0]) else: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) # Evaluation for attribute in expResult[keyname]: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: expResult[keyname][attribute]["tp"].append(ans["revid"]) else: expResult[keyname][attribute]["fn"].append(ans["revid"]) else: if postive: expResult[keyname][attribute]["fp"].append(ans["revid"]) else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid,count) return expResult
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData): # read articles and patterns contentJson = projizz.jsonRead(os.path.join(inputPath,filename)) contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath,filename)) print "Worker %d : Read %s" % (jobid,filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid":{"$in":queries}}) print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count()) count = 0 expResult = {} relaEx = {} # set thresholds for th in range(0,51,5): expResult[th] = copy.deepcopy(partAns) relaEx[th] = [] print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) # TODO # Relation extraction for line in ptnEx: # line[0]: line number # line[1]: array of patterns for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) ptntks = table[ptnId]["pattern"] lineText = article[line[0]] if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st): continue rfp = table[ptnId]["relations"] # check degree if len(rfp) > 5: continue # if no support, ignore this pattern if st[ptnId][0][1]["support"] <= 0: continue # TODO - Modlify string, remove pattern text in string? cosRlt = projizz.vsmSimilarity( lineText, vsmData, relas=rfp, ptntext=ptntks ) # NOTE - if cosine value > threshold then there is a relation (?) for keyname in expResult: threshold = float(keyname)/100.0 for pr in cosRlt: # Check type if domainRange[pr]["domain"] in types: if cosRlt[pr] > threshold: if pr not in relaEx[keyname]: relaEx[keyname].append(pr) #### Evaluation for keyname in expResult: for attribute in expResult[keyname]: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx[keyname]: postive = True if attribute in relation: true = True if true: if postive: expResult[keyname][attribute]["tp"].append(ans["revid"]) else: expResult[keyname][attribute]["fn"].append(ans["revid"]) else: if postive: expResult[keyname][attribute]["fp"].append(ans["revid"]) else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid,count) return expResult
def main(part, revid): # Paths (on NLG workstation) inputPath = "/tmp2/ccli/yago-part-%s/" % (part) inputPtnPath = "/tmp2/ccli/yago-ptn-part-%s/" % (part) spPath = "../yago/yagoPSv1/ps.%s.json" % (part) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer itr = collection.find({"revid": revid}) # find filename a = os.popen("grep -nr \"%s\" %s" % (revid, inputPath)).readline() targetFilename = a.split(":")[0].split("/")[-1] key = "%s.txt" % (revid) pattern = projizz.jsonRead(inputPtnPath + targetFilename)[key] article = projizz.articleSimpleSentenceFileter( projizz.jsonRead(inputPath + targetFilename)[key]) st = projizz.getSortedPatternStatistic(projizz.jsonRead(spPath)) domainRange = projizz.getYagoRelationDomainRange() model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../yago/yagoPatternTree.table") print "Part %s, RevID=%s, in %s" % (part, revid, targetFilename) for ans in itr: targetName = ans["_id"].replace("(", "").replace(")", "").split( "_") # get entity name's part types = ans["type"] answers = ans["properties"] print "Target=%s\nTarget token=%s" % (ans["_id"].encode("utf-8"), targetName) print "Type=%s" % (types) print "Answer=%s" % (answers) for line in pattern: lineText = article[line[0]] named = False for namedToken in targetName: if namedToken in lineText: named = True break if not named: # No target name in line text continue # go to next line. for ptn in line[1]: ptnId = "%d" % (ptn[0]) #rfp = table[ptnId]["relations"] if not ptnId in st: continue for ps in st[ptnId]: if float(ps[1]["support"]) / float(ps[1]["total"]) > 0: if domainRange[ps[0]]["domain"] in types: print "#%d" % (line[0]), lineText.encode("utf-8") isIn = "(X)" if ps[0] in answers: isIn = "(O)" print "%s %s/%s/{%d,%d}/ %s" % ( isIn, ptnId, table[ptnId]["pattern"], ps[1]["support"], ps[1]["total"], ps[0]) pass # select top 1 break # prevent second ans break
def updateAnswer(jobid, inputPath, filename): contenJson = projizz.jsonRead(os.path.join(inputPath, filename)) print "#%d - %s" % (jobid, filename) connect = Connection() answerCollection = connect.projizz.result.yago.answer factCollection = connect.projizz.yago.facts queries = map(lambda x: x[:-4], contenJson) itr = answerCollection.find({"revid": {"$in": queries}}) print "#%d - query=%d,result=%d" % (jobid, len(queries), itr.count()) count = 0 ty1g = 0 ty2g = 0 updateC = 0 articles = [] for ans in itr: count += 1 articleID = "%s.txt" % (ans["revid"]) articleName = ans["_id"] properties = ans["properties"] #not consider references. #references = ans["references"] if len(properties) == 0: # give up those no properties' article # print "#%d - give up %s (1)" % (jobid,articleID) ty1g += 1 continue needUpdate = len(properties) lines = projizz.articleSimpleSentenceFileter(contenJson[articleID]) text = "" for line in lines: text += (line + " ") observed = [] for pro in properties: pitr = factCollection.find({ "property": pro, "subject": articleName }) if pitr.count() < 1: notNeed.append(pro) continue found = False for fact in pitr: tokens = projizz.getNamedEntityTokens(fact["object"]) for token in tokens: if token in text: found = True break if found: break if found: observed.append(pro) if len(observed) > 0: articles.append(articleID) ans["observed"] = observed answerCollection.update({"revid": ans["revid"]}, ans, upsert=False) else: ty2g += 1 #print "#%d - give up %s (2)" % (jobid,articleID) print "#%d -> update %d (give up %d + %d)" % (jobid, len(articles), ty1g, ty2g) return (filename, articles)
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, nbcPath): # read articles and patterns contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename)) classifiers = projizz.getNBClassifiers(nbcPath) print "Worker %d : Read %s" % (jobid, filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 expResult = partAns relaEx = [] print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid # targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) # Relation extraction for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] if lineText[0] == "^": # It's a wikipeida reference comments, ignore it! continue for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) ptntks = table[ptnId]["pattern"] if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st): continue rfp = table[ptnId]["relations"] # check degree if len(rfp) > 5: continue # if no support, ignore this pattern if st[ptnId][0][1]["support"] <= 0: continue for ptnst in st[ptnId]: # ptnst[0] = relation # ptnst[1] = {"support":,"total": } if domainRange[ptnst[0]] not in types: continue if classifiers[ptnst[0]] == None: continue if classifiers[ptnst[0]].classify(lineText) == "pos": if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) #### Evaluation for attribute in expResult: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: expResult[attribute]["tp"].append(ans["revid"]) else: expResult[attribute]["fn"].append(ans["revid"]) else: if postive: expResult[attribute]["fp"].append(ans["revid"]) else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return expResult
def filterFunction(jobid, filename, inputPtnPath, model, table, partAns, st, domainRange, inputPath, confidence): # read patterns in articles contentPtnJson = json.load(open(os.path.join(inputPtnPath, filename), "r")) contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) print "Worker %d : Read %s into filter" % (jobid, filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 # prepare keys for multiple-exp # degree: 1 ~ 5 # ambigu: select 1, select n (threshold:.5, .75), select all # type or not: no type info, type info expResult = {} for deg in range(1, 6): for typ in ["n", "t"]: if not deg == 1: for amb in ["one", "50", "75", "all"]: keyname = "%d-%s-%s" % (deg, amb, typ) expResult[keyname] = copy.deepcopy(partAns) else: keyname = "%d-1-%s" % (deg, typ) expResult[keyname] = copy.deepcopy(partAns) print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) for keyname in expResult: args = keyname.split("-") degree = int(args[0]) ambigu = args[1] typ = args[2] # Relation extraction relaEx = [] for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] if lineText[ 0] == "^": # It's a wikipeida reference comments, ignore it! continue for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) if not projizz.isPatternValidate( ptnId, table, confidence=confidence, st=st): continue rfp = table[ptnId]["relations"] # check degree if len(rfp) > degree: continue if len(rfp) == 1: # or degree == 1 if st[ptnId][0][1]["support"] > 0 and not rfp[ 0] in relaEx: if typ == "t": if domainRange[rfp[0]]["domain"] in types: relaEx.append(rfp[0]) else: relaEx.append(rfp[0]) else: if ambigu == "one": if typ == "t": for ptnst in st[ptnId]: # ptnst[0] = relation # ptnst[1] = {"support": , "total": } if ptnst[1]["support"] > 0 and domainRange[ ptnst[0]]["domain"] in types: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) break else: if st[ptnId][0][1]["support"] > 0 and not rfp[ 0] in relaEx: relaEx.append(rfp[0]) elif ambigu == "all": for ptnst in st[ptnId]: if typ == "t": if domainRange[ ptnst[0]]["domain"] in types: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) else: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) else: th = 0.75 if ambigu == "50": th = 0.5 b = st[ptnId][0][1]["support"] if b > 0: for ptnst in st[ptnId]: if float(ptnst[1]["support"]) / float( b) >= th: if typ == "t": if domainRange[ptnst[0]][ "domain"] in types and not ptnst[ 0] in relaEx: relaEx.append(ptnst[0]) else: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) # Evaluation for attribute in expResult[keyname]: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: expResult[keyname][attribute]["tp"].append( ans["revid"]) else: expResult[keyname][attribute]["fn"].append( ans["revid"]) else: if postive: expResult[keyname][attribute]["fp"].append( ans["revid"]) else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return expResult