def filterFunction(jobid,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,nbcPath): # read patterns in articles contentPtnJson = json.load(open(os.path.join(inputPtnPath,filename),"r")) contentJson = projizz.jsonRead(os.path.join(inputPath,filename)) classifiers = projizz.getNBClassifiers(nbcPath) print "Worker %d : Read %s into filter" % (jobid,filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid":{"$in":queries}}) print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count()) count = 0 # prepare keys for multiple-exp # degree: 1 ~ 5 # ambigu: select 1, select n (threshold:.5, .75), select all # type or not: no type info, type info expResult = {} for deg in range(1,6): for typ in ["n","t"]: if not deg == 1: for amb in ["one","50","75","all"]: keyname = "%d-%s-%s" % (deg,amb,typ) expResult[keyname] = copy.deepcopy(partAns) else: keyname = "%d-1-%s" % (deg,typ) expResult[keyname] = copy.deepcopy(partAns) print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) for keyname in expResult: args = keyname.split("-") degree = int(args[0]) ambigu = args[1] typ = args[2] # Relation extraction relaEx = [] ptnExRela = {} # rela: ptns def recordPtnMakeRela(ptnId,rela,record): if not rela in record: record[rela] = [] if not ptnId in record[rela]: record[rela].append(ptnId) for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] if lineText[0] == "^": # It's a wikipeida reference comments, ignore it! continue for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) # validate the pattern if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st): continue # get all possible relation of this pattern rfp = table[ptnId]["relations"] # check degree if len(rfp) > degree: continue # # Decide to choice relation # if len(rfp) == 1: # or degree == 1 if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx: if typ == "t": if domainRange[rfp[0]]["domain"] in types: pr = rfp[0] if not classifiers[pr] == None and classifiers[pr].classify(lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela(ptnId, rfp[0], ptnExRela) else: pr = rfp[0] if not classifiers[pr] == None and classifiers[pr].classify(lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela(ptnId, rfp[0], ptnExRela) else: if ambigu == "one": if typ == "t": for ptnst in st[ptnId]: # ptnst[0] = relation # ptnst[1] = {"support": , "total": } if ptnst[1]["support"] > 0 and domainRange[ptnst[0]]["domain"] in types: if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela(ptnId, ptnst[0], ptnExRela) break else: if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx and not classifiers[rfp[0]] == None and classifiers[rfp[0]].classify(lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela(ptnId, rfp[0], ptnExRela) elif ambigu == "all": for ptnst in st[ptnId]: if typ == "t": if domainRange[ptnst[0]]["domain"] in types: if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela(ptnId, ptnst[0], ptnExRela) else: if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela(ptnId, ptnst[0], ptnExRela) else: th = 0.75 if ambigu == "50": th = 0.5 b = st[ptnId][0][1]["support"] if b > 0: for ptnst in st[ptnId]: if float(ptnst[1]["support"])/float(b) >= th: if typ == "t": if domainRange[ptnst[0]]["domain"] in types and not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela(ptnId, ptnst[0], ptnExRela) else: if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela(ptnId, ptnst[0], ptnExRela) # Evaluation for attribute in expResult[keyname]: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: expResult[keyname][attribute]["tp"].append(ans["revid"]) else: expResult[keyname][attribute]["fn"].append(ans["revid"]) else: if postive: # False Positive expResult[keyname][attribute]["fp"].append(ans["revid"]) # TODO - 分析錯誤原因 if attribute in ptnExRela: if attribute in originRela: # type 2 error expResult[keyname][attribute]["et2"].append(ans["revid"]) else: found = False ptns = ptnExRela[attribute] # get the patterns raise the Relation for pid in ptns: for psbR in table[pid]["relations"]: if psbR == attribute: continue # here means that the pattern can raise a `correct' relation in answer, may it choice or not if domainRange[psbR]["domain"] in types and psbR in relation: found = True break if found: # type 1 error expResult[keyname][attribute]["et1"].append(ans["revid"]) else: # type 3 error expResult[keyname][attribute]["et3"].append(ans["revid"]) else: # 這是什麼情況?@@ 這種狀況基`本不可能發生吧XD pass else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid,count) return expResult
def filterFunction(jobid, filename, inputPtnPath, model, table, partAns, st, domainRange, inputPath, confidence, nbcPath): # read patterns in articles contentPtnJson = json.load(open(os.path.join(inputPtnPath, filename), "r")) contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) classifiers = projizz.getNBClassifiers(nbcPath) print "Worker %d : Read %s into filter" % (jobid, filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 # prepare keys for multiple-exp # degree: 1 ~ 5 # ambigu: select 1, select n (threshold:.5, .75), select all # type or not: no type info, type info expResult = {} for deg in range(1, 6): for typ in ["n", "t"]: if not deg == 1: for amb in ["one", "50", "75", "all"]: keyname = "%d-%s-%s" % (deg, amb, typ) expResult[keyname] = copy.deepcopy(partAns) else: keyname = "%d-1-%s" % (deg, typ) expResult[keyname] = copy.deepcopy(partAns) print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) for keyname in expResult: args = keyname.split("-") degree = int(args[0]) ambigu = args[1] typ = args[2] # Relation extraction relaEx = [] ptnExRela = {} # rela: ptns def recordPtnMakeRela(ptnId, rela, record): if not rela in record: record[rela] = [] if not ptnId in record[rela]: record[rela].append(ptnId) for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] if lineText[ 0] == "^": # It's a wikipeida reference comments, ignore it! continue for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) # validate the pattern if not projizz.isPatternValidate( ptnId, table, confidence=confidence, st=st): continue # get all possible relation of this pattern rfp = table[ptnId]["relations"] # check degree if len(rfp) > degree: continue # # Decide to choice relation # if len(rfp) == 1: # or degree == 1 if st[ptnId][0][1]["support"] > 0 and not rfp[ 0] in relaEx: if typ == "t": if domainRange[rfp[0]]["domain"] in types: pr = rfp[0] if not classifiers[ pr] == None and classifiers[ pr].classify( lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela( ptnId, rfp[0], ptnExRela) else: pr = rfp[0] if not classifiers[pr] == None and classifiers[ pr].classify(lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela(ptnId, rfp[0], ptnExRela) else: if ambigu == "one": if typ == "t": for ptnst in st[ptnId]: # ptnst[0] = relation # ptnst[1] = {"support": , "total": } if ptnst[1]["support"] > 0 and domainRange[ ptnst[0]]["domain"] in types: if not ptnst[ 0] in relaEx and not classifiers[ ptnst[ 0]] == None and classifiers[ ptnst[0]].classify( lineText ) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela( ptnId, ptnst[0], ptnExRela) break else: if st[ptnId][0][1]["support"] > 0 and not rfp[ 0] in relaEx and not classifiers[ rfp[0]] == None and classifiers[ rfp[0]].classify( lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela(ptnId, rfp[0], ptnExRela) elif ambigu == "all": for ptnst in st[ptnId]: if typ == "t": if domainRange[ ptnst[0]]["domain"] in types: if not ptnst[ 0] in relaEx and not classifiers[ ptnst[ 0]] == None and classifiers[ ptnst[0]].classify( lineText ) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela( ptnId, ptnst[0], ptnExRela) else: if not ptnst[ 0] in relaEx and not classifiers[ ptnst[ 0]] == None and classifiers[ ptnst[0]].classify( lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela( ptnId, ptnst[0], ptnExRela) else: th = 0.75 if ambigu == "50": th = 0.5 b = st[ptnId][0][1]["support"] if b > 0: for ptnst in st[ptnId]: if float(ptnst[1]["support"]) / float( b) >= th: if typ == "t": if domainRange[ptnst[0]][ "domain"] in types and not ptnst[ 0] in relaEx and not classifiers[ ptnst[0]] == None and classifiers[ ptnst[ 0]].classify( lineText ) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela( ptnId, ptnst[0], ptnExRela) else: if not ptnst[0] in relaEx and not classifiers[ ptnst[ 0]] == None and classifiers[ ptnst[0]].classify( lineText ) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela( ptnId, ptnst[0], ptnExRela) # Evaluation for attribute in expResult[keyname]: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: expResult[keyname][attribute]["tp"].append( ans["revid"]) else: expResult[keyname][attribute]["fn"].append( ans["revid"]) else: if postive: # False Positive expResult[keyname][attribute]["fp"].append( ans["revid"]) # TODO - 分析錯誤原因 if attribute in ptnExRela: if attribute in originRela: # type 2 error expResult[keyname][attribute]["et2"].append( ans["revid"]) else: found = False ptns = ptnExRela[ attribute] # get the patterns raise the Relation for pid in ptns: for psbR in table[pid]["relations"]: if psbR == attribute: continue # here means that the pattern can raise a `correct' relation in answer, may it choice or not if domainRange[psbR][ "domain"] in types and psbR in relation: found = True break if found: # type 1 error expResult[keyname][attribute][ "et1"].append(ans["revid"]) else: # type 3 error expResult[keyname][attribute][ "et3"].append(ans["revid"]) else: # 這是什麼情況?@@ 這種狀況基`本不可能發生吧XD pass else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return expResult
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, nbcPath): # read articles and patterns contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename)) classifiers = projizz.getNBClassifiers(nbcPath) print "Worker %d : Read %s" % (jobid, filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 expResult = partAns relaEx = [] print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) # Relation extraction for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] if lineText[ 0] == "^": # It's a wikipeida reference comments, ignore it! continue for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) ptntks = table[ptnId]["pattern"] if not projizz.isPatternValidate( ptnId, table, confidence=confidence, st=st): continue rfp = table[ptnId]["relations"] # check degree if len(rfp) > 5: continue # if no support, ignore this pattern if st[ptnId][0][1]["support"] <= 0: continue for ptnst in st[ptnId]: # ptnst[0] = relation # ptnst[1] = {"support":,"total": } if domainRange[ptnst[0]] not in types: continue if classifiers[ptnst[0]] == None: continue if classifiers[ptnst[0]].classify(lineText) == "pos": if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) #### Evaluation for attribute in expResult: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: expResult[attribute]["tp"].append(ans["revid"]) else: expResult[attribute]["fn"].append(ans["revid"]) else: if postive: expResult[attribute]["fp"].append(ans["revid"]) else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return expResult
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, nbcPath): # read articles and patterns contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename)) classifiers = projizz.getNBClassifiers(nbcPath) print "Worker %d : Read %s" % (jobid, filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 expResult = partAns relaEx = [] print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid # targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) # Relation extraction for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] if lineText[0] == "^": # It's a wikipeida reference comments, ignore it! continue for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) ptntks = table[ptnId]["pattern"] if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st): continue rfp = table[ptnId]["relations"] # check degree if len(rfp) > 5: continue # if no support, ignore this pattern if st[ptnId][0][1]["support"] <= 0: continue for ptnst in st[ptnId]: # ptnst[0] = relation # ptnst[1] = {"support":,"total": } if domainRange[ptnst[0]] not in types: continue if classifiers[ptnst[0]] == None: continue if classifiers[ptnst[0]].classify(lineText) == "pos": if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) #### Evaluation for attribute in expResult: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: expResult[attribute]["tp"].append(ans["revid"]) else: expResult[attribute]["fn"].append(ans["revid"]) else: if postive: expResult[attribute]["fp"].append(ans["revid"]) else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return expResult