示例#1
0
def tryToFindRela(jobid, filename, dataInputPath, ptnOutputPath, model, table):
    content = projizz.combinedFileReader(os.path.join(dataInputPath, filename))
    print "Worker %d : Read %s into filter" % (jobid, filename)
    count = 0
    dealL = 0
    patternEx = {}
    for articleName in content:
        pattern = []
        article = projizz.articleSimpleSentenceFileter(content[articleName])
        lineCount = 0
        for line in article:
            dealL += 1
            tokens = projizz._posTagger.tag(line)
            patternExtracted = projizz.naiveExtractPatterns(tokens, model)
            if len(patternExtracted) > 0:
                pattern.append((lineCount, patternExtracted))
            if dealL % 10000 == 0:
                print "Worker %d deal with %d lines." % (jobid, dealL)
            lineCount += 1

        patternEx[articleName] = pattern
        count += 1
        if count % 100 == 0:
            print "Worker %d deal with %d files" % (jobid, count)
            gc.collect()

    projizz.combinedFileWriter(patternEx, os.path.join(ptnOutputPath,
                                                       filename))
    print "Worker %d : Write results out to %s." % (jobid, filename)
示例#2
0
def tryToFindRela(jobid, filename, dataInputPath, ptnOutputPath, model, table):
    content = projizz.combinedFileReader(os.path.join(dataInputPath,filename))
    print "Worker %d : Read %s into filter" % (jobid,filename)
    count = 0
    dealL = 0
    patternEx = {}
    for articleName in content:
        pattern = []
        article = projizz.articleSimpleSentenceFileter(content[articleName])
        lineCount = 0
        for line in article:
            dealL += 1
            tokens = projizz._posTagger.tag(line)
            patternExtracted = projizz.naiveExtractPatterns(tokens,model)
            if len(patternExtracted) > 0:
                pattern.append((lineCount,patternExtracted))
            if dealL % 10000 == 0:
                print "Worker %d deal with %d lines." % (jobid,dealL)
            lineCount += 1
        
        patternEx[articleName] = pattern
        count += 1
        if count % 100 == 0:
            print "Worker %d deal with %d files" % (jobid,count)
            gc.collect()

    projizz.combinedFileWriter(patternEx,os.path.join(ptnOutputPath,filename))
    print "Worker %d : Write results out to %s." % (jobid,filename)
示例#3
0
def tryToFindRela(jobid, filename, dataInputPath, resultOutPath, ptnOutputPath, model, tree):
    content = projizz.combinedFileReader(os.path.join(dataInputPath,filename))
    print "Worker %d : Read %s into filter" % (jobid,filename)
    count = 0
    dealL = 0
    results = {}
    patternEx = {}
    for articleName in content:
        result = {}
        pattern = []
        article = projizz.articleSimpleSentenceFileter(content[articleName])
        for line in article:
            tokens = projizz._posTagger.tag(line)
            patternExtracted = projizz.naiveExtractPatterns(tokens,model)

            for ptnId,start,to in patternExtracted:
                dealL += 1
                rels = tree[ptnId]["relations"]

                if len(rels) < 2:
                    for r in rels:
                        if not r in result:
                            result[r] = 0
                        result[r] += 1

                if not ptnId in pattern:
                    pattern.append(ptnId)
                
                if dealL % 10000 == 0:
                    print "Worker %d deal with %d lines." % (jobid,dealL)
                    
        
        results[articleName] = result
        patternEx[articleName] = pattern
        count += 1
        if count % 100 == 0:
            print "Worker %d deal with %d files" % (jobid,count)
            gc.collect()

    projizz.combinedFileWriter(results,os.path.join(resultOutPath,filename))
    projizz.combinedFileWriter(patternEx,os.path.join(ptnOutputPath,filename))
    print "Worker %d : Write results out to %s." % (jobid,filename)
def testing(filename):
    
    content = projizz.combinedFileReader(filename)

    model, table = projizz.readPrefixTreeModel("./../prefix_tree_model/patternTree.json")
  
    start_time = datetime.now()
    for articleName in content:
        print articleName
        article = projizz.articleSimpleSentenceFileter(content[articleName])
        
        for line in article:
            tokens = projizz._posTagger.tag(line)
            patternExtracted = projizz.naiveExtractPatterns(tokens,model)
            if len(patternExtracted)>0:
                print line.encode("utf-8")
                for ptnId,start,to in patternExtracted:
                    print "\t[%d] %s" % (ptnId,table[ptnId]["pattern"])

        print "\n----"
    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds,diff.microseconds)
示例#5
0
def testing(filename):

    content = projizz.combinedFileReader(filename)

    model, table = projizz.readPrefixTreeModel(
        "./../prefix_tree_model/patternTree.json")

    start_time = datetime.now()
    for articleName in content:
        print articleName
        article = projizz.articleSimpleSentenceFileter(content[articleName])

        for line in article:
            tokens = projizz._posTagger.tag(line)
            patternExtracted = projizz.naiveExtractPatterns(tokens, model)
            if len(patternExtracted) > 0:
                print line.encode("utf-8")
                for ptnId, start, to in patternExtracted:
                    print "\t[%d] %s" % (ptnId, table[ptnId]["pattern"])

        print "\n----"
    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def filterFunction(jobid,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath):
    # read patterns in articles
    contentPtnJson = json.load(open(os.path.join(inputPtnPath,filename),"r"))
    # read articles
    contentJson = json.load(open(os.path.join(inputPath,filename),"r"))
    print "Worker %d : Read %s into filter" % (jobid,filename)

    politicalPosition = ["Secretary","Premier","Mayor","Captain","Minister","Chief","Governor","General","Ambassadors","Member"]

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid":{"$in":queries}})
    print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count())

    count = 0

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])     # get revid
        targetName = projizz.getNamedEntityTokens(ans["_id"])   # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["properties"]
        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])
        relaEx = []
        for line in ptnEx:                      # line[0]: line number

            lineText = article[line[0]]
            named = False
            for namedToken in targetName:
                if namedToken in lineText:
                    named = True
                    break

            if not named:   # No target name in line text
                continue    # go to next line.

            for ptn in line[1]:                 # line[1]: array of patterns
                ptnId = "%d" % (ptn[0])         # ptn[0]:  pattern ID, [1]: start, [2]: end
                rfp = table[ptnId]["relations"]
                
                # ignore non-used pattern
                if not table[ptnId]["used"]:
                    continue
                if "eval" in table[ptnId] and not table[ptnId]["eval"]:
                    continue

                # never seen pattern
                if not ptnId in st:
                    continue
                
                # if only one relation
                if len(rfp) < 2:

                    if "holdsPoliticalPosition" in rfp:
                        foundPosition = False
                        for position in politicalPosition:
                            if position in lineText and not "holdsPoliticalPosition" in relaEx:
                                relaEx.append("holdsPoliticalPosition")
                                break
                        if foundPosition:
                            continue


                    if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx:
                        relaEx.append(rfp[0])

                # more than one relation
                else:

                    if "holdsPoliticalPosition" in rfp:
                        foundPosition = False
                        for position in politicalPosition:
                            if position in lineText and not "holdsPoliticalPosition" in relaEx:
                                relaEx.append("holdsPoliticalPosition")
                                break
                        if foundPosition:
                            continue


                    # using the first as the answer
                    if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx:
                        relaEx.append(rfp[0])

        # Remove impossible relations
        toBeRemove = []
        for attribute in relaEx:
            # speical case, produced
            if domainRange[attribute] == "":
                continue

            if not domainRange[attribute]["domain"] in types:
                if not attribute in toBeRemove:
                    toBeRemove.append(attribute)

        for attribute in toBeRemove:
            relaEx.remove(attribute)

        # Evaluation
        for attribute in partAns:
            postive = False
            true = False

            if attribute in relaEx:
                postive = True
            if attribute in relation:
                true = True

            if true:
                if postive:
                    partAns[attribute]["tp"].append(ans["revid"])
                else:
                    partAns[attribute]["fn"].append(ans["revid"])
            else:
                if postive:
                    partAns[attribute]["fp"].append(ans["revid"])
                else:
                    partAns[attribute]["tn"].append(ans["revid"])
        if count % 100 == 0:
            print "worker #%d done %d." % (jobid,count)
    return partAns
示例#7
0
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns,
           domainRange, confidence, vsmData):

    # read articles and patterns
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))
    contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename))

    print "Worker %d : Read %s" % (jobid, filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0
    expResult = {}
    relaEx = {}

    # set thresholds
    for th in range(0, 51, 5):
        expResult[th] = copy.deepcopy(partAns)
        relaEx[th] = []

    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])  # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        # TODO

        # Relation extraction
        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line

                ptnId = "%d" % (ptn[0])

                ptntks = table[ptnId]["pattern"]
                lineText = article[line[0]]

                if not projizz.isPatternValidate(
                        ptnId, table, confidence=confidence, st=st):
                    continue

                rfp = table[ptnId]["relations"]

                # check degree
                if len(rfp) > 5:
                    continue

                # if no support, ignore this pattern
                if st[ptnId][0][1]["support"] <= 0:
                    continue

                # TODO - Modlify string, remove pattern text in string?
                cosRlt = projizz.vsmSimilarity(lineText,
                                               vsmData,
                                               relas=rfp,
                                               ptntext=ptntks)

                # NOTE - if cosine value > threshold then there is a relation (?)
                for keyname in expResult:
                    threshold = float(keyname) / 100.0

                    for pr in cosRlt:
                        # Check type
                        if domainRange[pr]["domain"] in types:
                            if cosRlt[pr] > threshold:
                                if pr not in relaEx[keyname]:
                                    relaEx[keyname].append(pr)

        #### Evaluation
        for keyname in expResult:
            for attribute in expResult[keyname]:

                # special case, ignore.
                if attribute == "produced":
                    continue

                postive = False
                true = False

                if attribute in relaEx[keyname]:
                    postive = True
                if attribute in relation:
                    true = True

                if true:
                    if postive:
                        expResult[keyname][attribute]["tp"].append(
                            ans["revid"])
                    else:
                        expResult[keyname][attribute]["fn"].append(
                            ans["revid"])
                else:
                    if postive:
                        expResult[keyname][attribute]["fp"].append(
                            ans["revid"])
                    else:
                        # ignore true-negative
                        pass

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return expResult
def mapper(jobid,filename,inputPath,inputPtnPath,model,table):

    # Read article
    contentJson = projizz.jsonRead( os.path.join(inputPath,filename) )
    # Read ptn
    contentPtnJson = projizz.jsonRead( os.path.join(inputPtnPath,filename) )

    print "Worker %d : Read %s into filter" % (jobid,filename)
    
    ### Connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid":{"$in":queries}})
    print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count())

    count = 0

    supportInstanceByFile = {}

    for ans in itr:
        count += 1

        key = "%s.txt" % (ans["revid"])
        relation = ans["observed"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])
       
        supportInstanceByFile[key] = {}

        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns
            lineText = article[line[0]]

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line
                ptnId = "%d" % (ptn[0])

                if not projizz.isPatternValidate(ptnId, table):
                    continue

                for rela in table[ptnId]["relations"]:
                    # it's a support instance
                    if rela in relation:
                        
                        if not ptnId in supportInstanceByFile[key]:
                            supportInstanceByFile[key][ptnId] = {}
                        if not rela in supportInstanceByFile[key][ptnId]:
                            supportInstanceByFile[key][ptnId][rela] = []

                        if not line[0] in supportInstanceByFile[key][ptnId][rela]:
                            supportInstanceByFile[key][ptnId][rela].append(line[0])

        for ptnId in supportInstanceByFile[key]:
            for rela in supportInstanceByFile[key][ptnId]:
                lines = supportInstanceByFile[key][ptnId][rela]
                supportInstanceByFile[key][ptnId][rela] = []
                for lineN in lines:
                    supportInstanceByFile[key][ptnId][rela].append(article[lineN])

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid,count)

    return supportInstanceByFile
示例#9
0
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns,
           domainRange, confidence, nbcPath):

    # read articles and patterns
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))
    contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename))

    classifiers = projizz.getNBClassifiers(nbcPath)
    print "Worker %d : Read %s" % (jobid, filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0
    expResult = partAns
    relaEx = []

    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])  # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        # Relation extraction
        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns

            lineText = article[line[0]]
            if lineText[
                    0] == "^":  # It's a wikipeida reference comments, ignore it!
                continue

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line

                ptnId = "%d" % (ptn[0])

                ptntks = table[ptnId]["pattern"]

                if not projizz.isPatternValidate(
                        ptnId, table, confidence=confidence, st=st):
                    continue

                rfp = table[ptnId]["relations"]

                # check degree
                if len(rfp) > 5:
                    continue

                # if no support, ignore this pattern
                if st[ptnId][0][1]["support"] <= 0:
                    continue

                for ptnst in st[ptnId]:
                    # ptnst[0] = relation
                    # ptnst[1] = {"support":,"total": }
                    if domainRange[ptnst[0]] not in types:
                        continue

                    if classifiers[ptnst[0]] == None:
                        continue

                    if classifiers[ptnst[0]].classify(lineText) == "pos":
                        if not ptnst[0] in relaEx:
                            relaEx.append(ptnst[0])

        #### Evaluation
        for attribute in expResult:

            # special case, ignore.
            if attribute == "produced":
                continue

            postive = False
            true = False

            if attribute in relaEx:
                postive = True
            if attribute in relation:
                true = True

            if true:
                if postive:
                    expResult[attribute]["tp"].append(ans["revid"])
                else:
                    expResult[attribute]["fn"].append(ans["revid"])
            else:
                if postive:
                    expResult[attribute]["fp"].append(ans["revid"])
                else:
                    # ignore true-negative
                    pass

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return expResult
示例#10
0
def filterFunction(jobid,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,nbcPath):
    # read patterns in articles
    contentPtnJson = json.load(open(os.path.join(inputPtnPath,filename),"r"))
    contentJson = projizz.jsonRead(os.path.join(inputPath,filename))
    
    classifiers = projizz.getNBClassifiers(nbcPath)
    print "Worker %d : Read %s into filter" % (jobid,filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid":{"$in":queries}})
    print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count())

    count = 0

    # prepare keys for multiple-exp
    # degree: 1 ~ 5
    # ambigu: select 1, select n (threshold:.5, .75), select all
    # type or not: no type info, type info
    
    expResult = {}

    for deg in range(1,6):
        for typ in ["n","t"]:
            if not deg == 1:
                for amb in ["one","50","75","all"]:
                    keyname = "%d-%s-%s" % (deg,amb,typ)
                    expResult[keyname] = copy.deepcopy(partAns)
            else:
                keyname = "%d-1-%s" % (deg,typ)
                expResult[keyname] = copy.deepcopy(partAns)
    
    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])     # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]
        
        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        for keyname in expResult:

            args = keyname.split("-")
            degree = int(args[0])
            ambigu = args[1]
            typ    = args[2]

            # Relation extraction
            relaEx = []
            ptnExRela = {}  # rela: ptns

            def recordPtnMakeRela(ptnId,rela,record):
                if not rela in record:
                    record[rela] = []
                if not ptnId in record[rela]:
                    record[rela].append(ptnId)

            for line in ptnEx:
                # line[0]: line number
                # line[1]: array of patterns

                lineText = article[line[0]]
                if lineText[0] == "^":  # It's a wikipeida reference comments, ignore it!
                    continue
                
                for ptn in line[1]:
                    # ptn[0]: pattern ID
                    # ptn[1]: start position in line
                    # ptn[2]: end position in line

                    ptnId = "%d" % (ptn[0])

                    # validate the pattern 
                    if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st):
                        continue

                    # get all possible relation of this pattern
                    rfp = table[ptnId]["relations"]

                    # check degree
                    if len(rfp) > degree:
                        continue

                    #
                    #   Decide to choice relation
                    # 

                    if len(rfp) == 1:   # or degree == 1
                        if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx:
                            if typ == "t":
                                if domainRange[rfp[0]]["domain"] in types:
                                    pr = rfp[0]
                                    if not classifiers[pr] == None and classifiers[pr].classify(lineText) == "pos":
                                        relaEx.append(rfp[0])
                                        # FIXME For error checking
                                        recordPtnMakeRela(ptnId, rfp[0], ptnExRela)
                            else:
                                pr = rfp[0]
                                if not classifiers[pr] == None and classifiers[pr].classify(lineText) == "pos":
                                    relaEx.append(rfp[0])
                                    # FIXME For error checking
                                    recordPtnMakeRela(ptnId, rfp[0], ptnExRela)
                    else:
                        if ambigu == "one":
                            if typ == "t":
                                for ptnst in st[ptnId]:
                                    # ptnst[0] = relation
                                    # ptnst[1] = {"support": , "total": }
                                    if ptnst[1]["support"] > 0 and domainRange[ptnst[0]]["domain"] in types:
                                        if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos":
                                            relaEx.append(ptnst[0])
                                            # FIXME For error checking
                                            recordPtnMakeRela(ptnId, ptnst[0], ptnExRela)
                                            break
                            
                            else:
                                if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx and not classifiers[rfp[0]] == None and classifiers[rfp[0]].classify(lineText) == "pos":
                                    relaEx.append(rfp[0])
                                    # FIXME For error checking
                                    recordPtnMakeRela(ptnId, rfp[0], ptnExRela)
                                
                        elif ambigu == "all":
                            for ptnst in st[ptnId]:
                                if typ == "t":
                                    if domainRange[ptnst[0]]["domain"] in types:
                                        if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos":
                                            relaEx.append(ptnst[0])
                                            # FIXME For error checking
                                            recordPtnMakeRela(ptnId, ptnst[0], ptnExRela)
                                else:
                                    if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos":
                                        relaEx.append(ptnst[0])
                                        # FIXME For error checking
                                        recordPtnMakeRela(ptnId, ptnst[0], ptnExRela)
                        else:
                            th = 0.75
                            if ambigu == "50":
                                th = 0.5
                            
                            b = st[ptnId][0][1]["support"]
                            if b > 0:
                                for ptnst in st[ptnId]:
                                    if float(ptnst[1]["support"])/float(b) >= th:
                                        if typ == "t":
                                            if domainRange[ptnst[0]]["domain"] in types and not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos":
                                                relaEx.append(ptnst[0])
                                                # FIXME For error checking
                                                recordPtnMakeRela(ptnId, ptnst[0], ptnExRela)
                                        else:
                                            if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos":
                                                relaEx.append(ptnst[0])
                                                # FIXME For error checking
                                                recordPtnMakeRela(ptnId, ptnst[0], ptnExRela)

            
            # Evaluation
            for attribute in expResult[keyname]:

                # special case, ignore.
                if attribute == "produced":
                    continue

                postive = False
                true = False

                if attribute in relaEx:
                    postive = True
                if attribute in relation:
                    true = True
            
                if true:
                    if postive:
                        expResult[keyname][attribute]["tp"].append(ans["revid"])
                    else:
                        expResult[keyname][attribute]["fn"].append(ans["revid"])
                else:
                    if postive:
                        # False Positive
                        expResult[keyname][attribute]["fp"].append(ans["revid"])
                        # TODO - 分析錯誤原因
                        if attribute in ptnExRela:
                            if attribute in originRela:
                                # type 2 error
                                expResult[keyname][attribute]["et2"].append(ans["revid"])
                            else:
                                found = False
                                ptns =  ptnExRela[attribute]    # get the patterns raise the Relation
                                for pid in ptns:
                                    for psbR in table[pid]["relations"]:
                                        if psbR == attribute:
                                            continue

                                        # here means that the pattern can raise a `correct' relation in answer, may it choice or not
                                        if domainRange[psbR]["domain"] in types and psbR in relation:
                                            found = True
                                            break

                                if found:
                                    # type 1 error
                                    expResult[keyname][attribute]["et1"].append(ans["revid"])
                                else:
                                    # type 3 error
                                    expResult[keyname][attribute]["et3"].append(ans["revid"])
                        else:
                            # 這是什麼情況?@@ 這種狀況基`本不可能發生吧XD
                            pass
                    else:
                        # ignore true-negative
                        pass
    
        if count % 100 == 0:
            print "worker #%d done %d." % (jobid,count)

    return expResult
示例#11
0
def main(part,revid):

    # Paths (on NLG workstation)
    inputPath = "/tmp2/ccli/y-part-%s/" % (part)
    inputPtnPath = "/tmp2/ccli/y-ptn-part-%s/" % (part)
    spPath = "../yago/yagoPSv2/ps.%s.json" % (part)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    itr = collection.find({"revid":revid})

    # find filename
    a = os.popen("grep -nr \"%s\" %s" % (revid,inputPath)).readline()
    targetFilename = a.split(":")[0].split("/")[-1]
    key = "%s.txt" % (revid)

    

    pattern = projizz.jsonRead(inputPtnPath+targetFilename)[key]
    article = projizz.articleSimpleSentenceFileter(projizz.jsonRead(inputPath+targetFilename)[key])
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(spPath))
    domainRange = projizz.getYagoRelationDomainRange();
    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../yago/yagoPatternTree.table")

    print "Part %s, RevID=%s, in %s" % (part,revid,targetFilename)

    for ans in itr:

        targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]
        answers = ans["observed"]

        print "Target=%s\nTarget token=%s" % (ans["_id"].encode("utf-8"),targetName)
        print "Type=%s" % (types)
        print "Answer=%s" % (answers)
       
        for line in pattern:
            lineText = article[line[0]]
            named = False
            for namedToken in targetName:
                if namedToken in lineText:
                    named = True
                    break

            if not named:   # No target name in line text
                continue    # go to next line.

            for ptn in line[1]:
                ptnId = "%d" % (ptn[0])
                #rfp = table[ptnId]["relations"]
                if not ptnId in st:
                    continue

                for ps in st[ptnId]:
                    if float(ps[1]["support"])/float(ps[1]["total"]) > 0:
                        if domainRange[ps[0]]["domain"] in types:
                            print "#%d" % (line[0]),lineText.encode("utf-8")
                            isIn = "(X)"
                            if ps[0] in answers:
                                isIn = "(O)"
                            print "%s %s/%s/{%d,%d}/ %s" % (isIn,ptnId,table[ptnId]["pattern"],ps[1]["support"],ps[1]["total"],ps[0])

                        pass

                    # select top 1
                    break


        # prevent second ans
        break
示例#12
0
def mapper(jobid, filename, inputPath, inputPtnPath, model, table):

    # Read article
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))
    # Read ptn
    contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename))

    print "Worker %d : Read %s into filter" % (jobid, filename)

    ### Connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0

    supportInstanceByFile = {}

    for ans in itr:
        count += 1

        key = "%s.txt" % (ans["revid"])
        relation = ans["observed"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        supportInstanceByFile[key] = {}

        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns
            lineText = article[line[0]]

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line
                ptnId = "%d" % (ptn[0])

                if not projizz.isPatternValidate(ptnId, table):
                    continue

                for rela in table[ptnId]["relations"]:
                    # it's a support instance
                    if rela in relation:

                        if not ptnId in supportInstanceByFile[key]:
                            supportInstanceByFile[key][ptnId] = {}
                        if not rela in supportInstanceByFile[key][ptnId]:
                            supportInstanceByFile[key][ptnId][rela] = []

                        if not line[0] in supportInstanceByFile[key][ptnId][
                                rela]:
                            supportInstanceByFile[key][ptnId][rela].append(
                                line[0])

        for ptnId in supportInstanceByFile[key]:
            for rela in supportInstanceByFile[key][ptnId]:
                lines = supportInstanceByFile[key][ptnId][rela]
                supportInstanceByFile[key][ptnId][rela] = []
                for lineN in lines:
                    supportInstanceByFile[key][ptnId][rela].append(
                        article[lineN])

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return supportInstanceByFile
示例#13
0
def filterFunction(jobid, filename, inputPtnPath, model, table, partAns, st,
                   domainRange, inputPath, confidence, nbcPath):
    # read patterns in articles
    contentPtnJson = json.load(open(os.path.join(inputPtnPath, filename), "r"))
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))

    classifiers = projizz.getNBClassifiers(nbcPath)
    print "Worker %d : Read %s into filter" % (jobid, filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0

    # prepare keys for multiple-exp
    # degree: 1 ~ 5
    # ambigu: select 1, select n (threshold:.5, .75), select all
    # type or not: no type info, type info

    expResult = {}

    for deg in range(1, 6):
        for typ in ["n", "t"]:
            if not deg == 1:
                for amb in ["one", "50", "75", "all"]:
                    keyname = "%d-%s-%s" % (deg, amb, typ)
                    expResult[keyname] = copy.deepcopy(partAns)
            else:
                keyname = "%d-1-%s" % (deg, typ)
                expResult[keyname] = copy.deepcopy(partAns)

    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])  # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        for keyname in expResult:

            args = keyname.split("-")
            degree = int(args[0])
            ambigu = args[1]
            typ = args[2]

            # Relation extraction
            relaEx = []
            ptnExRela = {}  # rela: ptns

            def recordPtnMakeRela(ptnId, rela, record):
                if not rela in record:
                    record[rela] = []
                if not ptnId in record[rela]:
                    record[rela].append(ptnId)

            for line in ptnEx:
                # line[0]: line number
                # line[1]: array of patterns

                lineText = article[line[0]]
                if lineText[
                        0] == "^":  # It's a wikipeida reference comments, ignore it!
                    continue

                for ptn in line[1]:
                    # ptn[0]: pattern ID
                    # ptn[1]: start position in line
                    # ptn[2]: end position in line

                    ptnId = "%d" % (ptn[0])

                    # validate the pattern
                    if not projizz.isPatternValidate(
                            ptnId, table, confidence=confidence, st=st):
                        continue

                    # get all possible relation of this pattern
                    rfp = table[ptnId]["relations"]

                    # check degree
                    if len(rfp) > degree:
                        continue

                    #
                    #   Decide to choice relation
                    #

                    if len(rfp) == 1:  # or degree == 1
                        if st[ptnId][0][1]["support"] > 0 and not rfp[
                                0] in relaEx:
                            if typ == "t":
                                if domainRange[rfp[0]]["domain"] in types:
                                    pr = rfp[0]
                                    if not classifiers[
                                            pr] == None and classifiers[
                                                pr].classify(
                                                    lineText) == "pos":
                                        relaEx.append(rfp[0])
                                        # FIXME For error checking
                                        recordPtnMakeRela(
                                            ptnId, rfp[0], ptnExRela)
                            else:
                                pr = rfp[0]
                                if not classifiers[pr] == None and classifiers[
                                        pr].classify(lineText) == "pos":
                                    relaEx.append(rfp[0])
                                    # FIXME For error checking
                                    recordPtnMakeRela(ptnId, rfp[0], ptnExRela)
                    else:
                        if ambigu == "one":
                            if typ == "t":
                                for ptnst in st[ptnId]:
                                    # ptnst[0] = relation
                                    # ptnst[1] = {"support": , "total": }
                                    if ptnst[1]["support"] > 0 and domainRange[
                                            ptnst[0]]["domain"] in types:
                                        if not ptnst[
                                                0] in relaEx and not classifiers[
                                                    ptnst[
                                                        0]] == None and classifiers[
                                                            ptnst[0]].classify(
                                                                lineText
                                                            ) == "pos":
                                            relaEx.append(ptnst[0])
                                            # FIXME For error checking
                                            recordPtnMakeRela(
                                                ptnId, ptnst[0], ptnExRela)
                                            break

                            else:
                                if st[ptnId][0][1]["support"] > 0 and not rfp[
                                        0] in relaEx and not classifiers[
                                            rfp[0]] == None and classifiers[
                                                rfp[0]].classify(
                                                    lineText) == "pos":
                                    relaEx.append(rfp[0])
                                    # FIXME For error checking
                                    recordPtnMakeRela(ptnId, rfp[0], ptnExRela)

                        elif ambigu == "all":
                            for ptnst in st[ptnId]:
                                if typ == "t":
                                    if domainRange[
                                            ptnst[0]]["domain"] in types:
                                        if not ptnst[
                                                0] in relaEx and not classifiers[
                                                    ptnst[
                                                        0]] == None and classifiers[
                                                            ptnst[0]].classify(
                                                                lineText
                                                            ) == "pos":
                                            relaEx.append(ptnst[0])
                                            # FIXME For error checking
                                            recordPtnMakeRela(
                                                ptnId, ptnst[0], ptnExRela)
                                else:
                                    if not ptnst[
                                            0] in relaEx and not classifiers[
                                                ptnst[
                                                    0]] == None and classifiers[
                                                        ptnst[0]].classify(
                                                            lineText) == "pos":
                                        relaEx.append(ptnst[0])
                                        # FIXME For error checking
                                        recordPtnMakeRela(
                                            ptnId, ptnst[0], ptnExRela)
                        else:
                            th = 0.75
                            if ambigu == "50":
                                th = 0.5

                            b = st[ptnId][0][1]["support"]
                            if b > 0:
                                for ptnst in st[ptnId]:
                                    if float(ptnst[1]["support"]) / float(
                                            b) >= th:
                                        if typ == "t":
                                            if domainRange[ptnst[0]][
                                                    "domain"] in types and not ptnst[
                                                        0] in relaEx and not classifiers[
                                                            ptnst[0]] == None and classifiers[
                                                                ptnst[
                                                                    0]].classify(
                                                                        lineText
                                                                    ) == "pos":
                                                relaEx.append(ptnst[0])
                                                # FIXME For error checking
                                                recordPtnMakeRela(
                                                    ptnId, ptnst[0], ptnExRela)
                                        else:
                                            if not ptnst[0] in relaEx and not classifiers[
                                                    ptnst[
                                                        0]] == None and classifiers[
                                                            ptnst[0]].classify(
                                                                lineText
                                                            ) == "pos":
                                                relaEx.append(ptnst[0])
                                                # FIXME For error checking
                                                recordPtnMakeRela(
                                                    ptnId, ptnst[0], ptnExRela)

            # Evaluation
            for attribute in expResult[keyname]:

                # special case, ignore.
                if attribute == "produced":
                    continue

                postive = False
                true = False

                if attribute in relaEx:
                    postive = True
                if attribute in relation:
                    true = True

                if true:
                    if postive:
                        expResult[keyname][attribute]["tp"].append(
                            ans["revid"])
                    else:
                        expResult[keyname][attribute]["fn"].append(
                            ans["revid"])
                else:
                    if postive:
                        # False Positive
                        expResult[keyname][attribute]["fp"].append(
                            ans["revid"])
                        # TODO - 分析錯誤原因
                        if attribute in ptnExRela:
                            if attribute in originRela:
                                # type 2 error
                                expResult[keyname][attribute]["et2"].append(
                                    ans["revid"])
                            else:
                                found = False
                                ptns = ptnExRela[
                                    attribute]  # get the patterns raise the Relation
                                for pid in ptns:
                                    for psbR in table[pid]["relations"]:
                                        if psbR == attribute:
                                            continue

                                        # here means that the pattern can raise a `correct' relation in answer, may it choice or not
                                        if domainRange[psbR][
                                                "domain"] in types and psbR in relation:
                                            found = True
                                            break

                                if found:
                                    # type 1 error
                                    expResult[keyname][attribute][
                                        "et1"].append(ans["revid"])
                                else:
                                    # type 3 error
                                    expResult[keyname][attribute][
                                        "et3"].append(ans["revid"])
                        else:
                            # 這是什麼情況?@@ 這種狀況基`本不可能發生吧XD
                            pass
                    else:
                        # ignore true-negative
                        pass

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return expResult
示例#14
0
def mapper(jobid, filename, inputPath, inputPtnPath, model, table, confidence):

    # Read article
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))
    # Read ptn
    contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename))

    print "Worker %d : Read %s into filter" % (jobid, filename)

    ### Connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0

    supportInstanceByFile = {}

    linesByRelations = {}
    linesNoRelaByRelations = {}

    POS = {}
    NEG = {}

    for ans in itr:
        count += 1

        key = "%s.txt" % (ans["revid"])
        relation = ans["observed"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        supportInstanceByFile[key] = {}
        linesByRela = {}
        linesByNoRela = {}

        pos = {}
        neg = {}

        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns
            lineText = article[line[0]]

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line
                ptnId = "%d" % (ptn[0])

                if not projizz.isPatternValidate(
                        ptnId, table, confidence=confidence):
                    continue

                # give up degree > 5 's pattern
                if len(table[ptnId]["relations"]) > 5:
                    continue

                for rela in table[ptnId]["relations"]:
                    # it's a support instance
                    if rela in relation:

                        # NOTE - remove pattern text.
                        if not rela in linesByRela:
                            linesByRela[rela] = {}
                        if not line[0] in linesByRela[rela]:
                            linesByRela[rela][line[0]] = []
                        if not ptnId in linesByRela[rela][line[0]]:
                            linesByRela[rela][line[0]].append(ptnId)

                        # For binary classifier
                        if not rela in pos:
                            pos[rela] = []
                        if not lineText[0] == "^" and line[0] not in pos[rela]:
                            pos[rela].append(line[0])

                    else:
                        if not rela in linesByNoRela:
                            linesByNoRela[rela] = {}
                        if not line[0] in linesByNoRela[rela]:
                            linesByNoRela[rela][line[0]] = []
                        if not ptnId in linesByNoRela[rela][line[0]]:
                            linesByNoRela[rela][line[0]].append(ptnId)

                        # For binary classifier
                        if not rela in neg:
                            neg[rela] = []
                        if not lineText[0] == "^" and line[0] not in neg[rela]:
                            neg[rela].append(line[0])

        for rela in linesByRela:
            if not rela in linesByRelations:
                linesByRelations[rela] = []
            for lineN in linesByRela[rela]:
                text = projizz.getTokens(article[lineN].lower())
                for ptnId in linesByRela[rela][lineN]:
                    ptntext = table[ptnId]["pattern"].split()
                    for ptntk in ptntext:
                        if ptntk in text:
                            text.remove(ptntk)
                l = ' '.join(text)
                linesByRelations[rela].append(l)

        for rela in linesByNoRela:
            if not rela in linesNoRelaByRelations:
                linesNoRelaByRelations[rela] = []
            for lineN in linesByNoRela[rela]:
                text = projizz.getTokens(article[lineN].lower())
                for ptnId in linesByNoRela[rela][lineN]:
                    ptntext = table[ptnId]["pattern"].split()
                    for ptntk in ptntext:
                        if ptntk in text:
                            text.remove(ptntk)
                l = ' '.join(text)
                linesNoRelaByRelations[rela].append(l)

        # For binary classifier
        for rela in pos:
            if not rela in POS:
                POS[rela] = []
            for lineN in pos[rela]:
                POS[rela].append({"text": article[lineN], "label": "pos"})

        for rela in neg:
            if not rela in NEG:
                NEG[rela] = []
            for lineN in neg[rela]:
                NEG[rela].append({"text": article[lineN], "label": "neg"})

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return linesByRelations, linesNoRelaByRelations, POS, NEG
示例#15
0
def filterFunction(jobid, filename, inputPtnPath, model, table, partAns, st,
                   domainRange, inputPath):
    # read patterns in articles
    contentPtnJson = json.load(open(os.path.join(inputPtnPath, filename), "r"))
    # read articles
    contentJson = json.load(open(os.path.join(inputPath, filename), "r"))
    print "Worker %d : Read %s into filter" % (jobid, filename)

    politicalPosition = [
        "Secretary", "Premier", "Mayor", "Captain", "Minister", "Chief",
        "Governor", "General", "Ambassadors", "Member"
    ]

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])  # get revid
        targetName = projizz.getNamedEntityTokens(
            ans["_id"])  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["properties"]
        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])
        relaEx = []
        for line in ptnEx:  # line[0]: line number

            lineText = article[line[0]]
            named = False
            for namedToken in targetName:
                if namedToken in lineText:
                    named = True
                    break

            if not named:  # No target name in line text
                continue  # go to next line.

            for ptn in line[1]:  # line[1]: array of patterns
                ptnId = "%d" % (ptn[0]
                                )  # ptn[0]:  pattern ID, [1]: start, [2]: end
                rfp = table[ptnId]["relations"]

                # ignore non-used pattern
                if not table[ptnId]["used"]:
                    continue
                if "eval" in table[ptnId] and not table[ptnId]["eval"]:
                    continue

                # never seen pattern
                if not ptnId in st:
                    continue

                # if only one relation
                if len(rfp) < 2:

                    if "holdsPoliticalPosition" in rfp:
                        foundPosition = False
                        for position in politicalPosition:
                            if position in lineText and not "holdsPoliticalPosition" in relaEx:
                                relaEx.append("holdsPoliticalPosition")
                                break
                        if foundPosition:
                            continue

                    if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx:
                        relaEx.append(rfp[0])

                # more than one relation
                else:

                    if "holdsPoliticalPosition" in rfp:
                        foundPosition = False
                        for position in politicalPosition:
                            if position in lineText and not "holdsPoliticalPosition" in relaEx:
                                relaEx.append("holdsPoliticalPosition")
                                break
                        if foundPosition:
                            continue

                    # using the first as the answer
                    if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx:
                        relaEx.append(rfp[0])

        # Remove impossible relations
        toBeRemove = []
        for attribute in relaEx:
            # speical case, produced
            if domainRange[attribute] == "":
                continue

            if not domainRange[attribute]["domain"] in types:
                if not attribute in toBeRemove:
                    toBeRemove.append(attribute)

        for attribute in toBeRemove:
            relaEx.remove(attribute)

        # Evaluation
        for attribute in partAns:
            postive = False
            true = False

            if attribute in relaEx:
                postive = True
            if attribute in relation:
                true = True

            if true:
                if postive:
                    partAns[attribute]["tp"].append(ans["revid"])
                else:
                    partAns[attribute]["fn"].append(ans["revid"])
            else:
                if postive:
                    partAns[attribute]["fp"].append(ans["revid"])
                else:
                    partAns[attribute]["tn"].append(ans["revid"])
        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)
    return partAns
def updateAnswer(jobid,inputPath,filename):
    contenJson = projizz.jsonRead(os.path.join(inputPath,filename))
    print "#%d - %s" % (jobid,filename)
    connect = Connection()
    answerCollection = connect.projizz.result.yago.answer
    factCollection = connect.projizz.yago.facts

    queries = map(lambda x: x[:-4], contenJson)

    itr = answerCollection.find({"revid":{"$in":queries}})
    print "#%d - query=%d,result=%d" % (jobid,len(queries),itr.count())
    
    count = 0
    ty1g = 0
    ty2g = 0
    updateC = 0
    articles = []
    for ans in itr:
        count += 1
        articleID = "%s.txt" % (ans["revid"])
        articleName = ans["_id"]
        properties = ans["properties"]
        #not consider references.
        #references = ans["references"]

        if len(properties) == 0:
            # give up those no properties' article
            # print "#%d - give up %s (1)" % (jobid,articleID)
            ty1g += 1
            continue
        
        needUpdate = len(properties)

        lines = projizz.articleSimpleSentenceFileter(contenJson[articleID])
        text = ""
        for line in lines:
            text += (line + " ")

        observed = []
        for pro in properties:
            
            pitr = factCollection.find({"property":pro,"subject":articleName})
            if pitr.count() < 1:
                notNeed.append(pro)
                continue

            found = False
            for fact in pitr:
                tokens = projizz.getNamedEntityTokens(fact["object"])
                for token in tokens:
                    if token in text:
                        found = True
                        break
                if found:
                    break
            if found:
                observed.append(pro)
            
        if len(observed) > 0:
            articles.append(articleID)
            ans["observed"] = observed
            answerCollection.update({"revid":ans["revid"]},ans,upsert=False)
        else:
            ty2g += 1
            #print "#%d - give up %s (2)" % (jobid,articleID)

    print "#%d -> update %d (give up %d + %d)" % (jobid,len(articles),ty1g,ty2g)

    return (filename,articles)
示例#17
0
def mapper(jobid,filename,inputPath,inputPtnPath,model,table,confidence):

    # Read article
    contentJson = projizz.jsonRead( os.path.join(inputPath,filename) )
    # Read ptn
    contentPtnJson = projizz.jsonRead( os.path.join(inputPtnPath,filename) )

    print "Worker %d : Read %s into filter" % (jobid,filename)
    
    ### Connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid":{"$in":queries}})
    print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count())

    count = 0

    supportInstanceByFile = {}

    linesByRelations = {}
    linesNoRelaByRelations = {}

    POS = {}
    NEG = {}


    for ans in itr:
        count += 1

        key = "%s.txt" % (ans["revid"])
        relation = ans["observed"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])
       
        supportInstanceByFile[key] = {}
        linesByRela = {}
        linesByNoRela = {}

        pos = {}
        neg = {}

        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns
            lineText = article[line[0]]

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line
                ptnId = "%d" % (ptn[0])

                if not projizz.isPatternValidate(ptnId, table, confidence=confidence):
                    continue

                # give up degree > 5 's pattern
                if len(table[ptnId]["relations"]) > 5:
                    continue

                for rela in table[ptnId]["relations"]:
                    # it's a support instance
                    if rela in relation:
       
                        # NOTE - remove pattern text.
                        if not rela in linesByRela:
                            linesByRela[rela] = {}
                        if not line[0] in linesByRela[rela]:
                            linesByRela[rela][line[0]] = []
                        if not ptnId in linesByRela[rela][line[0]]:
                            linesByRela[rela][line[0]].append(ptnId)

                        # For binary classifier
                        if not rela in pos:
                            pos[rela] = []
                        if not lineText[0] == "^" and line[0] not in pos[rela]:
                            pos[rela].append(line[0])

                    else:
                        if not rela in linesByNoRela:
                            linesByNoRela[rela] = {}
                        if not line[0] in linesByNoRela[rela]:
                            linesByNoRela[rela][line[0]] = []
                        if not ptnId in linesByNoRela[rela][line[0]]:
                            linesByNoRela[rela][line[0]].append(ptnId)
                        
                        # For binary classifier
                        if not rela in neg:
                            neg[rela] = []
                        if not lineText[0] == "^" and line[0] not in neg[rela]:
                            neg[rela].append(line[0])

        for rela in linesByRela:
            if not rela in linesByRelations:
                linesByRelations[rela] = []
            for lineN in linesByRela[rela]:
                text = projizz.getTokens( article[lineN].lower() )
                for ptnId in linesByRela[rela][lineN]:
                    ptntext = table[ptnId]["pattern"].split()
                    for ptntk in ptntext:
                        if ptntk in text:
                            text.remove(ptntk)
                l = ' '.join(text)
                linesByRelations[rela].append(l)

        for rela in linesByNoRela:
            if not rela in linesNoRelaByRelations:
                linesNoRelaByRelations[rela] = []
            for lineN in linesByNoRela[rela]:
                text = projizz.getTokens( article[lineN].lower() )
                for ptnId in linesByNoRela[rela][lineN]:
                    ptntext = table[ptnId]["pattern"].split()
                    for ptntk in ptntext:
                        if ptntk in text:
                            text.remove(ptntk)
                l = ' '.join(text)
                linesNoRelaByRelations[rela].append(l)

        # For binary classifier
        for rela in pos:
            if not rela in POS:
                POS[rela] = []
            for lineN in pos[rela]:
                POS[rela].append( {"text":article[lineN],"label":"pos"} )

        for rela in neg:
            if not rela in NEG:
                NEG[rela] = []
            for lineN in neg[rela]:
                NEG[rela].append( {"text":article[lineN],"label":"neg"} )

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid,count)

    return linesByRelations,linesNoRelaByRelations,POS,NEG
示例#18
0
def filterFunction(jobid,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence):
    # read patterns in articles
    contentPtnJson = json.load(open(os.path.join(inputPtnPath,filename),"r"))
    contentJson = projizz.jsonRead(os.path.join(inputPath,filename))

    print "Worker %d : Read %s into filter" % (jobid,filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid":{"$in":queries}})
    print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count())

    count = 0

    # prepare keys for multiple-exp
    # degree: 1 ~ 5
    # ambigu: select 1, select n (threshold:.5, .75), select all
    # type or not: no type info, type info
    
    expResult = {}

    for deg in range(1,6):
        for typ in ["n","t"]:
            if not deg == 1:
                for amb in ["one","50","75","all"]:
                    keyname = "%d-%s-%s" % (deg,amb,typ)
                    expResult[keyname] = copy.deepcopy(partAns)
            else:
                keyname = "%d-1-%s" % (deg,typ)
                expResult[keyname] = copy.deepcopy(partAns)
    
    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])     # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]
        
        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        for keyname in expResult:

            args = keyname.split("-")
            degree = int(args[0])
            ambigu = args[1]
            typ    = args[2]

            # Relation extraction
            relaEx = []
            for line in ptnEx:
                # line[0]: line number
                # line[1]: array of patterns

                lineText = article[line[0]]
                if lineText[0] == "^":  # It's a wikipeida reference comments, ignore it!
                    continue

                for ptn in line[1]:
                    # ptn[0]: pattern ID
                    # ptn[1]: start position in line
                    # ptn[2]: end position in line

                    ptnId = "%d" % (ptn[0])

                    if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st):
                        continue

                    rfp = table[ptnId]["relations"]

                    # check degree
                    if len(rfp) > degree:
                        continue

                    if len(rfp) == 1:   # or degree == 1
                        if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx:
                            if typ == "t":
                                if domainRange[rfp[0]]["domain"] in types:
                                    relaEx.append(rfp[0])
                            else:
                                relaEx.append(rfp[0])

                    else:
                        if ambigu == "one":
                            if typ == "t":
                                for ptnst in st[ptnId]:
                                    # ptnst[0] = relation
                                    # ptnst[1] = {"support": , "total": }
                                    if ptnst[1]["support"] > 0 and domainRange[ptnst[0]]["domain"] in types:
                                        if not ptnst[0] in relaEx:
                                            relaEx.append(ptnst[0])
                                            break

                            
                            else:
                                if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx:
                                    relaEx.append(rfp[0])
                                
                        elif ambigu == "all":
                            for ptnst in st[ptnId]:
                                if typ == "t":
                                    if domainRange[ptnst[0]]["domain"] in types:
                                        if not ptnst[0] in relaEx:
                                            relaEx.append(ptnst[0])
                                else:
                                    if not ptnst[0] in relaEx:
                                        relaEx.append(ptnst[0])
                        else:
                            th = 0.75
                            if ambigu == "50":
                                th = 0.5
                            
                            b = st[ptnId][0][1]["support"]
                            if b > 0:
                                for ptnst in st[ptnId]:
                                    if float(ptnst[1]["support"])/float(b) >= th:
                                        if typ == "t":
                                            if domainRange[ptnst[0]]["domain"] in types and not ptnst[0] in relaEx:
                                                relaEx.append(ptnst[0])
                                        else:
                                            if not ptnst[0] in relaEx:
                                                relaEx.append(ptnst[0])

            
            # Evaluation
            for attribute in expResult[keyname]:

                # special case, ignore.
                if attribute == "produced":
                    continue

                postive = False
                true = False

                if attribute in relaEx:
                    postive = True
                if attribute in relation:
                    true = True
            
                if true:
                    if postive:
                        expResult[keyname][attribute]["tp"].append(ans["revid"])
                    else:
                        expResult[keyname][attribute]["fn"].append(ans["revid"])
                else:
                    if postive:
                        expResult[keyname][attribute]["fp"].append(ans["revid"])
                    else:
                        # ignore true-negative
                        pass
    
        if count % 100 == 0:
            print "worker #%d done %d." % (jobid,count)

    return expResult
示例#19
0
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData):

    # read articles and patterns
    contentJson = projizz.jsonRead(os.path.join(inputPath,filename))
    contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath,filename))

    print "Worker %d : Read %s" % (jobid,filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid":{"$in":queries}})
    print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count())

    count = 0
    expResult = {}
    relaEx = {}

    # set thresholds
    for th in range(0,51,5):
        expResult[th] = copy.deepcopy(partAns)
        relaEx[th] = []
    
    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])     # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]
        
        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        # TODO

        # Relation extraction
        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line

                ptnId = "%d" % (ptn[0])

                ptntks = table[ptnId]["pattern"]
                lineText = article[line[0]]

                if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st):
                    continue
        
                rfp = table[ptnId]["relations"]

                # check degree
                if len(rfp) > 5:
                    continue

                # if no support, ignore this pattern
                if st[ptnId][0][1]["support"] <= 0:
                    continue

                # TODO - Modlify string, remove pattern text in string?
                cosRlt = projizz.vsmSimilarity( lineText, vsmData, relas=rfp, ptntext=ptntks )

                # NOTE - if cosine value > threshold then there is a relation (?)
                for keyname in expResult:
                    threshold = float(keyname)/100.0

                    for pr in cosRlt:
                        # Check type
                        if domainRange[pr]["domain"] in types:
                            if cosRlt[pr] > threshold:
                                if pr not in relaEx[keyname]:
                                    relaEx[keyname].append(pr)

        #### Evaluation
        for keyname in expResult: 
            for attribute in expResult[keyname]:

                # special case, ignore.
                if attribute == "produced":
                    continue

                postive = False
                true = False

                if attribute in relaEx[keyname]:
                    postive = True
                if attribute in relation:
                    true = True
            
                if true:
                    if postive:
                        expResult[keyname][attribute]["tp"].append(ans["revid"])
                    else:
                        expResult[keyname][attribute]["fn"].append(ans["revid"])
                else:
                    if postive:
                        expResult[keyname][attribute]["fp"].append(ans["revid"])
                    else:
                        # ignore true-negative
                        pass
    
        if count % 100 == 0:
            print "worker #%d done %d." % (jobid,count)

    return expResult
示例#20
0
def main(part, revid):

    # Paths (on NLG workstation)
    inputPath = "/tmp2/ccli/yago-part-%s/" % (part)
    inputPtnPath = "/tmp2/ccli/yago-ptn-part-%s/" % (part)
    spPath = "../yago/yagoPSv1/ps.%s.json" % (part)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    itr = collection.find({"revid": revid})

    # find filename
    a = os.popen("grep -nr \"%s\" %s" % (revid, inputPath)).readline()
    targetFilename = a.split(":")[0].split("/")[-1]
    key = "%s.txt" % (revid)

    pattern = projizz.jsonRead(inputPtnPath + targetFilename)[key]
    article = projizz.articleSimpleSentenceFileter(
        projizz.jsonRead(inputPath + targetFilename)[key])
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(spPath))
    domainRange = projizz.getYagoRelationDomainRange()
    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model", "../yago/yagoPatternTree.table")

    print "Part %s, RevID=%s, in %s" % (part, revid, targetFilename)

    for ans in itr:

        targetName = ans["_id"].replace("(", "").replace(")", "").split(
            "_")  # get entity name's part
        types = ans["type"]
        answers = ans["properties"]

        print "Target=%s\nTarget token=%s" % (ans["_id"].encode("utf-8"),
                                              targetName)
        print "Type=%s" % (types)
        print "Answer=%s" % (answers)

        for line in pattern:
            lineText = article[line[0]]
            named = False
            for namedToken in targetName:
                if namedToken in lineText:
                    named = True
                    break

            if not named:  # No target name in line text
                continue  # go to next line.

            for ptn in line[1]:
                ptnId = "%d" % (ptn[0])
                #rfp = table[ptnId]["relations"]
                if not ptnId in st:
                    continue

                for ps in st[ptnId]:
                    if float(ps[1]["support"]) / float(ps[1]["total"]) > 0:
                        if domainRange[ps[0]]["domain"] in types:
                            print "#%d" % (line[0]), lineText.encode("utf-8")
                            isIn = "(X)"
                            if ps[0] in answers:
                                isIn = "(O)"
                            print "%s %s/%s/{%d,%d}/ %s" % (
                                isIn, ptnId, table[ptnId]["pattern"],
                                ps[1]["support"], ps[1]["total"], ps[0])

                        pass

                    # select top 1
                    break

        # prevent second ans
        break
示例#21
0
def updateAnswer(jobid, inputPath, filename):
    contenJson = projizz.jsonRead(os.path.join(inputPath, filename))
    print "#%d - %s" % (jobid, filename)
    connect = Connection()
    answerCollection = connect.projizz.result.yago.answer
    factCollection = connect.projizz.yago.facts

    queries = map(lambda x: x[:-4], contenJson)

    itr = answerCollection.find({"revid": {"$in": queries}})
    print "#%d - query=%d,result=%d" % (jobid, len(queries), itr.count())

    count = 0
    ty1g = 0
    ty2g = 0
    updateC = 0
    articles = []
    for ans in itr:
        count += 1
        articleID = "%s.txt" % (ans["revid"])
        articleName = ans["_id"]
        properties = ans["properties"]
        #not consider references.
        #references = ans["references"]

        if len(properties) == 0:
            # give up those no properties' article
            # print "#%d - give up %s (1)" % (jobid,articleID)
            ty1g += 1
            continue

        needUpdate = len(properties)

        lines = projizz.articleSimpleSentenceFileter(contenJson[articleID])
        text = ""
        for line in lines:
            text += (line + " ")

        observed = []
        for pro in properties:

            pitr = factCollection.find({
                "property": pro,
                "subject": articleName
            })
            if pitr.count() < 1:
                notNeed.append(pro)
                continue

            found = False
            for fact in pitr:
                tokens = projizz.getNamedEntityTokens(fact["object"])
                for token in tokens:
                    if token in text:
                        found = True
                        break
                if found:
                    break
            if found:
                observed.append(pro)

        if len(observed) > 0:
            articles.append(articleID)
            ans["observed"] = observed
            answerCollection.update({"revid": ans["revid"]}, ans, upsert=False)
        else:
            ty2g += 1
            #print "#%d - give up %s (2)" % (jobid,articleID)

    print "#%d -> update %d (give up %d + %d)" % (jobid, len(articles), ty1g,
                                                  ty2g)

    return (filename, articles)
示例#22
0
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, nbcPath):

    # read articles and patterns
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))
    contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename))

    classifiers = projizz.getNBClassifiers(nbcPath)
    print "Worker %d : Read %s" % (jobid, filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0
    expResult = partAns
    relaEx = []

    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])  # get revid
        # targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        # Relation extraction
        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns

            lineText = article[line[0]]
            if lineText[0] == "^":  # It's a wikipeida reference comments, ignore it!
                continue

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line

                ptnId = "%d" % (ptn[0])

                ptntks = table[ptnId]["pattern"]

                if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st):
                    continue

                rfp = table[ptnId]["relations"]

                # check degree
                if len(rfp) > 5:
                    continue

                # if no support, ignore this pattern
                if st[ptnId][0][1]["support"] <= 0:
                    continue

                for ptnst in st[ptnId]:
                    # ptnst[0] = relation
                    # ptnst[1] = {"support":,"total": }
                    if domainRange[ptnst[0]] not in types:
                        continue

                    if classifiers[ptnst[0]] == None:
                        continue

                    if classifiers[ptnst[0]].classify(lineText) == "pos":
                        if not ptnst[0] in relaEx:
                            relaEx.append(ptnst[0])

        #### Evaluation
        for attribute in expResult:

            # special case, ignore.
            if attribute == "produced":
                continue

            postive = False
            true = False

            if attribute in relaEx:
                postive = True
            if attribute in relation:
                true = True

            if true:
                if postive:
                    expResult[attribute]["tp"].append(ans["revid"])
                else:
                    expResult[attribute]["fn"].append(ans["revid"])
            else:
                if postive:
                    expResult[attribute]["fp"].append(ans["revid"])
                else:
                    # ignore true-negative
                    pass

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return expResult
示例#23
0
def filterFunction(jobid, filename, inputPtnPath, model, table, partAns, st,
                   domainRange, inputPath, confidence):
    # read patterns in articles
    contentPtnJson = json.load(open(os.path.join(inputPtnPath, filename), "r"))
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))

    print "Worker %d : Read %s into filter" % (jobid, filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0

    # prepare keys for multiple-exp
    # degree: 1 ~ 5
    # ambigu: select 1, select n (threshold:.5, .75), select all
    # type or not: no type info, type info

    expResult = {}

    for deg in range(1, 6):
        for typ in ["n", "t"]:
            if not deg == 1:
                for amb in ["one", "50", "75", "all"]:
                    keyname = "%d-%s-%s" % (deg, amb, typ)
                    expResult[keyname] = copy.deepcopy(partAns)
            else:
                keyname = "%d-1-%s" % (deg, typ)
                expResult[keyname] = copy.deepcopy(partAns)

    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])  # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        for keyname in expResult:

            args = keyname.split("-")
            degree = int(args[0])
            ambigu = args[1]
            typ = args[2]

            # Relation extraction
            relaEx = []
            for line in ptnEx:
                # line[0]: line number
                # line[1]: array of patterns

                lineText = article[line[0]]
                if lineText[
                        0] == "^":  # It's a wikipeida reference comments, ignore it!
                    continue

                for ptn in line[1]:
                    # ptn[0]: pattern ID
                    # ptn[1]: start position in line
                    # ptn[2]: end position in line

                    ptnId = "%d" % (ptn[0])

                    if not projizz.isPatternValidate(
                            ptnId, table, confidence=confidence, st=st):
                        continue

                    rfp = table[ptnId]["relations"]

                    # check degree
                    if len(rfp) > degree:
                        continue

                    if len(rfp) == 1:  # or degree == 1
                        if st[ptnId][0][1]["support"] > 0 and not rfp[
                                0] in relaEx:
                            if typ == "t":
                                if domainRange[rfp[0]]["domain"] in types:
                                    relaEx.append(rfp[0])
                            else:
                                relaEx.append(rfp[0])

                    else:
                        if ambigu == "one":
                            if typ == "t":
                                for ptnst in st[ptnId]:
                                    # ptnst[0] = relation
                                    # ptnst[1] = {"support": , "total": }
                                    if ptnst[1]["support"] > 0 and domainRange[
                                            ptnst[0]]["domain"] in types:
                                        if not ptnst[0] in relaEx:
                                            relaEx.append(ptnst[0])
                                            break

                            else:
                                if st[ptnId][0][1]["support"] > 0 and not rfp[
                                        0] in relaEx:
                                    relaEx.append(rfp[0])

                        elif ambigu == "all":
                            for ptnst in st[ptnId]:
                                if typ == "t":
                                    if domainRange[
                                            ptnst[0]]["domain"] in types:
                                        if not ptnst[0] in relaEx:
                                            relaEx.append(ptnst[0])
                                else:
                                    if not ptnst[0] in relaEx:
                                        relaEx.append(ptnst[0])
                        else:
                            th = 0.75
                            if ambigu == "50":
                                th = 0.5

                            b = st[ptnId][0][1]["support"]
                            if b > 0:
                                for ptnst in st[ptnId]:
                                    if float(ptnst[1]["support"]) / float(
                                            b) >= th:
                                        if typ == "t":
                                            if domainRange[ptnst[0]][
                                                    "domain"] in types and not ptnst[
                                                        0] in relaEx:
                                                relaEx.append(ptnst[0])
                                        else:
                                            if not ptnst[0] in relaEx:
                                                relaEx.append(ptnst[0])

            # Evaluation
            for attribute in expResult[keyname]:

                # special case, ignore.
                if attribute == "produced":
                    continue

                postive = False
                true = False

                if attribute in relaEx:
                    postive = True
                if attribute in relation:
                    true = True

                if true:
                    if postive:
                        expResult[keyname][attribute]["tp"].append(
                            ans["revid"])
                    else:
                        expResult[keyname][attribute]["fn"].append(
                            ans["revid"])
                else:
                    if postive:
                        expResult[keyname][attribute]["fp"].append(
                            ans["revid"])
                    else:
                        # ignore true-negative
                        pass

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return expResult