Exemplo n.º 1
0
def main(inputPtnPath,outputPath,pspath,inputPath):
    
    model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    properties = projizz.buildYagoProperties({"tp":[],"tn":[],"fp":[],"fn":[]})
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath))
    domainRange = projizz.getYagoRelationDomainRange()

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath )))
            t += 1
    pool.close()
    pool.join()

    for res in result:
        r = res.get()
        for m in r:
            properties[m]["tp"] += r[m]["tp"]
            properties[m]["tn"] += r[m]["tn"]
            properties[m]["fp"] += r[m]["fp"]
            properties[m]["fn"] += r[m]["fn"]

    print "start write out to %s" % (outputPath)
    json.dump(properties,open(outputPath,"w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath,pspath,inputPath):
    
    model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp":[],"tn":[],"fp":[],"fn":[]})
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath))
    domainRange = projizz.getYagoRelationDomainRange()

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath )))
            t += 1
    pool.close()
    pool.join()

    for res in result:
        r = res.get()
        for m in r:
            properties[m]["tp"] += r[m]["tp"]
            properties[m]["tn"] += r[m]["tn"]
            properties[m]["fp"] += r[m]["fp"]
            properties[m]["fn"] += r[m]["fn"]

    print "start write out to %s" % (outputPath)
    json.dump(properties,open(outputPath,"w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Exemplo n.º 3
0
def main(inputPtnPath,outputPath,pspath,inputPath,confidence,outputFilename,nbcPath):
    
    #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    model, table = projizz.readPrefixTreeModelWithTable("../yago//yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[],"et1":[],"et2":[],"et3":[]})
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath))
    domainRange = projizz.getYagoRelationDomainRange()


    start_time = datetime.now()

    cpuCount = multiprocessing.cpu_count()
    if cpuCount > 8:
        cpuCount = 8

    pool = multiprocessing.Pool(processes=cpuCount) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            #result.append(filterFunction(t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,classifiers ))
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,nbcPath )))
            t += 1
    pool.close()
    pool.join()

    expResult = {}
    for res in result:
        r = res.get()
        for keyname in r:

            if not keyname in expResult:
                expResult[keyname] = copy.deepcopy(properties)

            for m in r[keyname]:
                if m == "produced":
                    continue
                expResult[keyname][m]["tp"] += r[keyname][m]["tp"]
                expResult[keyname][m]["fp"] += r[keyname][m]["fp"]
                expResult[keyname][m]["fn"] += r[keyname][m]["fn"]
                expResult[keyname][m]["et1"] += r[keyname][m]["et1"]
                expResult[keyname][m]["et2"] += r[keyname][m]["et2"]
                expResult[keyname][m]["et3"] += r[keyname][m]["et3"]


    if not os.path.isdir(outputPath):
        os.mkdir(outputPath)

    for keyname in expResult:
        p = expResult[keyname]
        if not os.path.isdir(os.path.join(outputPath,keyname)):
            os.mkdir(os.path.join(outputPath,keyname))
        projizz.jsonWrite(p,os.path.join(outputPath,keyname,outputFilename))
        print "start write out to %s" % (os.path.join(outputPath,keyname))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Exemplo n.º 4
0
def main(inputPath, inputPtnPath, vsmPath, confidence, psfile, outputPath,
         outputFilename):

    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model",
        "../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp": [], "fp": [], "fn": []})
    domainRange = projizz.getYagoRelationDomainRange()
    idf, docs, lens = projizz.getVSMmodels(vsmPath)
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(psfile))
    vsmData = (idf, docs, lens)

    projizz.checkPath(outputPath)

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(
                pool.apply_async(
                    mapper, (t, filename, inputPath, inputPtnPath, table, st,
                             partAns, domainRange, confidence, vsmData)))
            #result.append( mapper( t, filename, inputPath, inputPtnPath, table, partAns, domainRange, confidence, vsmData  ))
            t += 1
    pool.close()
    pool.join()

    expResult = {}
    for res in result:
        r = res.get()
        for keyname in r:

            if not keyname in expResult:
                expResult[keyname] = copy.deepcopy(properties)

            for m in r[keyname]:
                if m == "produced":
                    continue
                expResult[keyname][m]["tp"] += r[keyname][m]["tp"]
                expResult[keyname][m]["fp"] += r[keyname][m]["fp"]
                expResult[keyname][m]["fn"] += r[keyname][m]["fn"]

    for keyname in expResult:
        p = expResult[keyname]
        keydirName = "vsm-%d" % (keyname)
        projizz.checkPath(os.path.join(outputPath, keydirName))
        projizz.jsonWrite(p,
                          os.path.join(outputPath, keydirName, outputFilename))
        print "start write out to %s" % (os.path.join(outputPath, keydirName))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Exemplo n.º 5
0
def trainModel(jobid,relation,inputpath,outputPath):
    print "%d -> train %s" % (jobid,relation)
    instances = []
    filepath = os.path.join(inputpath,"%s.pos" % (relation))
    if os.path.exists(filepath):
        posInstances = projizz.jsonRead( filepath )
        for data in posInstances:
            instances.append( (data["text"], data["label"]) )
    filepath = os.path.join(inputpath,"%s.neg" % (relation))
    if os.path.exists(filepath):
        negInstances = projizz.jsonRead( filepath )
        for data in negInstances:
            instances.append( (data["text"], data["label"]) )

    if len(instances) == 0:
        print "Cannot build %s.nbc because there are no training data." % (relation)

    classifier = projizz.NaiveBayesClassifier(instances)
    classifier.save( os.path.join(outputPath,"%s.nbc" % (relation)) )
    print "%d -> Write to %s %s.nbc" % (jobid,outputPath,relation)
Exemplo n.º 6
0
def trainModel(jobid, relation, inputpath, outputPath):
    print "%d -> train %s" % (jobid, relation)
    instances = []
    filepath = os.path.join(inputpath, "%s.pos" % (relation))
    if os.path.exists(filepath):
        posInstances = projizz.jsonRead(filepath)
        for data in posInstances:
            instances.append((data["text"], data["label"]))
    filepath = os.path.join(inputpath, "%s.neg" % (relation))
    if os.path.exists(filepath):
        negInstances = projizz.jsonRead(filepath)
        for data in negInstances:
            instances.append((data["text"], data["label"]))

    if len(instances) == 0:
        print "Cannot build %s.nbc because there are no training data." % (
            relation)

    classifier = projizz.NaiveBayesClassifier(instances)
    classifier.save(os.path.join(outputPath, "%s.nbc" % (relation)))
    print "%d -> Write to %s %s.nbc" % (jobid, outputPath, relation)
Exemplo n.º 7
0
def main(inputPtnPath, outputPath, pspath):

    model, table = projizz.readPrefixTreeModelWithTable(
        "./yagoPatternTree.model", "./yagoPatternTree.table")
    properties = projizz.buildYagoProperties({
        "tp": [],
        "tn": [],
        "fp": [],
        "fn": []
    })
    sp = projizz.getSortedStatistic(projizz.jsonRead(pspath))
    validate = []

    # Get Top 200 Relation
    for relation in sp:
        count = 0
        for ptnId, ptnS in sp[relation]:
            ptnData = table[ptnId]
            if len(ptnData["relations"]) == 1:
                count += 1
                validate.append(ptnId)
            if count >= 200:
                break

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(
                pool.apply_async(filterFunction,
                                 (t, filename, inputPtnPath, model, table,
                                  partAns, validate)))
            t += 1
    pool.close()
    pool.join()

    for res in result:
        r = res.get()
        for m in r:
            properties[m]["tp"] += r[m]["tp"]
            properties[m]["tn"] += r[m]["tn"]
            properties[m]["fp"] += r[m]["fp"]
            properties[m]["fn"] += r[m]["fn"]

    print "start write out to %s" % (outputPath)
    json.dump(properties, open(outputPath, "w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath,pspath,inputPath,confidence,outputFilename):
    
    #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    model, table = projizz.readPrefixTreeModelWithTable("../yago//yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[],"et1":[],"et2":[],"et3":[]})
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath))
    domainRange = projizz.getYagoRelationDomainRange()

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence )))
            t += 1
    pool.close()
    pool.join()

    expResult = {}
    for res in result:
        r = res.get()
        for keyname in r:

            if not keyname in expResult:
                expResult[keyname] = copy.deepcopy(properties)

            for m in r[keyname]:
                if m == "produced":
                    continue
                expResult[keyname][m]["tp"] += r[keyname][m]["tp"]
                expResult[keyname][m]["fp"] += r[keyname][m]["fp"]
                expResult[keyname][m]["fn"] += r[keyname][m]["fn"]
                expResult[keyname][m]["et1"] += r[keyname][m]["et1"]
                expResult[keyname][m]["et2"] += r[keyname][m]["et2"]
                expResult[keyname][m]["et3"] += r[keyname][m]["et3"]


    if not os.path.isdir(outputPath):
        os.mkdir(outputPath)

    for keyname in expResult:
        p = expResult[keyname]
        if not os.path.isdir(os.path.join(outputPath,keyname)):
            os.mkdir(os.path.join(outputPath,keyname))
        projizz.jsonWrite(p,os.path.join(outputPath,keyname,outputFilename))
        print "start write out to %s" % (os.path.join(outputPath,keyname))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Exemplo n.º 9
0
def main(inputPath, inputPtnPath, vsmPath, confidence, psfile, outputPath, outputFilename): 
    
    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[]})
    domainRange = projizz.getYagoRelationDomainRange()
    idf,docs,lens = projizz.getVSMmodels(vsmPath)
    st = projizz.getSortedPatternStatistic( projizz.jsonRead(psfile) )
    vsmData = (idf, docs, lens)

    projizz.checkPath(outputPath)

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(pool.apply_async(mapper, ( t, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData  )))
            #result.append( mapper( t, filename, inputPath, inputPtnPath, table, partAns, domainRange, confidence, vsmData  ))
            t += 1
    pool.close()
    pool.join()

    expResult = {}
    for res in result:
        r = res.get()
        for keyname in r:

            if not keyname in expResult:
                expResult[keyname] = copy.deepcopy(properties)

            for m in r[keyname]:
                if m == "produced":
                    continue
                expResult[keyname][m]["tp"] += r[keyname][m]["tp"]
                expResult[keyname][m]["fp"] += r[keyname][m]["fp"]
                expResult[keyname][m]["fn"] += r[keyname][m]["fn"]


    for keyname in expResult:
        p = expResult[keyname]
        keydirName = "vsm-%d" % (keyname)
        projizz.checkPath( os.path.join(outputPath,keydirName))
        projizz.jsonWrite(p,os.path.join(outputPath,keydirName,outputFilename))
        print "start write out to %s" % (os.path.join(outputPath,keydirName))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def outputStatistics(jsonPath):

    # Read file in.
    properties = projizz.jsonRead(jsonPath)

    occDocs = []
    for degree in range(1, 18):
        degree = "%d" % (degree)
        if not degree in properties:
            print "%s\t%d" % (degree, 0)
        else:
            print "%s\t%d" % (degree, len(properties[degree]))
            for ptnId in properties[degree]:
                for articleId in properties[degree][ptnId]["occ"]:
                    if not articleId in occDocs:
                        occDocs.append(articleId)

            print len(occDocs)
Exemplo n.º 11
0
def main(inputPtnPath,outputPath,pspath):
    
    model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    properties = projizz.buildYagoProperties({"tp":[],"tn":[],"fp":[],"fn":[]})
    sp = projizz.getSortedStatistic(projizz.jsonRead(pspath))
    validate = []
   
    # Get Top 100 Relation
    for relation in sp:
        count = 0
        for ptnId,ptnS in sp[relation]:
            ptnData = table[ptnId]
            if len(ptnData["relations"]) == 1:
                count += 1
                validate.append(ptnId)
            if count >= 100:
                break

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,validate )))
            t += 1
    pool.close()
    pool.join()

    for res in result:
        r = res.get()
        for m in r:
            properties[m]["tp"] += r[m]["tp"]
            properties[m]["tn"] += r[m]["tn"]
            properties[m]["fp"] += r[m]["fp"]
            properties[m]["fn"] += r[m]["fn"]

    print "start write out to %s" % (outputPath)
    json.dump(properties,open(outputPath,"w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Exemplo n.º 12
0
def filterFunction(jobid, filename, inputPtnPath, model, table, partAns, st,
                   domainRange, inputPath, confidence, nbcPath):
    # read patterns in articles
    contentPtnJson = json.load(open(os.path.join(inputPtnPath, filename), "r"))
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))

    classifiers = projizz.getNBClassifiers(nbcPath)
    print "Worker %d : Read %s into filter" % (jobid, filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0

    # prepare keys for multiple-exp
    # degree: 1 ~ 5
    # ambigu: select 1, select n (threshold:.5, .75), select all
    # type or not: no type info, type info

    expResult = {}

    for deg in range(1, 6):
        for typ in ["n", "t"]:
            if not deg == 1:
                for amb in ["one", "50", "75", "all"]:
                    keyname = "%d-%s-%s" % (deg, amb, typ)
                    expResult[keyname] = copy.deepcopy(partAns)
            else:
                keyname = "%d-1-%s" % (deg, typ)
                expResult[keyname] = copy.deepcopy(partAns)

    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])  # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        for keyname in expResult:

            args = keyname.split("-")
            degree = int(args[0])
            ambigu = args[1]
            typ = args[2]

            # Relation extraction
            relaEx = []
            ptnExRela = {}  # rela: ptns

            def recordPtnMakeRela(ptnId, rela, record):
                if not rela in record:
                    record[rela] = []
                if not ptnId in record[rela]:
                    record[rela].append(ptnId)

            for line in ptnEx:
                # line[0]: line number
                # line[1]: array of patterns

                lineText = article[line[0]]
                if lineText[
                        0] == "^":  # It's a wikipeida reference comments, ignore it!
                    continue

                for ptn in line[1]:
                    # ptn[0]: pattern ID
                    # ptn[1]: start position in line
                    # ptn[2]: end position in line

                    ptnId = "%d" % (ptn[0])

                    # validate the pattern
                    if not projizz.isPatternValidate(
                            ptnId, table, confidence=confidence, st=st):
                        continue

                    # get all possible relation of this pattern
                    rfp = table[ptnId]["relations"]

                    # check degree
                    if len(rfp) > degree:
                        continue

                    #
                    #   Decide to choice relation
                    #

                    if len(rfp) == 1:  # or degree == 1
                        if st[ptnId][0][1]["support"] > 0 and not rfp[
                                0] in relaEx:
                            if typ == "t":
                                if domainRange[rfp[0]]["domain"] in types:
                                    pr = rfp[0]
                                    if not classifiers[
                                            pr] == None and classifiers[
                                                pr].classify(
                                                    lineText) == "pos":
                                        relaEx.append(rfp[0])
                                        # FIXME For error checking
                                        recordPtnMakeRela(
                                            ptnId, rfp[0], ptnExRela)
                            else:
                                pr = rfp[0]
                                if not classifiers[pr] == None and classifiers[
                                        pr].classify(lineText) == "pos":
                                    relaEx.append(rfp[0])
                                    # FIXME For error checking
                                    recordPtnMakeRela(ptnId, rfp[0], ptnExRela)
                    else:
                        if ambigu == "one":
                            if typ == "t":
                                for ptnst in st[ptnId]:
                                    # ptnst[0] = relation
                                    # ptnst[1] = {"support": , "total": }
                                    if ptnst[1]["support"] > 0 and domainRange[
                                            ptnst[0]]["domain"] in types:
                                        if not ptnst[
                                                0] in relaEx and not classifiers[
                                                    ptnst[
                                                        0]] == None and classifiers[
                                                            ptnst[0]].classify(
                                                                lineText
                                                            ) == "pos":
                                            relaEx.append(ptnst[0])
                                            # FIXME For error checking
                                            recordPtnMakeRela(
                                                ptnId, ptnst[0], ptnExRela)
                                            break

                            else:
                                if st[ptnId][0][1]["support"] > 0 and not rfp[
                                        0] in relaEx and not classifiers[
                                            rfp[0]] == None and classifiers[
                                                rfp[0]].classify(
                                                    lineText) == "pos":
                                    relaEx.append(rfp[0])
                                    # FIXME For error checking
                                    recordPtnMakeRela(ptnId, rfp[0], ptnExRela)

                        elif ambigu == "all":
                            for ptnst in st[ptnId]:
                                if typ == "t":
                                    if domainRange[
                                            ptnst[0]]["domain"] in types:
                                        if not ptnst[
                                                0] in relaEx and not classifiers[
                                                    ptnst[
                                                        0]] == None and classifiers[
                                                            ptnst[0]].classify(
                                                                lineText
                                                            ) == "pos":
                                            relaEx.append(ptnst[0])
                                            # FIXME For error checking
                                            recordPtnMakeRela(
                                                ptnId, ptnst[0], ptnExRela)
                                else:
                                    if not ptnst[
                                            0] in relaEx and not classifiers[
                                                ptnst[
                                                    0]] == None and classifiers[
                                                        ptnst[0]].classify(
                                                            lineText) == "pos":
                                        relaEx.append(ptnst[0])
                                        # FIXME For error checking
                                        recordPtnMakeRela(
                                            ptnId, ptnst[0], ptnExRela)
                        else:
                            th = 0.75
                            if ambigu == "50":
                                th = 0.5

                            b = st[ptnId][0][1]["support"]
                            if b > 0:
                                for ptnst in st[ptnId]:
                                    if float(ptnst[1]["support"]) / float(
                                            b) >= th:
                                        if typ == "t":
                                            if domainRange[ptnst[0]][
                                                    "domain"] in types and not ptnst[
                                                        0] in relaEx and not classifiers[
                                                            ptnst[0]] == None and classifiers[
                                                                ptnst[
                                                                    0]].classify(
                                                                        lineText
                                                                    ) == "pos":
                                                relaEx.append(ptnst[0])
                                                # FIXME For error checking
                                                recordPtnMakeRela(
                                                    ptnId, ptnst[0], ptnExRela)
                                        else:
                                            if not ptnst[0] in relaEx and not classifiers[
                                                    ptnst[
                                                        0]] == None and classifiers[
                                                            ptnst[0]].classify(
                                                                lineText
                                                            ) == "pos":
                                                relaEx.append(ptnst[0])
                                                # FIXME For error checking
                                                recordPtnMakeRela(
                                                    ptnId, ptnst[0], ptnExRela)

            # Evaluation
            for attribute in expResult[keyname]:

                # special case, ignore.
                if attribute == "produced":
                    continue

                postive = False
                true = False

                if attribute in relaEx:
                    postive = True
                if attribute in relation:
                    true = True

                if true:
                    if postive:
                        expResult[keyname][attribute]["tp"].append(
                            ans["revid"])
                    else:
                        expResult[keyname][attribute]["fn"].append(
                            ans["revid"])
                else:
                    if postive:
                        # False Positive
                        expResult[keyname][attribute]["fp"].append(
                            ans["revid"])
                        # TODO - 分析錯誤原因
                        if attribute in ptnExRela:
                            if attribute in originRela:
                                # type 2 error
                                expResult[keyname][attribute]["et2"].append(
                                    ans["revid"])
                            else:
                                found = False
                                ptns = ptnExRela[
                                    attribute]  # get the patterns raise the Relation
                                for pid in ptns:
                                    for psbR in table[pid]["relations"]:
                                        if psbR == attribute:
                                            continue

                                        # here means that the pattern can raise a `correct' relation in answer, may it choice or not
                                        if domainRange[psbR][
                                                "domain"] in types and psbR in relation:
                                            found = True
                                            break

                                if found:
                                    # type 1 error
                                    expResult[keyname][attribute][
                                        "et1"].append(ans["revid"])
                                else:
                                    # type 3 error
                                    expResult[keyname][attribute][
                                        "et3"].append(ans["revid"])
                        else:
                            # 這是什麼情況?@@ 這種狀況基`本不可能發生吧XD
                            pass
                    else:
                        # ignore true-negative
                        pass

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return expResult
Exemplo n.º 13
0
def mapper(jobid,filename,inputPath,outputPath,model,table):

    # Read article
    article = projizz.jsonRead( os.path.join(inputPath,filename) )

    stemmer = PorterStemmer()
    tks = {}

    print "Worker %d : Read %s into filter" % (jobid,filename)

    count = 0
    total = 0
    for line in article:
        count += 1
        tokens = projizz.getTokens(line)

        for token in tokens:
            t = stemmer.stem(token) 

            if t not in tks:
                tks[t] = 0

            tks[t] += 1
            total += 1

        if count % 1000 == 0:
            print "worker %d done %d lines" % (jobid,count)


    # Remove stopwords
    for sw in projizz._stopwords:
        _sw = stemmer.stem(sw)
        if _sw in tks:
            total -= tks[_sw]
            tks.pop(_sw)
        
    needRemove = []
    maxTF = 0
    for t in tks:
        # ignore only one time word
        if tks[t] <= 1:
            needRemove.append(t)
            total -= tks[t]
            continue

        # ignore the case contain number
        if "0" in t or "1" in t or "2" in t or "3" in t or "4" in t or "5" in t or "6" in t or "7" in t or "8" in t or "9" in t:
            needRemove.append(t)
            total -= tks[t]
            continue

        if tks[t] > maxTF:
            maxTF = tks[t]

    for rm in needRemove:
        tks.pop(rm)

    projizz.jsonWrite(tks,os.path.join(outputPath,filename.replace(".json",".tfc")))
    
    # Calculate tf
    for t in tks:
        tc = tks[t]
        tks[t] = float(tc)/float(maxTF)

    projizz.jsonWrite(tks,os.path.join(outputPath,filename.replace(".json",".tf")))
    print "worker %d write out." % (jobid)

    return (filename,tks)
Exemplo n.º 14
0
def viewer(path,threshold):
    model = projizz.jsonRead(path)
    sortedModel = sorted(model.items(), key=lambda x:x[1], reverse=True)
    for word, score in sortedModel:
        if score >= threshold:
            print "%s\t%f" % (word.encode("utf-8"),score)
def main(inputPath,inputPtnPath,outputPath,outputPtnPath):

    debug = False

    if not os.path.isdir(outputPath):
        os.mkdir(outputPath)
    if not os.path.isdir(outputPtnPath):
        os.mkdir(outputPtnPath)

    result = []
    count = 0

    # Update answer
    cpuN = multiprocessing.cpu_count()
    print "CoreNumber = %d" % (cpuN)
    pool = multiprocessing.Pool(processes=12) 
    t = 0
    for filename in os.listdir(inputPath):
        if ".json" in filename:
            t += 1
            if debug:
                result.append(updateAnswer(t,inputPath,filename))
            else:
                result.append(pool.apply_async(updateAnswer, (t,inputPath,filename)))
    
    pool.close()
    pool.join()

    # Rebuild articles and patterns

    tmpArticle = {}
    tmpPtn = {}

    dataSize = 0
    for res in result:
        if debug:
            filename,articles = res
        else:
            filename,articles = res.get()

        print filename,len(articles)
        a = projizz.jsonRead(os.path.join(inputPath,filename))
        p = projizz.jsonRead(os.path.join(inputPtnPath,filename))

        for key in articles:
            dataSize += 1
            tmpArticle[key] = a[key]
            tmpPtn[key] = p[key]

            if len(tmpPtn) == 1000:
                print "write to %05d.json" % (count)
                projizz.jsonWrite(tmpArticle,os.path.join(outputPath,"%05d.json" % (count)))
                projizz.jsonWrite(tmpPtn,os.path.join(outputPtnPath,"%05d.json" % (count)))
                tmpArticle = {}
                tmpPtn = {}
                count += 1

    if len(tmpPtn) > 0:
        print "write to %05d.json" % (count)
        projizz.jsonWrite(tmpArticle,os.path.join(outputPath,"%05d.json" % (count)))
        projizz.jsonWrite(tmpPtn,os.path.join(outputPtnPath,"%05d.json" % (count)))
        tmpArticle = {}
        tmpPtn = {}
        count += 1

    # Split to 5 
    splitTo5part("/tmp2/r01922024","y-all","/tmp2/r01922024","y")
    splitTo5part("/tmp2/r01922024","y-ptn-all","/tmp2/r01922024","y-ptn")
    
    print "write %d files. (%d)" % (count,dataSize)
Exemplo n.º 16
0
def filterFunction(jobid,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,nbcPath):
    # read patterns in articles
    contentPtnJson = json.load(open(os.path.join(inputPtnPath,filename),"r"))
    contentJson = projizz.jsonRead(os.path.join(inputPath,filename))
    
    classifiers = projizz.getNBClassifiers(nbcPath)
    print "Worker %d : Read %s into filter" % (jobid,filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid":{"$in":queries}})
    print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count())

    count = 0

    # prepare keys for multiple-exp
    # degree: 1 ~ 5
    # ambigu: select 1, select n (threshold:.5, .75), select all
    # type or not: no type info, type info
    
    expResult = {}

    for deg in range(1,6):
        for typ in ["n","t"]:
            if not deg == 1:
                for amb in ["one","50","75","all"]:
                    keyname = "%d-%s-%s" % (deg,amb,typ)
                    expResult[keyname] = copy.deepcopy(partAns)
            else:
                keyname = "%d-1-%s" % (deg,typ)
                expResult[keyname] = copy.deepcopy(partAns)
    
    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])     # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]
        
        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        for keyname in expResult:

            args = keyname.split("-")
            degree = int(args[0])
            ambigu = args[1]
            typ    = args[2]

            # Relation extraction
            relaEx = []
            ptnExRela = {}  # rela: ptns

            def recordPtnMakeRela(ptnId,rela,record):
                if not rela in record:
                    record[rela] = []
                if not ptnId in record[rela]:
                    record[rela].append(ptnId)

            for line in ptnEx:
                # line[0]: line number
                # line[1]: array of patterns

                lineText = article[line[0]]
                if lineText[0] == "^":  # It's a wikipeida reference comments, ignore it!
                    continue
                
                for ptn in line[1]:
                    # ptn[0]: pattern ID
                    # ptn[1]: start position in line
                    # ptn[2]: end position in line

                    ptnId = "%d" % (ptn[0])

                    # validate the pattern 
                    if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st):
                        continue

                    # get all possible relation of this pattern
                    rfp = table[ptnId]["relations"]

                    # check degree
                    if len(rfp) > degree:
                        continue

                    #
                    #   Decide to choice relation
                    # 

                    if len(rfp) == 1:   # or degree == 1
                        if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx:
                            if typ == "t":
                                if domainRange[rfp[0]]["domain"] in types:
                                    pr = rfp[0]
                                    if not classifiers[pr] == None and classifiers[pr].classify(lineText) == "pos":
                                        relaEx.append(rfp[0])
                                        # FIXME For error checking
                                        recordPtnMakeRela(ptnId, rfp[0], ptnExRela)
                            else:
                                pr = rfp[0]
                                if not classifiers[pr] == None and classifiers[pr].classify(lineText) == "pos":
                                    relaEx.append(rfp[0])
                                    # FIXME For error checking
                                    recordPtnMakeRela(ptnId, rfp[0], ptnExRela)
                    else:
                        if ambigu == "one":
                            if typ == "t":
                                for ptnst in st[ptnId]:
                                    # ptnst[0] = relation
                                    # ptnst[1] = {"support": , "total": }
                                    if ptnst[1]["support"] > 0 and domainRange[ptnst[0]]["domain"] in types:
                                        if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos":
                                            relaEx.append(ptnst[0])
                                            # FIXME For error checking
                                            recordPtnMakeRela(ptnId, ptnst[0], ptnExRela)
                                            break
                            
                            else:
                                if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx and not classifiers[rfp[0]] == None and classifiers[rfp[0]].classify(lineText) == "pos":
                                    relaEx.append(rfp[0])
                                    # FIXME For error checking
                                    recordPtnMakeRela(ptnId, rfp[0], ptnExRela)
                                
                        elif ambigu == "all":
                            for ptnst in st[ptnId]:
                                if typ == "t":
                                    if domainRange[ptnst[0]]["domain"] in types:
                                        if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos":
                                            relaEx.append(ptnst[0])
                                            # FIXME For error checking
                                            recordPtnMakeRela(ptnId, ptnst[0], ptnExRela)
                                else:
                                    if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos":
                                        relaEx.append(ptnst[0])
                                        # FIXME For error checking
                                        recordPtnMakeRela(ptnId, ptnst[0], ptnExRela)
                        else:
                            th = 0.75
                            if ambigu == "50":
                                th = 0.5
                            
                            b = st[ptnId][0][1]["support"]
                            if b > 0:
                                for ptnst in st[ptnId]:
                                    if float(ptnst[1]["support"])/float(b) >= th:
                                        if typ == "t":
                                            if domainRange[ptnst[0]]["domain"] in types and not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos":
                                                relaEx.append(ptnst[0])
                                                # FIXME For error checking
                                                recordPtnMakeRela(ptnId, ptnst[0], ptnExRela)
                                        else:
                                            if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos":
                                                relaEx.append(ptnst[0])
                                                # FIXME For error checking
                                                recordPtnMakeRela(ptnId, ptnst[0], ptnExRela)

            
            # Evaluation
            for attribute in expResult[keyname]:

                # special case, ignore.
                if attribute == "produced":
                    continue

                postive = False
                true = False

                if attribute in relaEx:
                    postive = True
                if attribute in relation:
                    true = True
            
                if true:
                    if postive:
                        expResult[keyname][attribute]["tp"].append(ans["revid"])
                    else:
                        expResult[keyname][attribute]["fn"].append(ans["revid"])
                else:
                    if postive:
                        # False Positive
                        expResult[keyname][attribute]["fp"].append(ans["revid"])
                        # TODO - 分析錯誤原因
                        if attribute in ptnExRela:
                            if attribute in originRela:
                                # type 2 error
                                expResult[keyname][attribute]["et2"].append(ans["revid"])
                            else:
                                found = False
                                ptns =  ptnExRela[attribute]    # get the patterns raise the Relation
                                for pid in ptns:
                                    for psbR in table[pid]["relations"]:
                                        if psbR == attribute:
                                            continue

                                        # here means that the pattern can raise a `correct' relation in answer, may it choice or not
                                        if domainRange[psbR]["domain"] in types and psbR in relation:
                                            found = True
                                            break

                                if found:
                                    # type 1 error
                                    expResult[keyname][attribute]["et1"].append(ans["revid"])
                                else:
                                    # type 3 error
                                    expResult[keyname][attribute]["et3"].append(ans["revid"])
                        else:
                            # 這是什麼情況?@@ 這種狀況基`本不可能發生吧XD
                            pass
                    else:
                        # ignore true-negative
                        pass
    
        if count % 100 == 0:
            print "worker #%d done %d." % (jobid,count)

    return expResult
Exemplo n.º 17
0
def mapper(jobid, filename, inputPath, inputPtnPath, model, table):

    # Read article
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))
    # Read ptn
    contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename))

    print "Worker %d : Read %s into filter" % (jobid, filename)

    ### Connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0

    supportInstanceByFile = {}

    for ans in itr:
        count += 1

        key = "%s.txt" % (ans["revid"])
        relation = ans["observed"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        supportInstanceByFile[key] = {}

        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns
            lineText = article[line[0]]

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line
                ptnId = "%d" % (ptn[0])

                if not projizz.isPatternValidate(ptnId, table):
                    continue

                for rela in table[ptnId]["relations"]:
                    # it's a support instance
                    if rela in relation:

                        if not ptnId in supportInstanceByFile[key]:
                            supportInstanceByFile[key][ptnId] = {}
                        if not rela in supportInstanceByFile[key][ptnId]:
                            supportInstanceByFile[key][ptnId][rela] = []

                        if not line[0] in supportInstanceByFile[key][ptnId][
                                rela]:
                            supportInstanceByFile[key][ptnId][rela].append(
                                line[0])

        for ptnId in supportInstanceByFile[key]:
            for rela in supportInstanceByFile[key][ptnId]:
                lines = supportInstanceByFile[key][ptnId][rela]
                supportInstanceByFile[key][ptnId][rela] = []
                for lineN in lines:
                    supportInstanceByFile[key][ptnId][rela].append(
                        article[lineN])

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return supportInstanceByFile
Exemplo n.º 18
0
def main(inputPath, inputPtnPath, outputPath, outputPtnPath):

    debug = False

    if not os.path.isdir(outputPath):
        os.mkdir(outputPath)
    if not os.path.isdir(outputPtnPath):
        os.mkdir(outputPtnPath)

    result = []
    count = 0

    # Update answer
    cpuN = multiprocessing.cpu_count()
    print "CoreNumber = %d" % (cpuN)
    pool = multiprocessing.Pool(processes=12)
    t = 0
    for filename in os.listdir(inputPath):
        if ".json" in filename:
            t += 1
            if debug:
                result.append(updateAnswer(t, inputPath, filename))
            else:
                result.append(
                    pool.apply_async(updateAnswer, (t, inputPath, filename)))

    pool.close()
    pool.join()

    # Rebuild articles and patterns

    tmpArticle = {}
    tmpPtn = {}

    dataSize = 0
    for res in result:
        if debug:
            filename, articles = res
        else:
            filename, articles = res.get()

        print filename, len(articles)
        a = projizz.jsonRead(os.path.join(inputPath, filename))
        p = projizz.jsonRead(os.path.join(inputPtnPath, filename))

        for key in articles:
            dataSize += 1
            tmpArticle[key] = a[key]
            tmpPtn[key] = p[key]

            if len(tmpPtn) == 1000:
                print "write to %05d.json" % (count)
                projizz.jsonWrite(
                    tmpArticle, os.path.join(outputPath,
                                             "%05d.json" % (count)))
                projizz.jsonWrite(
                    tmpPtn, os.path.join(outputPtnPath, "%05d.json" % (count)))
                tmpArticle = {}
                tmpPtn = {}
                count += 1

    if len(tmpPtn) > 0:
        print "write to %05d.json" % (count)
        projizz.jsonWrite(tmpArticle,
                          os.path.join(outputPath, "%05d.json" % (count)))
        projizz.jsonWrite(tmpPtn,
                          os.path.join(outputPtnPath, "%05d.json" % (count)))
        tmpArticle = {}
        tmpPtn = {}
        count += 1

    # Split to 5
    splitTo5part("/tmp2/r01922024", "y-all", "/tmp2/r01922024", "y")
    splitTo5part("/tmp2/r01922024", "y-ptn-all", "/tmp2/r01922024", "y-ptn")

    print "write %d files. (%d)" % (count, dataSize)
Exemplo n.º 19
0
def main(part,revid):

    # Paths (on NLG workstation)
    inputPath = "/tmp2/ccli/y-part-%s/" % (part)
    inputPtnPath = "/tmp2/ccli/y-ptn-part-%s/" % (part)
    spPath = "../yago/yagoPSv2/ps.%s.json" % (part)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    itr = collection.find({"revid":revid})

    # find filename
    a = os.popen("grep -nr \"%s\" %s" % (revid,inputPath)).readline()
    targetFilename = a.split(":")[0].split("/")[-1]
    key = "%s.txt" % (revid)

    

    pattern = projizz.jsonRead(inputPtnPath+targetFilename)[key]
    article = projizz.articleSimpleSentenceFileter(projizz.jsonRead(inputPath+targetFilename)[key])
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(spPath))
    domainRange = projizz.getYagoRelationDomainRange();
    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../yago/yagoPatternTree.table")

    print "Part %s, RevID=%s, in %s" % (part,revid,targetFilename)

    for ans in itr:

        targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]
        answers = ans["observed"]

        print "Target=%s\nTarget token=%s" % (ans["_id"].encode("utf-8"),targetName)
        print "Type=%s" % (types)
        print "Answer=%s" % (answers)
       
        for line in pattern:
            lineText = article[line[0]]
            named = False
            for namedToken in targetName:
                if namedToken in lineText:
                    named = True
                    break

            if not named:   # No target name in line text
                continue    # go to next line.

            for ptn in line[1]:
                ptnId = "%d" % (ptn[0])
                #rfp = table[ptnId]["relations"]
                if not ptnId in st:
                    continue

                for ps in st[ptnId]:
                    if float(ps[1]["support"])/float(ps[1]["total"]) > 0:
                        if domainRange[ps[0]]["domain"] in types:
                            print "#%d" % (line[0]),lineText.encode("utf-8")
                            isIn = "(X)"
                            if ps[0] in answers:
                                isIn = "(O)"
                            print "%s %s/%s/{%d,%d}/ %s" % (isIn,ptnId,table[ptnId]["pattern"],ps[1]["support"],ps[1]["total"],ps[0])

                        pass

                    # select top 1
                    break


        # prevent second ans
        break
Exemplo n.º 20
0
def generate(inputSPIpath,inputTestPath,outputVSMpath,confidence):
    
    # Checking output path
    projizz.checkPath(outputVSMpath)

    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table")

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)

    # Collect not used keys
    # because using 5-fold CV
    t = 0
    result = []
    for filename in os.listdir(inputTestPath):
        if ".json" in filename:
            result.append( pool.apply_async( mapper, (t,filename,inputTestPath) )  )
            t += 1
    pool.close()
    pool.join()

    notUsedKeys = []
    for r in result:
        ks = r.get()
        notUsedKeys += ks

    ### Build Model
    # Paatern Selection
    modelArticles = projizz.buildYagoProperties([])
    words = []
    count = 0
    for filename in os.listdir(inputSPIpath):
        if ".json" in filename:
            ptnId = filename[:-5]

            # ignore invalidate pattern
            if not projizz.isPatternValidate(ptnId, table, confidence=confidence):
                continue

            count += 1
            print count,ptnId

            ptnInstance = projizz.jsonRead( os.path.join(inputSPIpath,filename) )
            for rela in ptnInstance:
                for key in ptnInstance[rela]:
                    # ignore in testing data's key
                    if key in notUsedKeys:
                        continue

                    for line in ptnInstance[rela][key]:
                        modelArticles[rela].append(line)
    
            if count%100 == 0:
                print "Read",count,"files"

    for relation in modelArticles:
        print relation
        projizz.jsonWrite(modelArticles[relation],os.path.join(outputVSMpath,"%s.txt" % (relation)))
Exemplo n.º 21
0
def mapper(jobid, filename, inputTestPath):
    contentPtnJson = projizz.jsonRead( os.path.join(inputTestPath, filename) )
    keys = map(lambda x: x, contentPtnJson)
    print "Worker %d read %s, done." % (jobid, filename)
    return keys
Exemplo n.º 22
0
# -*- coding: utf-8 -*-
# qcl
#

import sys
import projizz

if len(sys.argv) <= 1:
    print "$ python ./simpleSortedViewer.py [ps json]"
else:
    filename = sys.argv[1]
    ps = projizz.jsonRead(filename)
    sortedp = projizz.getSortedStatistic(ps)
    model,table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    for relation in sortedp:
        print relation
        for ptnId,ptnS in sortedp[relation]:
            print "%s\t%s %s %s" % (relation,table[ptnId]["pattern"],ptnId,ptnS)
Exemplo n.º 23
0
def filterFunction(jobid,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence):
    # read patterns in articles
    contentPtnJson = json.load(open(os.path.join(inputPtnPath,filename),"r"))
    contentJson = projizz.jsonRead(os.path.join(inputPath,filename))

    print "Worker %d : Read %s into filter" % (jobid,filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid":{"$in":queries}})
    print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count())

    count = 0

    # prepare keys for multiple-exp
    # degree: 1 ~ 5
    # ambigu: select 1, select n (threshold:.5, .75), select all
    # type or not: no type info, type info
    
    expResult = {}

    for deg in range(1,6):
        for typ in ["n","t"]:
            if not deg == 1:
                for amb in ["one","50","75","all"]:
                    keyname = "%d-%s-%s" % (deg,amb,typ)
                    expResult[keyname] = copy.deepcopy(partAns)
            else:
                keyname = "%d-1-%s" % (deg,typ)
                expResult[keyname] = copy.deepcopy(partAns)
    
    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])     # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]
        
        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        for keyname in expResult:

            args = keyname.split("-")
            degree = int(args[0])
            ambigu = args[1]
            typ    = args[2]

            # Relation extraction
            relaEx = []
            for line in ptnEx:
                # line[0]: line number
                # line[1]: array of patterns

                lineText = article[line[0]]
                if lineText[0] == "^":  # It's a wikipeida reference comments, ignore it!
                    continue

                for ptn in line[1]:
                    # ptn[0]: pattern ID
                    # ptn[1]: start position in line
                    # ptn[2]: end position in line

                    ptnId = "%d" % (ptn[0])

                    if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st):
                        continue

                    rfp = table[ptnId]["relations"]

                    # check degree
                    if len(rfp) > degree:
                        continue

                    if len(rfp) == 1:   # or degree == 1
                        if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx:
                            if typ == "t":
                                if domainRange[rfp[0]]["domain"] in types:
                                    relaEx.append(rfp[0])
                            else:
                                relaEx.append(rfp[0])

                    else:
                        if ambigu == "one":
                            if typ == "t":
                                for ptnst in st[ptnId]:
                                    # ptnst[0] = relation
                                    # ptnst[1] = {"support": , "total": }
                                    if ptnst[1]["support"] > 0 and domainRange[ptnst[0]]["domain"] in types:
                                        if not ptnst[0] in relaEx:
                                            relaEx.append(ptnst[0])
                                            break

                            
                            else:
                                if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx:
                                    relaEx.append(rfp[0])
                                
                        elif ambigu == "all":
                            for ptnst in st[ptnId]:
                                if typ == "t":
                                    if domainRange[ptnst[0]]["domain"] in types:
                                        if not ptnst[0] in relaEx:
                                            relaEx.append(ptnst[0])
                                else:
                                    if not ptnst[0] in relaEx:
                                        relaEx.append(ptnst[0])
                        else:
                            th = 0.75
                            if ambigu == "50":
                                th = 0.5
                            
                            b = st[ptnId][0][1]["support"]
                            if b > 0:
                                for ptnst in st[ptnId]:
                                    if float(ptnst[1]["support"])/float(b) >= th:
                                        if typ == "t":
                                            if domainRange[ptnst[0]]["domain"] in types and not ptnst[0] in relaEx:
                                                relaEx.append(ptnst[0])
                                        else:
                                            if not ptnst[0] in relaEx:
                                                relaEx.append(ptnst[0])

            
            # Evaluation
            for attribute in expResult[keyname]:

                # special case, ignore.
                if attribute == "produced":
                    continue

                postive = False
                true = False

                if attribute in relaEx:
                    postive = True
                if attribute in relation:
                    true = True
            
                if true:
                    if postive:
                        expResult[keyname][attribute]["tp"].append(ans["revid"])
                    else:
                        expResult[keyname][attribute]["fn"].append(ans["revid"])
                else:
                    if postive:
                        expResult[keyname][attribute]["fp"].append(ans["revid"])
                    else:
                        # ignore true-negative
                        pass
    
        if count % 100 == 0:
            print "worker #%d done %d." % (jobid,count)

    return expResult
Exemplo n.º 24
0
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData):

    # read articles and patterns
    contentJson = projizz.jsonRead(os.path.join(inputPath,filename))
    contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath,filename))

    print "Worker %d : Read %s" % (jobid,filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid":{"$in":queries}})
    print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count())

    count = 0
    expResult = {}
    relaEx = {}

    # set thresholds
    for th in range(0,51,5):
        expResult[th] = copy.deepcopy(partAns)
        relaEx[th] = []
    
    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])     # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]
        
        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        # TODO

        # Relation extraction
        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line

                ptnId = "%d" % (ptn[0])

                ptntks = table[ptnId]["pattern"]
                lineText = article[line[0]]

                if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st):
                    continue
        
                rfp = table[ptnId]["relations"]

                # check degree
                if len(rfp) > 5:
                    continue

                # if no support, ignore this pattern
                if st[ptnId][0][1]["support"] <= 0:
                    continue

                # TODO - Modlify string, remove pattern text in string?
                cosRlt = projizz.vsmSimilarity( lineText, vsmData, relas=rfp, ptntext=ptntks )

                # NOTE - if cosine value > threshold then there is a relation (?)
                for keyname in expResult:
                    threshold = float(keyname)/100.0

                    for pr in cosRlt:
                        # Check type
                        if domainRange[pr]["domain"] in types:
                            if cosRlt[pr] > threshold:
                                if pr not in relaEx[keyname]:
                                    relaEx[keyname].append(pr)

        #### Evaluation
        for keyname in expResult: 
            for attribute in expResult[keyname]:

                # special case, ignore.
                if attribute == "produced":
                    continue

                postive = False
                true = False

                if attribute in relaEx[keyname]:
                    postive = True
                if attribute in relation:
                    true = True
            
                if true:
                    if postive:
                        expResult[keyname][attribute]["tp"].append(ans["revid"])
                    else:
                        expResult[keyname][attribute]["fn"].append(ans["revid"])
                else:
                    if postive:
                        expResult[keyname][attribute]["fp"].append(ans["revid"])
                    else:
                        # ignore true-negative
                        pass
    
        if count % 100 == 0:
            print "worker #%d done %d." % (jobid,count)

    return expResult
Exemplo n.º 25
0
def mapper(jobid, filename, inputPath, inputPtnPath, model, table, confidence):

    # Read article
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))
    # Read ptn
    contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename))

    print "Worker %d : Read %s into filter" % (jobid, filename)

    ### Connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0

    supportInstanceByFile = {}

    linesByRelations = {}
    linesNoRelaByRelations = {}

    POS = {}
    NEG = {}

    for ans in itr:
        count += 1

        key = "%s.txt" % (ans["revid"])
        relation = ans["observed"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        supportInstanceByFile[key] = {}
        linesByRela = {}
        linesByNoRela = {}

        pos = {}
        neg = {}

        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns
            lineText = article[line[0]]

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line
                ptnId = "%d" % (ptn[0])

                if not projizz.isPatternValidate(
                        ptnId, table, confidence=confidence):
                    continue

                # give up degree > 5 's pattern
                if len(table[ptnId]["relations"]) > 5:
                    continue

                for rela in table[ptnId]["relations"]:
                    # it's a support instance
                    if rela in relation:

                        # NOTE - remove pattern text.
                        if not rela in linesByRela:
                            linesByRela[rela] = {}
                        if not line[0] in linesByRela[rela]:
                            linesByRela[rela][line[0]] = []
                        if not ptnId in linesByRela[rela][line[0]]:
                            linesByRela[rela][line[0]].append(ptnId)

                        # For binary classifier
                        if not rela in pos:
                            pos[rela] = []
                        if not lineText[0] == "^" and line[0] not in pos[rela]:
                            pos[rela].append(line[0])

                    else:
                        if not rela in linesByNoRela:
                            linesByNoRela[rela] = {}
                        if not line[0] in linesByNoRela[rela]:
                            linesByNoRela[rela][line[0]] = []
                        if not ptnId in linesByNoRela[rela][line[0]]:
                            linesByNoRela[rela][line[0]].append(ptnId)

                        # For binary classifier
                        if not rela in neg:
                            neg[rela] = []
                        if not lineText[0] == "^" and line[0] not in neg[rela]:
                            neg[rela].append(line[0])

        for rela in linesByRela:
            if not rela in linesByRelations:
                linesByRelations[rela] = []
            for lineN in linesByRela[rela]:
                text = projizz.getTokens(article[lineN].lower())
                for ptnId in linesByRela[rela][lineN]:
                    ptntext = table[ptnId]["pattern"].split()
                    for ptntk in ptntext:
                        if ptntk in text:
                            text.remove(ptntk)
                l = ' '.join(text)
                linesByRelations[rela].append(l)

        for rela in linesByNoRela:
            if not rela in linesNoRelaByRelations:
                linesNoRelaByRelations[rela] = []
            for lineN in linesByNoRela[rela]:
                text = projizz.getTokens(article[lineN].lower())
                for ptnId in linesByNoRela[rela][lineN]:
                    ptntext = table[ptnId]["pattern"].split()
                    for ptntk in ptntext:
                        if ptntk in text:
                            text.remove(ptntk)
                l = ' '.join(text)
                linesNoRelaByRelations[rela].append(l)

        # For binary classifier
        for rela in pos:
            if not rela in POS:
                POS[rela] = []
            for lineN in pos[rela]:
                POS[rela].append({"text": article[lineN], "label": "pos"})

        for rela in neg:
            if not rela in NEG:
                NEG[rela] = []
            for lineN in neg[rela]:
                NEG[rela].append({"text": article[lineN], "label": "neg"})

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return linesByRelations, linesNoRelaByRelations, POS, NEG
Exemplo n.º 26
0
def updateAnswer(jobid, inputPath, filename):
    contenJson = projizz.jsonRead(os.path.join(inputPath, filename))
    print "#%d - %s" % (jobid, filename)
    connect = Connection()
    answerCollection = connect.projizz.result.yago.answer
    factCollection = connect.projizz.yago.facts

    queries = map(lambda x: x[:-4], contenJson)

    itr = answerCollection.find({"revid": {"$in": queries}})
    print "#%d - query=%d,result=%d" % (jobid, len(queries), itr.count())

    count = 0
    ty1g = 0
    ty2g = 0
    updateC = 0
    articles = []
    for ans in itr:
        count += 1
        articleID = "%s.txt" % (ans["revid"])
        articleName = ans["_id"]
        properties = ans["properties"]
        #not consider references.
        #references = ans["references"]

        if len(properties) == 0:
            # give up those no properties' article
            # print "#%d - give up %s (1)" % (jobid,articleID)
            ty1g += 1
            continue

        needUpdate = len(properties)

        lines = projizz.articleSimpleSentenceFileter(contenJson[articleID])
        text = ""
        for line in lines:
            text += (line + " ")

        observed = []
        for pro in properties:

            pitr = factCollection.find({
                "property": pro,
                "subject": articleName
            })
            if pitr.count() < 1:
                notNeed.append(pro)
                continue

            found = False
            for fact in pitr:
                tokens = projizz.getNamedEntityTokens(fact["object"])
                for token in tokens:
                    if token in text:
                        found = True
                        break
                if found:
                    break
            if found:
                observed.append(pro)

        if len(observed) > 0:
            articles.append(articleID)
            ans["observed"] = observed
            answerCollection.update({"revid": ans["revid"]}, ans, upsert=False)
        else:
            ty2g += 1
            #print "#%d - give up %s (2)" % (jobid,articleID)

    print "#%d -> update %d (give up %d + %d)" % (jobid, len(articles), ty1g,
                                                  ty2g)

    return (filename, articles)
def updateAnswer(jobid,inputPath,filename):
    contenJson = projizz.jsonRead(os.path.join(inputPath,filename))
    print "#%d - %s" % (jobid,filename)
    connect = Connection()
    answerCollection = connect.projizz.result.yago.answer
    factCollection = connect.projizz.yago.facts

    queries = map(lambda x: x[:-4], contenJson)

    itr = answerCollection.find({"revid":{"$in":queries}})
    print "#%d - query=%d,result=%d" % (jobid,len(queries),itr.count())
    
    count = 0
    ty1g = 0
    ty2g = 0
    updateC = 0
    articles = []
    for ans in itr:
        count += 1
        articleID = "%s.txt" % (ans["revid"])
        articleName = ans["_id"]
        properties = ans["properties"]
        #not consider references.
        #references = ans["references"]

        if len(properties) == 0:
            # give up those no properties' article
            # print "#%d - give up %s (1)" % (jobid,articleID)
            ty1g += 1
            continue
        
        needUpdate = len(properties)

        lines = projizz.articleSimpleSentenceFileter(contenJson[articleID])
        text = ""
        for line in lines:
            text += (line + " ")

        observed = []
        for pro in properties:
            
            pitr = factCollection.find({"property":pro,"subject":articleName})
            if pitr.count() < 1:
                notNeed.append(pro)
                continue

            found = False
            for fact in pitr:
                tokens = projizz.getNamedEntityTokens(fact["object"])
                for token in tokens:
                    if token in text:
                        found = True
                        break
                if found:
                    break
            if found:
                observed.append(pro)
            
        if len(observed) > 0:
            articles.append(articleID)
            ans["observed"] = observed
            answerCollection.update({"revid":ans["revid"]},ans,upsert=False)
        else:
            ty2g += 1
            #print "#%d - give up %s (2)" % (jobid,articleID)

    print "#%d -> update %d (give up %d + %d)" % (jobid,len(articles),ty1g,ty2g)

    return (filename,articles)
Exemplo n.º 28
0
def mapper(jobid, filename, inputPath, topN, outputPath, model, table):

    # Read article
    article = projizz.jsonRead(os.path.join(inputPath, filename))

    stemmer = PorterStemmer()
    tks = {}

    print "Worker %d : Read %s into filter" % (jobid, filename)

    count = 0
    total = 0
    for line in article:
        count += 1
        tokens = projizz.getTokens(line)

        for token in tokens:
            t = stemmer.stem(token)

            if t not in tks:
                tks[t] = 0

            tks[t] += 1
            total += 1

        if count % 1000 == 0:
            print "worker %d done %d lines" % (jobid, count)

    # Remove stopwords
    for sw in projizz._stopwords:
        _sw = stemmer.stem(sw)
        if _sw in tks:
            total -= tks[_sw]
            tks.pop(_sw)

    needRemove = []
    maxTF = 1
    for t in tks:
        # ignore only one time word
        if tks[t] <= 1:
            needRemove.append(t)
            total -= tks[t]
            continue

        # ignore the case contain number
        if "0" in t or "1" in t or "2" in t or "3" in t or "4" in t or "5" in t or "6" in t or "7" in t or "8" in t or "9" in t:
            needRemove.append(t)
            total -= tks[t]
            continue

        #if tks[t] > maxTF:
        #    maxTF = tks[t]

    for rm in needRemove:
        tks.pop(rm)

    projizz.jsonWrite(
        tks, os.path.join(outputPath, filename.replace(".json", ".tfc")))

    ### select top N words
    # sort by tfc
    sortedTks = sorted(tks.items(), key=lambda x: x[1], reverse=True)
    tks = {}
    maxTF = sortedTks[0][1]
    # Calculate tf
    top = 0
    for t, c in sortedTks:
        top += 1
        tks[t] = float(c) / float(maxTF)
        if top == topN:
            break

    projizz.jsonWrite(
        tks, os.path.join(outputPath, filename.replace(".json", ".tf")))
    print "worker %d write out." % (jobid)

    return (filename, tks)
Exemplo n.º 29
0
def generate(inputSPIpath, inputTestPath, outputVSMpath, confidence):

    # Checking output path
    projizz.checkPath(outputVSMpath)

    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model",
        "../patty/yagoPatternTreeWithConfidence.table")

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)

    # Collect not used keys
    # because using 5-fold CV
    t = 0
    result = []
    for filename in os.listdir(inputTestPath):
        if ".json" in filename:
            result.append(
                pool.apply_async(mapper, (t, filename, inputTestPath)))
            t += 1
    pool.close()
    pool.join()

    notUsedKeys = []
    for r in result:
        ks = r.get()
        notUsedKeys += ks

    ### Build Model
    # Paatern Selection
    modelArticles = projizz.buildYagoProperties([])
    words = []
    count = 0
    for filename in os.listdir(inputSPIpath):
        if ".json" in filename:
            ptnId = filename[:-5]

            # ignore invalidate pattern
            if not projizz.isPatternValidate(
                    ptnId, table, confidence=confidence):
                continue

            count += 1
            print count, ptnId

            ptnInstance = projizz.jsonRead(os.path.join(
                inputSPIpath, filename))
            for rela in ptnInstance:
                for key in ptnInstance[rela]:
                    # ignore in testing data's key
                    if key in notUsedKeys:
                        continue

                    for line in ptnInstance[rela][key]:
                        modelArticles[rela].append(line)

            if count % 100 == 0:
                print "Read", count, "files"

    for relation in modelArticles:
        print relation
        projizz.jsonWrite(modelArticles[relation],
                          os.path.join(outputVSMpath, "%s.txt" % (relation)))
Exemplo n.º 30
0
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns,
           domainRange, confidence, nbcPath):

    # read articles and patterns
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))
    contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename))

    classifiers = projizz.getNBClassifiers(nbcPath)
    print "Worker %d : Read %s" % (jobid, filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0
    expResult = partAns
    relaEx = []

    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])  # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        # Relation extraction
        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns

            lineText = article[line[0]]
            if lineText[
                    0] == "^":  # It's a wikipeida reference comments, ignore it!
                continue

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line

                ptnId = "%d" % (ptn[0])

                ptntks = table[ptnId]["pattern"]

                if not projizz.isPatternValidate(
                        ptnId, table, confidence=confidence, st=st):
                    continue

                rfp = table[ptnId]["relations"]

                # check degree
                if len(rfp) > 5:
                    continue

                # if no support, ignore this pattern
                if st[ptnId][0][1]["support"] <= 0:
                    continue

                for ptnst in st[ptnId]:
                    # ptnst[0] = relation
                    # ptnst[1] = {"support":,"total": }
                    if domainRange[ptnst[0]] not in types:
                        continue

                    if classifiers[ptnst[0]] == None:
                        continue

                    if classifiers[ptnst[0]].classify(lineText) == "pos":
                        if not ptnst[0] in relaEx:
                            relaEx.append(ptnst[0])

        #### Evaluation
        for attribute in expResult:

            # special case, ignore.
            if attribute == "produced":
                continue

            postive = False
            true = False

            if attribute in relaEx:
                postive = True
            if attribute in relation:
                true = True

            if true:
                if postive:
                    expResult[attribute]["tp"].append(ans["revid"])
                else:
                    expResult[attribute]["fn"].append(ans["revid"])
            else:
                if postive:
                    expResult[attribute]["fp"].append(ans["revid"])
                else:
                    # ignore true-negative
                    pass

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return expResult
Exemplo n.º 31
0
def filterFunction(jobid, filename, inputPtnPath, model, table, partAns, st,
                   domainRange, inputPath, confidence):
    # read patterns in articles
    contentPtnJson = json.load(open(os.path.join(inputPtnPath, filename), "r"))
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))

    print "Worker %d : Read %s into filter" % (jobid, filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0

    # prepare keys for multiple-exp
    # degree: 1 ~ 5
    # ambigu: select 1, select n (threshold:.5, .75), select all
    # type or not: no type info, type info

    expResult = {}

    for deg in range(1, 6):
        for typ in ["n", "t"]:
            if not deg == 1:
                for amb in ["one", "50", "75", "all"]:
                    keyname = "%d-%s-%s" % (deg, amb, typ)
                    expResult[keyname] = copy.deepcopy(partAns)
            else:
                keyname = "%d-1-%s" % (deg, typ)
                expResult[keyname] = copy.deepcopy(partAns)

    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])  # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        for keyname in expResult:

            args = keyname.split("-")
            degree = int(args[0])
            ambigu = args[1]
            typ = args[2]

            # Relation extraction
            relaEx = []
            for line in ptnEx:
                # line[0]: line number
                # line[1]: array of patterns

                lineText = article[line[0]]
                if lineText[
                        0] == "^":  # It's a wikipeida reference comments, ignore it!
                    continue

                for ptn in line[1]:
                    # ptn[0]: pattern ID
                    # ptn[1]: start position in line
                    # ptn[2]: end position in line

                    ptnId = "%d" % (ptn[0])

                    if not projizz.isPatternValidate(
                            ptnId, table, confidence=confidence, st=st):
                        continue

                    rfp = table[ptnId]["relations"]

                    # check degree
                    if len(rfp) > degree:
                        continue

                    if len(rfp) == 1:  # or degree == 1
                        if st[ptnId][0][1]["support"] > 0 and not rfp[
                                0] in relaEx:
                            if typ == "t":
                                if domainRange[rfp[0]]["domain"] in types:
                                    relaEx.append(rfp[0])
                            else:
                                relaEx.append(rfp[0])

                    else:
                        if ambigu == "one":
                            if typ == "t":
                                for ptnst in st[ptnId]:
                                    # ptnst[0] = relation
                                    # ptnst[1] = {"support": , "total": }
                                    if ptnst[1]["support"] > 0 and domainRange[
                                            ptnst[0]]["domain"] in types:
                                        if not ptnst[0] in relaEx:
                                            relaEx.append(ptnst[0])
                                            break

                            else:
                                if st[ptnId][0][1]["support"] > 0 and not rfp[
                                        0] in relaEx:
                                    relaEx.append(rfp[0])

                        elif ambigu == "all":
                            for ptnst in st[ptnId]:
                                if typ == "t":
                                    if domainRange[
                                            ptnst[0]]["domain"] in types:
                                        if not ptnst[0] in relaEx:
                                            relaEx.append(ptnst[0])
                                else:
                                    if not ptnst[0] in relaEx:
                                        relaEx.append(ptnst[0])
                        else:
                            th = 0.75
                            if ambigu == "50":
                                th = 0.5

                            b = st[ptnId][0][1]["support"]
                            if b > 0:
                                for ptnst in st[ptnId]:
                                    if float(ptnst[1]["support"]) / float(
                                            b) >= th:
                                        if typ == "t":
                                            if domainRange[ptnst[0]][
                                                    "domain"] in types and not ptnst[
                                                        0] in relaEx:
                                                relaEx.append(ptnst[0])
                                        else:
                                            if not ptnst[0] in relaEx:
                                                relaEx.append(ptnst[0])

            # Evaluation
            for attribute in expResult[keyname]:

                # special case, ignore.
                if attribute == "produced":
                    continue

                postive = False
                true = False

                if attribute in relaEx:
                    postive = True
                if attribute in relation:
                    true = True

                if true:
                    if postive:
                        expResult[keyname][attribute]["tp"].append(
                            ans["revid"])
                    else:
                        expResult[keyname][attribute]["fn"].append(
                            ans["revid"])
                else:
                    if postive:
                        expResult[keyname][attribute]["fp"].append(
                            ans["revid"])
                    else:
                        # ignore true-negative
                        pass

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return expResult
Exemplo n.º 32
0
def main(part, revid):

    # Paths (on NLG workstation)
    inputPath = "/tmp2/ccli/yago-part-%s/" % (part)
    inputPtnPath = "/tmp2/ccli/yago-ptn-part-%s/" % (part)
    spPath = "../yago/yagoPSv1/ps.%s.json" % (part)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    itr = collection.find({"revid": revid})

    # find filename
    a = os.popen("grep -nr \"%s\" %s" % (revid, inputPath)).readline()
    targetFilename = a.split(":")[0].split("/")[-1]
    key = "%s.txt" % (revid)

    pattern = projizz.jsonRead(inputPtnPath + targetFilename)[key]
    article = projizz.articleSimpleSentenceFileter(
        projizz.jsonRead(inputPath + targetFilename)[key])
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(spPath))
    domainRange = projizz.getYagoRelationDomainRange()
    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model", "../yago/yagoPatternTree.table")

    print "Part %s, RevID=%s, in %s" % (part, revid, targetFilename)

    for ans in itr:

        targetName = ans["_id"].replace("(", "").replace(")", "").split(
            "_")  # get entity name's part
        types = ans["type"]
        answers = ans["properties"]

        print "Target=%s\nTarget token=%s" % (ans["_id"].encode("utf-8"),
                                              targetName)
        print "Type=%s" % (types)
        print "Answer=%s" % (answers)

        for line in pattern:
            lineText = article[line[0]]
            named = False
            for namedToken in targetName:
                if namedToken in lineText:
                    named = True
                    break

            if not named:  # No target name in line text
                continue  # go to next line.

            for ptn in line[1]:
                ptnId = "%d" % (ptn[0])
                #rfp = table[ptnId]["relations"]
                if not ptnId in st:
                    continue

                for ps in st[ptnId]:
                    if float(ps[1]["support"]) / float(ps[1]["total"]) > 0:
                        if domainRange[ps[0]]["domain"] in types:
                            print "#%d" % (line[0]), lineText.encode("utf-8")
                            isIn = "(X)"
                            if ps[0] in answers:
                                isIn = "(O)"
                            print "%s %s/%s/{%d,%d}/ %s" % (
                                isIn, ptnId, table[ptnId]["pattern"],
                                ps[1]["support"], ps[1]["total"], ps[0])

                        pass

                    # select top 1
                    break

        # prevent second ans
        break
Exemplo n.º 33
0
def mapper(jobid,filename,inputPath,inputPtnPath,model,table,confidence):

    # Read article
    contentJson = projizz.jsonRead( os.path.join(inputPath,filename) )
    # Read ptn
    contentPtnJson = projizz.jsonRead( os.path.join(inputPtnPath,filename) )

    print "Worker %d : Read %s into filter" % (jobid,filename)
    
    ### Connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid":{"$in":queries}})
    print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count())

    count = 0

    supportInstanceByFile = {}

    linesByRelations = {}
    linesNoRelaByRelations = {}

    POS = {}
    NEG = {}


    for ans in itr:
        count += 1

        key = "%s.txt" % (ans["revid"])
        relation = ans["observed"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])
       
        supportInstanceByFile[key] = {}
        linesByRela = {}
        linesByNoRela = {}

        pos = {}
        neg = {}

        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns
            lineText = article[line[0]]

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line
                ptnId = "%d" % (ptn[0])

                if not projizz.isPatternValidate(ptnId, table, confidence=confidence):
                    continue

                # give up degree > 5 's pattern
                if len(table[ptnId]["relations"]) > 5:
                    continue

                for rela in table[ptnId]["relations"]:
                    # it's a support instance
                    if rela in relation:
       
                        # NOTE - remove pattern text.
                        if not rela in linesByRela:
                            linesByRela[rela] = {}
                        if not line[0] in linesByRela[rela]:
                            linesByRela[rela][line[0]] = []
                        if not ptnId in linesByRela[rela][line[0]]:
                            linesByRela[rela][line[0]].append(ptnId)

                        # For binary classifier
                        if not rela in pos:
                            pos[rela] = []
                        if not lineText[0] == "^" and line[0] not in pos[rela]:
                            pos[rela].append(line[0])

                    else:
                        if not rela in linesByNoRela:
                            linesByNoRela[rela] = {}
                        if not line[0] in linesByNoRela[rela]:
                            linesByNoRela[rela][line[0]] = []
                        if not ptnId in linesByNoRela[rela][line[0]]:
                            linesByNoRela[rela][line[0]].append(ptnId)
                        
                        # For binary classifier
                        if not rela in neg:
                            neg[rela] = []
                        if not lineText[0] == "^" and line[0] not in neg[rela]:
                            neg[rela].append(line[0])

        for rela in linesByRela:
            if not rela in linesByRelations:
                linesByRelations[rela] = []
            for lineN in linesByRela[rela]:
                text = projizz.getTokens( article[lineN].lower() )
                for ptnId in linesByRela[rela][lineN]:
                    ptntext = table[ptnId]["pattern"].split()
                    for ptntk in ptntext:
                        if ptntk in text:
                            text.remove(ptntk)
                l = ' '.join(text)
                linesByRelations[rela].append(l)

        for rela in linesByNoRela:
            if not rela in linesNoRelaByRelations:
                linesNoRelaByRelations[rela] = []
            for lineN in linesByNoRela[rela]:
                text = projizz.getTokens( article[lineN].lower() )
                for ptnId in linesByNoRela[rela][lineN]:
                    ptntext = table[ptnId]["pattern"].split()
                    for ptntk in ptntext:
                        if ptntk in text:
                            text.remove(ptntk)
                l = ' '.join(text)
                linesNoRelaByRelations[rela].append(l)

        # For binary classifier
        for rela in pos:
            if not rela in POS:
                POS[rela] = []
            for lineN in pos[rela]:
                POS[rela].append( {"text":article[lineN],"label":"pos"} )

        for rela in neg:
            if not rela in NEG:
                NEG[rela] = []
            for lineN in neg[rela]:
                NEG[rela].append( {"text":article[lineN],"label":"neg"} )

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid,count)

    return linesByRelations,linesNoRelaByRelations,POS,NEG
Exemplo n.º 34
0
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns,
           domainRange, confidence, vsmData):

    # read articles and patterns
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))
    contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename))

    print "Worker %d : Read %s" % (jobid, filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0
    expResult = {}
    relaEx = {}

    # set thresholds
    for th in range(0, 51, 5):
        expResult[th] = copy.deepcopy(partAns)
        relaEx[th] = []

    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])  # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        # TODO

        # Relation extraction
        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line

                ptnId = "%d" % (ptn[0])

                ptntks = table[ptnId]["pattern"]
                lineText = article[line[0]]

                if not projizz.isPatternValidate(
                        ptnId, table, confidence=confidence, st=st):
                    continue

                rfp = table[ptnId]["relations"]

                # check degree
                if len(rfp) > 5:
                    continue

                # if no support, ignore this pattern
                if st[ptnId][0][1]["support"] <= 0:
                    continue

                # TODO - Modlify string, remove pattern text in string?
                cosRlt = projizz.vsmSimilarity(lineText,
                                               vsmData,
                                               relas=rfp,
                                               ptntext=ptntks)

                # NOTE - if cosine value > threshold then there is a relation (?)
                for keyname in expResult:
                    threshold = float(keyname) / 100.0

                    for pr in cosRlt:
                        # Check type
                        if domainRange[pr]["domain"] in types:
                            if cosRlt[pr] > threshold:
                                if pr not in relaEx[keyname]:
                                    relaEx[keyname].append(pr)

        #### Evaluation
        for keyname in expResult:
            for attribute in expResult[keyname]:

                # special case, ignore.
                if attribute == "produced":
                    continue

                postive = False
                true = False

                if attribute in relaEx[keyname]:
                    postive = True
                if attribute in relation:
                    true = True

                if true:
                    if postive:
                        expResult[keyname][attribute]["tp"].append(
                            ans["revid"])
                    else:
                        expResult[keyname][attribute]["fn"].append(
                            ans["revid"])
                else:
                    if postive:
                        expResult[keyname][attribute]["fp"].append(
                            ans["revid"])
                    else:
                        # ignore true-negative
                        pass

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return expResult
Exemplo n.º 35
0
def mapper(jobid,filename,inputPath,inputPtnPath,model,table):

    # Read article
    contentJson = projizz.jsonRead( os.path.join(inputPath,filename) )
    # Read ptn
    contentPtnJson = projizz.jsonRead( os.path.join(inputPtnPath,filename) )

    print "Worker %d : Read %s into filter" % (jobid,filename)
    
    ### Connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid":{"$in":queries}})
    print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count())

    count = 0

    supportInstanceByFile = {}

    for ans in itr:
        count += 1

        key = "%s.txt" % (ans["revid"])
        relation = ans["observed"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])
       
        supportInstanceByFile[key] = {}

        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns
            lineText = article[line[0]]

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line
                ptnId = "%d" % (ptn[0])

                if not projizz.isPatternValidate(ptnId, table):
                    continue

                for rela in table[ptnId]["relations"]:
                    # it's a support instance
                    if rela in relation:
                        
                        if not ptnId in supportInstanceByFile[key]:
                            supportInstanceByFile[key][ptnId] = {}
                        if not rela in supportInstanceByFile[key][ptnId]:
                            supportInstanceByFile[key][ptnId][rela] = []

                        if not line[0] in supportInstanceByFile[key][ptnId][rela]:
                            supportInstanceByFile[key][ptnId][rela].append(line[0])

        for ptnId in supportInstanceByFile[key]:
            for rela in supportInstanceByFile[key][ptnId]:
                lines = supportInstanceByFile[key][ptnId][rela]
                supportInstanceByFile[key][ptnId][rela] = []
                for lineN in lines:
                    supportInstanceByFile[key][ptnId][rela].append(article[lineN])

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid,count)

    return supportInstanceByFile
Exemplo n.º 36
0
# -*- coding: utf-8 -*-
# qcl
#

import sys
import projizz

if len(sys.argv) <= 1:
    print "$ python ./simpleSortedViewer.py [ps json]"
else:
    filename = sys.argv[1]
    ps = projizz.jsonRead(filename)
    sortedp = projizz.getSortedStatistic(ps)
    model, table = projizz.readPrefixTreeModelWithTable(
        "./yagoPatternTree.model", "./yagoPatternTree.table")
    for relation in sortedp:
        print relation
        for ptnId, ptnS in sortedp[relation]:
            print "%s\t%s %s %s" % (relation, table[ptnId]["pattern"], ptnId,
                                    ptnS)
Exemplo n.º 37
0
def viewer(path, threshold):
    model = projizz.jsonRead(path)
    sortedModel = sorted(model.items(), key=lambda x: x[1], reverse=True)
    for word, score in sortedModel:
        if score >= threshold:
            print "%s\t%f" % (word.encode("utf-8"), score)
Exemplo n.º 38
0
def mapper(jobid, filename, inputTestPath):
    contentPtnJson = projizz.jsonRead(os.path.join(inputTestPath, filename))
    keys = map(lambda x: x, contentPtnJson)
    print "Worker %d read %s, done." % (jobid, filename)
    return keys
Exemplo n.º 39
0
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, nbcPath):

    # read articles and patterns
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))
    contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename))

    classifiers = projizz.getNBClassifiers(nbcPath)
    print "Worker %d : Read %s" % (jobid, filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0
    expResult = partAns
    relaEx = []

    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])  # get revid
        # targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        # Relation extraction
        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns

            lineText = article[line[0]]
            if lineText[0] == "^":  # It's a wikipeida reference comments, ignore it!
                continue

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line

                ptnId = "%d" % (ptn[0])

                ptntks = table[ptnId]["pattern"]

                if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st):
                    continue

                rfp = table[ptnId]["relations"]

                # check degree
                if len(rfp) > 5:
                    continue

                # if no support, ignore this pattern
                if st[ptnId][0][1]["support"] <= 0:
                    continue

                for ptnst in st[ptnId]:
                    # ptnst[0] = relation
                    # ptnst[1] = {"support":,"total": }
                    if domainRange[ptnst[0]] not in types:
                        continue

                    if classifiers[ptnst[0]] == None:
                        continue

                    if classifiers[ptnst[0]].classify(lineText) == "pos":
                        if not ptnst[0] in relaEx:
                            relaEx.append(ptnst[0])

        #### Evaluation
        for attribute in expResult:

            # special case, ignore.
            if attribute == "produced":
                continue

            postive = False
            true = False

            if attribute in relaEx:
                postive = True
            if attribute in relation:
                true = True

            if true:
                if postive:
                    expResult[attribute]["tp"].append(ans["revid"])
                else:
                    expResult[attribute]["fn"].append(ans["revid"])
            else:
                if postive:
                    expResult[attribute]["fp"].append(ans["revid"])
                else:
                    # ignore true-negative
                    pass

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return expResult