예제 #1
0
def main(inputPtnPath,outputPath,pspath,inputPath):
    
    model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    properties = projizz.buildYagoProperties({"tp":[],"tn":[],"fp":[],"fn":[]})
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath))
    domainRange = projizz.getYagoRelationDomainRange()

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath )))
            t += 1
    pool.close()
    pool.join()

    for res in result:
        r = res.get()
        for m in r:
            properties[m]["tp"] += r[m]["tp"]
            properties[m]["tn"] += r[m]["tn"]
            properties[m]["fp"] += r[m]["fp"]
            properties[m]["fn"] += r[m]["fn"]

    print "start write out to %s" % (outputPath)
    json.dump(properties,open(outputPath,"w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath,pspath,inputPath):
    
    model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp":[],"tn":[],"fp":[],"fn":[]})
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath))
    domainRange = projizz.getYagoRelationDomainRange()

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath )))
            t += 1
    pool.close()
    pool.join()

    for res in result:
        r = res.get()
        for m in r:
            properties[m]["tp"] += r[m]["tp"]
            properties[m]["tn"] += r[m]["tn"]
            properties[m]["fp"] += r[m]["fp"]
            properties[m]["fn"] += r[m]["fn"]

    print "start write out to %s" % (outputPath)
    json.dump(properties,open(outputPath,"w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
예제 #3
0
def main(inputPtnPath,outputPath,pspath,inputPath,confidence,outputFilename,nbcPath):
    
    #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    model, table = projizz.readPrefixTreeModelWithTable("../yago//yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[],"et1":[],"et2":[],"et3":[]})
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath))
    domainRange = projizz.getYagoRelationDomainRange()


    start_time = datetime.now()

    cpuCount = multiprocessing.cpu_count()
    if cpuCount > 8:
        cpuCount = 8

    pool = multiprocessing.Pool(processes=cpuCount) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            #result.append(filterFunction(t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,classifiers ))
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,nbcPath )))
            t += 1
    pool.close()
    pool.join()

    expResult = {}
    for res in result:
        r = res.get()
        for keyname in r:

            if not keyname in expResult:
                expResult[keyname] = copy.deepcopy(properties)

            for m in r[keyname]:
                if m == "produced":
                    continue
                expResult[keyname][m]["tp"] += r[keyname][m]["tp"]
                expResult[keyname][m]["fp"] += r[keyname][m]["fp"]
                expResult[keyname][m]["fn"] += r[keyname][m]["fn"]
                expResult[keyname][m]["et1"] += r[keyname][m]["et1"]
                expResult[keyname][m]["et2"] += r[keyname][m]["et2"]
                expResult[keyname][m]["et3"] += r[keyname][m]["et3"]


    if not os.path.isdir(outputPath):
        os.mkdir(outputPath)

    for keyname in expResult:
        p = expResult[keyname]
        if not os.path.isdir(os.path.join(outputPath,keyname)):
            os.mkdir(os.path.join(outputPath,keyname))
        projizz.jsonWrite(p,os.path.join(outputPath,keyname,outputFilename))
        print "start write out to %s" % (os.path.join(outputPath,keyname))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
예제 #4
0
def main(inputPath, inputPtnPath, vsmPath, confidence, psfile, outputPath,
         outputFilename):

    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model",
        "../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp": [], "fp": [], "fn": []})
    domainRange = projizz.getYagoRelationDomainRange()
    idf, docs, lens = projizz.getVSMmodels(vsmPath)
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(psfile))
    vsmData = (idf, docs, lens)

    projizz.checkPath(outputPath)

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(
                pool.apply_async(
                    mapper, (t, filename, inputPath, inputPtnPath, table, st,
                             partAns, domainRange, confidence, vsmData)))
            #result.append( mapper( t, filename, inputPath, inputPtnPath, table, partAns, domainRange, confidence, vsmData  ))
            t += 1
    pool.close()
    pool.join()

    expResult = {}
    for res in result:
        r = res.get()
        for keyname in r:

            if not keyname in expResult:
                expResult[keyname] = copy.deepcopy(properties)

            for m in r[keyname]:
                if m == "produced":
                    continue
                expResult[keyname][m]["tp"] += r[keyname][m]["tp"]
                expResult[keyname][m]["fp"] += r[keyname][m]["fp"]
                expResult[keyname][m]["fn"] += r[keyname][m]["fn"]

    for keyname in expResult:
        p = expResult[keyname]
        keydirName = "vsm-%d" % (keyname)
        projizz.checkPath(os.path.join(outputPath, keydirName))
        projizz.jsonWrite(p,
                          os.path.join(outputPath, keydirName, outputFilename))
        print "start write out to %s" % (os.path.join(outputPath, keydirName))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath,pspath,inputPath,confidence,outputFilename):
    
    #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    model, table = projizz.readPrefixTreeModelWithTable("../yago//yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[],"et1":[],"et2":[],"et3":[]})
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath))
    domainRange = projizz.getYagoRelationDomainRange()

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence )))
            t += 1
    pool.close()
    pool.join()

    expResult = {}
    for res in result:
        r = res.get()
        for keyname in r:

            if not keyname in expResult:
                expResult[keyname] = copy.deepcopy(properties)

            for m in r[keyname]:
                if m == "produced":
                    continue
                expResult[keyname][m]["tp"] += r[keyname][m]["tp"]
                expResult[keyname][m]["fp"] += r[keyname][m]["fp"]
                expResult[keyname][m]["fn"] += r[keyname][m]["fn"]
                expResult[keyname][m]["et1"] += r[keyname][m]["et1"]
                expResult[keyname][m]["et2"] += r[keyname][m]["et2"]
                expResult[keyname][m]["et3"] += r[keyname][m]["et3"]


    if not os.path.isdir(outputPath):
        os.mkdir(outputPath)

    for keyname in expResult:
        p = expResult[keyname]
        if not os.path.isdir(os.path.join(outputPath,keyname)):
            os.mkdir(os.path.join(outputPath,keyname))
        projizz.jsonWrite(p,os.path.join(outputPath,keyname,outputFilename))
        print "start write out to %s" % (os.path.join(outputPath,keyname))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
예제 #6
0
def main(inputPath, inputPtnPath, vsmPath, confidence, psfile, outputPath, outputFilename): 
    
    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[]})
    domainRange = projizz.getYagoRelationDomainRange()
    idf,docs,lens = projizz.getVSMmodels(vsmPath)
    st = projizz.getSortedPatternStatistic( projizz.jsonRead(psfile) )
    vsmData = (idf, docs, lens)

    projizz.checkPath(outputPath)

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(pool.apply_async(mapper, ( t, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData  )))
            #result.append( mapper( t, filename, inputPath, inputPtnPath, table, partAns, domainRange, confidence, vsmData  ))
            t += 1
    pool.close()
    pool.join()

    expResult = {}
    for res in result:
        r = res.get()
        for keyname in r:

            if not keyname in expResult:
                expResult[keyname] = copy.deepcopy(properties)

            for m in r[keyname]:
                if m == "produced":
                    continue
                expResult[keyname][m]["tp"] += r[keyname][m]["tp"]
                expResult[keyname][m]["fp"] += r[keyname][m]["fp"]
                expResult[keyname][m]["fn"] += r[keyname][m]["fn"]


    for keyname in expResult:
        p = expResult[keyname]
        keydirName = "vsm-%d" % (keyname)
        projizz.checkPath( os.path.join(outputPath,keydirName))
        projizz.jsonWrite(p,os.path.join(outputPath,keydirName,outputFilename))
        print "start write out to %s" % (os.path.join(outputPath,keydirName))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
예제 #7
0
def main(part,revid):

    # Paths (on NLG workstation)
    inputPath = "/tmp2/ccli/y-part-%s/" % (part)
    inputPtnPath = "/tmp2/ccli/y-ptn-part-%s/" % (part)
    spPath = "../yago/yagoPSv2/ps.%s.json" % (part)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    itr = collection.find({"revid":revid})

    # find filename
    a = os.popen("grep -nr \"%s\" %s" % (revid,inputPath)).readline()
    targetFilename = a.split(":")[0].split("/")[-1]
    key = "%s.txt" % (revid)

    

    pattern = projizz.jsonRead(inputPtnPath+targetFilename)[key]
    article = projizz.articleSimpleSentenceFileter(projizz.jsonRead(inputPath+targetFilename)[key])
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(spPath))
    domainRange = projizz.getYagoRelationDomainRange();
    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../yago/yagoPatternTree.table")

    print "Part %s, RevID=%s, in %s" % (part,revid,targetFilename)

    for ans in itr:

        targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]
        answers = ans["observed"]

        print "Target=%s\nTarget token=%s" % (ans["_id"].encode("utf-8"),targetName)
        print "Type=%s" % (types)
        print "Answer=%s" % (answers)
       
        for line in pattern:
            lineText = article[line[0]]
            named = False
            for namedToken in targetName:
                if namedToken in lineText:
                    named = True
                    break

            if not named:   # No target name in line text
                continue    # go to next line.

            for ptn in line[1]:
                ptnId = "%d" % (ptn[0])
                #rfp = table[ptnId]["relations"]
                if not ptnId in st:
                    continue

                for ps in st[ptnId]:
                    if float(ps[1]["support"])/float(ps[1]["total"]) > 0:
                        if domainRange[ps[0]]["domain"] in types:
                            print "#%d" % (line[0]),lineText.encode("utf-8")
                            isIn = "(X)"
                            if ps[0] in answers:
                                isIn = "(O)"
                            print "%s %s/%s/{%d,%d}/ %s" % (isIn,ptnId,table[ptnId]["pattern"],ps[1]["support"],ps[1]["total"],ps[0])

                        pass

                    # select top 1
                    break


        # prevent second ans
        break
예제 #8
0
def main(part, revid):

    # Paths (on NLG workstation)
    inputPath = "/tmp2/ccli/yago-part-%s/" % (part)
    inputPtnPath = "/tmp2/ccli/yago-ptn-part-%s/" % (part)
    spPath = "../yago/yagoPSv1/ps.%s.json" % (part)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    itr = collection.find({"revid": revid})

    # find filename
    a = os.popen("grep -nr \"%s\" %s" % (revid, inputPath)).readline()
    targetFilename = a.split(":")[0].split("/")[-1]
    key = "%s.txt" % (revid)

    pattern = projizz.jsonRead(inputPtnPath + targetFilename)[key]
    article = projizz.articleSimpleSentenceFileter(
        projizz.jsonRead(inputPath + targetFilename)[key])
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(spPath))
    domainRange = projizz.getYagoRelationDomainRange()
    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model", "../yago/yagoPatternTree.table")

    print "Part %s, RevID=%s, in %s" % (part, revid, targetFilename)

    for ans in itr:

        targetName = ans["_id"].replace("(", "").replace(")", "").split(
            "_")  # get entity name's part
        types = ans["type"]
        answers = ans["properties"]

        print "Target=%s\nTarget token=%s" % (ans["_id"].encode("utf-8"),
                                              targetName)
        print "Type=%s" % (types)
        print "Answer=%s" % (answers)

        for line in pattern:
            lineText = article[line[0]]
            named = False
            for namedToken in targetName:
                if namedToken in lineText:
                    named = True
                    break

            if not named:  # No target name in line text
                continue  # go to next line.

            for ptn in line[1]:
                ptnId = "%d" % (ptn[0])
                #rfp = table[ptnId]["relations"]
                if not ptnId in st:
                    continue

                for ps in st[ptnId]:
                    if float(ps[1]["support"]) / float(ps[1]["total"]) > 0:
                        if domainRange[ps[0]]["domain"] in types:
                            print "#%d" % (line[0]), lineText.encode("utf-8")
                            isIn = "(X)"
                            if ps[0] in answers:
                                isIn = "(O)"
                            print "%s %s/%s/{%d,%d}/ %s" % (
                                isIn, ptnId, table[ptnId]["pattern"],
                                ps[1]["support"], ps[1]["total"], ps[0])

                        pass

                    # select top 1
                    break

        # prevent second ans
        break
예제 #9
0
# -*- coding:utf-8 -*-
# qcl

import projizz

dr = projizz.getYagoRelationDomainRange();

result = {}

for r in dr:
    if r == "produced":
        continue
    l = 0
    f = open("../patty/yagoSortedRela/%s.txt" % (r), "r")
    for line in f:
        l += 1
    f.close()
    domain = dr[r]["domain"]
    rang   = dr[r]["range"]
    
    if "wl:" in rang:
        rang = "Thing"

    if "wordnet_" in domain:
        domain = domain.split("_")
        domain = "%s\_%s" % (domain[0],domain[1])
    if "wordnet_" in rang:
        rang = rang.split("_")
        rang = "%s\_%s" % (rang[0],rang[1])
    
    result[r] = (domain,rang,l)