Exemplo n.º 1
0
def main(inputPtnPath,outputPath,pspath,inputPath,confidence,outputFilename,nbcPath):
    
    #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    model, table = projizz.readPrefixTreeModelWithTable("../yago//yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[],"et1":[],"et2":[],"et3":[]})
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath))
    domainRange = projizz.getYagoRelationDomainRange()


    start_time = datetime.now()

    cpuCount = multiprocessing.cpu_count()
    if cpuCount > 8:
        cpuCount = 8

    pool = multiprocessing.Pool(processes=cpuCount) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            #result.append(filterFunction(t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,classifiers ))
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,nbcPath )))
            t += 1
    pool.close()
    pool.join()

    expResult = {}
    for res in result:
        r = res.get()
        for keyname in r:

            if not keyname in expResult:
                expResult[keyname] = copy.deepcopy(properties)

            for m in r[keyname]:
                if m == "produced":
                    continue
                expResult[keyname][m]["tp"] += r[keyname][m]["tp"]
                expResult[keyname][m]["fp"] += r[keyname][m]["fp"]
                expResult[keyname][m]["fn"] += r[keyname][m]["fn"]
                expResult[keyname][m]["et1"] += r[keyname][m]["et1"]
                expResult[keyname][m]["et2"] += r[keyname][m]["et2"]
                expResult[keyname][m]["et3"] += r[keyname][m]["et3"]


    if not os.path.isdir(outputPath):
        os.mkdir(outputPath)

    for keyname in expResult:
        p = expResult[keyname]
        if not os.path.isdir(os.path.join(outputPath,keyname)):
            os.mkdir(os.path.join(outputPath,keyname))
        projizz.jsonWrite(p,os.path.join(outputPath,keyname,outputFilename))
        print "start write out to %s" % (os.path.join(outputPath,keyname))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Exemplo n.º 2
0
def preprocess(inputPath, inputPtnPath, outputPath):

    # Checking output path
    projizz.checkPath(outputPath)

    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model",
        "../patty/yagoPatternTreeWithConfidence.table")

    start_time = datetime.now()

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            result.append(
                pool.apply_async(
                    mapper,
                    (t, filename, inputPath, inputPtnPath, model, table)))
            t += 1
    pool.close()
    pool.join()

    patternInstances = {}

    # Reducer
    for r in result:
        sibf = r.get()
        for key in sibf:
            for ptnId in sibf[key]:
                if not ptnId in patternInstances:
                    patternInstances[ptnId] = {}
                for rela in sibf[key][ptnId]:
                    for inst in sibf[key][ptnId][rela]:
                        if not rela in patternInstances[ptnId]:
                            patternInstances[ptnId][rela] = {}
                        if not key in patternInstances[ptnId][rela]:
                            patternInstances[ptnId][rela][key] = []
                        patternInstances[ptnId][rela][key].append(inst)

    # Write to files
    # NOTE
    # Output Format:
    # ptnId.json (json)
    # rela: keys
    #   key: line text
    for ptnId in patternInstances:
        projizz.jsonWrite(patternInstances[ptnId],
                          os.path.join(outputPath, "%s.json" % (ptnId)))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Exemplo n.º 3
0
def main(inputPath, inputPtnPath, vsmPath, confidence, psfile, outputPath,
         outputFilename):

    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model",
        "../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp": [], "fp": [], "fn": []})
    domainRange = projizz.getYagoRelationDomainRange()
    idf, docs, lens = projizz.getVSMmodels(vsmPath)
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(psfile))
    vsmData = (idf, docs, lens)

    projizz.checkPath(outputPath)

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(
                pool.apply_async(
                    mapper, (t, filename, inputPath, inputPtnPath, table, st,
                             partAns, domainRange, confidence, vsmData)))
            #result.append( mapper( t, filename, inputPath, inputPtnPath, table, partAns, domainRange, confidence, vsmData  ))
            t += 1
    pool.close()
    pool.join()

    expResult = {}
    for res in result:
        r = res.get()
        for keyname in r:

            if not keyname in expResult:
                expResult[keyname] = copy.deepcopy(properties)

            for m in r[keyname]:
                if m == "produced":
                    continue
                expResult[keyname][m]["tp"] += r[keyname][m]["tp"]
                expResult[keyname][m]["fp"] += r[keyname][m]["fp"]
                expResult[keyname][m]["fn"] += r[keyname][m]["fn"]

    for keyname in expResult:
        p = expResult[keyname]
        keydirName = "vsm-%d" % (keyname)
        projizz.checkPath(os.path.join(outputPath, keydirName))
        projizz.jsonWrite(p,
                          os.path.join(outputPath, keydirName, outputFilename))
        print "start write out to %s" % (os.path.join(outputPath, keydirName))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Exemplo n.º 4
0
def preprocess(inputPath,inputPtnPath,outputPath):

    # Checking output path
    projizz.checkPath(outputPath)

    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table")

    start_time = datetime.now()

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            result.append( pool.apply_async( mapper, (t,filename,inputPath,inputPtnPath, model, table))  )
            t += 1
    pool.close()
    pool.join()

    patternInstances = {}

    # Reducer
    for r in result:
        sibf = r.get()
        for key in sibf:
            for ptnId in sibf[key]:
                if not ptnId in patternInstances:
                    patternInstances[ptnId] = {}
                for rela in sibf[key][ptnId]:
                    for inst in sibf[key][ptnId][rela]:
                        if not rela in patternInstances[ptnId]:
                            patternInstances[ptnId][rela] = {}
                        if not key in patternInstances[ptnId][rela]:
                            patternInstances[ptnId][rela][key] = []
                        patternInstances[ptnId][rela][key].append(inst)

    
    # Write to files
    # NOTE
    # Output Format:
    # ptnId.json (json)
    # rela: keys
    #   key: line text
    for ptnId in patternInstances:
        projizz.jsonWrite(patternInstances[ptnId],os.path.join(outputPath,"%s.json" % (ptnId))) 

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath,pspath,inputPath,confidence,outputFilename):
    
    #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    model, table = projizz.readPrefixTreeModelWithTable("../yago//yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[],"et1":[],"et2":[],"et3":[]})
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath))
    domainRange = projizz.getYagoRelationDomainRange()

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence )))
            t += 1
    pool.close()
    pool.join()

    expResult = {}
    for res in result:
        r = res.get()
        for keyname in r:

            if not keyname in expResult:
                expResult[keyname] = copy.deepcopy(properties)

            for m in r[keyname]:
                if m == "produced":
                    continue
                expResult[keyname][m]["tp"] += r[keyname][m]["tp"]
                expResult[keyname][m]["fp"] += r[keyname][m]["fp"]
                expResult[keyname][m]["fn"] += r[keyname][m]["fn"]
                expResult[keyname][m]["et1"] += r[keyname][m]["et1"]
                expResult[keyname][m]["et2"] += r[keyname][m]["et2"]
                expResult[keyname][m]["et3"] += r[keyname][m]["et3"]


    if not os.path.isdir(outputPath):
        os.mkdir(outputPath)

    for keyname in expResult:
        p = expResult[keyname]
        if not os.path.isdir(os.path.join(outputPath,keyname)):
            os.mkdir(os.path.join(outputPath,keyname))
        projizz.jsonWrite(p,os.path.join(outputPath,keyname,outputFilename))
        print "start write out to %s" % (os.path.join(outputPath,keyname))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Exemplo n.º 6
0
def main(inputPath, inputPtnPath, vsmPath, confidence, psfile, outputPath, outputFilename): 
    
    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[]})
    domainRange = projizz.getYagoRelationDomainRange()
    idf,docs,lens = projizz.getVSMmodels(vsmPath)
    st = projizz.getSortedPatternStatistic( projizz.jsonRead(psfile) )
    vsmData = (idf, docs, lens)

    projizz.checkPath(outputPath)

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(pool.apply_async(mapper, ( t, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData  )))
            #result.append( mapper( t, filename, inputPath, inputPtnPath, table, partAns, domainRange, confidence, vsmData  ))
            t += 1
    pool.close()
    pool.join()

    expResult = {}
    for res in result:
        r = res.get()
        for keyname in r:

            if not keyname in expResult:
                expResult[keyname] = copy.deepcopy(properties)

            for m in r[keyname]:
                if m == "produced":
                    continue
                expResult[keyname][m]["tp"] += r[keyname][m]["tp"]
                expResult[keyname][m]["fp"] += r[keyname][m]["fp"]
                expResult[keyname][m]["fn"] += r[keyname][m]["fn"]


    for keyname in expResult:
        p = expResult[keyname]
        keydirName = "vsm-%d" % (keyname)
        projizz.checkPath( os.path.join(outputPath,keydirName))
        projizz.jsonWrite(p,os.path.join(outputPath,keydirName,outputFilename))
        print "start write out to %s" % (os.path.join(outputPath,keydirName))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Exemplo n.º 7
0
def parseYagoData():
    
    phase = "used"
    
    if phase == "build":
        model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../yago/yagoPatternTree.table")
    else:
        model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "./yagoPatternTreeWithConfidence.table")
    
    # function testing.
    #test = "has appeared like [[num]]"
    ##test = "has appeared like [[num]"
    #i = projizz.naiveMatchPattern(test,model) 
    #print i


    a = table.keys()
    originL = len(a)

    ptnByRelation = {}

    

    for relation in projizz.getYagoRelation():
        if not phase == "build":
            break
        
        f = open("./yagoRela/%s.txt" % (relation))
        
        print relation

        text = f.readline()
        ptnSynsetTxt = text.split("\",\" ")[1:]
        ptnSynsetTxt = ptnSynsetTxt[:-1] + [ ptnSynsetTxt[-1][:-7] ]

        ptnByRelation[relation] = []
        

        evC = 0
        for text in ptnSynsetTxt:
            ptns = text.split("#")
            # ptns[1] : pattern synset id in patty
            # ptns[3] : pattern domain
            # ptns[4] : pattern plain text
            # ptns[5] : pattern range
            # pnts[6] : confidence
            # ptns[7] : support co-occurrence
            # ptns[8] : some has, I guess it is eval result.
            if len(ptns) > 8:
                evC += 1

            patterns = ptns[4].split(";%")
            patterns = patterns[:-1] + [patterns[-1][:-1]]

            for pattern in patterns:
                pid = projizz.naiveMatchPattern(pattern,model)
                if pid < 0:
                    pass
                    #print relation,pattern
                else:
                    pid = str(pid)
                    if pid in a:
                        a.remove(pid)
                    if not pid in ptnByRelation[relation]:
                        ptnByRelation[relation].append(pid)

                    if not relation in table[pid]["relations"]:
                        table[pid]["relations"].append(relation)
                        #print relation,pid,pattern

                    ptnS = table[pid]
                    if not "confidence" in ptnS:
                        table[pid]["confidence"] = float(ptns[6])
                        table[pid]["support"] = int(ptns[7])
                        table[pid]["used"] = True
            
                        if len(ptns) > 8:
                            if ptns[8] == "false":
                                table[pid]["eval"] = False
                                #print pid,table[pid]["relations"],pattern,ptns[8]
                            else:
                                table[pid]["eval"] = True

        f.close()

    if phase == "build":

        for pid in a:
            table[pid]["used"] = False
    
        for pid in table:
            if table[pid]["used"]:
                needRemove = []
                for relation in table[pid]["relations"]:
                    if not pid in ptnByRelation[relation]:
                        print pid,table[pid]["pattern"],relation
                        needRemove.append(relation)
                for p in needRemove:
                    table[pid]["relations"].remove(p)
                if len(table[pid]["relations"]) == 0:
                    print pid,table[pid]["pattern"],"!!!"
            else:
                pass

        projizz.jsonWrite(table,"./yagoPatternTreeWithConfidence.table")

    else:
        c = 0
        used = 0
        for pid in table:
            if table[pid]["used"]:
                # 如果有true或false在,就只留True的Pattern
                if "eval" in table[pid]:
                    if not table[pid]["eval"]:
                        continue
                used += 1
                for relation in table[pid]["relations"]:
                    if not relation in ptnByRelation:
                        ptnByRelation[relation] = []
                    if not pid in ptnByRelation[relation]:
                        ptnByRelation[relation].append(pid)
            else:
                c += 1

    # 一些小計算
    #for relation in ptnByRelation:
    #    print relation,len(ptnByRelation[relation])
    
    # 找最高(意思就是不能再更高了)信心值
    # 每組Relation的最高之中最小的那一個

    minC = 1.0
    minCR = ""
    for relation in ptnByRelation:
        c75 = 0
        c50 = 0
        ptns = []
        for pid in ptnByRelation[relation]:
            ptns.append(table[pid])
            ptns[-1]["pid"] = pid
        ptns.sort(key=lambda x:x["confidence"],reverse=True)
        if ptns[0]["confidence"] < minC:
            minC = ptns[0]["confidence"]
            minCR = relation
        
        #print relation,ptns[0]
        f = open("./yagoSortedRela/%s.txt" % (relation),"w")
        for ptn in ptns:
            if ptn["confidence"] > .75:
                c75 += 1
            if ptn["confidence"] > .5:
                c50 += 1
            f.write("%s\t%s\t%.3f\t%d\t%s\n" % (ptn["pid"],ptn["pattern"],ptn["confidence"],ptn["support"],ptn["relations"]))
        f.close()

        print relation,len(ptns),c75,c50

    print minCR,minC,"pattern used:",used
Exemplo n.º 8
0
def mapper(jobid, filename, inputPath, topN, outputPath, model, table):

    # Read article
    article = projizz.jsonRead(os.path.join(inputPath, filename))

    stemmer = PorterStemmer()
    tks = {}

    print "Worker %d : Read %s into filter" % (jobid, filename)

    count = 0
    total = 0
    for line in article:
        count += 1
        tokens = projizz.getTokens(line)

        for token in tokens:
            t = stemmer.stem(token)

            if t not in tks:
                tks[t] = 0

            tks[t] += 1
            total += 1

        if count % 1000 == 0:
            print "worker %d done %d lines" % (jobid, count)

    # Remove stopwords
    for sw in projizz._stopwords:
        _sw = stemmer.stem(sw)
        if _sw in tks:
            total -= tks[_sw]
            tks.pop(_sw)

    needRemove = []
    maxTF = 1
    for t in tks:
        # ignore only one time word
        if tks[t] <= 1:
            needRemove.append(t)
            total -= tks[t]
            continue

        # ignore the case contain number
        if "0" in t or "1" in t or "2" in t or "3" in t or "4" in t or "5" in t or "6" in t or "7" in t or "8" in t or "9" in t:
            needRemove.append(t)
            total -= tks[t]
            continue

        #if tks[t] > maxTF:
        #    maxTF = tks[t]

    for rm in needRemove:
        tks.pop(rm)

    projizz.jsonWrite(
        tks, os.path.join(outputPath, filename.replace(".json", ".tfc")))

    ### select top N words
    # sort by tfc
    sortedTks = sorted(tks.items(), key=lambda x: x[1], reverse=True)
    tks = {}
    maxTF = sortedTks[0][1]
    # Calculate tf
    top = 0
    for t, c in sortedTks:
        top += 1
        tks[t] = float(c) / float(maxTF)
        if top == topN:
            break

    projizz.jsonWrite(
        tks, os.path.join(outputPath, filename.replace(".json", ".tf")))
    print "worker %d write out." % (jobid)

    return (filename, tks)
Exemplo n.º 9
0
def preprocess(inputPath, topN, outputPath):

    # Checking output path
    projizz.checkPath(outputPath)

    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model",
        "../patty/yagoPatternTreeWithConfidence.table")

    start_time = datetime.now()

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)
    t = 0
    result = []
    for filename in os.listdir(inputPath):
        if ".json" in filename:
            result.append(
                pool.apply_async(
                    mapper,
                    (t, filename, inputPath, topN, outputPath, model, table)))
            t += 1
    pool.close()
    pool.join()

    words = {}
    idf = {}
    tfs = {}

    # Reducer - DF
    types = 0
    for r in result:
        fn, tks = r.get()
        tfs[fn] = tks
        types += 1

        for t in tks:
            if t not in words:
                words[t] = 0
            words[t] += 1

    print "Doc#", types, "words#", len(words)

    projizz.jsonWrite(words, os.path.join(outputPath, "documentFreq.df"))

    # Calculate idf
    for w in words:
        if words[w] == 0:
            continue

        idf[w] = math.log(float(types) / float(words[w]), 10)

    projizz.jsonWrite(idf, os.path.join(outputPath, "idf.idf"))
    print "Write out idf file"

    # Calculate td-idf weight
    for fn in tfs:
        tks = tfs[fn]
        weight = {}
        for t in tks:
            tf = tks[t]
            if t not in idf:
                continue

            weight[t] = tf * idf[t]

        projizz.jsonWrite(weight, os.path.join(outputPath, fn))
        print "build", fn, "tf-idf weight"

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Exemplo n.º 10
0
def main(inputPath, inputPtnPath, outputPath, outputPtnPath):

    debug = False

    if not os.path.isdir(outputPath):
        os.mkdir(outputPath)
    if not os.path.isdir(outputPtnPath):
        os.mkdir(outputPtnPath)

    result = []
    count = 0

    # Update answer
    cpuN = multiprocessing.cpu_count()
    print "CoreNumber = %d" % (cpuN)
    pool = multiprocessing.Pool(processes=12)
    t = 0
    for filename in os.listdir(inputPath):
        if ".json" in filename:
            t += 1
            if debug:
                result.append(updateAnswer(t, inputPath, filename))
            else:
                result.append(
                    pool.apply_async(updateAnswer, (t, inputPath, filename)))

    pool.close()
    pool.join()

    # Rebuild articles and patterns

    tmpArticle = {}
    tmpPtn = {}

    dataSize = 0
    for res in result:
        if debug:
            filename, articles = res
        else:
            filename, articles = res.get()

        print filename, len(articles)
        a = projizz.jsonRead(os.path.join(inputPath, filename))
        p = projizz.jsonRead(os.path.join(inputPtnPath, filename))

        for key in articles:
            dataSize += 1
            tmpArticle[key] = a[key]
            tmpPtn[key] = p[key]

            if len(tmpPtn) == 1000:
                print "write to %05d.json" % (count)
                projizz.jsonWrite(
                    tmpArticle, os.path.join(outputPath,
                                             "%05d.json" % (count)))
                projizz.jsonWrite(
                    tmpPtn, os.path.join(outputPtnPath, "%05d.json" % (count)))
                tmpArticle = {}
                tmpPtn = {}
                count += 1

    if len(tmpPtn) > 0:
        print "write to %05d.json" % (count)
        projizz.jsonWrite(tmpArticle,
                          os.path.join(outputPath, "%05d.json" % (count)))
        projizz.jsonWrite(tmpPtn,
                          os.path.join(outputPtnPath, "%05d.json" % (count)))
        tmpArticle = {}
        tmpPtn = {}
        count += 1

    # Split to 5
    splitTo5part("/tmp2/r01922024", "y-all", "/tmp2/r01922024", "y")
    splitTo5part("/tmp2/r01922024", "y-ptn-all", "/tmp2/r01922024", "y-ptn")

    print "write %d files. (%d)" % (count, dataSize)
Exemplo n.º 11
0
def generate(inputSPIpath, inputTestPath, outputVSMpath, confidence):

    # Checking output path
    projizz.checkPath(outputVSMpath)

    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model",
        "../patty/yagoPatternTreeWithConfidence.table")

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)

    # Collect not used keys
    # because using 5-fold CV
    t = 0
    result = []
    for filename in os.listdir(inputTestPath):
        if ".json" in filename:
            result.append(
                pool.apply_async(mapper, (t, filename, inputTestPath)))
            t += 1
    pool.close()
    pool.join()

    notUsedKeys = []
    for r in result:
        ks = r.get()
        notUsedKeys += ks

    ### Build Model
    # Paatern Selection
    modelArticles = projizz.buildYagoProperties([])
    words = []
    count = 0
    for filename in os.listdir(inputSPIpath):
        if ".json" in filename:
            ptnId = filename[:-5]

            # ignore invalidate pattern
            if not projizz.isPatternValidate(
                    ptnId, table, confidence=confidence):
                continue

            count += 1
            print count, ptnId

            ptnInstance = projizz.jsonRead(os.path.join(
                inputSPIpath, filename))
            for rela in ptnInstance:
                for key in ptnInstance[rela]:
                    # ignore in testing data's key
                    if key in notUsedKeys:
                        continue

                    for line in ptnInstance[rela][key]:
                        modelArticles[rela].append(line)

            if count % 100 == 0:
                print "Read", count, "files"

    for relation in modelArticles:
        print relation
        projizz.jsonWrite(modelArticles[relation],
                          os.path.join(outputVSMpath, "%s.txt" % (relation)))
Exemplo n.º 12
0
def generate(inputSPIpath,inputTestPath,outputVSMpath,confidence):
    
    # Checking output path
    projizz.checkPath(outputVSMpath)

    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table")

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)

    # Collect not used keys
    # because using 5-fold CV
    t = 0
    result = []
    for filename in os.listdir(inputTestPath):
        if ".json" in filename:
            result.append( pool.apply_async( mapper, (t,filename,inputTestPath) )  )
            t += 1
    pool.close()
    pool.join()

    notUsedKeys = []
    for r in result:
        ks = r.get()
        notUsedKeys += ks

    ### Build Model
    # Paatern Selection
    modelArticles = projizz.buildYagoProperties([])
    words = []
    count = 0
    for filename in os.listdir(inputSPIpath):
        if ".json" in filename:
            ptnId = filename[:-5]

            # ignore invalidate pattern
            if not projizz.isPatternValidate(ptnId, table, confidence=confidence):
                continue

            count += 1
            print count,ptnId

            ptnInstance = projizz.jsonRead( os.path.join(inputSPIpath,filename) )
            for rela in ptnInstance:
                for key in ptnInstance[rela]:
                    # ignore in testing data's key
                    if key in notUsedKeys:
                        continue

                    for line in ptnInstance[rela][key]:
                        modelArticles[rela].append(line)
    
            if count%100 == 0:
                print "Read",count,"files"

    for relation in modelArticles:
        print relation
        projizz.jsonWrite(modelArticles[relation],os.path.join(outputVSMpath,"%s.txt" % (relation)))
Exemplo n.º 13
0
def preprocess(inputPath,outputPath):

    # Checking output path
    projizz.checkPath(outputPath)

    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table")

    start_time = datetime.now()

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)
    t = 0
    result = []
    for filename in os.listdir(inputPath):
        if ".json" in filename:
            result.append( pool.apply_async( mapper, (t,filename,inputPath,outputPath, model, table))  )
            t += 1
    pool.close()
    pool.join()

    words = {}
    idf = {}
    tfs = {}

    # Reducer - DF
    types = 0
    for r in result:
        fn,tks = r.get()
        tfs[fn] = tks
        types += 1

        for t in tks:
            if t not in words:
                words[t] = 0
            words[t] += 1

    print "Doc#",types,"words#",len(words)

    projizz.jsonWrite(words,os.path.join(outputPath,"documentFreq.df"))

    # Calculate idf
    for w in words:
        if words[w] == 0:
            continue

        idf[w] = math.log(float(types)/float(words[w]),10)
    
    projizz.jsonWrite(idf,os.path.join(outputPath,"idf.idf"))
    print "Write out idf file"

    # Calculate td-idf weight
    for fn in tfs:
        tks = tfs[fn]
        weight = {}
        for t in tks:
            tf = tks[t]
            if t not in idf:
                continue
            
            weight[t] = tf * idf[t]

        projizz.jsonWrite(weight,os.path.join(outputPath,fn))
        print "build",fn,"tf-idf weight"


    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Exemplo n.º 14
0
def mapper(jobid,filename,inputPath,outputPath,model,table):

    # Read article
    article = projizz.jsonRead( os.path.join(inputPath,filename) )

    stemmer = PorterStemmer()
    tks = {}

    print "Worker %d : Read %s into filter" % (jobid,filename)

    count = 0
    total = 0
    for line in article:
        count += 1
        tokens = projizz.getTokens(line)

        for token in tokens:
            t = stemmer.stem(token) 

            if t not in tks:
                tks[t] = 0

            tks[t] += 1
            total += 1

        if count % 1000 == 0:
            print "worker %d done %d lines" % (jobid,count)


    # Remove stopwords
    for sw in projizz._stopwords:
        _sw = stemmer.stem(sw)
        if _sw in tks:
            total -= tks[_sw]
            tks.pop(_sw)
        
    needRemove = []
    maxTF = 0
    for t in tks:
        # ignore only one time word
        if tks[t] <= 1:
            needRemove.append(t)
            total -= tks[t]
            continue

        # ignore the case contain number
        if "0" in t or "1" in t or "2" in t or "3" in t or "4" in t or "5" in t or "6" in t or "7" in t or "8" in t or "9" in t:
            needRemove.append(t)
            total -= tks[t]
            continue

        if tks[t] > maxTF:
            maxTF = tks[t]

    for rm in needRemove:
        tks.pop(rm)

    projizz.jsonWrite(tks,os.path.join(outputPath,filename.replace(".json",".tfc")))
    
    # Calculate tf
    for t in tks:
        tc = tks[t]
        tks[t] = float(tc)/float(maxTF)

    projizz.jsonWrite(tks,os.path.join(outputPath,filename.replace(".json",".tf")))
    print "worker %d write out." % (jobid)

    return (filename,tks)
Exemplo n.º 15
0
def preprocess(inputPath,inputPtnPath,outputPath,confidence):

    # Checking output path
    projizz.checkPath(outputPath)

    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table")

    start_time = datetime.now()

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            result.append( pool.apply_async( mapper, (t,filename,inputPath,inputPtnPath, model, table, confidence))  )
            #result.append( mapper(t,filename,inputPath,inputPtnPath, model, table, confidence))
            t += 1
    pool.close()
    pool.join()

    modelArticles = {}
    negAritcles = {}

    POSArticles = {}
    NEGArticles = {}

    # Reducer
    for r in result:
        sibr, osibr, p, n = r.get()

        for rela in sibr:
            if not rela in modelArticles:
                modelArticles[rela] = []
            modelArticles[rela] += sibr[rela]

        for rela in osibr:
            if not rela in negAritcles:
                negAritcles[rela] = []
            negAritcles[rela] += osibr[rela]

        for rela in p:
            if not rela in POSArticles:
                POSArticles[rela] = []
            POSArticles[rela] += p[rela]

        for rela in n:
            if not rela in NEGArticles:
                NEGArticles[rela] = []
            NEGArticles[rela] += n[rela]

    #
    #   relation.json: [line, line, line, ....]
    #

    for rela in modelArticles:
        print rela
        projizz.jsonWrite(modelArticles[rela],os.path.join(outputPath,"%s.json" % (rela))) 

    for rela in negAritcles:
        print rela
        projizz.jsonWrite(negAritcles[rela],os.path.join(outputPath,"%s.other" % (rela))) 

    for rela in POSArticles:
        print rela
        projizz.jsonWrite(POSArticles[rela],os.path.join(outputPath,"%s.pos" % (rela)))
        
    for rela in NEGArticles:
        print rela
        projizz.jsonWrite(NEGArticles[rela],os.path.join(outputPath,"%s.neg" % (rela)))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Exemplo n.º 16
0
def preprocess(inputPath, inputPtnPath, outputPath, confidence):

    # Checking output path
    projizz.checkPath(outputPath)

    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model",
        "../patty/yagoPatternTreeWithConfidence.table")

    start_time = datetime.now()

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            result.append(
                pool.apply_async(mapper, (t, filename, inputPath, inputPtnPath,
                                          model, table, confidence)))
            #result.append( mapper(t,filename,inputPath,inputPtnPath, model, table, confidence))
            t += 1
    pool.close()
    pool.join()

    modelArticles = {}
    negAritcles = {}

    POSArticles = {}
    NEGArticles = {}

    # Reducer
    for r in result:
        sibr, osibr, p, n = r.get()

        for rela in sibr:
            if not rela in modelArticles:
                modelArticles[rela] = []
            modelArticles[rela] += sibr[rela]

        for rela in osibr:
            if not rela in negAritcles:
                negAritcles[rela] = []
            negAritcles[rela] += osibr[rela]

        for rela in p:
            if not rela in POSArticles:
                POSArticles[rela] = []
            POSArticles[rela] += p[rela]

        for rela in n:
            if not rela in NEGArticles:
                NEGArticles[rela] = []
            NEGArticles[rela] += n[rela]

    #
    #   relation.json: [line, line, line, ....]
    #

    for rela in modelArticles:
        print rela
        projizz.jsonWrite(modelArticles[rela],
                          os.path.join(outputPath, "%s.json" % (rela)))

    for rela in negAritcles:
        print rela
        projizz.jsonWrite(negAritcles[rela],
                          os.path.join(outputPath, "%s.other" % (rela)))

    for rela in POSArticles:
        print rela
        projizz.jsonWrite(POSArticles[rela],
                          os.path.join(outputPath, "%s.pos" % (rela)))

    for rela in NEGArticles:
        print rela
        projizz.jsonWrite(NEGArticles[rela],
                          os.path.join(outputPath, "%s.neg" % (rela)))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Exemplo n.º 17
0
def parseYagoData():

    phase = "used"

    if phase == "build":
        model, table = projizz.readPrefixTreeModelWithTable(
            "../yago/yagoPatternTree.model", "../yago/yagoPatternTree.table")
    else:
        model, table = projizz.readPrefixTreeModelWithTable(
            "../yago/yagoPatternTree.model",
            "./yagoPatternTreeWithConfidence.table")

    # function testing.
    #test = "has appeared like [[num]]"
    ##test = "has appeared like [[num]"
    #i = projizz.naiveMatchPattern(test,model)
    #print i

    a = table.keys()
    originL = len(a)

    ptnByRelation = {}

    for relation in projizz.getYagoRelation():
        if not phase == "build":
            break

        f = open("./yagoRela/%s.txt" % (relation))

        print relation

        text = f.readline()
        ptnSynsetTxt = text.split("\",\" ")[1:]
        ptnSynsetTxt = ptnSynsetTxt[:-1] + [ptnSynsetTxt[-1][:-7]]

        ptnByRelation[relation] = []

        evC = 0
        for text in ptnSynsetTxt:
            ptns = text.split("#")
            # ptns[1] : pattern synset id in patty
            # ptns[3] : pattern domain
            # ptns[4] : pattern plain text
            # ptns[5] : pattern range
            # pnts[6] : confidence
            # ptns[7] : support co-occurrence
            # ptns[8] : some has, I guess it is eval result.
            if len(ptns) > 8:
                evC += 1

            patterns = ptns[4].split(";%")
            patterns = patterns[:-1] + [patterns[-1][:-1]]

            for pattern in patterns:
                pid = projizz.naiveMatchPattern(pattern, model)
                if pid < 0:
                    pass
                    #print relation,pattern
                else:
                    pid = str(pid)
                    if pid in a:
                        a.remove(pid)
                    if not pid in ptnByRelation[relation]:
                        ptnByRelation[relation].append(pid)

                    if not relation in table[pid]["relations"]:
                        table[pid]["relations"].append(relation)
                        #print relation,pid,pattern

                    ptnS = table[pid]
                    if not "confidence" in ptnS:
                        table[pid]["confidence"] = float(ptns[6])
                        table[pid]["support"] = int(ptns[7])
                        table[pid]["used"] = True

                        if len(ptns) > 8:
                            if ptns[8] == "false":
                                table[pid]["eval"] = False
                                #print pid,table[pid]["relations"],pattern,ptns[8]
                            else:
                                table[pid]["eval"] = True

        f.close()

    if phase == "build":

        for pid in a:
            table[pid]["used"] = False

        for pid in table:
            if table[pid]["used"]:
                needRemove = []
                for relation in table[pid]["relations"]:
                    if not pid in ptnByRelation[relation]:
                        print pid, table[pid]["pattern"], relation
                        needRemove.append(relation)
                for p in needRemove:
                    table[pid]["relations"].remove(p)
                if len(table[pid]["relations"]) == 0:
                    print pid, table[pid]["pattern"], "!!!"
            else:
                pass

        projizz.jsonWrite(table, "./yagoPatternTreeWithConfidence.table")

    else:
        c = 0
        used = 0
        for pid in table:
            if table[pid]["used"]:
                # 如果有true或false在,就只留True的Pattern
                if "eval" in table[pid]:
                    if not table[pid]["eval"]:
                        continue
                used += 1
                for relation in table[pid]["relations"]:
                    if not relation in ptnByRelation:
                        ptnByRelation[relation] = []
                    if not pid in ptnByRelation[relation]:
                        ptnByRelation[relation].append(pid)
            else:
                c += 1

    # 一些小計算
    #for relation in ptnByRelation:
    #    print relation,len(ptnByRelation[relation])

    # 找最高(意思就是不能再更高了)信心值
    # 每組Relation的最高之中最小的那一個

    minC = 1.0
    minCR = ""
    for relation in ptnByRelation:
        c75 = 0
        c50 = 0
        ptns = []
        for pid in ptnByRelation[relation]:
            ptns.append(table[pid])
            ptns[-1]["pid"] = pid
        ptns.sort(key=lambda x: x["confidence"], reverse=True)
        if ptns[0]["confidence"] < minC:
            minC = ptns[0]["confidence"]
            minCR = relation

        #print relation,ptns[0]
        f = open("./yagoSortedRela/%s.txt" % (relation), "w")
        for ptn in ptns:
            if ptn["confidence"] > .75:
                c75 += 1
            if ptn["confidence"] > .5:
                c50 += 1
            f.write("%s\t%s\t%.3f\t%d\t%s\n" %
                    (ptn["pid"], ptn["pattern"], ptn["confidence"],
                     ptn["support"], ptn["relations"]))
        f.close()

        print relation, len(ptns), c75, c50

    print minCR, minC, "pattern used:", used
def main(inputPath,inputPtnPath,outputPath,outputPtnPath):

    debug = False

    if not os.path.isdir(outputPath):
        os.mkdir(outputPath)
    if not os.path.isdir(outputPtnPath):
        os.mkdir(outputPtnPath)

    result = []
    count = 0

    # Update answer
    cpuN = multiprocessing.cpu_count()
    print "CoreNumber = %d" % (cpuN)
    pool = multiprocessing.Pool(processes=12) 
    t = 0
    for filename in os.listdir(inputPath):
        if ".json" in filename:
            t += 1
            if debug:
                result.append(updateAnswer(t,inputPath,filename))
            else:
                result.append(pool.apply_async(updateAnswer, (t,inputPath,filename)))
    
    pool.close()
    pool.join()

    # Rebuild articles and patterns

    tmpArticle = {}
    tmpPtn = {}

    dataSize = 0
    for res in result:
        if debug:
            filename,articles = res
        else:
            filename,articles = res.get()

        print filename,len(articles)
        a = projizz.jsonRead(os.path.join(inputPath,filename))
        p = projizz.jsonRead(os.path.join(inputPtnPath,filename))

        for key in articles:
            dataSize += 1
            tmpArticle[key] = a[key]
            tmpPtn[key] = p[key]

            if len(tmpPtn) == 1000:
                print "write to %05d.json" % (count)
                projizz.jsonWrite(tmpArticle,os.path.join(outputPath,"%05d.json" % (count)))
                projizz.jsonWrite(tmpPtn,os.path.join(outputPtnPath,"%05d.json" % (count)))
                tmpArticle = {}
                tmpPtn = {}
                count += 1

    if len(tmpPtn) > 0:
        print "write to %05d.json" % (count)
        projizz.jsonWrite(tmpArticle,os.path.join(outputPath,"%05d.json" % (count)))
        projizz.jsonWrite(tmpPtn,os.path.join(outputPtnPath,"%05d.json" % (count)))
        tmpArticle = {}
        tmpPtn = {}
        count += 1

    # Split to 5 
    splitTo5part("/tmp2/r01922024","y-all","/tmp2/r01922024","y")
    splitTo5part("/tmp2/r01922024","y-ptn-all","/tmp2/r01922024","y-ptn")
    
    print "write %d files. (%d)" % (count,dataSize)