Пример #1
0
def tryToFindRela(jobid, filename, dataInputPath, ptnOutputPath, model, table):
    content = projizz.combinedFileReader(os.path.join(dataInputPath, filename))
    print "Worker %d : Read %s into filter" % (jobid, filename)
    count = 0
    dealL = 0
    patternEx = {}
    for articleName in content:
        pattern = []
        article = projizz.articleSimpleSentenceFileter(content[articleName])
        lineCount = 0
        for line in article:
            dealL += 1
            tokens = projizz._posTagger.tag(line)
            patternExtracted = projizz.naiveExtractPatterns(tokens, model)
            if len(patternExtracted) > 0:
                pattern.append((lineCount, patternExtracted))
            if dealL % 10000 == 0:
                print "Worker %d deal with %d lines." % (jobid, dealL)
            lineCount += 1

        patternEx[articleName] = pattern
        count += 1
        if count % 100 == 0:
            print "Worker %d deal with %d files" % (jobid, count)
            gc.collect()

    projizz.combinedFileWriter(patternEx, os.path.join(ptnOutputPath,
                                                       filename))
    print "Worker %d : Write results out to %s." % (jobid, filename)
Пример #2
0
def tryToFindRela(jobid, filename, dataInputPath, ptnOutputPath, model, table):
    content = projizz.combinedFileReader(os.path.join(dataInputPath,filename))
    print "Worker %d : Read %s into filter" % (jobid,filename)
    count = 0
    dealL = 0
    patternEx = {}
    for articleName in content:
        pattern = []
        article = projizz.articleSimpleSentenceFileter(content[articleName])
        lineCount = 0
        for line in article:
            dealL += 1
            tokens = projizz._posTagger.tag(line)
            patternExtracted = projizz.naiveExtractPatterns(tokens,model)
            if len(patternExtracted) > 0:
                pattern.append((lineCount,patternExtracted))
            if dealL % 10000 == 0:
                print "Worker %d deal with %d lines." % (jobid,dealL)
            lineCount += 1
        
        patternEx[articleName] = pattern
        count += 1
        if count % 100 == 0:
            print "Worker %d deal with %d files" % (jobid,count)
            gc.collect()

    projizz.combinedFileWriter(patternEx,os.path.join(ptnOutputPath,filename))
    print "Worker %d : Write results out to %s." % (jobid,filename)
def combinedFileViewer(filename):
    
    content = projizz.combinedFileReader(filename)
  
    for articleName in content:
        article = content[articleName]
        print articleName.encode("utf-8")
        print ""
        for line in article:
            print line.encode("utf-8")
        print "----"
Пример #4
0
def testing(filename):

    content = projizz.combinedFileReader(filename)

    start_time = datetime.now()
    for articleName in content:
        #print articleName
        article = projizz.articleSimpleLineFileter(content[articleName])

        for line in article:
            projizz._posTagger.tag(line)

        #print "\n----"
    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Пример #5
0
def testing(filename):
    
    content = projizz.combinedFileReader(filename)
  
    start_time = datetime.now()
    for articleName in content:
        #print articleName
        article = projizz.articleSimpleLineFileter(content[articleName])
        
        for line in article:
            projizz._posTagger.tag(line)


        #print "\n----"
    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds,diff.microseconds)
Пример #6
0
def tryToFindRela(jobid, filename, dataInputPath, resultOutPath, ptnOutputPath, model, tree):
    content = projizz.combinedFileReader(os.path.join(dataInputPath,filename))
    print "Worker %d : Read %s into filter" % (jobid,filename)
    count = 0
    dealL = 0
    results = {}
    patternEx = {}
    for articleName in content:
        result = {}
        pattern = []
        article = projizz.articleSimpleSentenceFileter(content[articleName])
        for line in article:
            tokens = projizz._posTagger.tag(line)
            patternExtracted = projizz.naiveExtractPatterns(tokens,model)

            for ptnId,start,to in patternExtracted:
                dealL += 1
                rels = tree[ptnId]["relations"]

                if len(rels) < 2:
                    for r in rels:
                        if not r in result:
                            result[r] = 0
                        result[r] += 1

                if not ptnId in pattern:
                    pattern.append(ptnId)
                
                if dealL % 10000 == 0:
                    print "Worker %d deal with %d lines." % (jobid,dealL)
                    
        
        results[articleName] = result
        patternEx[articleName] = pattern
        count += 1
        if count % 100 == 0:
            print "Worker %d deal with %d files" % (jobid,count)
            gc.collect()

    projizz.combinedFileWriter(results,os.path.join(resultOutPath,filename))
    projizz.combinedFileWriter(patternEx,os.path.join(ptnOutputPath,filename))
    print "Worker %d : Write results out to %s." % (jobid,filename)
Пример #7
0
def testing(filename):
    
    content = projizz.combinedFileReader(filename)

    model, table = projizz.readPrefixTreeModel("./../prefix_tree_model/patternTree.json")
  
    start_time = datetime.now()
    for articleName in content:
        print articleName
        article = projizz.articleSimpleSentenceFileter(content[articleName])
        
        for line in article:
            tokens = projizz._posTagger.tag(line)
            patternExtracted = projizz.naiveExtractPatterns(tokens,model)
            if len(patternExtracted)>0:
                print line.encode("utf-8")
                for ptnId,start,to in patternExtracted:
                    print "\t[%d] %s" % (ptnId,table[ptnId]["pattern"])

        print "\n----"
    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds,diff.microseconds)
Пример #8
0
def testing(filename):

    content = projizz.combinedFileReader(filename)

    model, table = projizz.readPrefixTreeModel(
        "./../prefix_tree_model/patternTree.json")

    start_time = datetime.now()
    for articleName in content:
        print articleName
        article = projizz.articleSimpleSentenceFileter(content[articleName])

        for line in article:
            tokens = projizz._posTagger.tag(line)
            patternExtracted = projizz.naiveExtractPatterns(tokens, model)
            if len(patternExtracted) > 0:
                print line.encode("utf-8")
                for ptnId, start, to in patternExtracted:
                    print "\t[%d] %s" % (ptnId, table[ptnId]["pattern"])

        print "\n----"
    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)