def main(dataInputPath,resultOutPath,ptnOutputPath): model, table = projizz.readPrefixTreeModel("../prefix_tree_model/patternTree.json") if not os.path.isdir(resultOutPath): os.mkdir(resultOutPath) if not os.path.isdir(ptnOutputPath): os.mkdir(ptnOutputPath) pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) print "Number of core: %d" % (multiprocessing.cpu_count()) start_time = datetime.now() jobN = 0 for filename in os.listdir(dataInputPath): if ".json" in filename: pool.apply_async(tryToFindRela, (jobN, filename, dataInputPath, resultOutPath,ptnOutputPath, model, table)) jobN+=1 pool.close() pool.join() diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds,diff.microseconds) projizz.combinedFileWriter(model,os.path.join(ptnOutputPath,"model")) projizz.combinedFileWriter(table,os.path.join(ptnOutputPath,"table"))
def tryToFindRela(jobid, filename, dataInputPath, ptnOutputPath, model, table): content = projizz.combinedFileReader(os.path.join(dataInputPath, filename)) print "Worker %d : Read %s into filter" % (jobid, filename) count = 0 dealL = 0 patternEx = {} for articleName in content: pattern = [] article = projizz.articleSimpleSentenceFileter(content[articleName]) lineCount = 0 for line in article: dealL += 1 tokens = projizz._posTagger.tag(line) patternExtracted = projizz.naiveExtractPatterns(tokens, model) if len(patternExtracted) > 0: pattern.append((lineCount, patternExtracted)) if dealL % 10000 == 0: print "Worker %d deal with %d lines." % (jobid, dealL) lineCount += 1 patternEx[articleName] = pattern count += 1 if count % 100 == 0: print "Worker %d deal with %d files" % (jobid, count) gc.collect() projizz.combinedFileWriter(patternEx, os.path.join(ptnOutputPath, filename)) print "Worker %d : Write results out to %s." % (jobid, filename)
def tryToFindRela(jobid, filename, dataInputPath, ptnOutputPath, model, table): content = projizz.combinedFileReader(os.path.join(dataInputPath,filename)) print "Worker %d : Read %s into filter" % (jobid,filename) count = 0 dealL = 0 patternEx = {} for articleName in content: pattern = [] article = projizz.articleSimpleSentenceFileter(content[articleName]) lineCount = 0 for line in article: dealL += 1 tokens = projizz._posTagger.tag(line) patternExtracted = projizz.naiveExtractPatterns(tokens,model) if len(patternExtracted) > 0: pattern.append((lineCount,patternExtracted)) if dealL % 10000 == 0: print "Worker %d deal with %d lines." % (jobid,dealL) lineCount += 1 patternEx[articleName] = pattern count += 1 if count % 100 == 0: print "Worker %d deal with %d files" % (jobid,count) gc.collect() projizz.combinedFileWriter(patternEx,os.path.join(ptnOutputPath,filename)) print "Worker %d : Write results out to %s." % (jobid,filename)
def tryToFindRela(jobid, filename, dataInputPath, resultOutPath, ptnOutputPath, model, tree): content = projizz.combinedFileReader(os.path.join(dataInputPath,filename)) print "Worker %d : Read %s into filter" % (jobid,filename) count = 0 dealL = 0 results = {} patternEx = {} for articleName in content: result = {} pattern = [] article = projizz.articleSimpleSentenceFileter(content[articleName]) for line in article: tokens = projizz._posTagger.tag(line) patternExtracted = projizz.naiveExtractPatterns(tokens,model) for ptnId,start,to in patternExtracted: dealL += 1 rels = tree[ptnId]["relations"] if len(rels) < 2: for r in rels: if not r in result: result[r] = 0 result[r] += 1 if not ptnId in pattern: pattern.append(ptnId) if dealL % 10000 == 0: print "Worker %d deal with %d lines." % (jobid,dealL) results[articleName] = result patternEx[articleName] = pattern count += 1 if count % 100 == 0: print "Worker %d deal with %d files" % (jobid,count) gc.collect() projizz.combinedFileWriter(results,os.path.join(resultOutPath,filename)) projizz.combinedFileWriter(patternEx,os.path.join(ptnOutputPath,filename)) print "Worker %d : Write results out to %s." % (jobid,filename)