def tryToFindRela(jobid, filename, dataInputPath, ptnOutputPath, model, table): content = projizz.combinedFileReader(os.path.join(dataInputPath, filename)) print "Worker %d : Read %s into filter" % (jobid, filename) count = 0 dealL = 0 patternEx = {} for articleName in content: pattern = [] article = projizz.articleSimpleSentenceFileter(content[articleName]) lineCount = 0 for line in article: dealL += 1 tokens = projizz._posTagger.tag(line) patternExtracted = projizz.naiveExtractPatterns(tokens, model) if len(patternExtracted) > 0: pattern.append((lineCount, patternExtracted)) if dealL % 10000 == 0: print "Worker %d deal with %d lines." % (jobid, dealL) lineCount += 1 patternEx[articleName] = pattern count += 1 if count % 100 == 0: print "Worker %d deal with %d files" % (jobid, count) gc.collect() projizz.combinedFileWriter(patternEx, os.path.join(ptnOutputPath, filename)) print "Worker %d : Write results out to %s." % (jobid, filename)
def tryToFindRela(jobid, filename, dataInputPath, ptnOutputPath, model, table): content = projizz.combinedFileReader(os.path.join(dataInputPath,filename)) print "Worker %d : Read %s into filter" % (jobid,filename) count = 0 dealL = 0 patternEx = {} for articleName in content: pattern = [] article = projizz.articleSimpleSentenceFileter(content[articleName]) lineCount = 0 for line in article: dealL += 1 tokens = projizz._posTagger.tag(line) patternExtracted = projizz.naiveExtractPatterns(tokens,model) if len(patternExtracted) > 0: pattern.append((lineCount,patternExtracted)) if dealL % 10000 == 0: print "Worker %d deal with %d lines." % (jobid,dealL) lineCount += 1 patternEx[articleName] = pattern count += 1 if count % 100 == 0: print "Worker %d deal with %d files" % (jobid,count) gc.collect() projizz.combinedFileWriter(patternEx,os.path.join(ptnOutputPath,filename)) print "Worker %d : Write results out to %s." % (jobid,filename)
def combinedFileViewer(filename): content = projizz.combinedFileReader(filename) for articleName in content: article = content[articleName] print articleName.encode("utf-8") print "" for line in article: print line.encode("utf-8") print "----"
def testing(filename): content = projizz.combinedFileReader(filename) start_time = datetime.now() for articleName in content: #print articleName article = projizz.articleSimpleLineFileter(content[articleName]) for line in article: projizz._posTagger.tag(line) #print "\n----" diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def testing(filename): content = projizz.combinedFileReader(filename) start_time = datetime.now() for articleName in content: #print articleName article = projizz.articleSimpleLineFileter(content[articleName]) for line in article: projizz._posTagger.tag(line) #print "\n----" diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds,diff.microseconds)
def tryToFindRela(jobid, filename, dataInputPath, resultOutPath, ptnOutputPath, model, tree): content = projizz.combinedFileReader(os.path.join(dataInputPath,filename)) print "Worker %d : Read %s into filter" % (jobid,filename) count = 0 dealL = 0 results = {} patternEx = {} for articleName in content: result = {} pattern = [] article = projizz.articleSimpleSentenceFileter(content[articleName]) for line in article: tokens = projizz._posTagger.tag(line) patternExtracted = projizz.naiveExtractPatterns(tokens,model) for ptnId,start,to in patternExtracted: dealL += 1 rels = tree[ptnId]["relations"] if len(rels) < 2: for r in rels: if not r in result: result[r] = 0 result[r] += 1 if not ptnId in pattern: pattern.append(ptnId) if dealL % 10000 == 0: print "Worker %d deal with %d lines." % (jobid,dealL) results[articleName] = result patternEx[articleName] = pattern count += 1 if count % 100 == 0: print "Worker %d deal with %d files" % (jobid,count) gc.collect() projizz.combinedFileWriter(results,os.path.join(resultOutPath,filename)) projizz.combinedFileWriter(patternEx,os.path.join(ptnOutputPath,filename)) print "Worker %d : Write results out to %s." % (jobid,filename)
def testing(filename): content = projizz.combinedFileReader(filename) model, table = projizz.readPrefixTreeModel("./../prefix_tree_model/patternTree.json") start_time = datetime.now() for articleName in content: print articleName article = projizz.articleSimpleSentenceFileter(content[articleName]) for line in article: tokens = projizz._posTagger.tag(line) patternExtracted = projizz.naiveExtractPatterns(tokens,model) if len(patternExtracted)>0: print line.encode("utf-8") for ptnId,start,to in patternExtracted: print "\t[%d] %s" % (ptnId,table[ptnId]["pattern"]) print "\n----" diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds,diff.microseconds)
def testing(filename): content = projizz.combinedFileReader(filename) model, table = projizz.readPrefixTreeModel( "./../prefix_tree_model/patternTree.json") start_time = datetime.now() for articleName in content: print articleName article = projizz.articleSimpleSentenceFileter(content[articleName]) for line in article: tokens = projizz._posTagger.tag(line) patternExtracted = projizz.naiveExtractPatterns(tokens, model) if len(patternExtracted) > 0: print line.encode("utf-8") for ptnId, start, to in patternExtracted: print "\t[%d] %s" % (ptnId, table[ptnId]["pattern"]) print "\n----" diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)