def main(inputFiles, outputPath, n): rule = ".json" files = Queue.Queue(0) # if target dir not exist, create it. if not os.path.isdir(outputPath): os.mkdir(outputPath) def workerFunction(jobObj, tid, args): content = json.load(open(os.path.join(inputFiles, jobObj), "r")) print "worker #%02d read file %s" % (tid, jobObj) dealL = 0 rawGram = {} for subFilename in content: ngl = [] for line in content[subFilename]: dealL += 1 if n == 2: # FIXME ngs = line.lower().replace("[", " ").replace( "]", " ").replace("!", " ").replace("?", " ").replace( ",", " ").replace(")", " ").replace("(", " ").split() if len(ngs) > 2: for i in xrange(1, len(ngs)): ngl.append("%s\t%s" % (ngs[i - 1], ngs[i])) else: pass else: # do the n-gram using nltk ngs = ngrams(word_tokenize(line.lower()), n) if len(ngs) > 1: continue for ng in ngs: ngl.append(toStringForm(ng)) if dealL % 10000 == 0: print "worker #%02d deal with %d lines" % (tid, dealL) rawGram[subFilename] = ngl json.dump(rawGram, open(os.path.join(outputPath, jobObj), "w")) fileNameList = [] for filename in os.listdir(inputFiles): if rule in filename: fileNameList.append(filename) #files.put(filename) fileNameList.sort() for filename in fileNameList: files.put(filename) manager = Manager(workerNumber=15) manager.setJobQueue(files) manager.setWorkerFunction(workerFunction) manager.startWorking()
def main(inputFiles,outputPath,n): rule = ".json" files = Queue.Queue(0) # if target dir not exist, create it. if not os.path.isdir(outputPath): os.mkdir(outputPath) def workerFunction(jobObj,tid,args): content = json.load(open(os.path.join(inputFiles,jobObj),"r")) print "worker #%02d read file %s" % (tid,jobObj) dealL = 0 rawGram = {} for subFilename in content: ngl = [] for line in content[subFilename]: dealL += 1 if n == 2: # FIXME ngs = line.lower().replace("["," ").replace("]"," ").replace("!"," ").replace("?"," ").replace(","," ").replace(")"," ").replace("("," ").split() if len(ngs) > 2: for i in xrange(1,len(ngs)): ngl.append("%s\t%s" % (ngs[i-1],ngs[i])) else: pass else: # do the n-gram using nltk ngs = ngrams(word_tokenize(line.lower()),n) if len(ngs) > 1: continue for ng in ngs: ngl.append(toStringForm(ng)) if dealL%10000 == 0: print "worker #%02d deal with %d lines" % (tid,dealL) rawGram[subFilename] = ngl json.dump(rawGram,open(os.path.join(outputPath,jobObj),"w")) fileNameList = [] for filename in os.listdir(inputFiles): if rule in filename: fileNameList.append(filename) #files.put(filename) fileNameList.sort() for filename in fileNameList: files.put(filename) manager = Manager(workerNumber=15) manager.setJobQueue(files) manager.setWorkerFunction(workerFunction) manager.startWorking()
def main(modelPath,inputPath,outputPath): ngram,models = readModel(modelPath) files = Queue.Queue(0) print "Read %d %d-gram models" % (len(models),ngram) if not os.path.isdir(outputPath): os.mkdir(outputPath) def workerFunction(jobObj,tid,args): a = datetime.now() print "worker #%02d start read file %s" % (tid,jobObj) f = open(os.path.join(inputPath,jobObj),"r") content = json.load(f) f.close() diff = datetime.now() - a print "worker #%02d read file %s, use [%d.%d s]" % (tid,jobObj,diff.seconds,diff.microseconds) results = {} count = 0 for subFilename in content: ngs = content[subFilename] count += 1 result = {} for model in models: mn = models[model] #result[model] = 0 for ng in ngs: if ng in mn: if not model in result: result[model] = 0 result[model]+=1 if count % 100 == 0: print "worker #%02d scan %d files" % (tid,count) results[subFilename] = result a = datetime.now() print "worker #%02d start write file %s" % (tid,jobObj) f = open(os.path.join(outputPath,jobObj),"w") json.dump(results,f) f.close() diff = datetime.now() - a print "worker #%02d write file %s, use [%d.%d s]" % (tid,jobObj,diff.seconds,diff.microseconds) for filename in os.listdir(inputPath): if ".json" in filename: files.put(filename) manager = Manager(25) manager.setJobQueue(files) manager.setWorkerFunction(workerFunction) manager.startWorking()
def main(modelPath, inputFiles, outputPath): ngram, models = readModel(modelPath) rule = ".json" files = Queue.Queue(0) start_time = datetime.now() # if target dir not exist, create it. if not os.path.isdir(outputPath): os.mkdir(outputPath) def workerFunction(jobObj, tid, args): content = json.load(open(os.path.join(inputFiles, jobObj), "r")) print "worker #%02d read file %s" % (tid, jobObj) dealL = 0 count = 0 results = {} for subFilename in content: count += 1 ngl = [] for line in content[subFilename]: dealL += 1 # FIXME - only implement bigram ngs = ( line.lower() .replace("[", " ") .replace("]", " ") .replace("!", " ") .replace("?", " ") .replace(",", " ") .replace(")", " ") .replace("(", " ") .split() ) for i in xrange(1, len(ngs)): ngl.append("%s\t%s" % (ngs[i - 1], ngs[i])) # try to find hit. result = {} if len(ngl) < 1: continue for model in models: mn = models[model] for ng in ngs: if ng in mn: if not model in result: result[model] = 0 result[model] += 1 if dealL % 10000 == 0: print "worker #%02d deal with %d lines" % (tid, dealL) if count % 100 == 0: print "worker #%02d scan %d files" % (tid, count) results[subFilename] = result json.dump(results, open(os.path.join(outputPath, jobObj), "w")) fileNameList = [] for filename in os.listdir(inputFiles): if rule in filename: # fileNameList.append(filename) files.put(filename) # fileNameList.sort() # for filename in fileNameList: # files.put(filename) manager = Manager(workerNumber=25) manager.setJobQueue(files) manager.setWorkerFunction(workerFunction) manager.startWorking() diff = datetime.now() - start_time print "All job done, use %d.%d secs" % (diff.seconds, diff.microseconds)
def main(modelPath, inputPath, outputFileName): ngram, models = readModel(modelPath) resultQueue = Queue.Queue(0) for model in models: models[model] = { "tp": 0, # true - postive Real Ans: True False "tn": 0, # true - negative Model told: "fp": 0, # false - postive Yes tp fp "fn": 0 # false - negative No fn tn } connect = pymongo.Connection() db = connect.projizz ansCol = db.result.data.instance def workerFunction(jobObj, tid, args): resultJson = json.load(open(os.path.join(inputPath, jobObj), "r")) print "worker #%02d read file %s" % (tid, jobObj) queries = map(lambda x: x[:-4], resultJson) itr = ansCol.find({"revid": {"$in": queries}}) print "worker #%02d query=%d, result=%d" % (tid, len(queries), itr.count()) partAns = copy.deepcopy(models) count = 0 for ans in itr: count += 1 features = ans["features"] resultInstance = resultJson["%s.txt" % (ans["revid"])] for relationship in partAns: postive = False true = False if relationship in resultInstance and resultInstance[ relationship] > 0: postive = True if relationship in features: true = True if true: if postive: partAns[relationship]["tp"] += 1 else: partAns[relationship]["fn"] += 1 else: if postive: partAns[relationship]["fp"] += 1 else: partAns[relationship]["tn"] += 1 if count % 100 == 0: print "worker #%02d done %d." % (tid, count) resultQueue.put(partAns) files = Queue.Queue(0) for filename in os.listdir(inputPath): if ".json" in filename: files.put(filename) manager = Manager(8) manager.setJobQueue(files) manager.setWorkerFunction(workerFunction) manager.startWorking() print "Result Queue size", resultQueue.qsize() while True: if resultQueue.empty(): break try: r = resultQueue.get() for m in r: models[m]["tp"] += r[m]["tp"] models[m]["tn"] += r[m]["tn"] models[m]["fp"] += r[m]["fp"] models[m]["fn"] += r[m]["fn"] except: break print "start write out to %s" % (outputFileName) json.dump(models, open(outputFileName, "w")) print "done"
def main(modelPath,inputFiles,outputPath): ngram,models = readModel(modelPath) rule = ".json" files = Queue.Queue(0) start_time = datetime.now() # if target dir not exist, create it. if not os.path.isdir(outputPath): os.mkdir(outputPath) def workerFunction(jobObj,tid,args): content = json.load(open(os.path.join(inputFiles,jobObj),"r")) print "worker #%02d read file %s" % (tid,jobObj) dealL = 0 count = 0 results = {} for subFilename in content: count += 1 ngl = [] for line in content[subFilename]: dealL += 1 # FIXME - only implement bigram ngs = line.lower().replace("["," ").replace("]"," ").replace("!"," ").replace("?"," ").replace(","," ").replace(")"," ").replace("("," ").split() for i in xrange(1,len(ngs)): ngl.append("%s\t%s" % (ngs[i-1],ngs[i])) # try to find hit. result = {} if len(ngl) < 1: continue for model in models: mn = models[model] for ng in ngs: if ng in mn: if not model in result: result[model] = 0 result[model] += 1 if dealL%10000 == 0: print "worker #%02d deal with %d lines" % (tid,dealL) if count % 100 == 0: print "worker #%02d scan %d files" % (tid,count) results[subFilename] = result json.dump(results,open(os.path.join(outputPath,jobObj),"w")) fileNameList = [] for filename in os.listdir(inputFiles): if rule in filename: #fileNameList.append(filename) files.put(filename) #fileNameList.sort() #for filename in fileNameList: # files.put(filename) manager = Manager(workerNumber=25) manager.setJobQueue(files) manager.setWorkerFunction(workerFunction) manager.startWorking() diff = datetime.now() - start_time print "All job done, use %d.%d secs" % (diff.seconds,diff.microseconds)
def main(modelPath,inputPath,outputFileName): ngram, models = readModel(modelPath) resultQueue = Queue.Queue(0) for model in models: models[model] = { "tp":0, # true - postive Real Ans: True False "tn":0, # true - negative Model told: "fp":0, # false - postive Yes tp fp "fn":0 # false - negative No fn tn } connect = pymongo.Connection() db = connect.projizz ansCol = db.result.data.instance def workerFunction(jobObj,tid,args): resultJson = json.load(open(os.path.join(inputPath,jobObj),"r")) print "worker #%02d read file %s" % (tid,jobObj) queries = map(lambda x: x[:-4], resultJson) itr = ansCol.find({"revid":{"$in":queries}}) print "worker #%02d query=%d, result=%d" % (tid,len(queries),itr.count()) partAns = copy.deepcopy(models) count = 0 for ans in itr: count += 1 features = ans["features"] resultInstance = resultJson["%s.txt" % (ans["revid"])] for relationship in partAns: postive = False true = False if relationship in resultInstance and resultInstance[relationship] > 0: postive = True if relationship in features: true = True if true: if postive: partAns[relationship]["tp"] += 1 else: partAns[relationship]["fn"] += 1 else: if postive: partAns[relationship]["fp"] += 1 else: partAns[relationship]["tn"] += 1 if count % 100 == 0: print "worker #%02d done %d." % (tid,count) resultQueue.put(partAns) files = Queue.Queue(0) for filename in os.listdir(inputPath): if ".json" in filename: files.put(filename) manager = Manager(8) manager.setJobQueue(files) manager.setWorkerFunction(workerFunction) manager.startWorking() print "Result Queue size", resultQueue.qsize() while True: if resultQueue.empty(): break try: r = resultQueue.get() for m in r: models[m]["tp"] += r[m]["tp"] models[m]["tn"] += r[m]["tn"] models[m]["fp"] += r[m]["fp"] models[m]["fn"] += r[m]["fn"] except: break print "start write out to %s" % (outputFileName) json.dump(models,open(outputFileName,"w")) print "done"