def handleOneIndex(indexpath, subdir, indexname, fast): print(indexpath, subdir, indexname) indexstatuspath = indexpath + config.getStatusPostfix() indexstatus = utils.load_status(indexstatuspath) if not utils.check_epoch(indexstatus, 'Prepare'): raise utils.EpochError('Please prepare first.\n') if utils.check_epoch(indexstatus, 'Populate'): return workdir = config.getWordRecognizerDir() + os.sep + \ subdir + os.sep + indexname print(workdir) shmdir = config.getInMemoryFileSystem() for i in range(1, N + 1): if fast: #copy file filename = config.getNgramFileName(i) filepath = workdir + os.sep + filename shmfilepath = shmdir + os.sep + filename utils.copyfile(filepath, shmfilepath) handleOnePass(indexpath, shmdir, i) pruneNgramTable(indexpath, shmdir, i) utils.copyfile(shmfilepath, filepath) os.unlink(shmfilepath) else: handleOnePass(indexpath, workdir, i) pruneNgramTable(indexpath, workdir, i) #sign epoch utils.sign_epoch(indexstatus, 'Populate') utils.store_status(indexstatuspath, indexstatus)
def generateOneText(infile, modelfile, reportfile): infilestatuspath = infile + config.getStatusPostfix() infilestatus = utils.load_status(infilestatuspath) if not utils.check_epoch(infilestatus, 'MergeSequence'): raise utils.EpochError('Please mergeseq first.\n') if utils.check_epoch(infilestatus, 'Generate'): return False #begin processing cmdline = ['../utils/training/gen_k_mixture_model', \ '--maximum-occurs-allowed', \ str(config.getMaximumOccursAllowed()), \ '--maximum-increase-rates-allowed', \ str(config.getMaximumIncreaseRatesAllowed()), \ '--k-mixture-model-file', \ modelfile, infile + \ config.getMergedPostfix()] subprocess = Popen(cmdline, shell=False, stderr=PIPE, \ close_fds=True) lines = subprocess.stderr.readlines() if lines: print('found error report') with open(reportfile, 'ab') as f: f.writelines(lines) (pid, status) = os.waitpid(subprocess.pid, 0) if status != 0: sys.exit('gen_k_mixture_model encounters error.') #end processing utils.sign_epoch(infilestatus, 'Generate') utils.store_status(infilestatuspath, infilestatus) return True
def handleOneIndex(indexpath): indexstatuspath = indexpath + config.getStatusPostfix() indexstatus = utils.load_status(indexstatuspath) if not utils.check_epoch(indexstatus, 'Segment'): raise utils.EpochError('Please segment first.\n') if utils.check_epoch(indexstatus, 'MergeSequence'): return #begin processing indexfile = open(indexpath, 'r') for oneline in indexfile.readlines(): #remove tailing '\n' oneline = oneline.rstrip(os.linesep) (title, textpath) = oneline.split('#') infile = config.getTextDir() + textpath outfile = config.getTextDir() + textpath + config.getMergedPostfix() reportfile = config.getTextDir() + textpath + \ config.getMergedReportPostfix() print("Processing " + title + '#' + textpath) mergeOneText(infile, outfile, reportfile) print("Processed " + title + '#' + textpath) indexfile.close() #end processing utils.sign_epoch(indexstatus, 'MergeSequence') utils.store_status(indexstatuspath, indexstatus)
def mergeOneText(infile, outfile, reportfile): infilestatuspath = infile + config.getStatusPostfix() infilestatus = utils.load_status(infilestatuspath) if not utils.check_epoch(infilestatus, 'Segment'): raise utils.EpochError('Please segment first.\n') if utils.check_epoch(infilestatus, 'MergeSequence'): return infile = infile + config.getSegmentPostfix() #begin processing cmdline = ['../utils/segment/mergeseq', \ '-o', outfile, infile] subprocess = Popen(cmdline, shell=False, stderr=PIPE, \ close_fds=True) lines = subprocess.stderr.readlines() if lines: print('found error report') with open(reportfile, 'wb') as f: f.writelines(lines) os.waitpid(subprocess.pid, 0) #end processing utils.sign_epoch(infilestatus, 'MergeSequence') utils.store_status(infilestatuspath, infilestatus)
def handleOneDocument(infile, cur, length): print(infile, length) infilestatuspath = infile + config.getStatusPostfix() infilestatus = utils.load_status(infilestatuspath) if not utils.check_epoch(infilestatus, 'Segment'): raise utils.EpochError('Please segment first.\n') if utils.check_epoch(infilestatus, 'Populate'): return False sep = config.getWordSep() #train docfile = open(infile + config.getSegmentPostfix(), 'r') words = [] for oneline in docfile.readlines(): oneline = oneline.rstrip(os.linesep) if len(oneline) == 0: continue (token, word) = oneline.split(" ", 1) token = int(token) if 0 == token: words = [] else: words.append(word) if len(words) < length: continue if len(words) > length: words.pop(0) assert len(words) == length #do sqlite training words_str = sep + sep.join(words) + sep #print(words_str) rowcount = cur.execute(UPDATE_NGRAM_DML, (words_str, )).rowcount #print(rowcount) assert rowcount <= 1 if 0 == rowcount: cur.execute(INSERT_NGRAM_DML, (words_str, )) docfile.close() #sign epoch only after last pass if N == length: utils.sign_epoch(infilestatus, 'Populate') utils.store_status(infilestatuspath, infilestatus) return True
def handleOneIndex(indexpath, subdir, indexname): print(indexpath, subdir, indexname) indexstatuspath = indexpath + config.getStatusPostfix() indexstatus = utils.load_status(indexstatuspath) if not utils.check_epoch(indexstatus, 'NewWord'): raise utils.EpochError('Please new word first.\n') if utils.check_epoch(indexstatus, 'MarkPinyin'): return workdir = config.getWordRecognizerDir() + os.sep + \ subdir + os.sep + indexname print(workdir) markPinyins(workdir) #sign epoch utils.sign_epoch(indexstatus, 'MarkPinyin') utils.store_status(indexstatuspath, indexstatus)
def handleOneModel(modelfile, reportfile): modelfilestatuspath = modelfile + config.getStatusPostfix() modelfilestatus = utils.load_status(modelfilestatuspath) if not utils.check_epoch(modelfilestatus, 'Generate'): raise utils.EpochError('Please generate first.\n') if utils.check_epoch(modelfilestatus, 'Estimate'): return reporthandle = open(reportfile, 'wb') result_line_prefix = "average lambda:" avg_lambda = 0. #begin processing cmdline = ['../utils/training/estimate_k_mixture_model', \ '--deleted-bigram-file', \ config.getEstimatesModel(), \ '--bigram-file', \ modelfile] subprocess = Popen(cmdline, shell=False, stdout=PIPE, \ close_fds=True) for line in subprocess.stdout.readlines(): reporthandle.writelines([line]) #remove trailing '\n' line = line.decode('utf-8') line = line.rstrip(os.linesep) if line.startswith(result_line_prefix): avg_lambda = float(line[len(result_line_prefix):]) reporthandle.close() (pid, status) = os.waitpid(subprocess.pid, 0) if status != 0: sys.exit('estimate k mixture model returns error.') #end processing print('average lambda:', avg_lambda) modelfilestatus['EstimateScore'] = avg_lambda utils.sign_epoch(modelfilestatus, 'Estimate') utils.store_status(modelfilestatuspath, modelfilestatus)
def handleOneIndex(indexpath, subdir, indexname): print(indexpath, subdir, indexname) indexstatuspath = indexpath + config.getStatusPostfix() indexstatus = utils.load_status(indexstatuspath) if not utils.check_epoch(indexstatus, 'Segment'): raise utils.EpochError('Please segment first.\n') if utils.check_epoch(indexstatus, 'Prepare'): return #create directory onedir = config.getWordRecognizerDir() + os.sep + \ subdir + os.sep + indexname os.path.exists(onedir) or os.makedirs(onedir) #create sqlite databases createSqliteDatabases(onedir) #sign epoch utils.sign_epoch(indexstatus, 'Prepare') utils.store_status(indexstatuspath, indexstatus)
def handleOneIndex(indexpath, subdir, indexname): print(indexpath, subdir, indexname) indexstatuspath = indexpath + config.getStatusPostfix() indexstatus = utils.load_status(indexstatuspath) if not utils.check_epoch(indexstatus, 'PartialWord'): raise utils.EpochError('Please partial word first.\n') if utils.check_epoch(indexstatus, 'NewWord'): return workdir = config.getWordRecognizerDir() + os.sep + \ subdir + os.sep + indexname print(workdir) createBigramSqlite(workdir) populateBigramSqlite(workdir) filename = config.getBigramFileName() filepath = workdir + os.sep + filename conn = sqlite3.connect(filepath) prethres = computeThreshold(conn, "prefix") indexstatus['NewWordPrefixThreshold'] = prethres postthres = computeThreshold(conn, "postfix") indexstatus['NewWordPostfixThreshold'] = postthres utils.store_status(indexstatuspath, indexstatus) filterPartialWord(workdir, conn, prethres, postthres) conn.commit() if conn: conn.close() #sign epoch utils.sign_epoch(indexstatus, 'NewWord') utils.store_status(indexstatuspath, indexstatus)
def handleOneIndex(indexpath, subdir, indexname): print(indexpath, subdir, indexname) indexstatuspath = indexpath + config.getStatusPostfix() indexstatus = utils.load_status(indexstatuspath) if not utils.check_epoch(indexstatus, 'Populate'): raise utils.EpochError('Please populate first.\n') if utils.check_epoch(indexstatus, 'PartialWord'): return workdir = config.getWordRecognizerDir() + os.sep + \ subdir + os.sep + indexname print(workdir) threshold = getThreshold(workdir) indexstatus['PartialWordThreshold'] = threshold utils.store_status(indexstatuspath, indexstatus) recognizePartialWord(workdir, threshold) #sign epoch utils.sign_epoch(indexstatus, 'PartialWord') utils.store_status(indexstatuspath, indexstatus)
def gatherModels(path, indexname): indexfilestatuspath = indexname + config.getStatusPostfix() indexfilestatus = utils.load_status(indexfilestatuspath) if utils.check_epoch(indexfilestatuspath, 'Estimate'): return #begin processing indexfile = open(indexname, "w") for root, dirs, files in os.walk(path, topdown=True, onerror=handleError): for onefile in files: filepath = os.path.join(root, onefile) if onefile.endswith(config.getModelPostfix()): #append one record to index file subdir = os.path.relpath(root, path) statusfilepath = filepath + config.getStatusPostfix() status = utils.load_status(statusfilepath) if not (utils.check_epoch(status, 'Estimate') and \ 'EstimateScore' in status): raise utils.EpochError('Unknown Error:\n' + \ 'Try re-run estimate.\n') avg_lambda = status['EstimateScore'] line = subdir + '#' + onefile + '#' + str(avg_lambda) indexfile.writelines([line, os.linesep]) #record written elif onefile.endswith(config.getStatusPostfix()): pass elif onefile.endswith(config.getIndexPostfix()): pass elif onefile.endswith(config.getReportPostfix()): pass else: print('Unexpected file:' + filepath) indexfile.close() #end processing utils.sign_epoch(indexfilestatus, 'Estimate') utils.store_status(indexfilestatuspath, indexfilestatus)
default=config.getFinalModelDir()) parser.add_argument('tryname', action='store', \ help='the storage directory') args = parser.parse_args() print(args) tryname = 'try' + args.tryname trydir = os.path.join(args.finaldir, tryname) if not os.access(trydir, os.F_OK): sys.exit(tryname + "doesn't exist.") cwdstatuspath = os.path.join(trydir, config.getFinalStatusFileName()) cwdstatus = utils.load_status(cwdstatuspath) if not utils.check_epoch(cwdstatus, 'Prune'): raise utils.EpochError('Please tryprune first.') if utils.check_epoch(cwdstatus, 'Evaluate'): sys.exit('already evaluated.') print('checking') checkData() modelfile = os.path.join(trydir, config.getFinalModelFileName()) destfile = os.path.join(libpinyin_dir, 'data', \ config.getFinalModelFileName()) utils.copyfile(modelfile, destfile) print('cleaning') cleanUpData()
def handleOneIndex(indexpath, subdir, indexname, fast): inMemoryFile = "model.db" modeldir = os.path.join(config.getModelDir(), subdir, indexname) os.path.exists(modeldir) or os.makedirs(modeldir) def cleanupInMemoryFile(): modelfile = os.path.join(config.getInMemoryFileSystem(), inMemoryFile) reportfile = modelfile + config.getReportPostfix() if os.access(modelfile, os.F_OK): os.unlink(modelfile) if os.access(reportfile, os.F_OK): os.unlink(reportfile) def copyoutInMemoryFile(modelfile): inmemoryfile = os.path.join\ (config.getInMemoryFileSystem(), inMemoryFile) inmemoryreportfile = inmemoryfile + config.getReportPostfix() reportfile = modelfile + config.getReportPostfix() if os.access(inmemoryfile, os.F_OK): utils.copyfile(inmemoryfile, modelfile) if os.access(inmemoryreportfile, os.F_OK): utils.copyfile(inmemoryreportfile, reportfile) def cleanupFiles(modelnum): modeldir = os.path.join(config.getModelDir(), subdir, indexname) modelfile = os.path.join( \ modeldir, config.getCandidateModelName(modelnum)) reportfile = modelfile + config.getReportPostfix() if os.access(modelfile, os.F_OK): os.unlink(modelfile) if os.access(reportfile, os.F_OK): os.unlink(reportfile) def storeModelStatus(modelfile, textnum, nexttextnum): #store model info in status file modelstatuspath = modelfile + config.getStatusPostfix() #create None status modelstatus = {} modelstatus['GenerateStart'] = textnum modelstatus['GenerateEnd'] = nexttextnum utils.sign_epoch(modelstatus, 'Generate') utils.store_status(modelstatuspath, modelstatus) print(indexpath, subdir, indexname) indexstatuspath = indexpath + config.getStatusPostfix() indexstatus = utils.load_status(indexstatuspath) if not utils.check_epoch(indexstatus, 'MergeSequence'): raise utils.EpochError('Please mergeseq first.\n') if utils.check_epoch(indexstatus, 'Generate'): return #continue generating textnum, modelnum, aggmodelsize = 0, 0, 0 if 'GenerateTextEnd' in indexstatus: textnum = indexstatus['GenerateTextEnd'] if 'GenerateModelEnd' in indexstatus: modelnum = indexstatus['GenerateModelEnd'] #clean up previous file if fast: cleanupInMemoryFile() cleanupFiles(modelnum) #begin processing indexfile = open(indexpath, 'r') for i, oneline in enumerate(indexfile.readlines()): #continue last generating if i < textnum: continue #remove trailing '\n' oneline = oneline.rstrip(os.linesep) (title, textpath) = oneline.split('#') infile = config.getTextDir() + textpath infilesize = utils.get_file_length(infile + config.getMergedPostfix()) if infilesize < config.getMinimumFileSize(): print("Skipping " + title + '#' + textpath) continue if fast: modelfile = os.path.join(config.getInMemoryFileSystem(), \ inMemoryFile) else: modelfile = os.path.join(modeldir, \ config.getCandidateModelName(modelnum)) reportfile = modelfile + config.getReportPostfix() print("Proccessing " + title + '#' + textpath) if generateOneText(infile, modelfile, reportfile): aggmodelsize += infilesize print("Processed " + title + '#' + textpath) if aggmodelsize > config.getCandidateModelSize(): #copy out in memory file if fast: modelfile = os.path.join\ (modeldir, config.getCandidateModelName(modelnum)) copyoutInMemoryFile(modelfile) cleanupInMemoryFile() #the model file is in disk now nexttextnum = i + 1 storeModelStatus(modelfile, textnum, nexttextnum) #new model candidate aggmodelsize = 0 textnum = nexttextnum modelnum += 1 #clean up next file cleanupFiles(modelnum) #save current progress in status file indexstatus['GenerateTextEnd'] = nexttextnum indexstatus['GenerateModelEnd'] = modelnum utils.store_status(indexstatuspath, indexstatus) #copy out in memory file if fast: modelfile = os.path.join\ (modeldir, config.getCandidateModelName(modelnum)) copyoutInMemoryFile(modelfile) cleanupInMemoryFile() #the model file is in disk now nexttextnum = i + 1 storeModelStatus(modelfile, textnum, nexttextnum) indexfile.close() #end processing #save current progress in status file modelnum += 1 indexstatus['GenerateTextEnd'] = nexttextnum indexstatus['GenerateModelEnd'] = modelnum utils.sign_epoch(indexstatus, 'Generate') utils.store_status(indexstatuspath, indexstatus)