def main(args): parser = argparse.ArgumentParser() parser.add_argument("config_file", nargs='?', default= os.path.join(os.getcwd(),"mseg_config.cfg"), help="configuration file for the multistage segmenter") args = parser.parse_args() print "running main_run" config_fname = args.config_file with open(config_fname) as data_file: config = json.load(data_file) base_dir = config['base_dir'] batches = config['batches'] for batch in batches: if(not batch['run_batch']): continue print "RUNNING BATCH", batch batch_name = batch['batch_dir'] batch_dir = os.path.join(base_dir, batch_name) lm_dir = os.path.join(base_dir, batch['language_model']) pm_dir = os.path.join(base_dir, batch['prosodic_model']) slm_dir = os.path.join(base_dir,batch['length_model']) te_file = batch['test_file'] use_pos_tags = batch.get('use_pos_tags', False) pm_weight = batch.get('pm_weight', 1) gold_dir = os.path.join(batch_dir, "gold") all_models_dir = os.path.join(batch_dir, "pm_lm_slm") all_models_out_dir = os.path.join(all_models_dir, "output") all_models_in_dir = os.path.join(all_models_dir, "composed") all_models_shp_dir = os.path.join(all_models_dir, "shortest") pm_only_dir = os.path.join(batch_dir, "pm_only") pm_shp_dir = os.path.join(pm_only_dir, "shortest") pm_outs_dir = os.path.join(pm_only_dir, "output") pm_lm_dir = os.path.join(batch_dir, "pm_lm") pm_lm_in_dir = os.path.join(pm_lm_dir, "composed") pm_lm_shp_dir = os.path.join(pm_lm_dir, "shortest") pm_lm_outs_dir = os.path.join(pm_lm_dir, "output") pm_slm_dir = os.path.join(batch_dir, "pm_slm") pm_slm_in_dir = os.path.join(pm_lm_dir, "composed") pm_slm_shp_dir = os.path.join(pm_lm_dir, "shortest") pm_slm_outs_dir = os.path.join(pm_lm_dir, "output") if(do_build): #lmdir_global = os.path.join(base_dir,lm_dir) batch_input_fst_dir = os.path.join(batch_dir, "speech_fsts") if not os.path.exists(batch_input_fst_dir): os.makedirs(batch_input_fst_dir) te_rows = read_file(os.path.join(base_dir, te_file), ',', skip_header=True) create_gold_files.generate_gold_files(gold_dir, te_rows) #ONE: make speech_fsts from te_rows lmsym_fname = os.path.join(lm_dir,LM_SYM_FILE) lm_syms = load_symbol_table(lmsym_fname) te_syms = [r[SYMBOL_COL] for r in te_rows] all_syms = set(lm_syms + te_syms) pmsym_fname = os.path.join(batch_dir, SYM_FILE) save_symbol_table(all_syms, pmsym_fname) probability_file = os.path.join(pm_dir, (te_file+"-probabilities.dat")) if not os.path.exists(probability_file): print "No prosodic probability file found: ", probability_file, " - you need to create this first with train_pm.py" continue #go onto the next batch TODO should create prob file here! prob_rows = read_file(probability_file, ' ', skip_header=True) if use_pos_tags: emission_vals = posify(te_rows) else: emission_vals = te_syms generate_pm_text_files(batch_input_fst_dir, lm_syms, te_rows, prob_rows, max_count=-1, emission_values=emission_vals, pm_weight=pm_weight) compile_pm_files(batch_input_fst_dir, pmsym_fname, lmsym_fname) #TWO: Assuming all the other model files are complete, we should be good to go #lang_mod = os.path.join(lm_dir,LM_PRUNED) lang_mod = os.path.join(lm_dir,"mod.pru") # lang_mod = os.path.join(lm_dir,"lm.mod") print "joined up working dir names" find_shortest_paths.stringify_shortest_paths(batch_input_fst_dir, pm_shp_dir, pm_outs_dir) #now just use the pruned LM file without slen modifier process_inputs(batch_input_fst_dir, lang_mod, pm_lm_in_dir) find_shortest_paths.stringify_shortest_paths(pm_lm_in_dir, pm_lm_shp_dir, pm_lm_outs_dir) #(input_dir, shortpath_dir, strings_dir #use combined LM and slen modifier slm_file = os.path.join(slm_dir,"slm.fst") lm_slm = os.path.join(batch_dir,"lm_slm.fst") if compose_lm_slm: lm_utils.fstarcsort(slm_file, ilabel_sort=True) lm_utils.fstcompose(lang_mod, slm_file, lm_slm) lm_utils.fstimmut(lm_slm, lm_slm) process_inputs(batch_input_fst_dir, lm_slm, all_models_in_dir) print "doing find shortest paths..." find_shortest_paths.stringify_shortest_paths(all_models_in_dir, all_models_shp_dir, all_models_out_dir) R = convert_to_single_file("*.gld", gold_dir) C = convert_to_single_file("*.fst", all_models_out_dir) PM_C = convert_to_single_file("*.fst", pm_outs_dir) PMLM_C = convert_to_single_file("*.fst", pm_lm_outs_dir) cands = (("PM", PM_C), ("PM_LM",PMLM_C), ("PM_LM_SLM",C)) evaluate_output.eval_segmenter_output(batch_dir) mc_report = evaluate_output.multi_col_report(batch_dir) mcrfile = open(os.path.join(batch_dir, "mc_report.csv"),"w") for r in mc_report: rec_id = r["rec_id"] words = r["words"] gold = r["gold"] pm = r["pm_only"] pm_lm = r["pm_lm"] pm_lm_slm = r["pm_lm_slm"] mcrfile.write("recording, word, gold, pm_only, pm_lm, pm_lm_slm\n") for row in zip(words, gold, pm, pm_lm, pm_lm_slm): s = ",".join(row) s = rec_id + "," + s + "\n" mcrfile.write(s) mcrfile.close() bfile = open(os.path.join(batch_dir, batch_name+"-SCORES.txt"),"w") bfile.write(batch_name+"\n\n"); #report BLEU-like scores for 4- and 3-grammes in both strict and lax counting modes for o in (4,3): for s in (True,False): write_bleus_to_file(R, cands, bfile, o, strict=s) #create a list of {0,1} values to show break or no break #golds=[r["gold"] for r in mc_report] golds=[ int(item) for r in mc_report for item in r['gold']] for m in ("pm_only", "pm_lm", "pm_lm_slm"): hyps=[int(item) for r in mc_report for item in r[m]] prF = report_utils.get_prF(golds, hyps) b_acc = report_utils.get_baseline_accuracy(golds, hyps) acc = report_utils.get_accuracy(golds, hyps) bfile.write("prF (%s)=%s\n" % (m,str(prF))) bfile.write("acc (%s)=%s with delta=%s\n" % (m,str(acc),str(acc-b_acc))) bfile.write("- - - - - -\n") bfile.close() print "Wrote bleu scores to file: ", bfile
def main(args): ## tr_data training set parser = argparse.ArgumentParser() parser.add_argument( "base_dir", nargs="?", default=os.path.join(os.getcwd(), "mseg_workspace"), help="this is the working directory, all sub dirs live under it", ) parser.add_argument( "pm_dir", nargs="?", default="pm_default", help="this is the directory in which to store the prosodic model file", ) parser.add_argument( "training_file", nargs="?", default=TRAIN_FILE_DEFAULT, help="name of CSV file that contains correctly annotated training examples", ) parser.add_argument( "test_file", nargs="?", default=TEST_FILE_DEFAULT, help="name of CSV file that contains mysterious cases that must be tested", ) parser.add_argument( "-lr", "--logistic_regression", default=False, action="store_true", help="use logistic regression classifier (default is RBF-SVM)", ) args = parser.parse_args() base_dir = args.base_dir pm_dir = args.pm_dir tr_file = args.training_file test_fname = args.test_file use_lr = args.logistic_regression # if(len(args)==3): # base_dir = args[0] # pm_dir = args[1] # tr_file = args[2] # test_fname = args[3] # else: # base_dir = DIR # pm_dir = "pm_default" # tr_file = TRAIN_FILE_DEFAULT # test_fname = TEST_FILE_DEFAULT # # do_search = False # # use_pilot = False n_samples = -1 cache = 800 # pm_dir= raw_input("enter PM name: [%s]" % pm_dir) or pm_dir # tr_file = raw_input("enter PM training file name: [%s]" % tr_file) or tr_file tr_data = read_file(os.path.join(base_dir, tr_file), ",", skip_header=True) # test_fname = raw_input("enter file to test on: [%s]" % test_fname) or test_fname # use_lr = bool(raw_input("use logistic regression [False]?")) or False if not use_lr: n_samples = 6000 out_fname = test_fname + "-probabilities.dat" report_fname = test_fname + "-report.txt" else: out_fname = test_fname + "-probabilities.dat" report_fname = test_fname + "-report-LR.txt" out_file = os.path.join(base_dir, pm_dir, out_fname) report_fname = os.path.join(base_dir, pm_dir, report_fname) # clear extant predictions file if os.path.exists(out_file): os.remove(out_file) print "removed", out_file print base_dir + "/" + tr_file + " -SVM-> ", out_file test_data = read_file(os.path.join(base_dir, test_fname), ",", skip_header=True) # sel = [12,13,14,15,21,22,23,24] sel = range(7, 30) # sel = [8,21,29, 24,25,27] (_, _, tr_samples, tr_classes) = dissect(tr_data, sel) (_, te_words, te_samples, te_classes) = dissect(test_data, sel) if n_samples > 0: tr_samples, _, tr_classes, _ = train_test_split( tr_samples, tr_classes, train_size=n_samples, stratify=tr_classes ) p = sum(c == 1.0 for c in tr_classes) # count the positive instances n = len(tr_classes) - p # derive the negative instances print "n=", n, " p=", p wgt = float(n) / float(p) # cast and divide print "wgt=", wgt # classWeight = { 1: wgt } # tr_samples, te_samples, tr_classes, te_classes = train_test_split(samples, classes, test_size=0.20, random_state=0, stratify=classes) scaler = preprocessing.StandardScaler().fit(np.array(tr_samples)) tr_samples = scaler.transform(tr_samples) clf = None best_params = None # override the defaults with the results of a grid search if desired (takes a while) # pickled = False pkl_dir = os.path.join(base_dir, pm_dir, "pkl") pickled_model = os.path.join(pkl_dir, "svm_classifier.pkl") if os.path.exists(pickled_model) and not overwrite_pkl: clf = joblib.load(pickled_model) clf.set_params(verbose=True) print "loaded pickled model...", pickled_model else: if not os.path.exists(pkl_dir): # output dir doesn't exist so make it os.makedirs(pkl_dir) print "made dir for pickled model:", pkl_dir cmin, cmax, cstep = -5, 17, 2 cr = range(cmin, cmax, cstep) print (cr) # c_range = [ pow(2, y) for y in cr] # c_range =(0.005, 0.5, 5, 50, 500, 5000, 50000) c_range = (0.5, 50, 5000) print ("c_range", c_range) gmin, gmax, gstep = -15, 5, 2 gr = range(gmin, gmax, gstep) print (gr) # gamma_range = [ pow(2, y) for y in gr ] # gamma_range = (0.00005, 0.0005, 0.005, 0.05, 0.5, 5.0, 50, 500) gamma_range = (0.0005, 0.05, 5.0, 500) print ("gamma_range", gamma_range) c_dist = scipy.stats.expon(scale=100) gamma_dist = scipy.stats.expon(scale=0.01) if use_lr: estr = LogisticRegression(class_weight="balanced") # estr = LogisticRegression() param_dist = {"C": c_dist} else: estr = svm.SVC(kernel="rbf", cache_size=800, probability=True, class_weight="balanced") # estr = svm.LinearSVC(class_weight='balanced') param_dist = {"C": c_dist, "gamma": gamma_dist} # searcher = RandomizedSearchCV(estr, param_distributions=param_dist, n_iter=100, n_jobs=-1, cv=5, verbose=True ) #, scoring="recall") searcher = RandomizedSearchCV( estr, param_distributions=param_dist, n_iter=100, n_jobs=-1, verbose=True, scoring="recall" ) searcher.fit(tr_samples, tr_classes) report(searcher.grid_scores_) clf = searcher.best_estimator_ print "COMPARING CLF PARAMS WITH BEST PARAMS (shd be same)" print clf.get_params() print best_params joblib.dump(clf, pickled_model) print clf # print "FITTING" # clf.set_params(verbose=True) # clf.fit(tr_samples, tr_classes) # print clf # NOW TO TEST AGAINST HELD-OUT/TEST DATA te_samples = scaler.transform(te_samples) print "no test cases", len(te_samples) predictions = -1.0 * clf.predict_log_proba( te_samples ) # this is a list of pairs of probs in form [ [1-p, p], ... ] # predictions = -1.0 * clf.decision_function(te_samples) print predictions predicted_classes = clf.predict(te_samples) print ( "TEST: Number of mislabelled points out of a total %d points : %d" % (len(te_samples), (te_classes != predicted_classes).sum()) ) print (classification_report(te_classes, predicted_classes)) rpt = open(report_fname, "w") rpt.write(classification_report(te_classes, predicted_classes)) rpt.write("\n") rpt.close() print "wrote report file", rpt pred_file = open(out_file, "w") pred_file.write("labels 0 1\n") # this emulates an earlier file format for compatibility for word, prob_tuple, guessed_class in zip(te_words, predictions, predicted_classes): pred_file.write("%d %f %f %s\n" % (guessed_class, prob_tuple[0], prob_tuple[1], word)) pred_file.close() print "wrote predictions file:", pred_file
''' import os from mseg.common import LM_SYM_FILE, SYM_FILE, TRAIN_FILE_DEFAULT, \ DIR, TEST_FILE_DEFAULT, read_file, save_symbol_table from mseg.lm_utils import generate_normed_text_file gen_ntxt = False tr_fname, slm_dir = TRAIN_FILE_DEFAULT, "eval1n" #tr_fname, slm_dir = "test1.csv", "test1" if __name__ == '__main__': #TRAIN_FILE_DEFAULT = "smalltest_norm.csv" slm_dir = raw_input("Type in LM dir or hit return to use default [%s]" % slm_dir) or slm_dir print "using ",slm_dir lmdir_global = os.path.join(DIR,slm_dir) tr_fname = raw_input("enter training file name: [%s]" % tr_fname) or tr_fname te_file = raw_input("enter test file name: [%s]" % TEST_FILE_DEFAULT) or TEST_FILE_DEFAULT tr_data = read_file(os.path.join(DIR, tr_fname), ',', skip_header=True) te_rows = read_file(os.path.join(DIR, te_file), ',', skip_header=True) rawtext_file = generate_normed_text_file(tr_data, lmdir_global) all_syms = set([r[5] for r in (tr_data + te_rows)]) lm_syms = set([r[5] for r in tr_data]) save_symbol_table(all_syms, lmdir_global, SYM_FILE) save_symbol_table(lm_syms, lmdir_global, LM_SYM_FILE)
def main(args): ## tr_data training set parser = argparse.ArgumentParser() parser.add_argument( "base_dir", nargs='?', default=os.path.join(os.getcwd(), "mseg_workspace"), help="this is the working directory, all sub dirs live under it") parser.add_argument( "pm_dir", nargs='?', default="pm_default", help="this is the directory in which to store the prosodic model file") parser.add_argument( "training_file", nargs='?', default=TRAIN_FILE_DEFAULT, help= "name of CSV file that contains correctly annotated training examples") parser.add_argument( "test_file", nargs='?', default=TEST_FILE_DEFAULT, help= "name of CSV file that contains mysterious cases that must be tested") parser.add_argument( "-lr", "--logistic_regression", default=False, action="store_true", help="use logistic regression classifier (default is RBF-SVM)") args = parser.parse_args() base_dir = args.base_dir pm_dir = args.pm_dir tr_file = args.training_file test_fname = args.test_file use_lr = args.logistic_regression # if(len(args)==3): # base_dir = args[0] # pm_dir = args[1] # tr_file = args[2] # test_fname = args[3] # else: # base_dir = DIR # pm_dir = "pm_default" # tr_file = TRAIN_FILE_DEFAULT # test_fname = TEST_FILE_DEFAULT # # do_search = False # # use_pilot = False n_samples = -1 cache = 800 # pm_dir= raw_input("enter PM name: [%s]" % pm_dir) or pm_dir # tr_file = raw_input("enter PM training file name: [%s]" % tr_file) or tr_file tr_data = read_file(os.path.join(base_dir, tr_file), ',', skip_header=True) # test_fname = raw_input("enter file to test on: [%s]" % test_fname) or test_fname # use_lr = bool(raw_input("use logistic regression [False]?")) or False if not use_lr: n_samples = 6000 out_fname = test_fname + "-probabilities.dat" report_fname = test_fname + "-report.txt" else: out_fname = test_fname + "-probabilities.dat" report_fname = test_fname + "-report-LR.txt" out_file = os.path.join(base_dir, pm_dir, out_fname) report_fname = os.path.join(base_dir, pm_dir, report_fname) #clear extant predictions file if (os.path.exists(out_file)): os.remove(out_file) print "removed", out_file print base_dir + "/" + tr_file + " -SVM-> ", out_file test_data = read_file(os.path.join(base_dir, test_fname), ',', skip_header=True) #sel = [12,13,14,15,21,22,23,24] sel = range(7, 30) #sel = [8,21,29, 24,25,27] (_, _, tr_samples, tr_classes) = dissect(tr_data, sel) (_, te_words, te_samples, te_classes) = dissect(test_data, sel) if n_samples > 0: tr_samples, _, tr_classes, _ = train_test_split(tr_samples, tr_classes, train_size=n_samples, stratify=tr_classes) p = sum(c == 1.0 for c in tr_classes) # count the positive instances n = len(tr_classes) - p # derive the negative instances print "n=", n, " p=", p wgt = float(n) / float(p) # cast and divide print "wgt=", wgt # classWeight = { 1: wgt } #tr_samples, te_samples, tr_classes, te_classes = train_test_split(samples, classes, test_size=0.20, random_state=0, stratify=classes) scaler = preprocessing.StandardScaler().fit(np.array(tr_samples)) tr_samples = scaler.transform(tr_samples) clf = None best_params = None #override the defaults with the results of a grid search if desired (takes a while) #pickled = False pkl_dir = os.path.join(base_dir, pm_dir, "pkl") pickled_model = os.path.join(pkl_dir, "svm_classifier.pkl") if (os.path.exists(pickled_model) and not overwrite_pkl): clf = joblib.load(pickled_model) clf.set_params(verbose=True) print "loaded pickled model...", pickled_model else: if not os.path.exists(pkl_dir): #output dir doesn't exist so make it os.makedirs(pkl_dir) print "made dir for pickled model:", pkl_dir cmin, cmax, cstep = -5, 17, 2 cr = range(cmin, cmax, cstep) print(cr) #c_range = [ pow(2, y) for y in cr] #c_range =(0.005, 0.5, 5, 50, 500, 5000, 50000) c_range = (0.5, 50, 5000) print('c_range', c_range) gmin, gmax, gstep = -15, 5, 2 gr = range(gmin, gmax, gstep) print(gr) #gamma_range = [ pow(2, y) for y in gr ] #gamma_range = (0.00005, 0.0005, 0.005, 0.05, 0.5, 5.0, 50, 500) gamma_range = (0.0005, 0.05, 5.0, 500) print('gamma_range', gamma_range) c_dist = scipy.stats.expon(scale=100) gamma_dist = scipy.stats.expon(scale=.01) if use_lr: estr = LogisticRegression(class_weight='balanced') # estr = LogisticRegression() param_dist = {'C': c_dist} else: estr = svm.SVC(kernel='rbf', cache_size=800, probability=True, class_weight='balanced') #estr = svm.LinearSVC(class_weight='balanced') param_dist = {'C': c_dist, 'gamma': gamma_dist} #searcher = RandomizedSearchCV(estr, param_distributions=param_dist, n_iter=100, n_jobs=-1, cv=5, verbose=True ) #, scoring="recall") searcher = RandomizedSearchCV(estr, param_distributions=param_dist, n_iter=100, n_jobs=-1, verbose=True, scoring="recall") searcher.fit(tr_samples, tr_classes) report(searcher.grid_scores_) clf = searcher.best_estimator_ print "COMPARING CLF PARAMS WITH BEST PARAMS (shd be same)" print clf.get_params() print best_params joblib.dump(clf, pickled_model) print clf # print "FITTING" # clf.set_params(verbose=True) # clf.fit(tr_samples, tr_classes) # print clf #NOW TO TEST AGAINST HELD-OUT/TEST DATA te_samples = scaler.transform(te_samples) print "no test cases", len(te_samples) predictions = -1.0 * clf.predict_log_proba( te_samples ) #this is a list of pairs of probs in form [ [1-p, p], ... ] #predictions = -1.0 * clf.decision_function(te_samples) print predictions predicted_classes = clf.predict(te_samples) print("TEST: Number of mislabelled points out of a total %d points : %d" % (len(te_samples), (te_classes != predicted_classes).sum())) print(classification_report(te_classes, predicted_classes)) rpt = open(report_fname, "w") rpt.write(classification_report(te_classes, predicted_classes)) rpt.write("\n") rpt.close() print "wrote report file", rpt pred_file = open(out_file, "w") pred_file.write("labels 0 1\n" ) #this emulates an earlier file format for compatibility for word, prob_tuple, guessed_class in zip(te_words, predictions, predicted_classes): pred_file.write("%d %f %f %s\n" % (guessed_class, prob_tuple[0], prob_tuple[1], word)) pred_file.close() print "wrote predictions file:", pred_file
def main(args): parser = argparse.ArgumentParser() parser.add_argument("config_file", nargs='?', default=os.path.join(os.getcwd(), "mseg_config.cfg"), help="configuration file for the multistage segmenter") args = parser.parse_args() print "running main_run" config_fname = args.config_file with open(config_fname) as data_file: config = json.load(data_file) base_dir = config['base_dir'] batches = config['batches'] for batch in batches: if (not batch['run_batch']): continue print "RUNNING BATCH", batch batch_name = batch['batch_dir'] batch_dir = os.path.join(base_dir, batch_name) lm_dir = os.path.join(base_dir, batch['language_model']) pm_dir = os.path.join(base_dir, batch['prosodic_model']) slm_dir = os.path.join(base_dir, batch['length_model']) te_file = batch['test_file'] use_pos_tags = batch.get('use_pos_tags', False) pm_weight = batch.get('pm_weight', 1) gold_dir = os.path.join(batch_dir, "gold") all_models_dir = os.path.join(batch_dir, "pm_lm_slm") all_models_out_dir = os.path.join(all_models_dir, "output") all_models_in_dir = os.path.join(all_models_dir, "composed") all_models_shp_dir = os.path.join(all_models_dir, "shortest") pm_only_dir = os.path.join(batch_dir, "pm_only") pm_shp_dir = os.path.join(pm_only_dir, "shortest") pm_outs_dir = os.path.join(pm_only_dir, "output") pm_lm_dir = os.path.join(batch_dir, "pm_lm") pm_lm_in_dir = os.path.join(pm_lm_dir, "composed") pm_lm_shp_dir = os.path.join(pm_lm_dir, "shortest") pm_lm_outs_dir = os.path.join(pm_lm_dir, "output") pm_slm_dir = os.path.join(batch_dir, "pm_slm") pm_slm_in_dir = os.path.join(pm_lm_dir, "composed") pm_slm_shp_dir = os.path.join(pm_lm_dir, "shortest") pm_slm_outs_dir = os.path.join(pm_lm_dir, "output") if (do_build): #lmdir_global = os.path.join(base_dir,lm_dir) batch_input_fst_dir = os.path.join(batch_dir, "speech_fsts") if not os.path.exists(batch_input_fst_dir): os.makedirs(batch_input_fst_dir) te_rows = read_file(os.path.join(base_dir, te_file), ',', skip_header=True) create_gold_files.generate_gold_files(gold_dir, te_rows) #ONE: make speech_fsts from te_rows lmsym_fname = os.path.join(lm_dir, LM_SYM_FILE) lm_syms = load_symbol_table(lmsym_fname) te_syms = [r[SYMBOL_COL] for r in te_rows] all_syms = set(lm_syms + te_syms) pmsym_fname = os.path.join(batch_dir, SYM_FILE) save_symbol_table(all_syms, pmsym_fname) probability_file = os.path.join(pm_dir, (te_file + "-probabilities.dat")) if not os.path.exists(probability_file): print "No prosodic probability file found: ", probability_file, " - you need to create this first with train_pm.py" continue #go onto the next batch TODO should create prob file here! prob_rows = read_file(probability_file, ' ', skip_header=True) if use_pos_tags: emission_vals = posify(te_rows) else: emission_vals = te_syms generate_pm_text_files(batch_input_fst_dir, lm_syms, te_rows, prob_rows, max_count=-1, emission_values=emission_vals, pm_weight=pm_weight) compile_pm_files(batch_input_fst_dir, pmsym_fname, lmsym_fname) #TWO: Assuming all the other model files are complete, we should be good to go #lang_mod = os.path.join(lm_dir,LM_PRUNED) lang_mod = os.path.join(lm_dir, "mod.pru") # lang_mod = os.path.join(lm_dir,"lm.mod") print "joined up working dir names" find_shortest_paths.stringify_shortest_paths( batch_input_fst_dir, pm_shp_dir, pm_outs_dir) #now just use the pruned LM file without slen modifier process_inputs(batch_input_fst_dir, lang_mod, pm_lm_in_dir) find_shortest_paths.stringify_shortest_paths( pm_lm_in_dir, pm_lm_shp_dir, pm_lm_outs_dir) #(input_dir, shortpath_dir, strings_dir #use combined LM and slen modifier slm_file = os.path.join(slm_dir, "slm.fst") lm_slm = os.path.join(batch_dir, "lm_slm.fst") if compose_lm_slm: lm_utils.fstarcsort(slm_file, ilabel_sort=True) lm_utils.fstcompose(lang_mod, slm_file, lm_slm) lm_utils.fstimmut(lm_slm, lm_slm) process_inputs(batch_input_fst_dir, lm_slm, all_models_in_dir) print "doing find shortest paths..." find_shortest_paths.stringify_shortest_paths(all_models_in_dir, all_models_shp_dir, all_models_out_dir) R = convert_to_single_file("*.gld", gold_dir) C = convert_to_single_file("*.fst", all_models_out_dir) PM_C = convert_to_single_file("*.fst", pm_outs_dir) PMLM_C = convert_to_single_file("*.fst", pm_lm_outs_dir) cands = (("PM", PM_C), ("PM_LM", PMLM_C), ("PM_LM_SLM", C)) evaluate_output.eval_segmenter_output(batch_dir) mc_report = evaluate_output.multi_col_report(batch_dir) mcrfile = open(os.path.join(batch_dir, "mc_report.csv"), "w") for r in mc_report: rec_id = r["rec_id"] words = r["words"] gold = r["gold"] pm = r["pm_only"] pm_lm = r["pm_lm"] pm_lm_slm = r["pm_lm_slm"] mcrfile.write("recording, word, gold, pm_only, pm_lm, pm_lm_slm\n") for row in zip(words, gold, pm, pm_lm, pm_lm_slm): s = ",".join(row) s = rec_id + "," + s + "\n" mcrfile.write(s) mcrfile.close() bfile = open(os.path.join(batch_dir, batch_name + "-SCORES.txt"), "w") bfile.write(batch_name + "\n\n") #report BLEU-like scores for 4- and 3-grammes in both strict and lax counting modes for o in (4, 3): for s in (True, False): write_bleus_to_file(R, cands, bfile, o, strict=s) #create a list of {0,1} values to show break or no break #golds=[r["gold"] for r in mc_report] golds = [int(item) for r in mc_report for item in r['gold']] for m in ("pm_only", "pm_lm", "pm_lm_slm"): hyps = [int(item) for r in mc_report for item in r[m]] prF = report_utils.get_prF(golds, hyps) b_acc = report_utils.get_baseline_accuracy(golds, hyps) acc = report_utils.get_accuracy(golds, hyps) bfile.write("prF (%s)=%s\n" % (m, str(prF))) bfile.write("acc (%s)=%s with delta=%s\n" % (m, str(acc), str(acc - b_acc))) bfile.write("- - - - - -\n") bfile.close() print "Wrote bleu scores to file: ", bfile
from mseg.common import LM_SYM_FILE, SYM_FILE, TRAIN_FILE_DEFAULT, \ DIR, TEST_FILE_DEFAULT, read_file, save_symbol_table from mseg.lm_utils import generate_normed_text_file gen_ntxt = False tr_fname, slm_dir = TRAIN_FILE_DEFAULT, "eval1n" #tr_fname, slm_dir = "test1.csv", "test1" if __name__ == '__main__': #TRAIN_FILE_DEFAULT = "smalltest_norm.csv" slm_dir = raw_input("Type in LM dir or hit return to use default [%s]" % slm_dir) or slm_dir print "using ", slm_dir lmdir_global = os.path.join(DIR, slm_dir) tr_fname = raw_input( "enter training file name: [%s]" % tr_fname) or tr_fname te_file = raw_input( "enter test file name: [%s]" % TEST_FILE_DEFAULT) or TEST_FILE_DEFAULT tr_data = read_file(os.path.join(DIR, tr_fname), ',', skip_header=True) te_rows = read_file(os.path.join(DIR, te_file), ',', skip_header=True) rawtext_file = generate_normed_text_file(tr_data, lmdir_global) all_syms = set([r[5] for r in (tr_data + te_rows)]) lm_syms = set([r[5] for r in tr_data]) save_symbol_table(all_syms, lmdir_global, SYM_FILE) save_symbol_table(lm_syms, lmdir_global, LM_SYM_FILE)
w = r[5] #[1:-1] b = r[6] if (next_transcript_id != transcript_id): transcript_id = next_transcript_id ofile.flush() ofile.close() # print("wrote",ofile) ofile = open_wfile(odir, transcript_id) ofile.write(w + "\t" + b + "\n") # if(b): # ofile.write("<break>\n") ofile.flush() ofile.close() print("wrote {} gold files to {}".format(len(rows), odir)) def open_wfile(odir, transcript_id): ofilename = os.path.join(odir, transcript_id + ".gld") fhandle = codecs.open(ofilename, 'w') return fhandle if __name__ == '__main__': lmdir = "eval1n" te_file = raw_input( "enter test file name: [%s]" % TEST_FILE_DEFAULT) or TEST_FILE_DEFAULT te_rows = read_file(os.path.join(DIR, te_file), ',', skip_header=True) generate_gold_files(te_rows)
def main(args): parser = argparse.ArgumentParser() parser.add_argument("base_dir", nargs='?', default= os.path.join(os.getcwd(),"mseg_workspace"), help="this is the working directory, all sub dirs live under it") parser.add_argument("lm_dir", nargs='?', default="lm_default", help="this is the directory in which to store the Language Model (LM) files") parser.add_argument("training_file", nargs='?', default=TRAIN_FILE_DEFAULT, help="name of CSV file that contains correctly annotated training examples") parser.add_argument("-o", "--order", type=int, default=4, help="sets the n-gramme order of the LM (default=4)") args = parser.parse_args() base_dir = args.base_dir # pm_dir = "pm_default" currently not used lm_dir = args.lm_dir tr_file = args.training_file ngo = args.order #SECTION ONE: dedicated to creating the Language Model files lmdir_global = os.path.join(base_dir, lm_dir) # if(os.path.exists(lmdir_global)): # shutil.rmtree(lmdir_global) # os.makedirs(lmdir_global) tr_rows = read_file(os.path.join(base_dir, tr_file), ',', skip_header=True) rawtext_file = generate_normed_text_file(tr_rows, lmdir_global) #lm_syms = set([r[5] for r in tr_rows]) lm_txt = open(rawtext_file, "r").readlines() lm_syms= set( open(rawtext_file, "r").read().split() ) if BREAK in lm_syms: lm_syms.remove(BREAK) if UNK in lm_syms: lm_syms.remove(UNK) buildmod = "y" modfile = os.path.join(lmdir_global,"lm.mod") modpru = os.path.join(lmdir_global,LM_PRUNED) print "checking for LM file",modfile if(os.path.exists(modfile)): buildmod = raw_input("model file in "+lmdir_global+" already exists. Overwrite? [n]").lower() or "n" if(not buildmod=="n"): modfile = compile_lm(rawtext_file, lmdir_global, lm_syms, ngo) print "Created unpruned lang model file:", modfile print "Now pruning LM..." ngramshrink(modfile, modpru) #print "Now minimising LM..." #lm_utils.fstmin(modpru,modpru) #we don't use the unpruned modfile again, so switch over to the pruned version here modfile = modpru remap_fname = os.path.join(lmdir_global,"lm_remap.dat") create_remap_table(lm_syms, remap_fname) osymfile = os.path.join(base_dir,"slm_sym.dat") create_slm_sym_file(osymfile) remap_lm(modfile, remap_fname, osymfile) print "remapped modfile output symbols" #create_converter(lmdir_global) #print "created converter." # slm_dir = raw_input("Type in SLM dir or hit return to match LM [%s]" % lm_dir) or lm_dir # slm_dir = os.path.join(base_dir,slm_dir) # # print "using ",slm_dir #put the SLM model into the same directory as its corresponding LM slm_dir = os.path.join(base_dir, lm_dir) generate_slm_from_txt(lm_txt, slm_dir, do_plot=True) #generate_slm(tr_rows, slm_dir, do_plot=False) # build the sentence length model, plot it so we can see it's sane print "slm generated from", tr_file, "in", slm_dir print "all constituent system files now compiled"