示例#1
0
def main(args):
    parser = argparse.ArgumentParser()
    parser.add_argument("config_file", nargs='?', default= os.path.join(os.getcwd(),"mseg_config.cfg"), help="configuration file for the multistage segmenter")
    args = parser.parse_args()
    
    print "running main_run"
    config_fname = args.config_file
    with open(config_fname) as data_file:
        config = json.load(data_file)
        
    base_dir = config['base_dir']
    batches = config['batches']

    for batch in batches:
        if(not batch['run_batch']):
            continue
        
        print "RUNNING BATCH", batch
        
        batch_name = batch['batch_dir']
        batch_dir = os.path.join(base_dir, batch_name)
        lm_dir = os.path.join(base_dir, batch['language_model'])
        pm_dir = os.path.join(base_dir, batch['prosodic_model'])
        slm_dir = os.path.join(base_dir,batch['length_model'])
        te_file = batch['test_file']
        
        use_pos_tags = batch.get('use_pos_tags', False)
        pm_weight = batch.get('pm_weight', 1)

        gold_dir = os.path.join(batch_dir, "gold")
        all_models_dir = os.path.join(batch_dir, "pm_lm_slm")
        
        all_models_out_dir = os.path.join(all_models_dir, "output")
        all_models_in_dir = os.path.join(all_models_dir, "composed")
        all_models_shp_dir = os.path.join(all_models_dir, "shortest")
        
        pm_only_dir = os.path.join(batch_dir, "pm_only")
        pm_shp_dir = os.path.join(pm_only_dir, "shortest")
        pm_outs_dir = os.path.join(pm_only_dir, "output")
        
        pm_lm_dir = os.path.join(batch_dir, "pm_lm")
        pm_lm_in_dir = os.path.join(pm_lm_dir, "composed")
        pm_lm_shp_dir = os.path.join(pm_lm_dir, "shortest")
        pm_lm_outs_dir = os.path.join(pm_lm_dir, "output")

        pm_slm_dir = os.path.join(batch_dir, "pm_slm")
        pm_slm_in_dir = os.path.join(pm_lm_dir, "composed")
        pm_slm_shp_dir = os.path.join(pm_lm_dir, "shortest")
        pm_slm_outs_dir = os.path.join(pm_lm_dir, "output")


        if(do_build): 
            #lmdir_global = os.path.join(base_dir,lm_dir)
            batch_input_fst_dir = os.path.join(batch_dir, "speech_fsts")
            if not os.path.exists(batch_input_fst_dir):
                os.makedirs(batch_input_fst_dir)
        
            te_rows = read_file(os.path.join(base_dir, te_file), ',', skip_header=True)
        
            create_gold_files.generate_gold_files(gold_dir, te_rows)
        
            #ONE: make speech_fsts from te_rows
            lmsym_fname = os.path.join(lm_dir,LM_SYM_FILE)
            lm_syms = load_symbol_table(lmsym_fname)
              
            te_syms = [r[SYMBOL_COL] for r in te_rows]
    
            all_syms = set(lm_syms + te_syms)
            pmsym_fname = os.path.join(batch_dir, SYM_FILE)
            save_symbol_table(all_syms, pmsym_fname)
            
            probability_file = os.path.join(pm_dir, (te_file+"-probabilities.dat"))
            if not os.path.exists(probability_file):
                print "No prosodic probability file found: ", probability_file, " - you need to create this first with train_pm.py"
                continue #go onto the next batch TODO should create prob file here!
            
            prob_rows = read_file(probability_file, ' ', skip_header=True)
        
            if use_pos_tags:
                emission_vals = posify(te_rows)
            else:
                emission_vals = te_syms
                        
        
            generate_pm_text_files(batch_input_fst_dir, lm_syms, te_rows, prob_rows, max_count=-1, emission_values=emission_vals, pm_weight=pm_weight)
            compile_pm_files(batch_input_fst_dir, pmsym_fname, lmsym_fname)
            
            #TWO: Assuming all the other model files are complete, we should be good to go
            
            #lang_mod = os.path.join(lm_dir,LM_PRUNED)
            lang_mod = os.path.join(lm_dir,"mod.pru")
#             lang_mod = os.path.join(lm_dir,"lm.mod")

                    
            print "joined up working dir names"
   
            find_shortest_paths.stringify_shortest_paths(batch_input_fst_dir, pm_shp_dir, pm_outs_dir)
            
  
            #now just use the pruned LM file without slen modifier
            process_inputs(batch_input_fst_dir, lang_mod, pm_lm_in_dir)
            find_shortest_paths.stringify_shortest_paths(pm_lm_in_dir, pm_lm_shp_dir, pm_lm_outs_dir) #(input_dir, shortpath_dir, strings_dir
            
            #use combined LM and slen modifier
            slm_file = os.path.join(slm_dir,"slm.fst")
            lm_slm = os.path.join(batch_dir,"lm_slm.fst") 
            
            if compose_lm_slm:
                lm_utils.fstarcsort(slm_file, ilabel_sort=True)
                lm_utils.fstcompose(lang_mod, slm_file, lm_slm)
                lm_utils.fstimmut(lm_slm, lm_slm)
                     
                process_inputs(batch_input_fst_dir, lm_slm, all_models_in_dir)
            
        print "doing find shortest paths..."
        find_shortest_paths.stringify_shortest_paths(all_models_in_dir, all_models_shp_dir, all_models_out_dir)

        R = convert_to_single_file("*.gld", gold_dir)        
        C = convert_to_single_file("*.fst", all_models_out_dir)
        
        PM_C = convert_to_single_file("*.fst", pm_outs_dir)
        PMLM_C = convert_to_single_file("*.fst", pm_lm_outs_dir)
        
        cands = (("PM", PM_C),
                 ("PM_LM",PMLM_C),
                 ("PM_LM_SLM",C))
        
        evaluate_output.eval_segmenter_output(batch_dir)
        
        mc_report = evaluate_output.multi_col_report(batch_dir)
        
        mcrfile = open(os.path.join(batch_dir, "mc_report.csv"),"w")
        for r in mc_report:
            rec_id = r["rec_id"]
            words = r["words"]
            gold = r["gold"]
            pm = r["pm_only"]
            pm_lm = r["pm_lm"]
            pm_lm_slm = r["pm_lm_slm"]
        
            mcrfile.write("recording, word, gold, pm_only, pm_lm, pm_lm_slm\n")
            for row in zip(words, gold, pm, pm_lm, pm_lm_slm):
                s = ",".join(row)
                s = rec_id + "," + s + "\n"
                mcrfile.write(s)
        mcrfile.close()

        bfile = open(os.path.join(batch_dir, batch_name+"-SCORES.txt"),"w")
        bfile.write(batch_name+"\n\n");
        
        #report BLEU-like scores for 4- and 3-grammes in both strict and lax counting modes
        for o in (4,3):
            for s in (True,False):
                write_bleus_to_file(R, cands, bfile, o, strict=s)

        #create a list of {0,1} values to show break or no break
        #golds=[r["gold"] for r in mc_report]
        golds=[ int(item) for r in mc_report for item in r['gold']]
        for m in ("pm_only", "pm_lm", "pm_lm_slm"):        
            hyps=[int(item) for r in mc_report for item in r[m]]
            prF = report_utils.get_prF(golds, hyps)
            b_acc = report_utils.get_baseline_accuracy(golds, hyps)
            acc = report_utils.get_accuracy(golds, hyps)
            bfile.write("prF (%s)=%s\n" % (m,str(prF)))
            bfile.write("acc (%s)=%s with delta=%s\n" % (m,str(acc),str(acc-b_acc)))
            bfile.write("- - - - - -\n")
        bfile.close()
        
        print "Wrote bleu scores to file: ", bfile
示例#2
0
def main(args):
    ## tr_data training set
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "base_dir",
        nargs="?",
        default=os.path.join(os.getcwd(), "mseg_workspace"),
        help="this is the working directory, all sub dirs live under it",
    )
    parser.add_argument(
        "pm_dir",
        nargs="?",
        default="pm_default",
        help="this is the directory in which to store the prosodic model file",
    )
    parser.add_argument(
        "training_file",
        nargs="?",
        default=TRAIN_FILE_DEFAULT,
        help="name of CSV file that contains correctly annotated training examples",
    )
    parser.add_argument(
        "test_file",
        nargs="?",
        default=TEST_FILE_DEFAULT,
        help="name of CSV file that contains mysterious cases that must be tested",
    )
    parser.add_argument(
        "-lr",
        "--logistic_regression",
        default=False,
        action="store_true",
        help="use logistic regression classifier (default is RBF-SVM)",
    )
    args = parser.parse_args()

    base_dir = args.base_dir
    pm_dir = args.pm_dir
    tr_file = args.training_file
    test_fname = args.test_file
    use_lr = args.logistic_regression

    #     if(len(args)==3):
    #         base_dir = args[0]
    #         pm_dir = args[1]
    #         tr_file = args[2]
    #         test_fname = args[3]
    #     else:
    #         base_dir = DIR
    #         pm_dir = "pm_default"
    #         tr_file = TRAIN_FILE_DEFAULT
    #         test_fname = TEST_FILE_DEFAULT
    # #         do_search = False
    # #         use_pilot = False
    n_samples = -1
    cache = 800

    #     pm_dir= raw_input("enter PM name: [%s]" % pm_dir) or pm_dir
    #     tr_file = raw_input("enter PM training file name: [%s]" % tr_file) or tr_file

    tr_data = read_file(os.path.join(base_dir, tr_file), ",", skip_header=True)

    #     test_fname = raw_input("enter file to test on: [%s]" % test_fname) or test_fname
    #     use_lr = bool(raw_input("use logistic regression [False]?")) or False

    if not use_lr:
        n_samples = 6000
        out_fname = test_fname + "-probabilities.dat"
        report_fname = test_fname + "-report.txt"
    else:
        out_fname = test_fname + "-probabilities.dat"
        report_fname = test_fname + "-report-LR.txt"

    out_file = os.path.join(base_dir, pm_dir, out_fname)
    report_fname = os.path.join(base_dir, pm_dir, report_fname)
    # clear extant predictions file
    if os.path.exists(out_file):
        os.remove(out_file)
        print "removed", out_file

    print base_dir + "/" + tr_file + " -SVM-> ", out_file

    test_data = read_file(os.path.join(base_dir, test_fname), ",", skip_header=True)

    # sel = [12,13,14,15,21,22,23,24]
    sel = range(7, 30)
    # sel = [8,21,29, 24,25,27]

    (_, _, tr_samples, tr_classes) = dissect(tr_data, sel)
    (_, te_words, te_samples, te_classes) = dissect(test_data, sel)

    if n_samples > 0:
        tr_samples, _, tr_classes, _ = train_test_split(
            tr_samples, tr_classes, train_size=n_samples, stratify=tr_classes
        )

    p = sum(c == 1.0 for c in tr_classes)  # count the positive instances
    n = len(tr_classes) - p  # derive the negative instances
    print "n=", n, " p=", p
    wgt = float(n) / float(p)  # cast and divide
    print "wgt=", wgt
    #     classWeight = { 1: wgt }

    # tr_samples, te_samples, tr_classes, te_classes = train_test_split(samples, classes, test_size=0.20, random_state=0, stratify=classes)

    scaler = preprocessing.StandardScaler().fit(np.array(tr_samples))
    tr_samples = scaler.transform(tr_samples)

    clf = None
    best_params = None
    # override the defaults with the results of a grid search if desired (takes a while)

    # pickled = False
    pkl_dir = os.path.join(base_dir, pm_dir, "pkl")
    pickled_model = os.path.join(pkl_dir, "svm_classifier.pkl")

    if os.path.exists(pickled_model) and not overwrite_pkl:
        clf = joblib.load(pickled_model)
        clf.set_params(verbose=True)
        print "loaded pickled model...", pickled_model

    else:
        if not os.path.exists(pkl_dir):  # output dir doesn't exist so make it
            os.makedirs(pkl_dir)
            print "made dir for pickled model:", pkl_dir

        cmin, cmax, cstep = -5, 17, 2
        cr = range(cmin, cmax, cstep)
        print (cr)
        # c_range = [ pow(2, y) for y in cr]
        # c_range =(0.005, 0.5, 5, 50, 500, 5000, 50000)
        c_range = (0.5, 50, 5000)
        print ("c_range", c_range)

        gmin, gmax, gstep = -15, 5, 2
        gr = range(gmin, gmax, gstep)
        print (gr)
        # gamma_range = [ pow(2, y) for y in gr ]
        # gamma_range = (0.00005, 0.0005, 0.005, 0.05, 0.5, 5.0, 50, 500)
        gamma_range = (0.0005, 0.05, 5.0, 500)

        print ("gamma_range", gamma_range)

        c_dist = scipy.stats.expon(scale=100)
        gamma_dist = scipy.stats.expon(scale=0.01)

        if use_lr:
            estr = LogisticRegression(class_weight="balanced")
            #             estr = LogisticRegression()
            param_dist = {"C": c_dist}
        else:
            estr = svm.SVC(kernel="rbf", cache_size=800, probability=True, class_weight="balanced")
            # estr = svm.LinearSVC(class_weight='balanced')
            param_dist = {"C": c_dist, "gamma": gamma_dist}

        # searcher = RandomizedSearchCV(estr, param_distributions=param_dist, n_iter=100, n_jobs=-1, cv=5, verbose=True ) #, scoring="recall")
        searcher = RandomizedSearchCV(
            estr, param_distributions=param_dist, n_iter=100, n_jobs=-1, verbose=True, scoring="recall"
        )
        searcher.fit(tr_samples, tr_classes)
        report(searcher.grid_scores_)
        clf = searcher.best_estimator_

        print "COMPARING CLF PARAMS WITH BEST PARAMS (shd be same)"
        print clf.get_params()
        print best_params

        joblib.dump(clf, pickled_model)

    print clf

    #     print "FITTING"
    #     clf.set_params(verbose=True)
    #     clf.fit(tr_samples, tr_classes)
    #     print clf

    # NOW TO TEST AGAINST HELD-OUT/TEST DATA
    te_samples = scaler.transform(te_samples)

    print "no test cases", len(te_samples)

    predictions = -1.0 * clf.predict_log_proba(
        te_samples
    )  # this is a list of pairs of probs in form [ [1-p, p],  ... ]
    # predictions = -1.0 * clf.decision_function(te_samples)
    print predictions
    predicted_classes = clf.predict(te_samples)

    print (
        "TEST: Number of mislabelled points out of a total %d points : %d"
        % (len(te_samples), (te_classes != predicted_classes).sum())
    )
    print (classification_report(te_classes, predicted_classes))

    rpt = open(report_fname, "w")
    rpt.write(classification_report(te_classes, predicted_classes))
    rpt.write("\n")
    rpt.close()
    print "wrote report file", rpt

    pred_file = open(out_file, "w")
    pred_file.write("labels 0 1\n")  # this emulates an earlier file format for compatibility
    for word, prob_tuple, guessed_class in zip(te_words, predictions, predicted_classes):
        pred_file.write("%d %f %f %s\n" % (guessed_class, prob_tuple[0], prob_tuple[1], word))

    pred_file.close()
    print "wrote predictions file:", pred_file
'''
import os

from mseg.common import LM_SYM_FILE, SYM_FILE, TRAIN_FILE_DEFAULT, \
    DIR, TEST_FILE_DEFAULT, read_file, save_symbol_table
from mseg.lm_utils import generate_normed_text_file


gen_ntxt = False
tr_fname, slm_dir = TRAIN_FILE_DEFAULT, "eval1n"
#tr_fname, slm_dir = "test1.csv", "test1"

if __name__ == '__main__':
        
    #TRAIN_FILE_DEFAULT = "smalltest_norm.csv"     
    slm_dir = raw_input("Type in LM dir or hit return to use default [%s]" % slm_dir) or slm_dir
    print "using ",slm_dir
    lmdir_global = os.path.join(DIR,slm_dir)
    
    tr_fname = raw_input("enter training file name: [%s]" % tr_fname) or tr_fname
    te_file = raw_input("enter test file name: [%s]" % TEST_FILE_DEFAULT) or TEST_FILE_DEFAULT
    
    tr_data = read_file(os.path.join(DIR, tr_fname), ',', skip_header=True)
    te_rows = read_file(os.path.join(DIR, te_file), ',', skip_header=True)
    rawtext_file = generate_normed_text_file(tr_data, lmdir_global)
    
    all_syms = set([r[5] for r in (tr_data + te_rows)])
    lm_syms = set([r[5] for r in tr_data])
    
    save_symbol_table(all_syms, lmdir_global, SYM_FILE)
    save_symbol_table(lm_syms, lmdir_global, LM_SYM_FILE)
示例#4
0
def main(args):
    ## tr_data training set
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "base_dir",
        nargs='?',
        default=os.path.join(os.getcwd(), "mseg_workspace"),
        help="this is the working directory, all sub dirs live under it")
    parser.add_argument(
        "pm_dir",
        nargs='?',
        default="pm_default",
        help="this is the directory in which to store the prosodic model file")
    parser.add_argument(
        "training_file",
        nargs='?',
        default=TRAIN_FILE_DEFAULT,
        help=
        "name of CSV file that contains correctly annotated training examples")
    parser.add_argument(
        "test_file",
        nargs='?',
        default=TEST_FILE_DEFAULT,
        help=
        "name of CSV file that contains mysterious cases that must be tested")
    parser.add_argument(
        "-lr",
        "--logistic_regression",
        default=False,
        action="store_true",
        help="use logistic regression classifier (default is RBF-SVM)")
    args = parser.parse_args()

    base_dir = args.base_dir
    pm_dir = args.pm_dir
    tr_file = args.training_file
    test_fname = args.test_file
    use_lr = args.logistic_regression

    #     if(len(args)==3):
    #         base_dir = args[0]
    #         pm_dir = args[1]
    #         tr_file = args[2]
    #         test_fname = args[3]
    #     else:
    #         base_dir = DIR
    #         pm_dir = "pm_default"
    #         tr_file = TRAIN_FILE_DEFAULT
    #         test_fname = TEST_FILE_DEFAULT
    # #         do_search = False
    # #         use_pilot = False
    n_samples = -1
    cache = 800

    #     pm_dir= raw_input("enter PM name: [%s]" % pm_dir) or pm_dir
    #     tr_file = raw_input("enter PM training file name: [%s]" % tr_file) or tr_file

    tr_data = read_file(os.path.join(base_dir, tr_file), ',', skip_header=True)

    #     test_fname = raw_input("enter file to test on: [%s]" % test_fname) or test_fname
    #     use_lr = bool(raw_input("use logistic regression [False]?")) or False

    if not use_lr:
        n_samples = 6000
        out_fname = test_fname + "-probabilities.dat"
        report_fname = test_fname + "-report.txt"
    else:
        out_fname = test_fname + "-probabilities.dat"
        report_fname = test_fname + "-report-LR.txt"

    out_file = os.path.join(base_dir, pm_dir, out_fname)
    report_fname = os.path.join(base_dir, pm_dir, report_fname)
    #clear extant predictions file
    if (os.path.exists(out_file)):
        os.remove(out_file)
        print "removed", out_file

    print base_dir + "/" + tr_file + " -SVM-> ", out_file

    test_data = read_file(os.path.join(base_dir, test_fname),
                          ',',
                          skip_header=True)

    #sel = [12,13,14,15,21,22,23,24]
    sel = range(7, 30)
    #sel = [8,21,29, 24,25,27]

    (_, _, tr_samples, tr_classes) = dissect(tr_data, sel)
    (_, te_words, te_samples, te_classes) = dissect(test_data, sel)

    if n_samples > 0:
        tr_samples, _, tr_classes, _ = train_test_split(tr_samples,
                                                        tr_classes,
                                                        train_size=n_samples,
                                                        stratify=tr_classes)

    p = sum(c == 1.0 for c in tr_classes)  # count the positive instances
    n = len(tr_classes) - p  # derive the negative instances
    print "n=", n, " p=", p
    wgt = float(n) / float(p)  # cast and divide
    print "wgt=", wgt
    #     classWeight = { 1: wgt }

    #tr_samples, te_samples, tr_classes, te_classes = train_test_split(samples, classes, test_size=0.20, random_state=0, stratify=classes)

    scaler = preprocessing.StandardScaler().fit(np.array(tr_samples))
    tr_samples = scaler.transform(tr_samples)

    clf = None
    best_params = None
    #override the defaults with the results of a grid search if desired (takes a while)

    #pickled = False
    pkl_dir = os.path.join(base_dir, pm_dir, "pkl")
    pickled_model = os.path.join(pkl_dir, "svm_classifier.pkl")

    if (os.path.exists(pickled_model) and not overwrite_pkl):
        clf = joblib.load(pickled_model)
        clf.set_params(verbose=True)
        print "loaded pickled model...", pickled_model

    else:
        if not os.path.exists(pkl_dir):  #output dir doesn't exist so make it
            os.makedirs(pkl_dir)
            print "made dir for pickled model:", pkl_dir

        cmin, cmax, cstep = -5, 17, 2
        cr = range(cmin, cmax, cstep)
        print(cr)
        #c_range = [ pow(2, y) for y in cr]
        #c_range =(0.005, 0.5, 5, 50, 500, 5000, 50000)
        c_range = (0.5, 50, 5000)
        print('c_range', c_range)

        gmin, gmax, gstep = -15, 5, 2
        gr = range(gmin, gmax, gstep)
        print(gr)
        #gamma_range = [ pow(2, y) for y in gr ]
        #gamma_range = (0.00005, 0.0005, 0.005, 0.05, 0.5, 5.0, 50, 500)
        gamma_range = (0.0005, 0.05, 5.0, 500)

        print('gamma_range', gamma_range)

        c_dist = scipy.stats.expon(scale=100)
        gamma_dist = scipy.stats.expon(scale=.01)

        if use_lr:
            estr = LogisticRegression(class_weight='balanced')
            #             estr = LogisticRegression()
            param_dist = {'C': c_dist}
        else:
            estr = svm.SVC(kernel='rbf',
                           cache_size=800,
                           probability=True,
                           class_weight='balanced')
            #estr = svm.LinearSVC(class_weight='balanced')
            param_dist = {'C': c_dist, 'gamma': gamma_dist}

        #searcher = RandomizedSearchCV(estr, param_distributions=param_dist, n_iter=100, n_jobs=-1, cv=5, verbose=True ) #, scoring="recall")
        searcher = RandomizedSearchCV(estr,
                                      param_distributions=param_dist,
                                      n_iter=100,
                                      n_jobs=-1,
                                      verbose=True,
                                      scoring="recall")
        searcher.fit(tr_samples, tr_classes)
        report(searcher.grid_scores_)
        clf = searcher.best_estimator_

        print "COMPARING CLF PARAMS WITH BEST PARAMS (shd be same)"
        print clf.get_params()
        print best_params

        joblib.dump(clf, pickled_model)

    print clf

    #     print "FITTING"
    #     clf.set_params(verbose=True)
    #     clf.fit(tr_samples, tr_classes)
    #     print clf

    #NOW TO TEST AGAINST HELD-OUT/TEST DATA
    te_samples = scaler.transform(te_samples)

    print "no test cases", len(te_samples)

    predictions = -1.0 * clf.predict_log_proba(
        te_samples
    )  #this is a list of pairs of probs in form [ [1-p, p],  ... ]
    #predictions = -1.0 * clf.decision_function(te_samples)
    print predictions
    predicted_classes = clf.predict(te_samples)

    print("TEST: Number of mislabelled points out of a total %d points : %d" %
          (len(te_samples), (te_classes != predicted_classes).sum()))
    print(classification_report(te_classes, predicted_classes))

    rpt = open(report_fname, "w")
    rpt.write(classification_report(te_classes, predicted_classes))
    rpt.write("\n")
    rpt.close()
    print "wrote report file", rpt

    pred_file = open(out_file, "w")
    pred_file.write("labels 0 1\n"
                    )  #this emulates an earlier file format for compatibility
    for word, prob_tuple, guessed_class in zip(te_words, predictions,
                                               predicted_classes):
        pred_file.write("%d %f %f %s\n" %
                        (guessed_class, prob_tuple[0], prob_tuple[1], word))

    pred_file.close()
    print "wrote predictions file:", pred_file
示例#5
0
def main(args):
    parser = argparse.ArgumentParser()
    parser.add_argument("config_file",
                        nargs='?',
                        default=os.path.join(os.getcwd(), "mseg_config.cfg"),
                        help="configuration file for the multistage segmenter")
    args = parser.parse_args()

    print "running main_run"
    config_fname = args.config_file
    with open(config_fname) as data_file:
        config = json.load(data_file)

    base_dir = config['base_dir']
    batches = config['batches']

    for batch in batches:
        if (not batch['run_batch']):
            continue

        print "RUNNING BATCH", batch

        batch_name = batch['batch_dir']
        batch_dir = os.path.join(base_dir, batch_name)
        lm_dir = os.path.join(base_dir, batch['language_model'])
        pm_dir = os.path.join(base_dir, batch['prosodic_model'])
        slm_dir = os.path.join(base_dir, batch['length_model'])
        te_file = batch['test_file']

        use_pos_tags = batch.get('use_pos_tags', False)
        pm_weight = batch.get('pm_weight', 1)

        gold_dir = os.path.join(batch_dir, "gold")
        all_models_dir = os.path.join(batch_dir, "pm_lm_slm")

        all_models_out_dir = os.path.join(all_models_dir, "output")
        all_models_in_dir = os.path.join(all_models_dir, "composed")
        all_models_shp_dir = os.path.join(all_models_dir, "shortest")

        pm_only_dir = os.path.join(batch_dir, "pm_only")
        pm_shp_dir = os.path.join(pm_only_dir, "shortest")
        pm_outs_dir = os.path.join(pm_only_dir, "output")

        pm_lm_dir = os.path.join(batch_dir, "pm_lm")
        pm_lm_in_dir = os.path.join(pm_lm_dir, "composed")
        pm_lm_shp_dir = os.path.join(pm_lm_dir, "shortest")
        pm_lm_outs_dir = os.path.join(pm_lm_dir, "output")

        pm_slm_dir = os.path.join(batch_dir, "pm_slm")
        pm_slm_in_dir = os.path.join(pm_lm_dir, "composed")
        pm_slm_shp_dir = os.path.join(pm_lm_dir, "shortest")
        pm_slm_outs_dir = os.path.join(pm_lm_dir, "output")

        if (do_build):
            #lmdir_global = os.path.join(base_dir,lm_dir)
            batch_input_fst_dir = os.path.join(batch_dir, "speech_fsts")
            if not os.path.exists(batch_input_fst_dir):
                os.makedirs(batch_input_fst_dir)

            te_rows = read_file(os.path.join(base_dir, te_file),
                                ',',
                                skip_header=True)

            create_gold_files.generate_gold_files(gold_dir, te_rows)

            #ONE: make speech_fsts from te_rows
            lmsym_fname = os.path.join(lm_dir, LM_SYM_FILE)
            lm_syms = load_symbol_table(lmsym_fname)

            te_syms = [r[SYMBOL_COL] for r in te_rows]

            all_syms = set(lm_syms + te_syms)
            pmsym_fname = os.path.join(batch_dir, SYM_FILE)
            save_symbol_table(all_syms, pmsym_fname)

            probability_file = os.path.join(pm_dir,
                                            (te_file + "-probabilities.dat"))
            if not os.path.exists(probability_file):
                print "No prosodic probability file found: ", probability_file, " - you need to create this first with train_pm.py"
                continue  #go onto the next batch TODO should create prob file here!

            prob_rows = read_file(probability_file, ' ', skip_header=True)

            if use_pos_tags:
                emission_vals = posify(te_rows)
            else:
                emission_vals = te_syms

            generate_pm_text_files(batch_input_fst_dir,
                                   lm_syms,
                                   te_rows,
                                   prob_rows,
                                   max_count=-1,
                                   emission_values=emission_vals,
                                   pm_weight=pm_weight)
            compile_pm_files(batch_input_fst_dir, pmsym_fname, lmsym_fname)

            #TWO: Assuming all the other model files are complete, we should be good to go

            #lang_mod = os.path.join(lm_dir,LM_PRUNED)
            lang_mod = os.path.join(lm_dir, "mod.pru")
            #             lang_mod = os.path.join(lm_dir,"lm.mod")

            print "joined up working dir names"

            find_shortest_paths.stringify_shortest_paths(
                batch_input_fst_dir, pm_shp_dir, pm_outs_dir)

            #now just use the pruned LM file without slen modifier
            process_inputs(batch_input_fst_dir, lang_mod, pm_lm_in_dir)
            find_shortest_paths.stringify_shortest_paths(
                pm_lm_in_dir, pm_lm_shp_dir,
                pm_lm_outs_dir)  #(input_dir, shortpath_dir, strings_dir

            #use combined LM and slen modifier
            slm_file = os.path.join(slm_dir, "slm.fst")
            lm_slm = os.path.join(batch_dir, "lm_slm.fst")

            if compose_lm_slm:
                lm_utils.fstarcsort(slm_file, ilabel_sort=True)
                lm_utils.fstcompose(lang_mod, slm_file, lm_slm)
                lm_utils.fstimmut(lm_slm, lm_slm)

                process_inputs(batch_input_fst_dir, lm_slm, all_models_in_dir)

        print "doing find shortest paths..."
        find_shortest_paths.stringify_shortest_paths(all_models_in_dir,
                                                     all_models_shp_dir,
                                                     all_models_out_dir)

        R = convert_to_single_file("*.gld", gold_dir)
        C = convert_to_single_file("*.fst", all_models_out_dir)

        PM_C = convert_to_single_file("*.fst", pm_outs_dir)
        PMLM_C = convert_to_single_file("*.fst", pm_lm_outs_dir)

        cands = (("PM", PM_C), ("PM_LM", PMLM_C), ("PM_LM_SLM", C))

        evaluate_output.eval_segmenter_output(batch_dir)

        mc_report = evaluate_output.multi_col_report(batch_dir)

        mcrfile = open(os.path.join(batch_dir, "mc_report.csv"), "w")
        for r in mc_report:
            rec_id = r["rec_id"]
            words = r["words"]
            gold = r["gold"]
            pm = r["pm_only"]
            pm_lm = r["pm_lm"]
            pm_lm_slm = r["pm_lm_slm"]

            mcrfile.write("recording, word, gold, pm_only, pm_lm, pm_lm_slm\n")
            for row in zip(words, gold, pm, pm_lm, pm_lm_slm):
                s = ",".join(row)
                s = rec_id + "," + s + "\n"
                mcrfile.write(s)
        mcrfile.close()

        bfile = open(os.path.join(batch_dir, batch_name + "-SCORES.txt"), "w")
        bfile.write(batch_name + "\n\n")

        #report BLEU-like scores for 4- and 3-grammes in both strict and lax counting modes
        for o in (4, 3):
            for s in (True, False):
                write_bleus_to_file(R, cands, bfile, o, strict=s)

        #create a list of {0,1} values to show break or no break
        #golds=[r["gold"] for r in mc_report]
        golds = [int(item) for r in mc_report for item in r['gold']]
        for m in ("pm_only", "pm_lm", "pm_lm_slm"):
            hyps = [int(item) for r in mc_report for item in r[m]]
            prF = report_utils.get_prF(golds, hyps)
            b_acc = report_utils.get_baseline_accuracy(golds, hyps)
            acc = report_utils.get_accuracy(golds, hyps)
            bfile.write("prF (%s)=%s\n" % (m, str(prF)))
            bfile.write("acc (%s)=%s with delta=%s\n" %
                        (m, str(acc), str(acc - b_acc)))
            bfile.write("- - - - - -\n")
        bfile.close()

        print "Wrote bleu scores to file: ", bfile
from mseg.common import LM_SYM_FILE, SYM_FILE, TRAIN_FILE_DEFAULT, \
    DIR, TEST_FILE_DEFAULT, read_file, save_symbol_table
from mseg.lm_utils import generate_normed_text_file

gen_ntxt = False
tr_fname, slm_dir = TRAIN_FILE_DEFAULT, "eval1n"
#tr_fname, slm_dir = "test1.csv", "test1"

if __name__ == '__main__':

    #TRAIN_FILE_DEFAULT = "smalltest_norm.csv"
    slm_dir = raw_input("Type in LM dir or hit return to use default [%s]" %
                        slm_dir) or slm_dir
    print "using ", slm_dir
    lmdir_global = os.path.join(DIR, slm_dir)

    tr_fname = raw_input(
        "enter training file name: [%s]" % tr_fname) or tr_fname
    te_file = raw_input(
        "enter test file name: [%s]" % TEST_FILE_DEFAULT) or TEST_FILE_DEFAULT

    tr_data = read_file(os.path.join(DIR, tr_fname), ',', skip_header=True)
    te_rows = read_file(os.path.join(DIR, te_file), ',', skip_header=True)
    rawtext_file = generate_normed_text_file(tr_data, lmdir_global)

    all_syms = set([r[5] for r in (tr_data + te_rows)])
    lm_syms = set([r[5] for r in tr_data])

    save_symbol_table(all_syms, lmdir_global, SYM_FILE)
    save_symbol_table(lm_syms, lmdir_global, LM_SYM_FILE)
        w = r[5]  #[1:-1]
        b = r[6]
        if (next_transcript_id != transcript_id):
            transcript_id = next_transcript_id
            ofile.flush()
            ofile.close()
            #             print("wrote",ofile)
            ofile = open_wfile(odir, transcript_id)
        ofile.write(w + "\t" + b + "\n")


#         if(b):
#             ofile.write("<break>\n")
    ofile.flush()
    ofile.close()
    print("wrote {} gold files to {}".format(len(rows), odir))


def open_wfile(odir, transcript_id):
    ofilename = os.path.join(odir, transcript_id + ".gld")
    fhandle = codecs.open(ofilename, 'w')
    return fhandle


if __name__ == '__main__':
    lmdir = "eval1n"

    te_file = raw_input(
        "enter test file name: [%s]" % TEST_FILE_DEFAULT) or TEST_FILE_DEFAULT
    te_rows = read_file(os.path.join(DIR, te_file), ',', skip_header=True)
    generate_gold_files(te_rows)
def main(args):
    parser = argparse.ArgumentParser()
    parser.add_argument("base_dir", nargs='?', default= os.path.join(os.getcwd(),"mseg_workspace"), help="this is the working directory, all sub dirs live under it")
    parser.add_argument("lm_dir", nargs='?', default="lm_default", help="this is the directory in which to store the Language Model (LM) files")
    parser.add_argument("training_file", nargs='?', default=TRAIN_FILE_DEFAULT, help="name of CSV file that contains correctly annotated training examples")
    parser.add_argument("-o", "--order", type=int, default=4, help="sets the n-gramme order of the LM (default=4)")
    args = parser.parse_args()
    
    base_dir = args.base_dir
#     pm_dir = "pm_default" currently not used
    lm_dir = args.lm_dir
    tr_file = args.training_file
    ngo = args.order
    
    #SECTION ONE: dedicated to creating the Language Model files    
    lmdir_global = os.path.join(base_dir, lm_dir)
#     if(os.path.exists(lmdir_global)):
#         shutil.rmtree(lmdir_global)
#     os.makedirs(lmdir_global)

    tr_rows = read_file(os.path.join(base_dir, tr_file), ',', skip_header=True)

    rawtext_file = generate_normed_text_file(tr_rows, lmdir_global)
    
    #lm_syms = set([r[5] for r in tr_rows])
    
    lm_txt = open(rawtext_file, "r").readlines()
    lm_syms= set( open(rawtext_file, "r").read().split() )
    if BREAK in lm_syms: lm_syms.remove(BREAK)
    if UNK in lm_syms: lm_syms.remove(UNK)
    
    buildmod = "y"
    modfile = os.path.join(lmdir_global,"lm.mod")
    modpru = os.path.join(lmdir_global,LM_PRUNED)
    
    print "checking for LM file",modfile
    if(os.path.exists(modfile)):
        buildmod = raw_input("model file in "+lmdir_global+" already exists.  Overwrite? [n]").lower() or "n"
    
    if(not buildmod=="n"):
        modfile = compile_lm(rawtext_file, lmdir_global, lm_syms, ngo)
        
        print "Created unpruned lang model file:", modfile
        print "Now pruning LM..."
        ngramshrink(modfile, modpru)
        #print "Now minimising LM..."
        #lm_utils.fstmin(modpru,modpru)
        
        #we don't use the unpruned modfile again, so switch over to the pruned version here
        modfile = modpru
        
        remap_fname = os.path.join(lmdir_global,"lm_remap.dat")
        create_remap_table(lm_syms, remap_fname)
        
        
        osymfile = os.path.join(base_dir,"slm_sym.dat")
        create_slm_sym_file(osymfile)
        remap_lm(modfile, remap_fname, osymfile)
        print "remapped modfile output symbols"
                
    #create_converter(lmdir_global)
    #print "created converter."
    
#     slm_dir = raw_input("Type in SLM dir or hit return to match LM [%s]" % lm_dir) or lm_dir
#     slm_dir = os.path.join(base_dir,slm_dir)
#         
#     print "using ",slm_dir

    #put the SLM model into the same directory as its corresponding LM
    slm_dir = os.path.join(base_dir, lm_dir)

    generate_slm_from_txt(lm_txt, slm_dir, do_plot=True)
    #generate_slm(tr_rows, slm_dir, do_plot=False) # build the sentence length model, plot it so we can see it's sane
    print "slm generated from", tr_file, "in", slm_dir
    
    print "all constituent system files now compiled"