def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): projection = lambda x: x if STEM: projection = stem # Start Training print("Fold %i Training code" % fold) # Important - only compute code frequency from training data (NO CHEATING) code_freq = tally_code_frequencies(essays_TD) # For training td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq, projection=projection) vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq, projection=projection) trainer = HiddenMarkovModelTrainer() model = trainer.train_supervised(td_sents) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags) vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): projection = lambda x: x if STEM: projection = stem # Start Training print("Fold %i Training code" % fold) # For training td_sents = to_label_powerset_tagged_sentences(essays_TD, regular_tags, projection=projection) vd_sents = to_label_powerset_tagged_sentences(essays_VD, regular_tags, projection=projection) trainer = HiddenMarkovModelTrainer() model = trainer.train_supervised(td_sents) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold, code_freq, training_opt): # Start Training # Start Training print("Fold %i Training code" % fold) # For training td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq) vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq) model_filename = models_folder + "/" + "%i_%s__%s" % (fold, "most_freq_code", str(randint(0, 9999999))) model = CRFTagger(feature_func=comp_feat_extactor, verbose=False, training_opt=training_opt) model.train(td_sents, model_filename) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags) vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags) os.remove(model_filename) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold, training_opt): # Start Training print("Fold %i Training code" % fold) # For training td_sents = to_label_powerset_tagged_sentences(essays_TD, regular_tags) vd_sents = to_label_powerset_tagged_sentences(essays_VD, regular_tags) model_filename = models_folder + "/" + "%i_%s__%s" % ( fold, "power_set", str(randint(0, 9999999))) model = CRFTagger(feature_func=comp_feat_extactor, verbose=False, training_opt=training_opt) model.train(td_sents, model_filename) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code( td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code( vd_predictions, regular_tags) os.remove(model_filename) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold, window_size): # Start Training print("Fold %i Training code" % fold) # For training td_sents = to_label_powerset_tagged_sentences(essays_TD, regular_tags) vd_sents = to_label_powerset_tagged_sentences(essays_VD, regular_tags) # To word windows td_tagged_windows = tagged_sents_to_word_windows(td_sents, window_size) vd_tagged_windows = tagged_sents_to_word_windows(vd_sents, window_size) model_filename = "{folder}/model_{fold}_{random}".format( folder=models_folder, fold=fold, random=randint(0, 9999999)) training_filename = "{folder}/training_{fold}_{random}.txt".format( folder=models_folder, fold=fold, random=randint(0, 9999999)) tagged_windows_to_file(td_tagged_windows, training_filename) # TRAIN MODEL model = fasttext.supervised(training_filename, model_filename) td_predictions = model.predict( map(lambda (tokens, tag): " ".join(tokens), td_tagged_windows)) vd_predictions = model.predict( map(lambda (tokens, tag): " ".join(tokens), vd_tagged_windows)) # for evaluation - binary tags # YS (ACTUAL) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = predictions_to_ys_by_code( td_predictions, regular_tags) vd_wd_predictions_by_code = predictions_to_ys_by_code( vd_predictions, regular_tags) #os.remove(model_filename) #os.remove(training_filename) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): # Start Training print("Fold %i Training code" % fold) # Important - only compute code frequency from training data (NO CHEATING) code_freq = tally_code_frequencies(essays_TD) # For training td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq) vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq) model_filename = models_folder + "/" + "%i_%s__%s" % ( fold, "most_freq_code", str(randint(0, 9999999))) model = CRFTagger(feature_func=comp_feat_extactor, verbose=False) model.train(td_sents, model_filename) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags) vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code( td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code( vd_predictions, regular_tags) os.remove(model_filename) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code