示例#1
0
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):

    projection = lambda x: x
    if STEM:
        projection = stem

    # Start Training
    print("Fold %i Training code" % fold)

    # Important - only compute code frequency from training data (NO CHEATING)
    code_freq = tally_code_frequencies(essays_TD)

    # For training
    td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq, projection=projection)
    vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq, projection=projection)

    trainer = HiddenMarkovModelTrainer()
    model = trainer.train_supervised(td_sents)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags)
    vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags)

    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):

    projection = lambda x: x
    if STEM:
        projection = stem

    # Start Training
    print("Fold %i Training code" % fold)

    # For training
    td_sents = to_label_powerset_tagged_sentences(essays_TD, regular_tags, projection=projection)
    vd_sents = to_label_powerset_tagged_sentences(essays_VD, regular_tags, projection=projection)

    trainer = HiddenMarkovModelTrainer()
    model = trainer.train_supervised(td_sents)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold, code_freq, training_opt):

    # Start Training
    # Start Training
    print("Fold %i Training code" % fold)

    # For training
    td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq)
    vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq)

    model_filename = models_folder + "/" + "%i_%s__%s" % (fold, "most_freq_code", str(randint(0, 9999999)))

    model = CRFTagger(feature_func=comp_feat_extactor, verbose=False, training_opt=training_opt)
    model.train(td_sents, model_filename)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags)
    vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags)

    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags)
    os.remove(model_filename)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):

    projection = lambda x: x
    if STEM:
        projection = stem

    # Start Training
    print("Fold %i Training code" % fold)

    # For training
    td_sents = to_label_powerset_tagged_sentences(essays_TD, regular_tags, projection=projection)
    vd_sents = to_label_powerset_tagged_sentences(essays_VD, regular_tags, projection=projection)

    trainer = HiddenMarkovModelTrainer()
    model = trainer.train_supervised(td_sents)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
示例#5
0
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold,
                            training_opt):

    # Start Training
    print("Fold %i Training code" % fold)

    # For training
    td_sents = to_label_powerset_tagged_sentences(essays_TD, regular_tags)
    vd_sents = to_label_powerset_tagged_sentences(essays_VD, regular_tags)

    model_filename = models_folder + "/" + "%i_%s__%s" % (
        fold, "power_set", str(randint(0, 9999999)))

    model = CRFTagger(feature_func=comp_feat_extactor,
                      verbose=False,
                      training_opt=training_opt)
    model.train(td_sents, model_filename)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(
        td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(
        vd_predictions, regular_tags)

    os.remove(model_filename)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
示例#6
0
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold,
                            window_size):

    # Start Training
    print("Fold %i Training code" % fold)

    # For training
    td_sents = to_label_powerset_tagged_sentences(essays_TD, regular_tags)
    vd_sents = to_label_powerset_tagged_sentences(essays_VD, regular_tags)

    # To word windows
    td_tagged_windows = tagged_sents_to_word_windows(td_sents, window_size)
    vd_tagged_windows = tagged_sents_to_word_windows(vd_sents, window_size)

    model_filename = "{folder}/model_{fold}_{random}".format(
        folder=models_folder, fold=fold, random=randint(0, 9999999))
    training_filename = "{folder}/training_{fold}_{random}.txt".format(
        folder=models_folder, fold=fold, random=randint(0, 9999999))

    tagged_windows_to_file(td_tagged_windows, training_filename)

    # TRAIN MODEL
    model = fasttext.supervised(training_filename, model_filename)

    td_predictions = model.predict(
        map(lambda (tokens, tag): " ".join(tokens), td_tagged_windows))
    vd_predictions = model.predict(
        map(lambda (tokens, tag): " ".join(tokens), vd_tagged_windows))

    # for evaluation - binary tags
    # YS (ACTUAL)
    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = predictions_to_ys_by_code(
        td_predictions, regular_tags)
    vd_wd_predictions_by_code = predictions_to_ys_by_code(
        vd_predictions, regular_tags)

    #os.remove(model_filename)
    #os.remove(training_filename)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):

    # Start Training
    print("Fold %i Training code" % fold)

    # Important - only compute code frequency from training data (NO CHEATING)
    code_freq = tally_code_frequencies(essays_TD)

    # For training
    td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags,
                                                    code_freq)
    vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags,
                                                    code_freq)

    model_filename = models_folder + "/" + "%i_%s__%s" % (
        fold, "most_freq_code", str(randint(0, 9999999)))

    model = CRFTagger(feature_func=comp_feat_extactor, verbose=False)
    model.train(td_sents, model_filename)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags)
    vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags)

    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset,
                                                      regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset,
                                                      regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(
        td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(
        vd_predictions, regular_tags)
    os.remove(model_filename)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code