Exemplo n.º 1
0
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):

    projection = lambda x: x
    if STEM:
        projection = stem

    # Start Training
    print("Fold %i Training code" % fold)

    # Important - only compute code frequency from training data (NO CHEATING)
    code_freq = tally_code_frequencies(essays_TD)

    # For training
    td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq, projection=projection)
    vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq, projection=projection)

    trainer = HiddenMarkovModelTrainer()
    model = trainer.train_supervised(td_sents)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags)
    vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags)

    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):

    # Start Training
    print("Fold %i Training code" % fold)

    # Important - only compute code frequency from training data (NO CHEATING)
    code_freq = tally_code_frequencies(essays_TD)

    # For training
    td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq)
    vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq)

    model_filename = models_folder + "/" + "%i_%s__%s" % (fold, "most_freq_code", str(randint(0, 9999999)))

    model = CRFTagger(feature_func=comp_feat_extactor, verbose=False)
    model.train(td_sents, model_filename)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags)
    vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags)

    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags)
    os.remove(model_filename)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):

    # Start Training
    print("Fold %i Training code" % fold)

    # Important - only compute code frequency from training data (NO CHEATING)
    code_freq = tally_code_frequencies(essays_TD)

    # For training
    td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags,
                                                    code_freq)
    vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags,
                                                    code_freq)

    model_filename = models_folder + "/" + "%i_%s__%s" % (
        fold, "most_freq_code", str(randint(0, 9999999)))

    model = CRFTagger(feature_func=comp_feat_extactor, verbose=False)
    model.train(td_sents, model_filename)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags)
    vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags)

    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset,
                                                      regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset,
                                                      regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(
        td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(
        vd_predictions, regular_tags)
    os.remove(model_filename)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
                                                 ngram_size=1,
                                                 positional=False,
                                                 stem_words=False)

extractors = [
    unigram_bow_window,
    unigram_window_stemmed,
    biigram_window_stemmed,
    #trigram_window_stemmed,
    extract_brown_cluster,
    #extract_dependency_relation
]

comp_feat_extactor = fact_composite_feature_extractor(extractors)

code_freq = tally_code_frequencies(train_tagged_essays)
folds = [(train_tagged_essays, test_tagged_essays)]

for feat_poss_state in [False]:
    for feat_poss_transitions in [True]:
        for c2 in [1.0]:  # different to CB
            cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(
                list), defaultdict(list)
            cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(
                list), defaultdict(list)

            training_opt = {
                "feature.possible_states": feat_poss_state,
                "feature.possible_transitions": feat_poss_transitions,
                "c2": c2
            }
unigram_bow_window = fact_extract_ngram_features(offset=offset, ngram_size=1, positional=False, stem_words=False)

extractors = [
    unigram_bow_window,

    unigram_window_stemmed,
    biigram_window_stemmed,
    #trigram_window_stemmed,

    extract_brown_cluster,
    #extract_dependency_relation
]

comp_feat_extactor = fact_composite_feature_extractor(extractors)

code_freq = tally_code_frequencies(train_tagged_essays)
folds = [(train_tagged_essays, test_tagged_essays)]

for feat_poss_state in [False]:
    for feat_poss_transitions in [True]:
        for c2 in [1.0]: # different to CB
            cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list)
            cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

            training_opt = {"feature.possible_states" :     feat_poss_state,
                            "feature.possible_transitions": feat_poss_transitions,
                            "c2": c2
                            }

            training_opt_copy = dict([(k.replace(".", "_"),v) for k,v in training_opt.items()])