merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag) merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag) merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag) merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag) logger.info("Training completed") """ Persist Results to Mongo DB """ wd_algo = "CRF_MOST_COMMON_TAG" SUFFIX = "_CRF_MOST_COMMON_TAG" SC_TAGGING_TD, SC_TAGGING_VD = "TEST_SC_TAGGING_TD" + SUFFIX, "TEST_SC_TAGGING_VD" + SUFFIX parameters = dict(train_config) parameters["extractors"] = map(lambda fn: fn.func_name, extractors) parameters["min_feat_freq"] = MIN_FEAT_FREQ parameters.update(training_opt_copy) wd_td_objectid = processor.persist_results( SC_TAGGING_TD, cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag, parameters, wd_algo) wd_vd_objectid = processor.persist_results( SC_TAGGING_VD, cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag, parameters, wd_algo) # This outputs 0's for MEAN CONCEPT CODES as we aren't including those in the outputs print processor.results_to_string(wd_td_objectid, SC_TAGGING_TD, wd_vd_objectid, SC_TAGGING_VD, "TAGGING") logger.info("Results Processed")
tag2word_classifier[tag] = tagger td_wd_predictions_by_code[tag] = tagger.classify_many(td_feats) vd_wd_predictions_by_code[tag] = tagger.classify_many(vd_feats) td_metrics = toDict(compute_metrics(wd_td_ys_bytag, td_wd_predictions_by_code)[tag]) vd_metrics = toDict(compute_metrics(wd_vd_ys_bytag, vd_wd_predictions_by_code)[tag]) print "Fold:", i, "Tag:", tag print processor.__metrics_to_str__(pad_str, tag, td_metrics, vd_metrics) merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag) merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag) merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag) merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag) pass CB_TAGGING_TD, CB_TAGGING_VD = "CB_TAGGING_TD", "CB_TAGGING_VD" parameters = dict(config) #parameters["no_bias"] = True # better with #parameters["AverageWeights"] = False # Bad - averaging really helps parameters["extractors"] = map(lambda fn: fn.func_name, extractors) parameters["min_feat_freq"] = MIN_FEAT_FREQ wd_algo = "MaxEnt-Binary-NLTK" wd_td_objectid = processor.persist_results(CB_TAGGING_TD, cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag, parameters, wd_algo) wd_vd_objectid = processor.persist_results(CB_TAGGING_VD, cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag, parameters, wd_algo) print processor.results_to_string(wd_td_objectid, CB_TAGGING_TD, wd_vd_objectid, CB_TAGGING_VD, "TAGGING") """ WEIGHTED MEAN F1 CONCEPT CODES = 0.727. Better than WINDOW BASED """
# sent_algo = "Shift_Reduce_Parser_WTD_RF_25" # sent_algo = "Shift_Reduce_Parser_WTD_GBT_3" parameters = dict(config) parameters["extractors"] = list(map(lambda fn: fn.__name__, extractors)) parameters["no_stacking"] = True parameters["min_feat_freq"] = MIN_FEAT_FREQ parameters["num_extractors"] = len(extractors) parameters["cost_function"] = cost_fn.__name__ parameters["beta"] = BETA parameters["max_epochs"] = MAX_EPOCHS parameters["algorithm"] = str(LogisticRegression()) parameters["ngrams"] = str(NGRAMS) parameters["stemmed"] = False sent_td_objectid = processor.persist_results(CB_SENT_TD, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, parameters, sent_algo) sent_vd_objectid = processor.persist_results(CB_SENT_VD, cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag, parameters, sent_algo) print(processor.results_to_string(sent_td_objectid, CB_SENT_TD, sent_vd_objectid, CB_SENT_VD, "SENTENCE")) ## TODO # - Re-train tagging model, adding tags where reg tag is missing but is included in a causer or result tag. # - Also include explicit in the predicted tags. # - Need to handle relations where same code -> same code # -TODO - Neat Ideas # Inject a random action (unform distribution) with a specified probability during training also # Ensures better exploration of the policy space. Initial algo predictions will be random but converges very quickly so this may be lost # TODO * Need to make sure the tagger tags EXCPLICIT tags. These can then be skipped by the parser, but will be included in the features used to train the parser and taggger. Do we want to train a separate tagger that determines if a tagged word is a cause, explict or result. That will then resolve the direction of the relation?
# print results for each code logger.info("Training completed") """ Persist Results to Mongo DB """ wd_algo = str(fn_create_wd_cls()) sent_algo = str(fn_create_sent_cls()) SUFFIX = "_CAUSE_EFFECT_LBLS" SC_TAGGING_TD, SC_TAGGING_VD, SC_SENT_TD, SC_SENT_VD = "SC_TAGGING_TD" + SUFFIX, "SC_TAGGING_VD" + SUFFIX, "SC_SENT_TD" + SUFFIX, "SC_SENT_VD" + SUFFIX parameters = dict(config) parameters["extractors"] = map(lambda fn: fn.func_name, extractors) parameters["min_feat_freq"] = MIN_FEAT_FREQ wd_td_objectid = processor.persist_results(SC_TAGGING_TD, cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag, parameters, wd_algo) wd_vd_objectid = processor.persist_results(SC_TAGGING_VD, cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag, parameters, wd_algo) sent_td_objectid = processor.persist_results(SC_SENT_TD, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, parameters, sent_algo, tagger_id=wd_td_objectid) sent_vd_objectid = processor.persist_results(SC_SENT_VD, cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag, parameters, sent_algo, tagger_id=wd_vd_objectid) print processor.results_to_string(wd_td_objectid, SC_TAGGING_TD, wd_vd_objectid, SC_TAGGING_VD, "TAGGING") print processor.results_to_string(sent_td_objectid, SC_SENT_TD, sent_vd_objectid, SC_SENT_VD, "SENTENCE") logger.info("Results Processed") """ # PLAN # WORD LEVEL FEATURE EXTRACTION - use functions specific to the individual word, but that can look around at the # previous and next words and sentences if needed. This can handle every scenario where I want to leverage features # across sentences and at the essay level. # MEMOIZE SENTENCE LEVEL FEATS (e.g. deps) - Will need memoizing when extracting dependency parse features per sentence (as called once for every word in sentence) # WORD \ SENTENCE PARTITIONING FOR WORD AND SENTENCE LEVEL TAGGING
# sent_algo = "Shift_Reduce_Parser_WTD_GBT_3" parameters = dict(config) parameters["extractors"] = list(map(lambda fn: fn.__name__, extractors)) parameters["no_stacking"] = True parameters["min_feat_freq"] = MIN_FEAT_FREQ parameters["num_extractors"] = len(extractors) parameters["cost_function"] = cost_fn.__name__ parameters["beta"] = BETA parameters["max_epochs"] = MAX_EPOCHS parameters["algorithm"] = str(LogisticRegression()) parameters["ngrams"] = str(NGRAMS) parameters["stemmed"] = False sent_td_objectid = processor.persist_results(CB_SENT_TD, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, parameters, sent_algo) sent_vd_objectid = processor.persist_results(CB_SENT_VD, cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag, parameters, sent_algo) print( processor.results_to_string(sent_td_objectid, CB_SENT_TD, sent_vd_objectid, CB_SENT_VD, "SENTENCE")) ## TODO # - Re-train tagging model, adding tags where reg tag is missing but is included in a causer or result tag. # - Also include explicit in the predicted tags. # - Need to handle relations where same code -> same code # -TODO - Neat Ideas