def cross_validation(use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size): results = Parallel(n_jobs=1)( delayed(evaluate_fold)(i, use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size) for i in range(len(fold2training_data))) cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list) cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list) for result in results: td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag = result merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag) merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag) merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag) merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag) SUFFIX = "_RNN_MOST_COMMON_TAG" CB_TAGGING_TD, CB_TAGGING_VD = "TEST_CB_TAGGING_TD" + SUFFIX, "TEST_CB_TAGGING_VD" + SUFFIX parameters = dict(train_config) parameters["extractors"] = [] parameters["min_feat_freq"] = 0 parameters["use_pretrained_embedding"] = use_pretrained_embedding parameters["bi-directional"] = bi_directional parameters["hidden_size"] = hidden_size parameters["merge_mode"] = merge_mode parameters["num_rnns"] = num_rnns wd_algo = "RNN" wd_td_objectid = processor.persist_results(CB_TAGGING_TD, cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag, parameters, wd_algo) wd_vd_objectid = processor.persist_results(CB_TAGGING_VD, cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag, parameters, wd_algo) avg_f1 = float(processor.get_metric(CB_TAGGING_VD, wd_vd_objectid, __MICRO_F1__)["f1_score"]) return avg_f1
def evaluate_model_essay_level( folds: List[Tuple[Any, Any]], extractor_fn_names_lst: List[str], all_extractor_fns: List[Any], beta: float, ngrams: int, stemmed: bool, max_epochs: int, min_feat_freq:int, cr_tags: Set[str], base_learner_fact: Any, down_sample_rate=1.0, model: Union[ SearnModelTemplateFeatures, SearnModelEssayParser, SearnModelEssayParserBreadthFirst] = None)-> Tuple[Any]: if not model: model = SearnModelEssayParser if down_sample_rate < 1.0: new_folds = [] # type: List[Tuple[Any, Any]] for i, (essays_TD, essays_VD) in enumerate(folds): essays_TD = essays_TD[:int(down_sample_rate * len(essays_TD))] essays_VD = essays_VD[:int(down_sample_rate * len(essays_VD))] new_folds.append((essays_TD, essays_VD)) folds = new_folds # type: List[Tuple[Any, Any]] serial_results = [ train_sr_parser(essays_TD, essays_VD, extractor_fn_names_lst, all_extractor_fns, ngrams, stemmed, beta, max_epochs, cr_tags, min_feat_freq, cr_tags, base_learner_fact, model) for essays_TD, essays_VD in folds ] cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag = defaultdict(list), defaultdict(list) cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = defaultdict(list), defaultdict(list) # record the number of features in each fold number_of_feats = [] # Parallel is almost 5X faster!!! parser_models = [] for (model, num_feats, sent_td_ys_bycode, sent_vd_ys_bycode, sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode) in serial_results: number_of_feats.append(num_feats) parser_models.append(model) merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag) merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag) merge_dictionaries(sent_td_pred_ys_bycode, cv_sent_td_predictions_by_tag) merge_dictionaries(sent_vd_pred_ys_bycode, cv_sent_vd_predictions_by_tag) # print(processor.results_to_string(sent_td_objectid, CB_SENT_TD, sent_vd_objectid, CB_SENT_VD, "SENTENCE")) return parser_models, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag
def cross_validation(use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size): results = Parallel(n_jobs=1)( delayed(evaluate_fold)(i, use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size) for i in range(CV_FOLDS)) cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict( list), defaultdict(list) cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict( list), defaultdict(list) for result in results: td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag = result merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag) merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag) merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag) merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag) SUFFIX = "_RNN_MOST_COMMON_TAG_HYPER_PARAM_TUNING" SC_TAGGING_TD, SC_TAGGING_VD = "SC_TAGGING_TD" + SUFFIX, "SC_TAGGING_VD" + SUFFIX parameters = dict(config) parameters["extractors"] = [] parameters["min_feat_freq"] = 0 parameters["use_pretrained_embedding"] = use_pretrained_embedding parameters["bi-directional"] = bi_directional parameters["hidden_size"] = hidden_size parameters["merge_mode"] = merge_mode parameters["num_rnns"] = num_rnns wd_algo = "RNN" wd_td_objectid = processor.persist_results(SC_TAGGING_TD, cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag, parameters, wd_algo) wd_vd_objectid = processor.persist_results(SC_TAGGING_VD, cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag, parameters, wd_algo) avg_f1 = float( processor.get_metric(SC_TAGGING_VD, wd_vd_objectid, __MICRO_F1__)["f1_score"]) return avg_f1
def evaluate_features(folds: List[Tuple[Any, Any]], extractor_names: Set[str], cost_function_name: str, beta: float = 0.3, base_learner: Any = LogisticRegression, ngrams: int = 2, down_sample_rate=1.0) -> float: if down_sample_rate < 1.0: new_folds = [] # type: List[Tuple[Any, Any]] for i, (essays_TD, essays_VD) in enumerate(folds): essays_TD = essays_TD[:int(down_sample_rate * len(essays_TD))] essays_VD = essays_VD[:int(down_sample_rate * len(essays_VD))] new_folds.append((essays_TD, essays_VD)) folds = new_folds # type: List[Tuple[Any, Any]] parallel_results = Parallel(n_jobs=len(folds))( delayed(model_train_predict)(essays_TD, essays_VD, extractor_names, cost_function_name, ngrams, beta) for essays_TD, essays_VD in folds) cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag = defaultdict( list), defaultdict(list) cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = defaultdict( list), defaultdict(list) # record the number of features in each fold number_of_feats = [] # Parallel is almost 5X faster!!! for (num_feats, sent_td_ys_bycode, sent_vd_ys_bycode, sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode) in parallel_results: number_of_feats.append(num_feats) merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag) merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag) merge_dictionaries(sent_td_pred_ys_bycode, cv_sent_td_predictions_by_tag) merge_dictionaries(sent_vd_pred_ys_bycode, cv_sent_vd_predictions_by_tag) # break # Mongo settings recording avg_feats = np.mean(number_of_feats) sent_algo = "Shift_Reduce_Parser_LR" parameters = dict(config) parameters["extractors"] = list(sorted(extractor_names)) parameters["num_extractors"] = len(extractor_names) parameters["cost_function"] = cost_function_name parameters["beta"] = beta parameters["no_stacking"] = True parameters["algorithm"] = str(base_learner()) parameters["ngrams"] = str(ngrams) parameters["num_feats_MEAN"] = avg_feats parameters["num_feats_per_fold"] = number_of_feats parameters["min_feat_freq"] = MIN_FEAT_FREQ parameters["stemmed"] = False logger.info( "\t\tMean num feats: {avg_feats:.2f}".format(avg_feats=avg_feats)) TD = "CR_CB_SHIFT_REDUCE_PARSER_MULITNOMIAL_FEATURE_SEL_TD" VD = "CR_CB_SHIFT_REDUCE_PARSER_MULITNOMIAL_FEATURE_SEL_VD" if down_sample_rate < 1.0: logger.info( "\t\tDown sampling at rate: {rate:.5f}, storing temp results". format(rate=down_sample_rate)) parameters["down_sample"] = down_sample_rate CB_SENT_TD, CB_SENT_VD = "__tmp_" + TD, "__tmp_" + TD else: CB_SENT_TD, CB_SENT_VD = TD, VD sent_td_objectid = processor.persist_results( CB_SENT_TD, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, parameters, sent_algo) sent_vd_objectid = processor.persist_results( CB_SENT_VD, cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag, parameters, sent_algo) # print(processor.results_to_string(sent_td_objectid, CB_SENT_TD, sent_vd_objectid, CB_SENT_VD, "SENTENCE")) micro_f1 = float( processor.get_metric(CB_SENT_VD, sent_vd_objectid, __MICRO_F1__)["f1_score"]) return micro_f1
"c2": c2 } training_opt_copy = dict([(k.replace(".", "_"), v) for k, v in training_opt.items()]) results = Parallel(n_jobs=len(folds))( delayed(train_classifer_on_fold)(essays_TD, essays_VD, regular_tags, fold, code_freq, training_opt) for fold, (essays_TD, essays_VD) in enumerate(folds)) for result in results: wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code = result merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag) merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag) merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag) merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag) logger.info("Training completed") """ Persist Results to Mongo DB """ wd_algo = "CRF_MOST_COMMON_TAG" SUFFIX = "_CRF_MOST_COMMON_TAG" SC_TAGGING_TD, SC_TAGGING_VD = "TEST_SC_TAGGING_TD" + SUFFIX, "TEST_SC_TAGGING_VD" + SUFFIX parameters = dict(train_config) parameters["extractors"] = map(lambda fn: fn.func_name, extractors) parameters["min_feat_freq"] = MIN_FEAT_FREQ
def evaluate_model( collection_prefix: str, folds: List[Tuple[Any, Any]], extractor_fn_names_lst: List[str], cost_function_name: str, beta: float, ngrams: int, stemmed: bool, max_epochs: int, down_sample_rate=1.0) -> float: if down_sample_rate < 1.0: new_folds = [] # type: List[Tuple[Any, Any]] for i, (essays_TD, essays_VD) in enumerate(folds): essays_TD = essays_TD[:int(down_sample_rate * len(essays_TD))] essays_VD = essays_VD[:int(down_sample_rate * len(essays_VD))] new_folds.append((essays_TD, essays_VD)) folds = new_folds # type: List[Tuple[Any, Any]] #logger.info("\tModei={model}".format(model=str(BASE_LEARNER_FACT()))) parallel_results = Parallel(n_jobs=len(folds))( delayed(model_train_predict)(essays_TD, essays_VD, extractor_fn_names_lst, cost_function_name, ngrams, stemmed, beta, max_epochs) for essays_TD, essays_VD in folds) cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag = defaultdict(list), defaultdict(list) cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = defaultdict(list), defaultdict(list) # record the number of features in each fold number_of_feats = [] # Parallel is almost 5X faster!!! for (num_feats, sent_td_ys_bycode, sent_vd_ys_bycode, sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode) in parallel_results: number_of_feats.append(num_feats) merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag) merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag) merge_dictionaries(sent_td_pred_ys_bycode, cv_sent_td_predictions_by_tag) merge_dictionaries(sent_vd_pred_ys_bycode, cv_sent_vd_predictions_by_tag) # break # Mongo settings recording avg_feats = np.mean(number_of_feats) sent_algo = "Shift_Reduce_Parser_LR" parameters = dict(config) parameters["extractors"] = list(extractor_fn_names_lst) parameters["num_extractors"] = len(extractor_fn_names_lst) parameters["cost_function"] = cost_function_name parameters["beta"] = beta parameters["max_epochs"] = max_epochs parameters["no_stacking"] = True parameters["algorithm"] = str(BASE_LEARNER_FACT()) parameters["ngrams"] = str(ngrams) parameters["num_feats_MEAN"] = avg_feats parameters["num_feats_per_fold"] = number_of_feats parameters["min_feat_freq"] = MIN_FEAT_FREQ parameters["stemmed"] = stemmed logger.info("\t\tMean num feats: {avg_feats:.2f}".format(avg_feats=avg_feats)) TD = collection_prefix + "_TD" VD = collection_prefix + "_VD" if down_sample_rate < 1.0: logger.info("\t\tDown sampling at rate: {rate:.5f}, storing temp results".format(rate=down_sample_rate)) parameters["down_sample"] = down_sample_rate CB_SENT_TD, CB_SENT_VD = "__tmp_" + TD, "__tmp_" + TD else: CB_SENT_TD, CB_SENT_VD = TD, VD sent_td_objectid = results_processor.persist_results(CB_SENT_TD, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, parameters, sent_algo) sent_vd_objectid = results_processor.persist_results(CB_SENT_VD, cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag, parameters, sent_algo) # print(processor.results_to_string(sent_td_objectid, CB_SENT_TD, sent_vd_objectid, CB_SENT_VD, "SENTENCE")) micro_f1 = float(results_processor.get_metric(CB_SENT_VD, sent_vd_objectid, __MICRO_F1__)["f1_score"]) return micro_f1
def evaluate_model(collection_prefix: str, folds: List[Tuple[Any, Any]], extractor_fn_names_lst: List[str], cost_function_name: str, beta: float, ngrams: int, stemmed: bool, max_epochs: int, down_sample_rate=1.0) -> float: if down_sample_rate < 1.0: new_folds = [] # type: List[Tuple[Any, Any]] for i, (essays_TD, essays_VD) in enumerate(folds): essays_TD = essays_TD[:int(down_sample_rate * len(essays_TD))] essays_VD = essays_VD[:int(down_sample_rate * len(essays_VD))] new_folds.append((essays_TD, essays_VD)) folds = new_folds # type: List[Tuple[Any, Any]] #logger.info("\tModei={model}".format(model=str(BASE_LEARNER_FACT()))) parallel_results = Parallel(n_jobs=len(folds))( delayed(model_train_predict) (essays_TD, essays_VD, extractor_fn_names_lst, cost_function_name, ngrams, stemmed, beta, max_epochs) for essays_TD, essays_VD in folds) cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag = defaultdict( list), defaultdict(list) cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = defaultdict( list), defaultdict(list) # record the number of features in each fold number_of_feats = [] # Parallel is almost 5X faster!!! for (num_feats, sent_td_ys_bycode, sent_vd_ys_bycode, sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode) in parallel_results: number_of_feats.append(num_feats) merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag) merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag) merge_dictionaries(sent_td_pred_ys_bycode, cv_sent_td_predictions_by_tag) merge_dictionaries(sent_vd_pred_ys_bycode, cv_sent_vd_predictions_by_tag) # break # Mongo settings recording avg_feats = np.mean(number_of_feats) sent_algo = "Shift_Reduce_Parser_LR" parameters = dict(config) parameters["extractors"] = list(extractor_fn_names_lst) parameters["num_extractors"] = len(extractor_fn_names_lst) parameters["cost_function"] = cost_function_name parameters["beta"] = beta parameters["max_epochs"] = max_epochs parameters["no_stacking"] = True parameters["algorithm"] = str(BASE_LEARNER_FACT()) parameters["ngrams"] = str(ngrams) parameters["num_feats_MEAN"] = avg_feats parameters["num_feats_per_fold"] = number_of_feats parameters["min_feat_freq"] = MIN_FEAT_FREQ parameters["stemmed"] = stemmed logger.info( "\t\tMean num feats: {avg_feats:.2f}".format(avg_feats=avg_feats)) TD = collection_prefix + "_TD" VD = collection_prefix + "_VD" if down_sample_rate < 1.0: logger.info( "\t\tDown sampling at rate: {rate:.5f}, storing temp results". format(rate=down_sample_rate)) parameters["down_sample"] = down_sample_rate CB_SENT_TD, CB_SENT_VD = "__tmp_" + TD, "__tmp_" + TD else: CB_SENT_TD, CB_SENT_VD = TD, VD sent_td_objectid = results_processor.persist_results( CB_SENT_TD, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, parameters, sent_algo) sent_vd_objectid = results_processor.persist_results( CB_SENT_VD, cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag, parameters, sent_algo) # print(processor.results_to_string(sent_td_objectid, CB_SENT_TD, sent_vd_objectid, CB_SENT_VD, "SENTENCE")) micro_f1 = float( results_processor.get_metric(CB_SENT_VD, sent_vd_objectid, __MICRO_F1__)["f1_score"]) return micro_f1
""" FEATURE EXTRACTION """ cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list) cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list) folds = [(train_tagged_essays, test_tagged_essays)] results = Parallel(n_jobs=len(folds))( delayed(train_classifer_on_fold)(essays_TD, essays_VD, regular_tags, fold) for fold, (essays_TD, essays_VD) in enumerate(folds)) for result in results: wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code = result merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag) merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag) merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag) merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag) logger.info("Training completed") """ Persist Results to Mongo DB """ wd_algo = "HMM_MOST_COMMON_TAG_MULTICLASS" SUFFIX = "_HMM_MOST_COMMON_TAG_MULTICLASS" CB_TAGGING_TD, CB_TAGGING_VD= "TEST_CB_TAGGING_TD" + SUFFIX, "TEST_CB_TAGGING_VD" + SUFFIX parameters = dict(train_config) parameters["min_feat_freq"] = MIN_FEAT_FREQ if STEM: parameters["extractors"] = "stemmed_unigrams"
cost_function=cost_fn, min_feature_freq=MIN_FEAT_FREQ, ngram_extractor=ngram_extractor, cr_tags=cr_tags, base_learner_fact=LogisticRegression, beta=BETA ) parse_model.train(essays_TD, MAX_EPOCHS) sent_td_ys_bycode = parse_model.get_label_data(essays_TD) sent_vd_ys_bycode = parse_model.get_label_data(essays_VD) sent_td_pred_ys_bycode = parse_model.predict(essays_TD) sent_vd_pred_ys_bycode = parse_model.predict(essays_VD) merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag) merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag) merge_dictionaries(sent_td_pred_ys_bycode, cv_sent_td_predictions_by_tag) merge_dictionaries(sent_vd_pred_ys_bycode, cv_sent_vd_predictions_by_tag) # break # CB_SENT_TD, CB_SENT_VD = "CR_CB_SHIFT_REDUCE_PARSER_TD" , "CR_CB_SHIFT_REDUCE_PARSER_VD" CB_SENT_TD, CB_SENT_VD = "CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_TD", "CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_VD" # sent_algo = "Shift_Reduce_Parser" sent_algo = "Shift_Reduce_Parser_LR" # sent_algo = "Shift_Reduce_Parser_XGB_10" # sent_algo = "Shift_Reduce_Parser_CLA_LR" # sent_algo = "Shift_Reduce_Parser_WTD_LR" # sent_algo = "Shift_Reduce_Parser_WTD_RF" # sent_algo = "Shift_Reduce_Parser_WTD_RF_25" # sent_algo = "Shift_Reduce_Parser_WTD_GBT_3"
def evaluate_features(folds: List[Tuple[Any, Any]], extractor_names: Set[str], cost_function_name: str, beta: float = 0.3, base_learner: Any = LogisticRegression, ngrams: int = 2, down_sample_rate=1.0) -> float: if down_sample_rate < 1.0: new_folds = [] # type: List[Tuple[Any, Any]] for i, (essays_TD, essays_VD) in enumerate(folds): essays_TD = essays_TD[:int(down_sample_rate * len(essays_TD))] essays_VD = essays_VD[:int(down_sample_rate * len(essays_VD))] new_folds.append((essays_TD, essays_VD)) folds = new_folds # type: List[Tuple[Any, Any]] parallel_results = Parallel(n_jobs=len(folds))( delayed(model_train_predict)(essays_TD, essays_VD, extractor_names, cost_function_name, ngrams, beta) for essays_TD, essays_VD in folds) cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag = defaultdict(list), defaultdict(list) cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = defaultdict(list), defaultdict(list) # record the number of features in each fold number_of_feats = [] # Parallel is almost 5X faster!!! for (num_feats, sent_td_ys_bycode, sent_vd_ys_bycode, sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode) in parallel_results: number_of_feats.append(num_feats) merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag) merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag) merge_dictionaries(sent_td_pred_ys_bycode, cv_sent_td_predictions_by_tag) merge_dictionaries(sent_vd_pred_ys_bycode, cv_sent_vd_predictions_by_tag) # break # Mongo settings recording avg_feats = np.mean(number_of_feats) sent_algo = "Shift_Reduce_Parser_LR" parameters = dict(config) parameters["extractors"] = list(sorted(extractor_names)) parameters["num_extractors"] = len(extractor_names) parameters["cost_function"] = cost_function_name parameters["beta"] = beta parameters["no_stacking"] = True parameters["algorithm"] = str(base_learner()) parameters["ngrams"] = str(ngrams) parameters["num_feats_MEAN"] = avg_feats parameters["num_feats_per_fold"] = number_of_feats parameters["min_feat_freq"] = MIN_FEAT_FREQ parameters["stemmed"] = False logger.info("\t\tMean num feats: {avg_feats:.2f}".format(avg_feats=avg_feats)) TD = "CR_CB_SHIFT_REDUCE_PARSER_REGRESSION_SEL_TD" VD = "CR_CB_SHIFT_REDUCE_PARSER_REGRESSION_SEL_VD" if down_sample_rate < 1.0: logger.info("\t\tDown sampling at rate: {rate:.5f}, storing temp results".format(rate=down_sample_rate)) parameters["down_sample"] = down_sample_rate CB_SENT_TD, CB_SENT_VD = "__tmp_" + TD, "__tmp_" + TD else: CB_SENT_TD, CB_SENT_VD = TD, VD sent_td_objectid = processor.persist_results(CB_SENT_TD, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, parameters, sent_algo) sent_vd_objectid = processor.persist_results(CB_SENT_VD, cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag, parameters, sent_algo) # print(processor.results_to_string(sent_td_objectid, CB_SENT_TD, sent_vd_objectid, CB_SENT_VD, "SENTENCE")) micro_f1 = float(processor.get_metric(CB_SENT_VD, sent_vd_objectid, __MICRO_F1__)["f1_score"]) return micro_f1
feature_extractor=template_feature_extractor, cost_function=cost_fn, min_feature_freq=MIN_FEAT_FREQ, ngram_extractor=ngram_extractor, cr_tags=cr_tags, base_learner_fact=LogisticRegression, beta=BETA) parse_model.train(essays_TD, MAX_EPOCHS) sent_td_ys_bycode = parse_model.get_label_data(essays_TD) sent_vd_ys_bycode = parse_model.get_label_data(essays_VD) sent_td_pred_ys_bycode = parse_model.predict(essays_TD) sent_vd_pred_ys_bycode = parse_model.predict(essays_VD) merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag) merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag) merge_dictionaries(sent_td_pred_ys_bycode, cv_sent_td_predictions_by_tag) merge_dictionaries(sent_vd_pred_ys_bycode, cv_sent_vd_predictions_by_tag) # break # CB_SENT_TD, CB_SENT_VD = "CR_CB_SHIFT_REDUCE_PARSER_TD" , "CR_CB_SHIFT_REDUCE_PARSER_VD" CB_SENT_TD, CB_SENT_VD = "CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_TD", "CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_VD" # sent_algo = "Shift_Reduce_Parser" sent_algo = "Shift_Reduce_Parser_LR" # sent_algo = "Shift_Reduce_Parser_XGB_10" # sent_algo = "Shift_Reduce_Parser_CLA_LR" # sent_algo = "Shift_Reduce_Parser_WTD_LR" # sent_algo = "Shift_Reduce_Parser_WTD_RF" # sent_algo = "Shift_Reduce_Parser_WTD_RF_25" # sent_algo = "Shift_Reduce_Parser_WTD_GBT_3"