def cross_validation(use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size):
    results = Parallel(n_jobs=1)(
        delayed(evaluate_fold)(i, use_pretrained_embedding, bi_directional, num_rnns, merge_mode, hidden_size) for i in
        range(len(fold2training_data)))

    cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list)
    cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list)
    for result in results:
        td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag = result
        merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
        merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
        merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag)
        merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag)

    SUFFIX = "_RNN_MOST_COMMON_TAG"
    CB_TAGGING_TD, CB_TAGGING_VD = "TEST_CB_TAGGING_TD" + SUFFIX, "TEST_CB_TAGGING_VD" + SUFFIX
    parameters = dict(train_config)
    parameters["extractors"] = []
    parameters["min_feat_freq"] = 0

    parameters["use_pretrained_embedding"] = use_pretrained_embedding
    parameters["bi-directional"] = bi_directional
    parameters["hidden_size"] = hidden_size
    parameters["merge_mode"] = merge_mode
    parameters["num_rnns"] = num_rnns

    wd_algo = "RNN"
    wd_td_objectid = processor.persist_results(CB_TAGGING_TD, cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag,
                                               parameters, wd_algo)
    wd_vd_objectid = processor.persist_results(CB_TAGGING_VD, cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag,
                                               parameters, wd_algo)
    avg_f1 = float(processor.get_metric(CB_TAGGING_VD, wd_vd_objectid, __MICRO_F1__)["f1_score"])
    return avg_f1
예제 #2
0
def evaluate_model_essay_level(
        folds: List[Tuple[Any, Any]],
        extractor_fn_names_lst: List[str],
        all_extractor_fns: List[Any],
        beta: float,
        ngrams: int,
        stemmed: bool,
        max_epochs: int,
        min_feat_freq:int,
        cr_tags: Set[str],
        base_learner_fact: Any,
        down_sample_rate=1.0, model: Union[
            SearnModelTemplateFeatures, SearnModelEssayParser, SearnModelEssayParserBreadthFirst] = None)-> Tuple[Any]:

    if not model:
        model = SearnModelEssayParser

    if down_sample_rate < 1.0:
        new_folds = []  # type: List[Tuple[Any, Any]]
        for i, (essays_TD, essays_VD) in enumerate(folds):
            essays_TD = essays_TD[:int(down_sample_rate * len(essays_TD))]
            essays_VD = essays_VD[:int(down_sample_rate * len(essays_VD))]
            new_folds.append((essays_TD, essays_VD))
        folds = new_folds  # type: List[Tuple[Any, Any]]

    serial_results = [
        train_sr_parser(essays_TD, essays_VD, extractor_fn_names_lst, all_extractor_fns, ngrams, stemmed, beta,
                        max_epochs, cr_tags, min_feat_freq, cr_tags, base_learner_fact, model)
        for essays_TD, essays_VD in folds
    ]

    cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag = defaultdict(list), defaultdict(list)
    cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

    # record the number of features in each fold
    number_of_feats = []

    # Parallel is almost 5X faster!!!
    parser_models = []
    for (model, num_feats,
         sent_td_ys_bycode, sent_vd_ys_bycode,
         sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode) in serial_results:
        number_of_feats.append(num_feats)

        parser_models.append(model)
        merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag)
        merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag)
        merge_dictionaries(sent_td_pred_ys_bycode, cv_sent_td_predictions_by_tag)
        merge_dictionaries(sent_vd_pred_ys_bycode, cv_sent_vd_predictions_by_tag)

    # print(processor.results_to_string(sent_td_objectid, CB_SENT_TD, sent_vd_objectid, CB_SENT_VD, "SENTENCE"))
    return parser_models, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag, cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag
def cross_validation(use_pretrained_embedding, bi_directional, num_rnns,
                     merge_mode, hidden_size):
    results = Parallel(n_jobs=1)(
        delayed(evaluate_fold)(i, use_pretrained_embedding, bi_directional,
                               num_rnns, merge_mode, hidden_size)
        for i in range(CV_FOLDS))

    cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(
        list), defaultdict(list)
    cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(
        list), defaultdict(list)
    for result in results:
        td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag = result
        merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
        merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
        merge_dictionaries(td_wd_predictions_by_code,
                           cv_wd_td_predictions_by_tag)
        merge_dictionaries(vd_wd_predictions_by_code,
                           cv_wd_vd_predictions_by_tag)

    SUFFIX = "_RNN_MOST_COMMON_TAG_HYPER_PARAM_TUNING"
    SC_TAGGING_TD, SC_TAGGING_VD = "SC_TAGGING_TD" + SUFFIX, "SC_TAGGING_VD" + SUFFIX
    parameters = dict(config)
    parameters["extractors"] = []
    parameters["min_feat_freq"] = 0

    parameters["use_pretrained_embedding"] = use_pretrained_embedding
    parameters["bi-directional"] = bi_directional
    parameters["hidden_size"] = hidden_size
    parameters["merge_mode"] = merge_mode
    parameters["num_rnns"] = num_rnns

    wd_algo = "RNN"
    wd_td_objectid = processor.persist_results(SC_TAGGING_TD,
                                               cv_wd_td_ys_by_tag,
                                               cv_wd_td_predictions_by_tag,
                                               parameters, wd_algo)
    wd_vd_objectid = processor.persist_results(SC_TAGGING_VD,
                                               cv_wd_vd_ys_by_tag,
                                               cv_wd_vd_predictions_by_tag,
                                               parameters, wd_algo)
    avg_f1 = float(
        processor.get_metric(SC_TAGGING_VD, wd_vd_objectid,
                             __MICRO_F1__)["f1_score"])
    return avg_f1
def evaluate_features(folds: List[Tuple[Any, Any]],
                      extractor_names: Set[str],
                      cost_function_name: str,
                      beta: float = 0.3,
                      base_learner: Any = LogisticRegression,
                      ngrams: int = 2,
                      down_sample_rate=1.0) -> float:
    if down_sample_rate < 1.0:
        new_folds = []  # type: List[Tuple[Any, Any]]
        for i, (essays_TD, essays_VD) in enumerate(folds):
            essays_TD = essays_TD[:int(down_sample_rate * len(essays_TD))]
            essays_VD = essays_VD[:int(down_sample_rate * len(essays_VD))]
            new_folds.append((essays_TD, essays_VD))
        folds = new_folds  # type: List[Tuple[Any, Any]]

    parallel_results = Parallel(n_jobs=len(folds))(
        delayed(model_train_predict)(essays_TD, essays_VD, extractor_names,
                                     cost_function_name, ngrams, beta)
        for essays_TD, essays_VD in folds)

    cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag = defaultdict(
        list), defaultdict(list)
    cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = defaultdict(
        list), defaultdict(list)

    # record the number of features in each fold
    number_of_feats = []

    # Parallel is almost 5X faster!!!
    for (num_feats, sent_td_ys_bycode, sent_vd_ys_bycode,
         sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode) in parallel_results:
        number_of_feats.append(num_feats)

        merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag)
        merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag)
        merge_dictionaries(sent_td_pred_ys_bycode,
                           cv_sent_td_predictions_by_tag)
        merge_dictionaries(sent_vd_pred_ys_bycode,
                           cv_sent_vd_predictions_by_tag)
        # break

    # Mongo settings recording
    avg_feats = np.mean(number_of_feats)
    sent_algo = "Shift_Reduce_Parser_LR"

    parameters = dict(config)
    parameters["extractors"] = list(sorted(extractor_names))
    parameters["num_extractors"] = len(extractor_names)
    parameters["cost_function"] = cost_function_name
    parameters["beta"] = beta
    parameters["no_stacking"] = True
    parameters["algorithm"] = str(base_learner())
    parameters["ngrams"] = str(ngrams)
    parameters["num_feats_MEAN"] = avg_feats
    parameters["num_feats_per_fold"] = number_of_feats
    parameters["min_feat_freq"] = MIN_FEAT_FREQ
    parameters["stemmed"] = False

    logger.info(
        "\t\tMean num feats: {avg_feats:.2f}".format(avg_feats=avg_feats))

    TD = "CR_CB_SHIFT_REDUCE_PARSER_MULITNOMIAL_FEATURE_SEL_TD"
    VD = "CR_CB_SHIFT_REDUCE_PARSER_MULITNOMIAL_FEATURE_SEL_VD"
    if down_sample_rate < 1.0:
        logger.info(
            "\t\tDown sampling at rate: {rate:.5f}, storing temp results".
            format(rate=down_sample_rate))
        parameters["down_sample"] = down_sample_rate
        CB_SENT_TD, CB_SENT_VD = "__tmp_" + TD, "__tmp_" + TD
    else:
        CB_SENT_TD, CB_SENT_VD = TD, VD

    sent_td_objectid = processor.persist_results(
        CB_SENT_TD, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag,
        parameters, sent_algo)
    sent_vd_objectid = processor.persist_results(
        CB_SENT_VD, cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag,
        parameters, sent_algo)

    # print(processor.results_to_string(sent_td_objectid, CB_SENT_TD, sent_vd_objectid, CB_SENT_VD, "SENTENCE"))
    micro_f1 = float(
        processor.get_metric(CB_SENT_VD, sent_vd_objectid,
                             __MICRO_F1__)["f1_score"])
    return micro_f1
                "c2": c2
            }

            training_opt_copy = dict([(k.replace(".", "_"), v)
                                      for k, v in training_opt.items()])

            results = Parallel(n_jobs=len(folds))(
                delayed(train_classifer_on_fold)(essays_TD, essays_VD,
                                                 regular_tags, fold, code_freq,
                                                 training_opt)
                for fold, (essays_TD, essays_VD) in enumerate(folds))

            for result in results:
                wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code = result

                merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
                merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
                merge_dictionaries(td_wd_predictions_by_code,
                                   cv_wd_td_predictions_by_tag)
                merge_dictionaries(vd_wd_predictions_by_code,
                                   cv_wd_vd_predictions_by_tag)

            logger.info("Training completed")
            """ Persist Results to Mongo DB """
            wd_algo = "CRF_MOST_COMMON_TAG"
            SUFFIX = "_CRF_MOST_COMMON_TAG"
            SC_TAGGING_TD, SC_TAGGING_VD = "TEST_SC_TAGGING_TD" + SUFFIX, "TEST_SC_TAGGING_VD" + SUFFIX

            parameters = dict(train_config)
            parameters["extractors"] = map(lambda fn: fn.func_name, extractors)
            parameters["min_feat_freq"] = MIN_FEAT_FREQ
def evaluate_model(
        collection_prefix: str,
        folds: List[Tuple[Any, Any]],
        extractor_fn_names_lst: List[str],
        cost_function_name: str,
        beta: float,
        ngrams: int,
        stemmed: bool,
        max_epochs: int,
        down_sample_rate=1.0) -> float:

    if down_sample_rate < 1.0:
        new_folds = []  # type: List[Tuple[Any, Any]]
        for i, (essays_TD, essays_VD) in enumerate(folds):
            essays_TD = essays_TD[:int(down_sample_rate * len(essays_TD))]
            essays_VD = essays_VD[:int(down_sample_rate * len(essays_VD))]
            new_folds.append((essays_TD, essays_VD))
        folds = new_folds  # type: List[Tuple[Any, Any]]

    #logger.info("\tModei={model}".format(model=str(BASE_LEARNER_FACT())))

    parallel_results = Parallel(n_jobs=len(folds))(
        delayed(model_train_predict)(essays_TD, essays_VD, extractor_fn_names_lst, cost_function_name, ngrams, stemmed,
                                     beta, max_epochs)
        for essays_TD, essays_VD in folds)

    cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag = defaultdict(list), defaultdict(list)
    cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

    # record the number of features in each fold
    number_of_feats = []

    # Parallel is almost 5X faster!!!
    for (num_feats,
         sent_td_ys_bycode, sent_vd_ys_bycode,
         sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode) in parallel_results:
        number_of_feats.append(num_feats)

        merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag)
        merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag)
        merge_dictionaries(sent_td_pred_ys_bycode, cv_sent_td_predictions_by_tag)
        merge_dictionaries(sent_vd_pred_ys_bycode, cv_sent_vd_predictions_by_tag)
        # break

    # Mongo settings recording
    avg_feats = np.mean(number_of_feats)
    sent_algo = "Shift_Reduce_Parser_LR"

    parameters = dict(config)
    parameters["extractors"] = list(extractor_fn_names_lst)
    parameters["num_extractors"] = len(extractor_fn_names_lst)
    parameters["cost_function"] = cost_function_name
    parameters["beta"] = beta
    parameters["max_epochs"] = max_epochs
    parameters["no_stacking"] = True
    parameters["algorithm"] = str(BASE_LEARNER_FACT())
    parameters["ngrams"] = str(ngrams)
    parameters["num_feats_MEAN"] = avg_feats
    parameters["num_feats_per_fold"] = number_of_feats
    parameters["min_feat_freq"] = MIN_FEAT_FREQ
    parameters["stemmed"] = stemmed

    logger.info("\t\tMean num feats: {avg_feats:.2f}".format(avg_feats=avg_feats))

    TD = collection_prefix + "_TD"
    VD = collection_prefix + "_VD"
    if down_sample_rate < 1.0:
        logger.info("\t\tDown sampling at rate: {rate:.5f}, storing temp results".format(rate=down_sample_rate))
        parameters["down_sample"] = down_sample_rate
        CB_SENT_TD, CB_SENT_VD = "__tmp_" + TD, "__tmp_" + TD
    else:
        CB_SENT_TD, CB_SENT_VD = TD, VD

    sent_td_objectid = results_processor.persist_results(CB_SENT_TD, cv_sent_td_ys_by_tag,
                                                         cv_sent_td_predictions_by_tag, parameters, sent_algo)
    sent_vd_objectid = results_processor.persist_results(CB_SENT_VD, cv_sent_vd_ys_by_tag,
                                                         cv_sent_vd_predictions_by_tag, parameters, sent_algo)

    # print(processor.results_to_string(sent_td_objectid, CB_SENT_TD, sent_vd_objectid, CB_SENT_VD, "SENTENCE"))
    micro_f1 = float(results_processor.get_metric(CB_SENT_VD, sent_vd_objectid, __MICRO_F1__)["f1_score"])
    return micro_f1
예제 #7
0
def evaluate_model(collection_prefix: str,
                   folds: List[Tuple[Any, Any]],
                   extractor_fn_names_lst: List[str],
                   cost_function_name: str,
                   beta: float,
                   ngrams: int,
                   stemmed: bool,
                   max_epochs: int,
                   down_sample_rate=1.0) -> float:

    if down_sample_rate < 1.0:
        new_folds = []  # type: List[Tuple[Any, Any]]
        for i, (essays_TD, essays_VD) in enumerate(folds):
            essays_TD = essays_TD[:int(down_sample_rate * len(essays_TD))]
            essays_VD = essays_VD[:int(down_sample_rate * len(essays_VD))]
            new_folds.append((essays_TD, essays_VD))
        folds = new_folds  # type: List[Tuple[Any, Any]]

    #logger.info("\tModei={model}".format(model=str(BASE_LEARNER_FACT())))

    parallel_results = Parallel(n_jobs=len(folds))(
        delayed(model_train_predict)
        (essays_TD, essays_VD, extractor_fn_names_lst, cost_function_name,
         ngrams, stemmed, beta, max_epochs) for essays_TD, essays_VD in folds)

    cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag = defaultdict(
        list), defaultdict(list)
    cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = defaultdict(
        list), defaultdict(list)

    # record the number of features in each fold
    number_of_feats = []

    # Parallel is almost 5X faster!!!
    for (num_feats, sent_td_ys_bycode, sent_vd_ys_bycode,
         sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode) in parallel_results:
        number_of_feats.append(num_feats)

        merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag)
        merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag)
        merge_dictionaries(sent_td_pred_ys_bycode,
                           cv_sent_td_predictions_by_tag)
        merge_dictionaries(sent_vd_pred_ys_bycode,
                           cv_sent_vd_predictions_by_tag)
        # break

    # Mongo settings recording
    avg_feats = np.mean(number_of_feats)
    sent_algo = "Shift_Reduce_Parser_LR"

    parameters = dict(config)
    parameters["extractors"] = list(extractor_fn_names_lst)
    parameters["num_extractors"] = len(extractor_fn_names_lst)
    parameters["cost_function"] = cost_function_name
    parameters["beta"] = beta
    parameters["max_epochs"] = max_epochs
    parameters["no_stacking"] = True
    parameters["algorithm"] = str(BASE_LEARNER_FACT())
    parameters["ngrams"] = str(ngrams)
    parameters["num_feats_MEAN"] = avg_feats
    parameters["num_feats_per_fold"] = number_of_feats
    parameters["min_feat_freq"] = MIN_FEAT_FREQ
    parameters["stemmed"] = stemmed

    logger.info(
        "\t\tMean num feats: {avg_feats:.2f}".format(avg_feats=avg_feats))

    TD = collection_prefix + "_TD"
    VD = collection_prefix + "_VD"
    if down_sample_rate < 1.0:
        logger.info(
            "\t\tDown sampling at rate: {rate:.5f}, storing temp results".
            format(rate=down_sample_rate))
        parameters["down_sample"] = down_sample_rate
        CB_SENT_TD, CB_SENT_VD = "__tmp_" + TD, "__tmp_" + TD
    else:
        CB_SENT_TD, CB_SENT_VD = TD, VD

    sent_td_objectid = results_processor.persist_results(
        CB_SENT_TD, cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag,
        parameters, sent_algo)
    sent_vd_objectid = results_processor.persist_results(
        CB_SENT_VD, cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag,
        parameters, sent_algo)

    # print(processor.results_to_string(sent_td_objectid, CB_SENT_TD, sent_vd_objectid, CB_SENT_VD, "SENTENCE"))
    micro_f1 = float(
        results_processor.get_metric(CB_SENT_VD, sent_vd_objectid,
                                     __MICRO_F1__)["f1_score"])
    return micro_f1
""" FEATURE EXTRACTION """

cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list)
cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

folds = [(train_tagged_essays, test_tagged_essays)]

results = Parallel(n_jobs=len(folds))(
            delayed(train_classifer_on_fold)(essays_TD, essays_VD, regular_tags, fold)
                for fold, (essays_TD, essays_VD) in enumerate(folds))

for result in results:
    wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code = result

    merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
    merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
    merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag)
    merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag)

logger.info("Training completed")

""" Persist Results to Mongo DB """
wd_algo = "HMM_MOST_COMMON_TAG_MULTICLASS"
SUFFIX = "_HMM_MOST_COMMON_TAG_MULTICLASS"
CB_TAGGING_TD, CB_TAGGING_VD= "TEST_CB_TAGGING_TD" + SUFFIX, "TEST_CB_TAGGING_VD" + SUFFIX

parameters = dict(train_config)
parameters["min_feat_freq"] = MIN_FEAT_FREQ
if STEM:
    parameters["extractors"] = "stemmed_unigrams"
                                             cost_function=cost_fn,
                                             min_feature_freq=MIN_FEAT_FREQ,
                                             ngram_extractor=ngram_extractor,
                                             cr_tags=cr_tags,
                                             base_learner_fact=LogisticRegression,
                                             beta=BETA
                                             )
    parse_model.train(essays_TD, MAX_EPOCHS)

    sent_td_ys_bycode = parse_model.get_label_data(essays_TD)
    sent_vd_ys_bycode = parse_model.get_label_data(essays_VD)

    sent_td_pred_ys_bycode = parse_model.predict(essays_TD)
    sent_vd_pred_ys_bycode = parse_model.predict(essays_VD)

    merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag)
    merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag)
    merge_dictionaries(sent_td_pred_ys_bycode, cv_sent_td_predictions_by_tag)
    merge_dictionaries(sent_vd_pred_ys_bycode, cv_sent_vd_predictions_by_tag)
    # break

# CB_SENT_TD, CB_SENT_VD = "CR_CB_SHIFT_REDUCE_PARSER_TD" , "CR_CB_SHIFT_REDUCE_PARSER_VD"
CB_SENT_TD, CB_SENT_VD = "CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_TD", "CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_VD"
# sent_algo = "Shift_Reduce_Parser"
sent_algo = "Shift_Reduce_Parser_LR"
# sent_algo = "Shift_Reduce_Parser_XGB_10"
# sent_algo = "Shift_Reduce_Parser_CLA_LR"
# sent_algo = "Shift_Reduce_Parser_WTD_LR"
# sent_algo = "Shift_Reduce_Parser_WTD_RF"
# sent_algo = "Shift_Reduce_Parser_WTD_RF_25"
# sent_algo = "Shift_Reduce_Parser_WTD_GBT_3"
def evaluate_features(folds: List[Tuple[Any, Any]],
                      extractor_names: Set[str],
                      cost_function_name: str,
                      beta: float = 0.3,
                      base_learner: Any = LogisticRegression,
                      ngrams: int = 2, down_sample_rate=1.0) -> float:
    if down_sample_rate < 1.0:
        new_folds = []  # type: List[Tuple[Any, Any]]
        for i, (essays_TD, essays_VD) in enumerate(folds):
            essays_TD = essays_TD[:int(down_sample_rate * len(essays_TD))]
            essays_VD = essays_VD[:int(down_sample_rate * len(essays_VD))]
            new_folds.append((essays_TD, essays_VD))
        folds = new_folds  # type: List[Tuple[Any, Any]]

    parallel_results = Parallel(n_jobs=len(folds))(
        delayed(model_train_predict)(essays_TD, essays_VD, extractor_names, cost_function_name, ngrams, beta)
        for essays_TD, essays_VD in folds)

    cv_sent_td_ys_by_tag, cv_sent_td_predictions_by_tag = defaultdict(list), defaultdict(list)
    cv_sent_vd_ys_by_tag, cv_sent_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

    # record the number of features in each fold
    number_of_feats = []

    # Parallel is almost 5X faster!!!
    for (num_feats,
         sent_td_ys_bycode, sent_vd_ys_bycode,
         sent_td_pred_ys_bycode, sent_vd_pred_ys_bycode) in parallel_results:
        number_of_feats.append(num_feats)

        merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag)
        merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag)
        merge_dictionaries(sent_td_pred_ys_bycode, cv_sent_td_predictions_by_tag)
        merge_dictionaries(sent_vd_pred_ys_bycode, cv_sent_vd_predictions_by_tag)
        # break

    # Mongo settings recording
    avg_feats = np.mean(number_of_feats)
    sent_algo = "Shift_Reduce_Parser_LR"

    parameters = dict(config)
    parameters["extractors"] = list(sorted(extractor_names))
    parameters["num_extractors"] = len(extractor_names)
    parameters["cost_function"] = cost_function_name
    parameters["beta"] = beta
    parameters["no_stacking"] = True
    parameters["algorithm"] = str(base_learner())
    parameters["ngrams"] = str(ngrams)
    parameters["num_feats_MEAN"] = avg_feats
    parameters["num_feats_per_fold"] = number_of_feats
    parameters["min_feat_freq"] = MIN_FEAT_FREQ
    parameters["stemmed"] = False

    logger.info("\t\tMean num feats: {avg_feats:.2f}".format(avg_feats=avg_feats))

    TD = "CR_CB_SHIFT_REDUCE_PARSER_REGRESSION_SEL_TD"
    VD = "CR_CB_SHIFT_REDUCE_PARSER_REGRESSION_SEL_VD"
    if down_sample_rate < 1.0:
        logger.info("\t\tDown sampling at rate: {rate:.5f}, storing temp results".format(rate=down_sample_rate))
        parameters["down_sample"] = down_sample_rate
        CB_SENT_TD, CB_SENT_VD = "__tmp_" + TD, "__tmp_" + TD
    else:
        CB_SENT_TD, CB_SENT_VD = TD, VD

    sent_td_objectid = processor.persist_results(CB_SENT_TD, cv_sent_td_ys_by_tag,
                                                 cv_sent_td_predictions_by_tag, parameters, sent_algo)
    sent_vd_objectid = processor.persist_results(CB_SENT_VD, cv_sent_vd_ys_by_tag,
                                                 cv_sent_vd_predictions_by_tag, parameters, sent_algo)

    # print(processor.results_to_string(sent_td_objectid, CB_SENT_TD, sent_vd_objectid, CB_SENT_VD, "SENTENCE"))
    micro_f1 = float(processor.get_metric(CB_SENT_VD, sent_vd_objectid, __MICRO_F1__)["f1_score"])
    return micro_f1
        feature_extractor=template_feature_extractor,
        cost_function=cost_fn,
        min_feature_freq=MIN_FEAT_FREQ,
        ngram_extractor=ngram_extractor,
        cr_tags=cr_tags,
        base_learner_fact=LogisticRegression,
        beta=BETA)
    parse_model.train(essays_TD, MAX_EPOCHS)

    sent_td_ys_bycode = parse_model.get_label_data(essays_TD)
    sent_vd_ys_bycode = parse_model.get_label_data(essays_VD)

    sent_td_pred_ys_bycode = parse_model.predict(essays_TD)
    sent_vd_pred_ys_bycode = parse_model.predict(essays_VD)

    merge_dictionaries(sent_td_ys_bycode, cv_sent_td_ys_by_tag)
    merge_dictionaries(sent_vd_ys_bycode, cv_sent_vd_ys_by_tag)
    merge_dictionaries(sent_td_pred_ys_bycode, cv_sent_td_predictions_by_tag)
    merge_dictionaries(sent_vd_pred_ys_bycode, cv_sent_vd_predictions_by_tag)
    # break

# CB_SENT_TD, CB_SENT_VD = "CR_CB_SHIFT_REDUCE_PARSER_TD" , "CR_CB_SHIFT_REDUCE_PARSER_VD"
CB_SENT_TD, CB_SENT_VD = "CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_TD", "CR_CB_SHIFT_REDUCE_PARSER_TEMPLATED_VD"
# sent_algo = "Shift_Reduce_Parser"
sent_algo = "Shift_Reduce_Parser_LR"
# sent_algo = "Shift_Reduce_Parser_XGB_10"
# sent_algo = "Shift_Reduce_Parser_CLA_LR"
# sent_algo = "Shift_Reduce_Parser_WTD_LR"
# sent_algo = "Shift_Reduce_Parser_WTD_RF"
# sent_algo = "Shift_Reduce_Parser_WTD_RF_25"
# sent_algo = "Shift_Reduce_Parser_WTD_GBT_3"