Пример #1
0
def main():
    """
    Main function.
    """
    # parse args
    args = parse_argument()

    # load main config file and set logging
    main_config = ConfigParser(args.config_file)
    set_logging(log_file=main_config.get_str('log_file'))

    # initialize model manager object
    model_manager = ModelManager()

    # parse config
    classifier = main_config.get_str('classifier')
    pre_built_models_dir = os.path.join(main_config.get_str('pre_built_models_dir'), classifier)
    num_classifiers = main_config.get_int('num_classifiers')

    # we need to build the models first if they do not exist
    if not dir_exists(pre_built_models_dir):
        save_models(
            classifier,
            pre_built_models_dir,
            main_config,
            model_manager,
            num_classifiers)

    make_recommendation(
        classifier,
        pre_built_models_dir,
        main_config,
        model_manager,
        num_classifiers)
Пример #2
0
def main():
    """
    Main function.
    """
    # set log, parse args, and read configuration
    args = parse_argument()
    configparser = ConfigParser(args.config_file)
    set_logging(configparser.getstr('logfile'))

    # parse FoodOn
    parse_foodon = ParseFoodOn(configparser.getstr('foodon_parse_config'))
    classes_dict = parse_foodon.get_candidate_classes()
    classes_dict_skeleton, candidate_entities = parse_foodon.get_seeded_skeleton(
        classes_dict)

    # run
    scoring_manager = ScoringManager(classes_dict_skeleton, candidate_entities,
                                     configparser.getstr('scoring_config'))

    scoring_manager.run_iteration()
Пример #3
0
def main():
    """
    Main function.
    """
    # parse args
    args = parse_argument()

    # load main config file and set logging
    main_config = ConfigParser(args.config_file)
    set_logging(log_file=main_config.get_str('log_file'))

    # initialize model manager object
    model_manager = ModelManager()

    # baseline / best classifiers
    baseline_classifier = main_config.get_str('baseline')
    best_classifier = main_config.get_str('classifier')

    # plot PR curve and print confusion matrix
    plot_pr_print_cm(baseline_classifier, best_classifier, main_config,
                     model_manager)
Пример #4
0
def main():
    """
    Main function.
    """
    # set log, parse args, and read configuration
    set_logging()
    args = parse_argument()
    configparser = ConfigParser(args.config_file)

    # load data to train with
    sentence_column = configparser.getstr('sentence_column')

    pd_data = pd.read_csv(configparser.getstr('input_filepath'), sep='\t')

    pd_data.fillna('', inplace=True)
    pd_data = pd_data[pd_data[sentence_column] != '']

    # use specified column as sentences
    sentences = pd_data[sentence_column].tolist()
    sentences = [sentence.split() for sentence in sentences]

    # init word2vec manager
    w2vm = Word2VecManager(args.config_file)

    # start training and load pre-training data if prompted
    if configparser.getbool('pre_train'):
        pretrained = configparser.getstr('pre_trained_vectors')
    else:
        pretrained = None

    w2vm.train(sentences, pretrained=pretrained)

    # save word embeddings and model
    w2vm.save_model(configparser.getstr('model_saveto'))
    w2vm.save_vectors(configparser.getstr('vectors_saveto'))
    w2vm.save_loss(configparser.getstr('loss_saveto'))
Пример #5
0
def main():
    """
    Main function.
    """
    # parse args
    args = parse_argument()

    # load main config file and set logging
    main_config = ConfigParser(args.config_file)
    set_logging(log_file=main_config.get_str('log_file'))

    # initialize model manager object
    model_manager = ModelManager()

    # run models for all possible combination of preprocessing
    scale_modes = main_config.get_str_list('scale_mode')
    mvi_modes = main_config.get_str_list('mvi_mode')
    outlier_modes = main_config.get_str_list('outlier_mode')
    classifiers = main_config.get_str_list('classifier')

    classifier_score_dict = {classifier: 0 for classifier in classifiers}
    classifier_best_combination_dict = {
        classifier: None
        for classifier in classifiers
    }
    all_combinations = [scale_modes, mvi_modes, outlier_modes, classifiers]
    all_combinations = list(itertools.product(*all_combinations))
    failed_combinations = []

    for idx, combination in enumerate(all_combinations):
        # unpack the tuple
        scale_mode = combination[0]
        mvi_mode = combination[1]
        outlier_mode = combination[2]
        classifier = combination[3]

        # log current combination
        combination_str_joined = ', '.join(list(combination))
        log.info('Running grid search %d/%d: (%s)', idx + 1,
                 len(all_combinations), combination_str_joined)

        # some classifiers must use minmax scaler
        if classifier in ['MultinomialNB', 'CategoricalNB'
                          ] and scale_mode != 'minmax':
            log.info('Skipping this combination...')
            continue

        # overwrite the config file using the current combination
        preprocess_config = ConfigParser(
            main_config.get_str('preprocess_config'))
        classifier_config = ConfigParser(
            main_config.get_str('classifier_config'))

        preprocess_config.overwrite('scale_mode', scale_mode)
        preprocess_config.overwrite('mvi_mode', mvi_mode)
        preprocess_config.overwrite('outlier_mode', outlier_mode)
        classifier_config.overwrite('classifier', classifier)

        # perform preprocessing
        X, y = model_manager.preprocess(preprocess_config)

        # run classification model
        try:
            score = model_manager.grid_search(
                X, y, main_config.get_str('optimize_scoring'),
                classifier_config,
                main_config.get_str('updated_classifier_config'))
        except (IndexError, ValueError) as e:
            failed_combinations.append(combination_str_joined)
            log.error(e)
            continue

        # update the best preprocessing combination
        if classifier_score_dict[classifier] < score:
            classifier_score_dict[classifier] = score
            classifier_best_combination_dict[
                classifier] = combination_str_joined

    log.info('Best %s score for each classifier: %s',
             main_config.get_str('optimize_scoring'), classifier_score_dict)

    log.info(
        'Preprocessing combination of the best %s score for each classifier: %s',
        main_config.get_str('optimize_scoring'),
        classifier_best_combination_dict)

    log.info('%d failed combinations: %s', len(failed_combinations),
             failed_combinations)
Пример #6
0
def main():
    """
    Main function.
    """
    # parse args
    args = parse_argument()

    # load main config file and set logging
    main_config = ConfigParser(args.config_file)
    set_logging(log_file=main_config.get_str('log_file'))

    # initialize model manager object
    model_manager = ModelManager()

    # perform analysis on these classifiers
    classifiers = main_config.get_str_list('classifier')

    # do prediction
    classifiers_ys = {}
    for classifier in classifiers:
        log.info('Running model for classifier \'%s\'', classifier)

        # load config parsers
        preprocess_config = ConfigParser(
            main_config.get_str('preprocess_config'))
        classifier_config = ConfigParser(
            main_config.get_str('classifier_config'))

        # perform preprocessing
        X, y = model_manager.preprocess(preprocess_config, section=classifier)

        # run classification model
        classifier_config.overwrite('classifier', classifier)

        X = model_manager.feature_selector(X, y, classifier_config)

        score_avg, score_std, ys = model_manager.run_model_cv(
            X, y, 'f1', classifier_config)

        classifiers_ys[classifier] = ys

    # plot PR curve
    fig = plt.figure()

    lines = []
    labels = []
    for classifier, ys in classifiers_ys.items():
        y_trues, y_preds, y_probs = ys

        y_probs_1 = tuple(y_prob[1].to_numpy() for y_prob in y_probs)
        line, label = plot_pr(y_trues, y_probs_1, classifier)

        lines.append(line)
        labels.append(label)

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('PR Curve')
    plt.legend(lines, labels, loc='lower right', prop={'size': 8})

    save_figure(
        fig,
        os.path.join(main_config.get_str('visualization_dir'), 'pr_curve.png'))

    # plot ROC curve
    fig = plt.figure()

    lines = []
    labels = []
    for classifier, ys in classifiers_ys.items():
        y_trues, y_preds, y_probs = ys

        y_probs_1 = tuple(y_prob[1].to_numpy() for y_prob in y_probs)
        line, label = plot_roc(y_trues, y_probs_1, classifier)

        lines.append(line)
        labels.append(label)

    # plt.plot([0, 1], [0, 1], color='k', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(lines, labels, loc='lower right', prop={'size': 8})

    save_figure(
        fig,
        os.path.join(main_config.get_str('visualization_dir'),
                     'roc_curve.png'))
Пример #7
0
def main():
    """
    Main function.
    """
    # set log, parse args, and read configuration
    set_logging(log_level=log.INFO)
    args = parse_argument()
    configparser = ConfigParser(args.config_file)

    # need to apply preprocessing
    fpm = FdcPreprocessManager(configparser.getstr('preprocess_config'))

    # read FoodOn vocabs
    labels = []
    pd_foodon_pairs = pd.read_csv('./data/FoodOn/foodonpairs.txt', sep='\t')
    labels.extend(pd_foodon_pairs['Parent'].tolist())
    labels.extend(pd_foodon_pairs['Child'].tolist())
    labels = list(set(labels))

    log.info('Number of unique labels: %d', len(labels))

    processed_labels = fpm.preprocess_column(pd.Series(labels),
                                             load_model=True).tolist()
    queries = processed_labels.copy()
    for processed_label in processed_labels:
        queries.extend(processed_label.split())
    queries = list(set(queries))

    # get summaries of the wikipedia entry
    wm = WikipediaManager()

    # check if we're gonna reuse the previous results
    if configparser.getbool('reuse_previous'):
        prev_summary = configparser.getstr('prev_summaries_filepath')
        prev_failed = configparser.getstr('prev_failed_filepath')
    else:
        prev_summary = None
        prev_failed = None

    pd_summary, pd_failed = wm.get_summary(queries,
                                           prev_summary=prev_summary,
                                           prev_failed=prev_failed)

    # save results
    log.info('Saving successfully pulled wiki summaries to %s',
             configparser.getstr('summaries_filepath'))

    pd_summary.to_csv(configparser.getstr('summaries_filepath'),
                      sep='\t',
                      index=False)

    log.info('Saving failed wiki queries to %s',
             configparser.getstr('failed_filepath'))

    pd_failed.to_csv(configparser.getstr('failed_filepath'),
                     sep='\t',
                     index=False)

    # preprocess columns
    pd_summary['summary_preprocessed'] = fpm.preprocess_column(
        pd_summary['summary'], load_model=True)

    output_filepath = configparser.getstr('preprocessed_output')

    log.info('Saving preprocessed wikipedia data to %s...', output_filepath)
    pd_summary.to_csv(output_filepath, sep='\t', index=False)
Пример #8
0
            else:
                seeds = random.sample(entities, self.num_seeds)
                candidate_entities.extend(list(set(entities) - set(seeds)))

            skeleton_candidate_classes_dict[candidate_class] = (
                candidate_classes_dict[candidate_class][0], seeds)

        candidate_entities = list(set(candidate_entities))
        candidate_entities.sort()

        log.info(
            'Found %d candidate entities to populate out of %d all entities.',
            len(candidate_entities), len(self.all_entities))

        return_value = (skeleton_candidate_classes_dict, candidate_entities)
        save_pkl(return_value, self.skeleton_and_entities_pkl)

        return return_value


if __name__ == '__main__':
    # set log, parse args, and read configuration
    set_logging()

    # parse FoodOn
    parse_foodon = ParseFoodOn('../config/foodon_parse.ini')
    all_classes_dict = parse_foodon.get_all_classes_dict()
    candidate_classes_dict = parse_foodon.get_candidate_classes()
    (skeleton_candidate_classes_dict, candidate_entities
     ) = parse_foodon.get_seeded_skeleton(candidate_classes_dict)