def main(): """ Main function. """ # parse args args = parse_argument() # load main config file and set logging main_config = ConfigParser(args.config_file) set_logging(log_file=main_config.get_str('log_file')) # initialize model manager object model_manager = ModelManager() # parse config classifier = main_config.get_str('classifier') pre_built_models_dir = os.path.join(main_config.get_str('pre_built_models_dir'), classifier) num_classifiers = main_config.get_int('num_classifiers') # we need to build the models first if they do not exist if not dir_exists(pre_built_models_dir): save_models( classifier, pre_built_models_dir, main_config, model_manager, num_classifiers) make_recommendation( classifier, pre_built_models_dir, main_config, model_manager, num_classifiers)
def main(): """ Main function. """ # set log, parse args, and read configuration args = parse_argument() configparser = ConfigParser(args.config_file) set_logging(configparser.getstr('logfile')) # parse FoodOn parse_foodon = ParseFoodOn(configparser.getstr('foodon_parse_config')) classes_dict = parse_foodon.get_candidate_classes() classes_dict_skeleton, candidate_entities = parse_foodon.get_seeded_skeleton( classes_dict) # run scoring_manager = ScoringManager(classes_dict_skeleton, candidate_entities, configparser.getstr('scoring_config')) scoring_manager.run_iteration()
def main(): """ Main function. """ # parse args args = parse_argument() # load main config file and set logging main_config = ConfigParser(args.config_file) set_logging(log_file=main_config.get_str('log_file')) # initialize model manager object model_manager = ModelManager() # baseline / best classifiers baseline_classifier = main_config.get_str('baseline') best_classifier = main_config.get_str('classifier') # plot PR curve and print confusion matrix plot_pr_print_cm(baseline_classifier, best_classifier, main_config, model_manager)
def main(): """ Main function. """ # set log, parse args, and read configuration set_logging() args = parse_argument() configparser = ConfigParser(args.config_file) # load data to train with sentence_column = configparser.getstr('sentence_column') pd_data = pd.read_csv(configparser.getstr('input_filepath'), sep='\t') pd_data.fillna('', inplace=True) pd_data = pd_data[pd_data[sentence_column] != ''] # use specified column as sentences sentences = pd_data[sentence_column].tolist() sentences = [sentence.split() for sentence in sentences] # init word2vec manager w2vm = Word2VecManager(args.config_file) # start training and load pre-training data if prompted if configparser.getbool('pre_train'): pretrained = configparser.getstr('pre_trained_vectors') else: pretrained = None w2vm.train(sentences, pretrained=pretrained) # save word embeddings and model w2vm.save_model(configparser.getstr('model_saveto')) w2vm.save_vectors(configparser.getstr('vectors_saveto')) w2vm.save_loss(configparser.getstr('loss_saveto'))
def main(): """ Main function. """ # parse args args = parse_argument() # load main config file and set logging main_config = ConfigParser(args.config_file) set_logging(log_file=main_config.get_str('log_file')) # initialize model manager object model_manager = ModelManager() # run models for all possible combination of preprocessing scale_modes = main_config.get_str_list('scale_mode') mvi_modes = main_config.get_str_list('mvi_mode') outlier_modes = main_config.get_str_list('outlier_mode') classifiers = main_config.get_str_list('classifier') classifier_score_dict = {classifier: 0 for classifier in classifiers} classifier_best_combination_dict = { classifier: None for classifier in classifiers } all_combinations = [scale_modes, mvi_modes, outlier_modes, classifiers] all_combinations = list(itertools.product(*all_combinations)) failed_combinations = [] for idx, combination in enumerate(all_combinations): # unpack the tuple scale_mode = combination[0] mvi_mode = combination[1] outlier_mode = combination[2] classifier = combination[3] # log current combination combination_str_joined = ', '.join(list(combination)) log.info('Running grid search %d/%d: (%s)', idx + 1, len(all_combinations), combination_str_joined) # some classifiers must use minmax scaler if classifier in ['MultinomialNB', 'CategoricalNB' ] and scale_mode != 'minmax': log.info('Skipping this combination...') continue # overwrite the config file using the current combination preprocess_config = ConfigParser( main_config.get_str('preprocess_config')) classifier_config = ConfigParser( main_config.get_str('classifier_config')) preprocess_config.overwrite('scale_mode', scale_mode) preprocess_config.overwrite('mvi_mode', mvi_mode) preprocess_config.overwrite('outlier_mode', outlier_mode) classifier_config.overwrite('classifier', classifier) # perform preprocessing X, y = model_manager.preprocess(preprocess_config) # run classification model try: score = model_manager.grid_search( X, y, main_config.get_str('optimize_scoring'), classifier_config, main_config.get_str('updated_classifier_config')) except (IndexError, ValueError) as e: failed_combinations.append(combination_str_joined) log.error(e) continue # update the best preprocessing combination if classifier_score_dict[classifier] < score: classifier_score_dict[classifier] = score classifier_best_combination_dict[ classifier] = combination_str_joined log.info('Best %s score for each classifier: %s', main_config.get_str('optimize_scoring'), classifier_score_dict) log.info( 'Preprocessing combination of the best %s score for each classifier: %s', main_config.get_str('optimize_scoring'), classifier_best_combination_dict) log.info('%d failed combinations: %s', len(failed_combinations), failed_combinations)
def main(): """ Main function. """ # parse args args = parse_argument() # load main config file and set logging main_config = ConfigParser(args.config_file) set_logging(log_file=main_config.get_str('log_file')) # initialize model manager object model_manager = ModelManager() # perform analysis on these classifiers classifiers = main_config.get_str_list('classifier') # do prediction classifiers_ys = {} for classifier in classifiers: log.info('Running model for classifier \'%s\'', classifier) # load config parsers preprocess_config = ConfigParser( main_config.get_str('preprocess_config')) classifier_config = ConfigParser( main_config.get_str('classifier_config')) # perform preprocessing X, y = model_manager.preprocess(preprocess_config, section=classifier) # run classification model classifier_config.overwrite('classifier', classifier) X = model_manager.feature_selector(X, y, classifier_config) score_avg, score_std, ys = model_manager.run_model_cv( X, y, 'f1', classifier_config) classifiers_ys[classifier] = ys # plot PR curve fig = plt.figure() lines = [] labels = [] for classifier, ys in classifiers_ys.items(): y_trues, y_preds, y_probs = ys y_probs_1 = tuple(y_prob[1].to_numpy() for y_prob in y_probs) line, label = plot_pr(y_trues, y_probs_1, classifier) lines.append(line) labels.append(label) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('PR Curve') plt.legend(lines, labels, loc='lower right', prop={'size': 8}) save_figure( fig, os.path.join(main_config.get_str('visualization_dir'), 'pr_curve.png')) # plot ROC curve fig = plt.figure() lines = [] labels = [] for classifier, ys in classifiers_ys.items(): y_trues, y_preds, y_probs = ys y_probs_1 = tuple(y_prob[1].to_numpy() for y_prob in y_probs) line, label = plot_roc(y_trues, y_probs_1, classifier) lines.append(line) labels.append(label) # plt.plot([0, 1], [0, 1], color='k', linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve') plt.legend(lines, labels, loc='lower right', prop={'size': 8}) save_figure( fig, os.path.join(main_config.get_str('visualization_dir'), 'roc_curve.png'))
def main(): """ Main function. """ # set log, parse args, and read configuration set_logging(log_level=log.INFO) args = parse_argument() configparser = ConfigParser(args.config_file) # need to apply preprocessing fpm = FdcPreprocessManager(configparser.getstr('preprocess_config')) # read FoodOn vocabs labels = [] pd_foodon_pairs = pd.read_csv('./data/FoodOn/foodonpairs.txt', sep='\t') labels.extend(pd_foodon_pairs['Parent'].tolist()) labels.extend(pd_foodon_pairs['Child'].tolist()) labels = list(set(labels)) log.info('Number of unique labels: %d', len(labels)) processed_labels = fpm.preprocess_column(pd.Series(labels), load_model=True).tolist() queries = processed_labels.copy() for processed_label in processed_labels: queries.extend(processed_label.split()) queries = list(set(queries)) # get summaries of the wikipedia entry wm = WikipediaManager() # check if we're gonna reuse the previous results if configparser.getbool('reuse_previous'): prev_summary = configparser.getstr('prev_summaries_filepath') prev_failed = configparser.getstr('prev_failed_filepath') else: prev_summary = None prev_failed = None pd_summary, pd_failed = wm.get_summary(queries, prev_summary=prev_summary, prev_failed=prev_failed) # save results log.info('Saving successfully pulled wiki summaries to %s', configparser.getstr('summaries_filepath')) pd_summary.to_csv(configparser.getstr('summaries_filepath'), sep='\t', index=False) log.info('Saving failed wiki queries to %s', configparser.getstr('failed_filepath')) pd_failed.to_csv(configparser.getstr('failed_filepath'), sep='\t', index=False) # preprocess columns pd_summary['summary_preprocessed'] = fpm.preprocess_column( pd_summary['summary'], load_model=True) output_filepath = configparser.getstr('preprocessed_output') log.info('Saving preprocessed wikipedia data to %s...', output_filepath) pd_summary.to_csv(output_filepath, sep='\t', index=False)
else: seeds = random.sample(entities, self.num_seeds) candidate_entities.extend(list(set(entities) - set(seeds))) skeleton_candidate_classes_dict[candidate_class] = ( candidate_classes_dict[candidate_class][0], seeds) candidate_entities = list(set(candidate_entities)) candidate_entities.sort() log.info( 'Found %d candidate entities to populate out of %d all entities.', len(candidate_entities), len(self.all_entities)) return_value = (skeleton_candidate_classes_dict, candidate_entities) save_pkl(return_value, self.skeleton_and_entities_pkl) return return_value if __name__ == '__main__': # set log, parse args, and read configuration set_logging() # parse FoodOn parse_foodon = ParseFoodOn('../config/foodon_parse.ini') all_classes_dict = parse_foodon.get_all_classes_dict() candidate_classes_dict = parse_foodon.get_candidate_classes() (skeleton_candidate_classes_dict, candidate_entities ) = parse_foodon.get_seeded_skeleton(candidate_classes_dict)