def conf(tmpdir): # load default configuration tmpfile = tmpdir.join('blank') with open(str(tmpfile), 'w'): pass # touch res, _ = parse_config_file(str(tmpfile), confrc='conf/confrc', quit_on_error=False) res['feature_extraction'].update({ 'class': 'eval.pipeline.bov.ThesaurusVectorizer', 'min_df': 1, 'k': 10, # use all thesaurus entries 'train_token_handler': 'eval.pipeline.feature_handlers.BaseFeatureHandler', 'decode_token_handler': 'eval.pipeline.feature_handlers.BaseFeatureHandler', 'random_neighbour_thesaurus': False, 'train_time_opts': dict(extract_unigram_features=['J', 'N', 'V'], extract_phrase_features=[]), 'decode_time_opts': dict(extract_unigram_features=['J', 'N', 'V'], extract_phrase_features=[]) }) res['feature_selection'].update({ 'run': True, 'method': 'eval.pipeline.feature_selectors.VectorBackedSelectKBest', 'scoring_function': 'sklearn.feature_selection.chi2', 'must_be_in_thesaurus': False, 'k': 'all', }) res['vector_sources']['is_thesaurus'] = True return res
def test_jsonify_XML_corpus(): conf_file = 'tests/resources/conf/exp0/exp0.conf' conf, _ = parse_config_file(conf_file) train_set = conf['training_data'] json_train_set = train_set + '.gz' tk = get_tokenizer_settings_from_conf(conf) # parse the XML directly x_tr, y_tr, _, _ = get_tokenized_data(train_set, tk) jsonify_single_labelled_corpus('unit_tests', train_set, conf_file) x_tr1, y_tr1, _, _ = get_tokenized_data(json_train_set, tk) # because the process of converting to json merges the train and test set, if a test set exists, # we need to merge them too in this test. for a, b in zip(x_tr, x_tr1): assert len(a[0]) == len(b) == 3 assert set(str(f) for f in a[0].nodes()) == set(b) np.testing.assert_array_equal(y_tr, y_tr1) os.unlink(json_train_set)
def get_tokenizer_settings_from_conf_file(conf_file): conf, _ = parse_config_file(conf_file) return get_tokenizer_settings_from_conf(conf)
def is_valid_file(arg): if not os.path.exists(arg): parser.error("The conf file %s does not exist!" % arg) else: return arg if __name__ == '__main__': # parse command-line arguments (conf file only) parser = argparse.ArgumentParser(description='Evaluate vector via document classification') parser.add_argument('conf_file', help='Conf file that defines the experiment', type=is_valid_file) args = parser.parse_args() conf, configspec_file = parse_config_file(args.conf_file) mkdirs_if_not_exists(conf['output_dir']) # set up logging to file logging.basicConfig(level=logging.INFO, format="%(asctime)s\t%(module)s.%(funcName)s (line %(lineno)d)\t%(levelname)s : %(message)s", datefmt='%m-%d %H:%M', filename=os.path.join(conf['output_dir'], 'log.txt'), filemode='w') # define a Handler which writes INFO messages or higher to the sys.stderr console = logging.StreamHandler() console.setLevel(logging.INFO) # set a format which is simpler for console use formatter = logging.Formatter("%(asctime)s\t%(module)s.%(funcName)s (line %(lineno)d)\t%(levelname)s : %(message)s") # tell the handler to use this format console.setFormatter(formatter)
def conf(): config, _ = parse_config_file(conf_file) mkdirs_if_not_exists(config['output_dir']) return config