예제 #1
0
def test_jsonify_XML_corpus():
    conf_file = 'tests/resources/conf/exp0/exp0.conf'
    conf, _ = parse_config_file(conf_file)
    train_set = conf['training_data']
    json_train_set = train_set + '.gz'
    tk = get_tokenizer_settings_from_conf(conf)

    # parse the XML directly
    x_tr, y_tr, _, _ = get_tokenized_data(train_set, tk)

    jsonify_single_labelled_corpus('unit_tests', train_set, conf_file)
    x_tr1, y_tr1, _, _ = get_tokenized_data(json_train_set, tk)

    # because the process of converting to json merges the train and test set, if a test set exists,
    # we need to merge them too in this test.
    for a, b in zip(x_tr, x_tr1):
        assert len(a[0]) == len(b) == 3
        assert set(str(f) for f in a[0].nodes()) == set(b)
    np.testing.assert_array_equal(y_tr, y_tr1)
    os.unlink(json_train_set)
예제 #2
0
def run_experiment(conf):
    start_time = datetime.now()
    mkdirs_if_not_exists(conf['output_dir'])
    test_path = ''
    tr_data = conf['training_data']
    if conf['test_data']:
        test_path = conf['test_data']

    # LOADING RAW TEXT
    x_tr, y_tr, x_test, y_test = get_tokenized_data(tr_data,
                                                    get_tokenizer_settings_from_conf(conf),
                                                    test_data=test_path)

    # CREATE CROSSVALIDATION ITERATOR
    cv_iterator, y_vals = _build_crossvalidation_iterator(conf['crossvalidation'],
                                                          y_tr, y_test)
    if x_test is not None:
        # concatenate all data, the CV iterator will make sure x_test is used for testing
        x_vals = list(x_tr)
        x_vals.extend(list(x_test))
    else:
        x_vals = x_tr

    all_scores = []
    params = []
    for i, (train_idx, test_idx) in enumerate(cv_iterator):
        params.append((conf, i, multiple_scores, test_idx, train_idx, x_vals, y_vals))
        logging.warning('Only using the first CV fold')
        if conf['crossvalidation']['break_after_first']:
            # only do one train/test split to save time
            logging.info('Exiting after first fold')
            break

    scores_over_cv = [_cv_loop(*foo) for foo in params]
    all_scores.extend([score for one_set_of_scores in scores_over_cv for score in one_set_of_scores])
    _store_scores(all_scores, conf['output_dir'], conf['name'])
    total_time = (datetime.now() - start_time).seconds / 60
    logging.info('MINUTES TAKEN %.2f' % total_time)