Пример #1
0
def data(request):
    """
    Returns path to a labelled dataset on disk
    """
    kind = request.param
    prefix = 'tests/resources/test-baseline'
    tr_path = '%s-tr' % prefix
    ev_path = '%s-ev' % prefix

    if kind == 'xml':
        # return the raw corpus in XML
        return tr_path, ev_path
    if kind == 'json':
        # convert corpus to gzipped JSON and try again
        jsonify_single_labelled_corpus('unit_tests', tr_path, tokenizer_conf=tokenizer_opts)
        jsonify_single_labelled_corpus('unit_tests', ev_path, tokenizer_conf=tokenizer_opts)
        return tr_path + '.gz', ev_path + '.gz'
Пример #2
0
def test_jsonify_XML_corpus():
    conf_file = 'tests/resources/conf/exp0/exp0.conf'
    conf, _ = parse_config_file(conf_file)
    train_set = conf['training_data']
    json_train_set = train_set + '.gz'
    tk = get_tokenizer_settings_from_conf(conf)

    # parse the XML directly
    x_tr, y_tr, _, _ = get_tokenized_data(train_set, tk)

    jsonify_single_labelled_corpus('unit_tests', train_set, conf_file)
    x_tr1, y_tr1, _, _ = get_tokenized_data(json_train_set, tk)

    # because the process of converting to json merges the train and test set, if a test set exists,
    # we need to merge them too in this test.
    for a, b in zip(x_tr, x_tr1):
        assert len(a[0]) == len(b) == 3
        assert set(str(f) for f in a[0].nodes()) == set(b)
    np.testing.assert_array_equal(y_tr, y_tr1)
    os.unlink(json_train_set)