def data(request): """ Returns path to a labelled dataset on disk """ kind = request.param prefix = 'tests/resources/test-baseline' tr_path = '%s-tr' % prefix ev_path = '%s-ev' % prefix if kind == 'xml': # return the raw corpus in XML return tr_path, ev_path if kind == 'json': # convert corpus to gzipped JSON and try again jsonify_single_labelled_corpus('unit_tests', tr_path, tokenizer_conf=tokenizer_opts) jsonify_single_labelled_corpus('unit_tests', ev_path, tokenizer_conf=tokenizer_opts) return tr_path + '.gz', ev_path + '.gz'
def test_jsonify_XML_corpus(): conf_file = 'tests/resources/conf/exp0/exp0.conf' conf, _ = parse_config_file(conf_file) train_set = conf['training_data'] json_train_set = train_set + '.gz' tk = get_tokenizer_settings_from_conf(conf) # parse the XML directly x_tr, y_tr, _, _ = get_tokenized_data(train_set, tk) jsonify_single_labelled_corpus('unit_tests', train_set, conf_file) x_tr1, y_tr1, _, _ = get_tokenized_data(json_train_set, tk) # because the process of converting to json merges the train and test set, if a test set exists, # we need to merge them too in this test. for a, b in zip(x_tr, x_tr1): assert len(a[0]) == len(b) == 3 assert set(str(f) for f in a[0].nodes()) == set(b) np.testing.assert_array_equal(y_tr, y_tr1) os.unlink(json_train_set)