def test_jsonify_XML_corpus(): conf_file = 'tests/resources/conf/exp0/exp0.conf' conf, _ = parse_config_file(conf_file) train_set = conf['training_data'] json_train_set = train_set + '.gz' tk = get_tokenizer_settings_from_conf(conf) # parse the XML directly x_tr, y_tr, _, _ = get_tokenized_data(train_set, tk) jsonify_single_labelled_corpus('unit_tests', train_set, conf_file) x_tr1, y_tr1, _, _ = get_tokenized_data(json_train_set, tk) # because the process of converting to json merges the train and test set, if a test set exists, # we need to merge them too in this test. for a, b in zip(x_tr, x_tr1): assert len(a[0]) == len(b) == 3 assert set(str(f) for f in a[0].nodes()) == set(b) np.testing.assert_array_equal(y_tr, y_tr1) os.unlink(json_train_set)
def jsonify_single_labelled_corpus(corpus_name, corpus_path, conf_file=None, tokenizer_conf=None, unigram_features=set('JNV'), phrase_features=set(['AN', 'NN', 'VO', 'SVO']), write_feature_set=False): """ Tokenizes an entire XML/CoNLL corpus (sentence segmented and dependency parsed), incl test and train chunk, and writes its content to a single JSON gzip-ed file, one document per line. Each line is a JSON array, the first value of which is the label of the document, and the rest are JSON representation of a list of lists, containing all document features of interest, e.g. nouns, adj, NPs, VPs, wtc. The resultant document can be loaded with a GzippedJsonTokenizer. :param corpus_path: path to the corpus """ def _write_corpus_to_json(x_tr, y_tr): extr = FeatureExtractor(extract_unigram_features=unigram_features, extract_phrase_features=phrase_features) documents = [] for doc in x_tr: documents.append([str(f) for f in extr.extract_features_from_tree_list(doc)]) for document, label in zip(documents, y_tr): outfile.write(bytes(json.dumps([label, document]), 'UTF8')) outfile.write(bytes('\n', 'UTF8')) return set(feat for doc in documents for feat in doc) # load the dataset from XML/JSON/CoNLL if conf_file: conf = get_tokenizer_settings_from_conf_file(conf_file) elif tokenizer_conf: conf = tokenizer_conf else: raise ValueError('Must provide a dict or a file containing tokenizer config') x_tr, y_tr, x_test, y_test = get_tokenized_data(corpus_path, conf) with gzip.open('%s.gz' % corpus_path, 'wb') as outfile: feats = _write_corpus_to_json(x_tr, y_tr) logging.info('Writing %s to gzip json', corpus_path) if x_test: feats |= _write_corpus_to_json(x_test, y_test) if write_feature_set: _write_features_of_single_corpus_to_file(feats, corpus_name)
def test_distributional_with_vector_clusters(conf, tmpdir): # generate random vectors for the the appropriate features and cluster them first x_tr, _, _, _ = get_tokenized_data(conf['training_data'], conf['tokenizer']) feats = FeatureExtractor().extract_features_from_tree_list([foo[0] for foo in x_tr]) vectors = np.random.random((len(feats), 10)) v = DenseVectors(pd.DataFrame(vectors, index=feats)) tmpfile = str(tmpdir.join('tmp_random_vectors')) v.to_tsv(tmpfile, dense_hd5=True) tmpclusters = str(tmpdir.join('tmp_random_clusters')) cluster_vectors(tmpfile, tmpclusters, n_clusters=5, n_jobs=1) conf['vector_sources']['neighbours_file'] = [] conf['vectorizer']['class'] = 'eval.pipeline.multivectors.KmeansVectorizer' conf['vector_sources']['clusters_file'] = tmpclusters # the features of the document are cluster ids, not phrases # no point in checking in they are in the thesaurus conf['feature_selection']['must_be_in_thesaurus'] = False for debug_level in [0, 1, 2]: conf['debug_level'] = debug_level run_experiment(conf)
def run_experiment(conf): start_time = datetime.now() mkdirs_if_not_exists(conf['output_dir']) test_path = '' tr_data = conf['training_data'] if conf['test_data']: test_path = conf['test_data'] # LOADING RAW TEXT x_tr, y_tr, x_test, y_test = get_tokenized_data(tr_data, get_tokenizer_settings_from_conf(conf), test_data=test_path) # CREATE CROSSVALIDATION ITERATOR cv_iterator, y_vals = _build_crossvalidation_iterator(conf['crossvalidation'], y_tr, y_test) if x_test is not None: # concatenate all data, the CV iterator will make sure x_test is used for testing x_vals = list(x_tr) x_vals.extend(list(x_test)) else: x_vals = x_tr all_scores = [] params = [] for i, (train_idx, test_idx) in enumerate(cv_iterator): params.append((conf, i, multiple_scores, test_idx, train_idx, x_vals, y_vals)) logging.warning('Only using the first CV fold') if conf['crossvalidation']['break_after_first']: # only do one train/test split to save time logging.info('Exiting after first fold') break scores_over_cv = [_cv_loop(*foo) for foo in params] all_scores.extend([score for one_set_of_scores in scores_over_cv for score in one_set_of_scores]) _store_scores(all_scores, conf['output_dir'], conf['name']) total_time = (datetime.now() - start_time).seconds / 60 logging.info('MINUTES TAKEN %.2f' % total_time)
def _vectorize_data(data_paths, config, dummy=False): if dummy: config['vector_sources']['dummy_thesaurus'] = True config['vector_sources']['neighbours_file'] = [] else: config['vector_sources']['neighbours_file'] = [tsv_file] config['vector_sources']['neighbour_strategy'] = 'linear' config['name'] = 'test_main' config['debug_level'] = 2 config['output_dir'] = '.' pipeline, fit_params = evaluate._build_pipeline(config, 12345) x_tr, y_tr, x_test, y_test = get_tokenized_data(data_paths[0], tokenizer_opts, test_data=data_paths[1]) x1 = pipeline.fit_transform(x_tr, y_tr, **fit_params) if 'fs' in pipeline.named_steps: pipeline.named_steps['vect'].vocabulary_ = pipeline.named_steps['fs'].vocabulary_ voc = pipeline.named_steps['fs'].vocabulary_ x2 = pipeline.transform(x_test) return x1, x2, voc