Exemplo n.º 1
0
def test_loading_from_h5():
    t1 = Vectors.from_tsv('discoutils/tests/resources/exp0-0a.strings')
    t2 = Vectors.from_tsv('discoutils/tests/resources/exp0-0a.strings.h5')
    for k in t1.keys():
        assert k in t2
        v1 = t1.get_vector(k)
        v2 = t2.get_vector(k)
        np.testing.assert_array_equal(v1.A, v2.A)
Exemplo n.º 2
0
def merge_vectors(composed_dir, unigrams, output, workers=4, chunk_size=10000):
    # this particular dataset uses spaces instead of underscores. State this to avoid parsing issues
    DocumentFeature.ngram_separator = " "
    DIMS = 100  # SVD dimensionality

    files = glob(os.path.join(composed_dir, "*apt.vec.gz"))
    logging.info("Found %d composed phrase files", len(files))

    # ignore stuff that isn't unigrams, it will cause problems later
    unigrams = Vectors.from_tsv(unigrams, row_filter=lambda x, y: y.type == "1-GRAM")
    logging.info("Found %d unigram vectors", len(unigrams))

    mat, cols, rows = unigrams.to_sparse_matrix()
    unigrams.v.vocabulary_ = {x: i for i, x in enumerate(list(cols))}
    cols = set(cols)
    svd = TruncatedSVD(DIMS, random_state=0)
    logging.info("Reducing dimensionality of matrix of shape %r...", mat.shape)
    start = time.time()
    reduced_mat = svd.fit_transform(mat)
    logging.info(
        "Reduced using {} from shape {} to shape {} in {} seconds".format(
            svd, mat.shape, reduced_mat.shape, time.time() - start
        )
    )
    write_vectors_to_hdf(
        reduced_mat,
        rows,
        ["SVD:feat{0:03d}".format(i) for i in range(reduced_mat.shape[1])],
        "%s-unigrams-SVD%d" % (output, DIMS),
    )
    del mat

    for i, chunk in enumerate(grouper(chunk_size, files)):
        d = {}
        logging.info("Reading composed vectors, chunk %d...", i)
        for phrase, features in Parallel(n_jobs=workers)(delayed(_read_vector)(f) for f in chunk if f):
            if features:
                d[phrase] = features

        logging.info("Found %d non-empty composed vectors in this chunk, running SVD now...", len(d))
        if not d:
            continue

        composed_vec = Vectors(d, column_filter=lambda foo: foo in cols)
        # vectorize second matrix with the vocabulary (columns) of the first thesaurus to ensure shapes match
        # "project" composed matrix into space of unigram thesaurus
        extra_matrix = unigrams.v.transform([dict(fv) for fv in composed_vec.values()])
        assert extra_matrix.shape == (len(composed_vec), len(cols))
        logging.info("Composed matrix is of shape %r before SVD", extra_matrix.shape)

        extra_matrix = svd.transform(extra_matrix)
        write_vectors_to_hdf(
            extra_matrix,
            list(composed_vec.keys()),
            ["SVD:feat{0:03d}".format(i) for i in range(extra_matrix.shape[1])],
            "%s-phrases-chunk%d-SVD%d" % (output, i, DIMS),
        )
        del composed_vec
Exemplo n.º 3
0
def test_loading_unordered_feature_lists(tmpdir):
    d = {
        'a/N': [('f1', 1), ('f2', 2), ('f3', 3)],
        'b/N': [('f3', 3), ('f1', 1), ('f2', 2), ],
        'c/N': [('f3', 3), ('f2', 2), ('f1', 1)],
    }  # three identical vectors
    v = Vectors(d)
    filename = str(tmpdir.join('outfile.txt'))
    v.to_tsv(filename)

    v1 = v.from_tsv(filename)
    assert v.columns == v1.columns # rows can be in any order, but columns need to be sorted
    for word in d.keys():
        assert_array_equal(v.get_vector(word).A, v1.get_vector(word).A)
def density_window(vectors,
                   words=None,
                   num_neighbours=10,
                   window_size=0.1,
                   alpha='auto',
                   nn_metric='cosine',
                   **kwargs):
    """
	Perform smoothing by associative inference
	:param vectors: Original elementary APTs
	:param words: Lexemes of interest to apply distributional inference on (pass None for all lexemes)
	:param num_neighbours: Maximum number of neighbours used for distributional inference
	:param window_size: proportional distance to nearest neighbour, defining the parzen window for each vector individually (default=0.1)
	:param alpha: weighting of original vector (default='auto', which multiplies the original vectors by `num_neighbours`)
	:param nn_metric: nearest neighbour metric to use (default='cosine'; supported are 'cosine' and 'euclidean')
	:return: smoothed apt vector
	"""
    smoothed_vectors = {}
    if (isinstance(vectors, Vectors)):
        disco_vectors = vectors
    else:  # Passive-Aggressive-Defensive loading cascade
        if (isinstance(vectors, dict)):
            disco_vectors = Vectors.from_dict_of_dicts(vectors)
        else:
            raise ValueError(
                'Unsupported type[{}] for `vectors` supplied. Supported types are [`discoutils.thesaurus_loader.Vectors` and `dict`]!'
            )

    if (not kwargs.pop('is_initialised', False)):
        disco_vectors.init_sims(
            n_neighbors=num_neighbours,
            nn_metric=nn_metric,
            knn='brute' if nn_metric == 'cosine' else 'kd_tree')

    words = words if words is not None else vectors.keys()

    a = alpha if alpha != 'auto' else num_neighbours
    for w in words:
        if (w not in disco_vectors): continue
        # Retrieve top neighbour
        top_neighbour = disco_vectors.get_nearest_neighbours(w)[0]

        # Anything within `distance_threshold` is still considered for inference
        distance_threshold = top_neighbour[1] * (1 + window_size)

        neighbours = []
        for neighbour, distance in disco_vectors.get_nearest_neighbours(w):
            if (distance > distance_threshold): break

            neighbours.append((neighbour, distance))

        # Enrich original vector
        apt = disco_vectors.get_vector(w) * a

        for neighbour, _ in neighbours:
            apt += disco_vectors.get_vector(neighbour)

        smoothed_vectors[w] = apt.copy()

    return disco_vectors, smoothed_vectors
def train_verb_tensors(svos_file, noun_vectors_file, output_filename):
    """
    Trains Verb-bar matrices, as described in Milajevs et al (EMNLP-14, §3)
    :param svos_file: file containing a list of all SVOs in unlabelled data, one per line. May contain other document
     features too. Such a file is output by `find_all_NPs.py`, which is called from `observed_vectors.py`
    :param noun_vectors_file: a vector store containing noun vectors
    :param output_filename: name of output file- must identify the noun vectors and the unlabelled corpus
    """
    mkdirs_if_not_exists(os.path.dirname(output_filename))

    v = Vectors.from_tsv(noun_vectors_file)

    with open(svos_file) as infile:
        phrases = set()
        for line in infile:
            if DocumentFeature.from_string(line.strip()).type == 'SVO':
                phrases.add(tuple(line.strip().split('_')))
    phrases = [(subj, verb, obj) for subj, verb, obj in phrases if subj in v and obj in v]
    phrases = sorted(phrases, key=itemgetter(1))
    logging.info('Found %d SVOs in list', len(phrases))

    verb_tensors = dict()
    for verb, svos in groupby(phrases, itemgetter(1)):
        svos = list(svos)
        if len(svos) < MIN_SVO_PER_VERB:
            continue
        logging.info('Training matrix for %s from %d SVOs', verb, len(svos))
        vt = np.sum(np.outer(v.get_vector(subj).A, v.get_vector(obj).A) for subj, _, obj in svos)
        verb_tensors[verb] = vt

    logging.info('Trained %d verb matrices, saving...', len(verb_tensors))
    for verb, tensor in verb_tensors.items():
        df = pd.DataFrame(tensor)
        df.to_hdf(output_filename, verb.split('/')[0], complevel=9, complib='zlib')
def wordnet_synsets(vectors,
                    words,
                    num_neighbours,
                    alpha='auto',
                    nn_metric='cosine',
                    **kwargs):
    """
	Perform smoothing by associative inference
	:param vectors: Original elementary APTs
	:param words: Lexemes of interest to apply distributional inference on (pass None for all lexemes), !!!Need to be (word, pos) tuples!!!
	:param num_neighbours: Maximum number of neighbours used for distributional inference
	:param alpha: weighting of original vector (default='auto', which multiplies the original vectors by `num_neighbours`)
	:param nn_metric: nearest neighbour metric to use (default='cosine'; supported are 'cosine' and 'euclidean')
	:return: smoothed apt vector
	"""
    smoothed_vectors = {}
    if (isinstance(vectors, Vectors)):
        disco_vectors = vectors
    else:  # Passive-Aggressive-Defensive loading cascade
        if (isinstance(vectors, dict)):
            disco_vectors = Vectors.from_dict_of_dicts(vectors)
        else:
            raise ValueError(
                'Unsupported type[{}] for `vectors` supplied. Supported types are [`discoutils.thesaurus_loader.Vectors` and `dict`]!'
            )

    if (not kwargs.pop('is_initialised', False)):
        disco_vectors.init_sims(
            n_neighbors=num_neighbours,
            nn_metric=nn_metric,
            knn='brute' if nn_metric == 'cosine' else 'kd_tree')

    words = words if words is not None else vectors.keys()

    a = alpha if alpha != 'auto' else num_neighbours
    for w, pos in words:
        if (w not in disco_vectors): continue
        neighbours = set()
        for syn in wordnet.synsets(w, pos=pos):
            n = syn.name().split('.')[0]
            if (n != w):
                neighbours.add(n)

        # Get indices of neighbours
        idx = []
        for i, n in enumerate(neighbours, 1):
            if (i > num_neighbours): break
            if (n in disco_vectors):
                idx.append(disco_vectors.name2row[n])

        A = disco_vectors.matrix[np.array(idx)]

        # Retrieve vector for `w` and add `A` to it and apply alpha weighting to original APT
        apt = sparse.csr_matrix(
            disco_vectors.get_vector(w).multiply(a) +
            A.sum(axis=0))  # Should still be sparse enough

        smoothed_vectors[w] = apt.copy()

    return disco_vectors, smoothed_vectors
def _do_feature_selection(must_be_in_thesaurus, k, handler='Base', vector_source='default', max_feature_len=1,
                          delete_kid=False):
    """
    Loads a data set, vectorizes it by extracting n-grams (default n=1) using a feature handler (default
    BaseFeatureHandler) and then performs feature selection based on either a vector source or on chi2 scores.
    Returns the encode/decode matrices and the stripped vocabulary of the Vectorizer after feature selection.

    The vector source by default has a unigrams source that covers all unigrams in the training set
    (feature vectors are made up), and does not know about n-grams. Optionally, another vector
    source can be passed in.
    """
    handler_pattern = 'eval.pipeline.feature_handlers.{}FeatureHandler'
    raw_data, data_ids = load_text_data_into_memory(
        training_path='tests/resources/test-tr',
        test_path='tests/resources/test-ev',
    )

    tokenizer = XmlTokenizer()
    x_train, y_train, x_test, y_test = tokenize_data(raw_data, tokenizer, data_ids)

    if vector_source == 'default':
        unigrams_vect = Vectors.from_tsv('tests/resources/thesauri/exp0-0a.txt.events-unfiltered.strings')
        vector_source = unigrams_vect

    if delete_kid:
        # the set of vectors we load from disk covers all unigrams in the training set, which makes it boring
        # let's remove one entry
        del unigrams_vect['kid/N']
        unigrams_vect.matrix = unigrams_vect.matrix[:, :-1]

    if max_feature_len == 1:
        # extract only unigram features
        feat_extr_opts = {'extract_unigram_features': ['J', 'N', 'V'],
                          'extract_phrase_features': []}
        standard_ngram_features = 0
    else:
        feat_extr_opts = {'extract_unigram_features': ['J', 'N', 'V'],
                          'extract_phrase_features': ['AN', 'NN', 'VO', 'SVO']}
        standard_ngram_features = max_feature_len

    feature_extractor = FeatureExtractor(standard_ngram_features=standard_ngram_features).update(**feat_extr_opts)
    pipeline_list = [
        ('vect',
         ThesaurusVectorizer(min_df=1, use_tfidf=False,
                             decode_token_handler=handler_pattern.format(handler))),
        ('fs', VectorBackedSelectKBest(must_be_in_thesaurus=must_be_in_thesaurus, k=k)),
        ('dumper', FeatureVectorsCsvDumper('fs-test'))
    ]
    p = Pipeline(pipeline_list)
    fit_params = {'vect__vector_source': vector_source,
                  'vect__train_time_extractor':feature_extractor,
                  'vect__decode_time_extractor':feature_extractor,
                  'fs__vector_source': vector_source}

    tr_matrix, tr_voc = p.fit_transform(x_train, y_train, **fit_params)
    if 'fs' in p.named_steps:
        p.named_steps['vect'].vocabulary_ = p.named_steps['fs'].vocabulary_
    ev_matrix, ev_voc = p.transform(x_test)
    return tr_matrix.A, strip(tr_voc), ev_matrix.A, strip(ev_voc)
Exemplo n.º 8
0
def test_loading_dict_of_dicts():
    d = {
        'monday': {
            'det:the': 23,
            'amod:terrible': 321
        },
        'tuesday': {
            'amod:awful': 231,
            'det:a': 12
        }
    }
    v = Vectors(d)

    v1 = v.from_dict_of_dicts(d)
    assert v.columns == v1.columns
    for word in d.keys():
        assert_array_equal(v.get_vector(word).A, v1.get_vector(word).A)
Exemplo n.º 9
0
def cluster_vectors(path_to_vectors, output_path, n_clusters=100, noise=0, n_jobs=4):
    vectors = Vectors.from_tsv(path_to_vectors, noise=noise)
    km = KMeans(n_clusters=n_clusters, n_jobs=n_jobs, random_state=0, verbose=1)
    clusters = km.fit_predict(vectors.matrix)
    num2word = np.array(vectors.row_names)
    idx = np.argsort(num2word)
    df = pd.DataFrame(dict(clusters=clusters[idx]), index=num2word[idx])
    df.to_hdf(output_path, key='clusters', complevel=9, complib='zlib')
Exemplo n.º 10
0
def _generate_hdf_gzip_repr(kind, tmpdir, v):
    if kind == 'txt':
        # just read the plaintext file
        return v
    else:
        outfile = str(tmpdir.join('events.txt'))
        if kind == 'gz':
            v.to_tsv(outfile, gzipped=True)
        if kind == 'hdf':
            v.to_tsv(outfile, dense_hd5=True)
        return Vectors.from_tsv(outfile)
Exemplo n.º 11
0
def _translate_byblo_to_dissect(events_file, row_transform=lambda x: x):
    """
    Translates Byblo-made vectors file to dissect format in the absence of features/entries files
    :param events_file: path to byblo-made vectors
    :type events_file: str
    :return: prefix of dissect-compatible data files
    :rtype: str
    """
    # remove duplicate head noun vectors, converting to a dissect sparse matrix format
    logging.info('Converting %s to DISSECT format', events_file)
    t = Vectors.from_tsv(events_file)
    t.to_dissect_sparse_files(events_file, row_transform=row_transform)
Exemplo n.º 12
0
def test_all_neighbours_overlap(call_init):
    FEATURE = 'daily/J_pais/N'
    v = Vectors.from_tsv('tests/resources/only_overlapping.txt', allow_lexical_overlap=False)
    mv = MultiVectors([v] * 3)
    if call_init:
        mv.init_sims()
    assert FEATURE in v
    assert FEATURE in mv  # feature is contained in vector set, but...
    # when we look up its neighbours, they all overlap, so nothing is left
    assert mv.get_nearest_neighbours(FEATURE) == []
    assert mv.get_nearest_neighbours('asdf') == []

    assert mv.get_nearest_neighbours('pais/N') is not None
def write_gensim_vectors_to_tsv(model, output_path, vocab=None):
    # get word2vec vectors for each word, write to TSV
    if not vocab:
        vocab = model.vocab.keys()
    vectors = dict()

    dims = len(model[next(iter(vocab))])  # vector dimensionality
    dimension_names = ['f%02d' % i for i in range(dims)]
    for word in vocab:
        # watch for non-DocumentFeatures, these break to_tsv
        # also ignore words with non-ascii characters
        # if DocumentFeature.from_string(word).type == 'EMPTY': # todo assumes there is a PoS tag
        # logging.info('Ignoring vector for %s', word)
        # continue
        vectors[word] = zip(dimension_names, model[word])
    vectors = Vectors(vectors)
    vectors.to_tsv(output_path, gzipped=True,
                   enforce_word_entry_pos_format=True,
                   entry_filter=lambda _: True,
                   dense_hd5=True)
    del model
    return vectors
def static_top_n(vectors, words=None, num_neighbours=10, alpha='auto', nn_metric='cosine', **kwargs):
	"""
	Perform smoothing by associative inference
	:param vectors: Original elementary APTs
	:param words: Lexemes of interest to apply distributional inference on (pass None for all lexemes)
	:param num_neighbours: Number of neighbours used for distributional inference
	:param alpha: weighting of original vector (default='auto', which multiplies the original vectors by `num_neighbours`)
	:param nn_metric: nearest neighbour metric to use (default='cosine'; supported are 'cosine' and 'euclidean')
	:return: smoothed apt vector
	"""
	smoothed_vectors = {}
	if (isinstance(vectors, Vectors)):
		disco_vectors = vectors
	else: # Passive-Aggressive-Defensive loading cascade
		if (isinstance(vectors, dict)):
			disco_vectors = Vectors.from_dict_of_dicts(vectors)
		else:
			raise ValueError('Unsupported type[{}] for `vectors` supplied. Supported types are [`discoutils.thesaurus_loader.Vectors` and `dict`]!')

	if (not kwargs.pop('is_initialised', False)):
		disco_vectors.init_sims(n_neighbors=num_neighbours, nn_metric=nn_metric, knn='brute' if nn_metric == 'cosine' else 'kd_tree')

	words = words if words is not None else vectors.keys()

	a = alpha if alpha != 'auto' else num_neighbours
	for w in words:
		if (w not in disco_vectors):
			smoothed_vectors[w] = sparse.csr_matrix((1, disco_vectors.matrix.shape[1]))
			continue

		neighbours = []
		try:
			neighbours = disco_vectors.get_nearest_neighbours(w)
		except ValueError as ex:
			import logging
			logging.error('Failed to retrieve neighbours for w={}: {}...'.format(w, ex))
			raise ex

		# Enrich original vector
		apt = disco_vectors.get_vector(w)
		if (apt is None): # OOV
			apt = sparse.csr_matrix((1, disco_vectors.matrix.shape[1]))
		apt *= a

		for neighbour, _ in neighbours:
			apt += disco_vectors.get_vector(neighbour)

		smoothed_vectors[w] = apt.copy()

	return disco_vectors, smoothed_vectors
def density_window(vectors, words=None, num_neighbours=10, window_size=0.1, alpha='auto', nn_metric='cosine', **kwargs):
	"""
	Perform smoothing by associative inference
	:param vectors: Original elementary APTs
	:param words: Lexemes of interest to apply distributional inference on (pass None for all lexemes)
	:param num_neighbours: Maximum number of neighbours used for distributional inference
	:param window_size: proportional distance to nearest neighbour, defining the parzen window for each vector individually (default=0.1)
	:param alpha: weighting of original vector (default='auto', which multiplies the original vectors by `num_neighbours`)
	:param nn_metric: nearest neighbour metric to use (default='cosine'; supported are 'cosine' and 'euclidean')
	:return: smoothed apt vector
	"""
	smoothed_vectors = {}
	if (isinstance(vectors, Vectors)):
		disco_vectors = vectors
	else: # Passive-Aggressive-Defensive loading cascade
		if (isinstance(vectors, dict)):
			disco_vectors = Vectors.from_dict_of_dicts(vectors)
		else:
			raise ValueError('Unsupported type[{}] for `vectors` supplied. Supported types are [`discoutils.thesaurus_loader.Vectors` and `dict`]!')

	if (not kwargs.pop('is_initialised', False)):
		disco_vectors.init_sims(n_neighbors=num_neighbours, nn_metric=nn_metric, knn='brute' if nn_metric == 'cosine' else 'kd_tree')

	words = words if words is not None else vectors.keys()

	a = alpha if alpha != 'auto' else num_neighbours
	for w in words:
		if (w not in disco_vectors): continue
		# Retrieve top neighbour
		top_neighbour = disco_vectors.get_nearest_neighbours(w)[0]

		# Anything within `distance_threshold` is still considered for inference
		distance_threshold = top_neighbour[1] * (1+window_size)

		neighbours = []
		for neighbour, distance in disco_vectors.get_nearest_neighbours(w):
			if (distance > distance_threshold): break

			neighbours.append((neighbour, distance))

		# Enrich original vector
		apt = disco_vectors.get_vector(w) * a

		for neighbour, _ in neighbours:
			apt += disco_vectors.get_vector(neighbour)

		smoothed_vectors[w] = apt.copy()

	return disco_vectors, smoothed_vectors
def wordnet_synsets(vectors, words, num_neighbours, alpha='auto', nn_metric='cosine', **kwargs):
	"""
	Perform smoothing by associative inference
	:param vectors: Original elementary APTs
	:param words: Lexemes of interest to apply distributional inference on (pass None for all lexemes), !!!Need to be (word, pos) tuples!!!
	:param num_neighbours: Maximum number of neighbours used for distributional inference
	:param alpha: weighting of original vector (default='auto', which multiplies the original vectors by `num_neighbours`)
	:param nn_metric: nearest neighbour metric to use (default='cosine'; supported are 'cosine' and 'euclidean')
	:return: smoothed apt vector
	"""
	smoothed_vectors = {}
	if (isinstance(vectors, Vectors)):
		disco_vectors = vectors
	else: # Passive-Aggressive-Defensive loading cascade
		if (isinstance(vectors, dict)):
			disco_vectors = Vectors.from_dict_of_dicts(vectors)
		else:
			raise ValueError('Unsupported type[{}] for `vectors` supplied. Supported types are [`discoutils.thesaurus_loader.Vectors` and `dict`]!')

	if (not kwargs.pop('is_initialised', False)):
		disco_vectors.init_sims(n_neighbors=num_neighbours, nn_metric=nn_metric, knn='brute' if nn_metric == 'cosine' else 'kd_tree')

	words = words if words is not None else vectors.keys()

	a = alpha if alpha != 'auto' else num_neighbours
	for w, pos in words:
		if (w not in disco_vectors): continue
		neighbours = set()
		for syn in wordnet.synsets(w, pos=pos):
			n = syn.name().split('.')[0]
			if (n != w):
				neighbours.add(n)

		# Get indices of neighbours
		idx = []
		for i, n in enumerate(neighbours, 1):
			if (i > num_neighbours): break
			if (n in disco_vectors):
				idx.append(disco_vectors.name2row[n])

		A = disco_vectors.matrix[np.array(idx)]

		# Retrieve vector for `w` and add `A` to it and apply alpha weighting to original APT
		apt = sparse.csr_matrix(disco_vectors.get_vector(w).multiply(a) + A.sum(axis=0)) # Should still be sparse enough

		smoothed_vectors[w] = apt.copy()

	return disco_vectors, smoothed_vectors
def test_application_after_learning_with_selective_write(tmpdir):
    """
    Test if when SVD is trained on matrix A and applied to matrix B, and
    it is requested that just the reduced version of only A or B is output,
    the shape of the output is right
    """
    tmpfile = tmpdir.join('tmp.thesaurus')
    for w, exp_row_len in zip([1, 2, 3], [4, 5, 7]):
        do_svd('discoutils/tests/resources/exp0-0b.strings',
               tmpfile,
               reduce_to=[2], # some small number, not what we are testing for here
               apply_to='discoutils/tests/resources/exp0-0c.strings',
               write=w)
        t = Vectors.from_tsv(str(tmpfile) + '-SVD2.events.filtered.strings', lowercasing=False)
        mat, _, _ = t.to_sparse_matrix()
        assert mat.shape == (exp_row_len, 2)
def test_application_after_learning(tmpdir, first, second, exp_row_len):
    """
    Test of applying a learnt SVD to another matrix works. We are mostly interested if
    matrix dimensions match- no exception should be raised. Other than that,
    this is a useless test
    """
    tmpfile = tmpdir.join('tmp.thesaurus')
    do_svd('discoutils/tests/resources/exp0-0%s.strings' % first,
           tmpfile,
           reduce_to=[2], # some small number, not what we are testing for here
           apply_to='discoutils/tests/resources/exp0-0%s.strings' % second)

    # when made into a thesaurus, the reduced matrix will have some duplicates
    # these will be summed out, leaving us with a matrix of a specific size
    t = Vectors.from_tsv(str(tmpfile) + '-SVD2.events.filtered.strings',
                           lowercasing=False)
    mat, cols, rows = t.to_sparse_matrix()
    assert mat.shape == (exp_row_len, 2)
Exemplo n.º 19
0
def test_vectors_to_tsv(vectors_c, tmpdir):
    """

    :type vectors_c: Vectors
    :type tmpdir: py.path.local
    """
    # these are feature vectors, columns(features) can be reordered
    filename = str(tmpdir.join('outfile.txt'))
    vectors_c.to_tsv(filename, gzipped=True)
    from_disk = Vectors.from_tsv(filename)

    if hasattr(vectors_c, 'df'):
        # this is in dense format
        np.testing.assert_array_equal(vectors_c.matrix, from_disk.matrix)
    else:
        # sparse format: can't just assert from_disk == thesaurus_c, because to_tsv may reorder the columns
        for k, v in vectors_c.items():
            assert k in from_disk.keys()
            assert set(v) == set(vectors_c[k])
Exemplo n.º 20
0
def test_from_pandas_data_frame(vectors_c):
    mat, cols, rows = vectors_c.to_sparse_matrix()
    df = DataFrame(mat.A, index=rows, columns=cols)
    v = Vectors.from_pandas_df(df)

    mat1, cols1, rows1 = vectors_c.to_sparse_matrix()
    assert rows == rows1
    assert cols == cols1
    np.testing.assert_almost_equal(mat.A, mat1.A)

    vectors_c.init_sims()
    v.init_sims()
    for entry in vectors_c.keys():
        np.testing.assert_almost_equal(vectors_c.get_vector(entry).A,
                                       v.get_vector(entry).A)

        n1 = [x[0] for x in vectors_c.get_nearest_neighbours(entry)]
        n2 = [x[0] for x in v.get_nearest_neighbours(entry)]
        print(entry, n1, n2)
        assert n1 == n2
Exemplo n.º 21
0
def build_thesaurus_out_of_vectors(vectors_path, out_dir, threads=4, num_neighbours=100, sim_function='Cosine'):
    """
    Builds a Byblo thesaurus out of the provided vectors, however these were constructed. This function will make an
    uncompressed copy of the provided vectors file- might be slow and use up a lot of extra space.

    :param vectors_path: input vectors in byblo format, compressed or not
    :param out_dir: where to put the thesaurus and all temp file
    :param threads: number of byblo threads
    :param num_neighbours: number of nearest neighbours per entry to output
    :param sim_function: similarity measure between vectors to use. see byblo docs
    """
    from discoutils.thesaurus_loader import Vectors

    BYBLO_BASE_DIR = '/lustre/scratch/inf/mmb28/FeatureExtractionToolkit/Byblo-2.2.0'
    vectors_path = os.path.abspath(vectors_path)
    out_dir = os.path.abspath(out_dir)
    mkdirs_if_not_exists(out_dir)
    v = Vectors.from_tsv(vectors_path)

    # prepare the files that byblo expects
    outf_basename = os.path.join(out_dir, 'input')
    events_file = os.path.join(out_dir, outf_basename + '.events.filtered.strings')
    entries_file = os.path.join(out_dir, outf_basename + '.entries.filtered.strings')
    features_file = os.path.join(out_dir, outf_basename + '.features.filtered.strings')

    v.to_plain_txt(events_file, entries_file, features_file)
    # write the byblo conf file
    conf = '--input {} --output {} --threads {} --similarity-min 0.01 -k {} ' \
           '--measure {} --stages allpairs,knn,unenumerate'.format(outf_basename, out_dir, threads,
                                                                   num_neighbours, sim_function)
    conf_path = os.path.join(out_dir, 'conf.txt')
    with open(conf_path, 'w') as outf:
        for line in conf.split():
            outf.write(line)
            outf.write('\n')

    # go baby go
    with temp_chdir(BYBLO_BASE_DIR):
        reindex_all_byblo_vectors(outf_basename)
        run_byblo(conf_path, touch_input_file=True)
        unindex_all_byblo_vectors(outf_basename)
Exemplo n.º 22
0
def _overlapping_vectors(request):
    return Vectors.from_tsv('discoutils/tests/resources/lexical-overlap-vectors.txt',
                            allow_lexical_overlap=request.param)
def ones_vectors_no_pos():
    return Vectors.from_tsv('tests/resources/ones.vectors.nopos.txt',
                            enforce_word_entry_pos_format=False)
def _do_ppmi(vectors_path, output_dir):
    v = Vectors.from_tsv(vectors_path)
    ppmi_sparse_matrix(v.matrix)
    v.to_tsv(join(output_dir, basename(vectors_path)), gzipped=True)
Exemplo n.º 25
0
def test_random_vectors(tmpdir):
    output = str(tmpdir.join('vectors.h5'))
    generate(output, 10)
    v = Vectors.from_tsv(output)
    assert v.matrix.shape[1] == 10
Exemplo n.º 26
0
def do_svd(input_path, output_prefix,
           desired_counts_per_feature_type=[('N', 8), ('V', 4), ('J', 4), ('RB', 2), ('AN', 2)],
           reduce_to=[3, 10, 15], apply_to=None, write=3, use_hdf=True):
    """

    Performs truncated SVD. A copy of the trained sklearn SVD estimator will be also be saved

    :param input_path: list of files containing vectors in TSV format. All vectors will be reduced together.
    :type input_path: list of file names or a Vectors object
    :param output_prefix: Where to output the reduced files. An extension will be added.
    :param desired_counts_per_feature_type: how many entries to keep of each DocumentFeature type, by frequency. This
     is the PoS tag for unigram features and the feature type otherwise. For instance, pass in [('N', 2), ('AN', 0)] to
    select 2 unigrams of PoS N and 0 bigrams of type adjective-noun. Types that are not explicitly given a positive
    desired count are treated as if the desired count is 0. If this is None, not filtering is performed.
    :param reduce_to: list of integers, what dimensionalities to reduce to
    :param apply_to: a file path. After SVD has been trained on input_path, it can be applied to
    apply_to. Output will be writen to the same file
    :param write: Once SVD is trained on A and applied to B, output either A, B or vstack(A, B). Use values 0,
    1, and 2 respectively. Default is 3.
    :param use_hdf: if true, store results as a pandas DF in HDF. This will enforce some constraints like not having
    duplicate entries in the index, which I deliberately break with some of the unit tests. This switch is the easiest
    way to avoid modifying the unit tests
    :type write: int
    :raise ValueError: If the loaded thesaurus is empty
    """
    if not 1 <= write <= 3:
        raise ValueError('value of parameter write must be 1, 2 or 3')

    if not isinstance(input_path, Vectors):
        thesaurus = Vectors.from_tsv(input_path, lowercasing=False)
    else:
        thesaurus = input_path

    if not thesaurus:
        raise ValueError('Empty thesaurus %r', input_path)
    mat, _, rows, cols = filter_out_infrequent_entries(desired_counts_per_feature_type, thesaurus)
    if apply_to:
        cols = set(cols)
        if not isinstance(apply_to, Vectors):
            thes_to_apply_to = Vectors.from_tsv(apply_to, lowercasing=False,
                                                column_filter=lambda foo: foo in cols)
        else:
            thes_to_apply_to = apply_to
        # get the names of each thesaurus entry
        extra_rows = [x for x in thes_to_apply_to.keys()]
        # vectorize second matrix with the vocabulary (columns) of the first thesaurus to ensure shapes match
        # "project" second thesaurus into space of first thesaurus
        thesaurus.v.vocabulary_ = {x: i for i, x in enumerate(list(cols))}
        extra_matrix = thesaurus.v.transform([dict(fv) for fv in thes_to_apply_to.values()])
        # make sure the shape is right
        assert extra_matrix.shape[1] == mat.shape[1]

        if write == 3:
            # extend the list of names
            rows = list(rows) + [DocumentFeature.from_string(x) for x in extra_rows]
        elif write == 2:
            rows = [DocumentFeature.from_string(x) for x in extra_rows]
            # no need to do anything if write == 1

    for n_components in reduce_to:
        method, reduced_mat = _do_svd_single(mat, n_components)
        if not method:
            continue
        if apply_to:
            logging.info('Applying learned SVD transform to matrix of shape %r', extra_matrix.shape)
            # apply learned transform to new data
            if write == 3:
                # append to old data
                reduced_mat = np.vstack((reduced_mat, method.transform(extra_matrix)))
            elif write == 2:
                reduced_mat = method.transform(extra_matrix)

        path = '{}-SVD{}'.format(output_prefix, n_components)
        _write_to_disk(scipy.sparse.coo_matrix(reduced_mat), path, rows, use_hdf=use_hdf)
Exemplo n.º 27
0
def vectors_c(request, tmpdir):
    kind = request.param  # txt, gz or hdf
    v = Vectors.from_tsv('discoutils/tests/resources/exp0-0c.strings', sim_threshold=0, ngram_separator='_')
    assert DocumentFeature.from_string('oversized/J') not in v
    assert len(v) == 5
    return _generate_hdf_gzip_repr(kind, tmpdir, v)
Exemplo n.º 28
0
def train_grefenstette_multistep_composer(all_vectors_file, root_dir):
    """
    Train Grefenstette et al's multistep regression VO/SVO model
    Adapted from dissect's ex19.py
    :param all_vectors_file: file containing N, V, VO and SVO vectors
    :param root_dir: where to write temp files and output
    """
    mkdirs_if_not_exists(root_dir)
    vo_composer_output_file = join(root_dir, 'vo_comp.pkl')
    svo_composer_output_file = join(root_dir, 'svo_comp.pkl')

    filename = basename(all_vectors_file)
    noun_events_file = join(root_dir, '%s-onlyN.tmp' % filename)
    # verb_events_file = join(root_dir, '%s-onlyV.tmp' % filename)
    # vo_events_file = join(root_dir, '%s-onlyVO.tmp' % filename)
    svo_events_file = join(root_dir, '%s-onlySVO.tmp' % filename)

    # this has unigrams and observed phrases
    thes = Vectors.from_tsv(all_vectors_file)
    thes.to_tsv(noun_events_file,
                entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N')
    _translate_byblo_to_dissect(noun_events_file)
    # thes.to_tsv(verb_events_file,
    # entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'V')
    # _translate_byblo_to_dissect(verb_events_file)
    # thes.to_tsv(vo_events_file,
    #             entry_filter=lambda x: x.type == 'VO')
    # _translate_byblo_to_dissect(vo_events_file)
    thes.to_tsv(svo_events_file,
                entry_filter=lambda x: x.type == 'SVO')
    _translate_byblo_to_dissect(svo_events_file)

    train_vo_data, train_v_data = [], []
    for phrase in thes.keys():
        df = DocumentFeature.from_string(phrase)
        if df.type == 'SVO':
            train_vo_data.append((str(df[1:]), str(df[0]), str(df)))
        if df.type == 'VO':
            train_v_data.append((str(df[0]), str(df[1]), str(df)))

    # logging.info('train_vo_data %r', len(train_vo_data))
    # logging.info('train_v_data %r', len(train_v_data))

    # load N and SVO spaces
    n_space = Space.build(data=noun_events_file + '.sm',
                          cols=noun_events_file + '.cols',
                          format="sm")

    svo_space = Space.build(data=svo_events_file + '.sm',
                            cols=svo_events_file + '.cols',
                            format="sm")

    logging.info("Input SVO training space:")
    logging.info(svo_space.id2row)
    # logging.info(svo_space.cooccurrence_matrix)

    # 1. train a model to learn VO functions on train data: VO N -> SVO
    logging.info("Step 1 training")
    vo_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2)  # Gref et al 2013, §5 says 3
    vo_model.train(train_vo_data, n_space, svo_space)
    io_utils.save(vo_model, vo_composer_output_file)

    # 2. train a model to learn V functions on train data: V N -> VO
    # where VO space: function space learned in step 1
    logging.info("Step 2 training")
    vo_space = vo_model.function_space
    v_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2)
    v_model.train(train_v_data, n_space, vo_space)
    io_utils.save(v_model, svo_composer_output_file)
Exemplo n.º 29
0
def train_baroni_guevara_composers(all_vectors,
                                   ROOT_DIR,
                                   baroni_output_path, guevara_output_path,
                                   baroni_threshold=10):
    """

    :type all_vectors: str; path to vectors file containing both N and observed AN vectors
    :type ROOT_DIR: str; where to write temp files
    :type baroni_output_path: str; where to write pickled baroni composer
    :type guevara_output_path: str
    :type baroni_threshold: int
    """
    SVD_DIMS = 100
    baroni_training_phrase_types = {'AN', 'NN'}  # what kind of NPs to train Baroni composer for

    # prepare the input files to be fed into Dissect
    mkdirs_if_not_exists(ROOT_DIR)

    filename = basename(all_vectors)
    noun_events_file = join(ROOT_DIR, '%s-onlyN-SVD%d.tmp' % (filename, SVD_DIMS))
    NPs_events_file = join(ROOT_DIR, '%s-onlyPhrases-SVD%d.tmp' % (filename, SVD_DIMS))

    thes = Vectors.from_tsv(all_vectors, lowercasing=False)
    thes.to_tsv(noun_events_file,
                entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N')
    _translate_byblo_to_dissect(noun_events_file)

    thes.to_tsv(NPs_events_file,
                entry_filter=lambda x: x.type in baroni_training_phrase_types,
                row_transform=lambda x: str(x).replace(' ', '_'))
    _translate_byblo_to_dissect(NPs_events_file)

    my_space = Space.build(data="{}.sm".format(noun_events_file),
                           rows="{}.rows".format(noun_events_file),
                           cols="{}.cols".format(noun_events_file),
                           format="sm")
    logging.info('Each unigram vector has dimensionality %r', my_space.element_shape)

    # create a peripheral space
    my_per_space = PeripheralSpace.build(my_space,
                                         data="{}.sm".format(NPs_events_file),
                                         rows="{}.rows".format(NPs_events_file),
                                         # The columns of the peripheral space have to be identical to those
                                         # in the core space (including their order)!
                                         cols="{}.cols".format(NPs_events_file),
                                         format="sm")
    logging.info('Each phrase vector has dimensionality %r', my_per_space.element_shape)

    # use the model to compose words in my_space
    all_data = []
    for phrase in my_per_space._row2id:
        # make sure there are only NPs here
        if DocumentFeature.from_string(phrase.replace(' ', '_')).type in baroni_training_phrase_types:
            adj, noun = phrase.split('_')
            all_data.append((adj, noun, '%s_%s' % (adj, noun)))

    # train a composition model on the data and save it
    baroni = LexicalFunction(min_samples=baroni_threshold, learner=RidgeRegressionLearner())
    guevara = FullAdditive(learner=RidgeRegressionLearner())
    for composer, out_path in zip([baroni, guevara],
                                  [baroni_output_path, guevara_output_path]):
        composer.train(all_data, my_space, my_per_space)
        io_utils.save(composer, out_path)
        logging.info('Saved trained composer to %s', out_path)
Exemplo n.º 30
0
def get_thesaurus_entries(tsv_file):
    """
    Returns the set of entries contained in a thesaurus
    :param tsv_file: path to vectors file
    """
    return set(Vectors.from_tsv(tsv_file).keys())
Exemplo n.º 31
0
def compose_and_write_vectors(unigram_vectors, short_vector_dataset_name, composer_classes, remove_pos= False,
                              pretrained_Baroni_composer_file=None, pretrained_Guevara_composer_file=None,
                              pretrained_Gref_composer_file=None, categorical_vector_matrix_file=None,
                              output_dir='.', gzipped=True, dense_hd5=False,
                              row_filter=default_row_filter):
    """
    Extracts all composable features from a labelled classification corpus and dumps a composed vector for each of them
    to disk. The output file will also contain all unigram vectors that were passed in, and only unigrams!
    :param unigram_vectors: a file in Byblo events format that contain vectors for all unigrams OR
    a Vectors object. This will be used in the composition process.
    :type unigram_vectors: str or Vectors
    :param classification_corpora: Corpora to extract features from. Dict {corpus_path: conf_file}
    :param pretrained_Baroni_composer_file: path to pre-trained Baroni AN/NN composer file
    :param output_dir:
    :param composer_classes: what composers to use
    :type composer_classes: list
    """

    phrases_to_compose = get_all_document_features(remove_pos=remove_pos)
    # if this isn't a Vectors object assume it's the name of a file containing vectors and load them
    if not isinstance(unigram_vectors, Vectors):
        # ensure there's only unigrams in the set of unigram vectors
        # composers do not need any ngram vectors contain in this file, they may well be
        # observed ones
        unigram_vectors = Vectors.from_tsv(unigram_vectors,
                                           row_filter=row_filter)
        logging.info('Starting composition with %d unigram vectors', len(unigram_vectors))

    # doing this loop in parallel isn't worth it as pickling or shelving `vectors` is so slow
    # it negates any gains from using multiple cores
    for composer_class in composer_classes:
        if composer_class == BaroniComposer:
            assert pretrained_Baroni_composer_file is not None
            composer = BaroniComposer(unigram_vectors, pretrained_Baroni_composer_file)
        elif composer_class == GuevaraComposer:
            assert pretrained_Guevara_composer_file is not None
            composer = GuevaraComposer(unigram_vectors, pretrained_Guevara_composer_file)
        elif composer_class == GrefenstetteMultistepComposer:
            assert pretrained_Gref_composer_file is not None
            composer = GrefenstetteMultistepComposer(unigram_vectors, pretrained_Gref_composer_file)
        elif composer_class in [CopyObject, FrobeniusAdd, FrobeniusMult]:
            composer = composer_class(categorical_vector_matrix_file, unigram_vectors)
        else:
            composer = composer_class(unigram_vectors)

        try:
            # compose_all returns all unigrams and composed phrases
            mat, cols, rows = composer.compose_all(phrases_to_compose)

            events_path = os.path.join(output_dir,
                                       'composed_%s_%s.events.filtered.strings' % (short_vector_dataset_name,
                                                                                   composer.name))
            if dense_hd5:
                write_vectors_to_hdf(mat, rows, cols, events_path)
            else:
                rows2idx = {i: DocumentFeature.from_string(x) for (x, i) in rows.items()}
                write_vectors_to_disk(mat.tocoo(), rows2idx, cols, events_path,
                                      entry_filter=lambda x: x.type in {'AN', 'NN', 'VO', 'SVO', '1-GRAM'},
                                      gzipped=gzipped)
        except ValueError as e:
            logging.error('RED ALERT, RED ALERT')
            logging.error(e)
            continue
def static_top_n(vectors,
                 words=None,
                 num_neighbours=10,
                 alpha='auto',
                 nn_metric='cosine',
                 **kwargs):
    """
	Perform smoothing by associative inference
	:param vectors: Original elementary APTs
	:param words: Lexemes of interest to apply distributional inference on (pass None for all lexemes)
	:param num_neighbours: Number of neighbours used for distributional inference
	:param alpha: weighting of original vector (default='auto', which multiplies the original vectors by `num_neighbours`)
	:param nn_metric: nearest neighbour metric to use (default='cosine'; supported are 'cosine' and 'euclidean')
	:return: smoothed apt vector
	"""
    smoothed_vectors = {}
    if (isinstance(vectors, Vectors)):
        disco_vectors = vectors
    else:  # Passive-Aggressive-Defensive loading cascade
        if (isinstance(vectors, dict)):
            disco_vectors = Vectors.from_dict_of_dicts(vectors)
        else:
            raise ValueError(
                'Unsupported type[{}] for `vectors` supplied. Supported types are [`discoutils.thesaurus_loader.Vectors` and `dict`]!'
            )

    if (not kwargs.pop('is_initialised', False)):
        disco_vectors.init_sims(
            n_neighbors=num_neighbours,
            nn_metric=nn_metric,
            knn='brute' if nn_metric == 'cosine' else 'kd_tree')

    words = words if words is not None else vectors.keys()

    a = alpha if alpha != 'auto' else num_neighbours
    for w in words:
        if (w not in disco_vectors):
            smoothed_vectors[w] = sparse.csr_matrix(
                (1, disco_vectors.matrix.shape[1]))
            continue

        neighbours = []
        try:
            neighbours = disco_vectors.get_nearest_neighbours(w)
        except ValueError as ex:
            import logging
            logging.error(
                'Failed to retrieve neighbours for w={}: {}...'.format(w, ex))
            raise ex

        # Enrich original vector
        apt = disco_vectors.get_vector(w)
        if (apt is None):  # OOV
            apt = sparse.csr_matrix((1, disco_vectors.matrix.shape[1]))
        apt *= a

        for neighbour, _ in neighbours:
            apt += disco_vectors.get_vector(neighbour)

        smoothed_vectors[w] = apt.copy()

    return disco_vectors, smoothed_vectors
Exemplo n.º 33
0
def get_pipeline_fit_args(conf):
    """
    Builds a dict of resources that document vectorizers require at fit time. These currently include
    various kinds of distributional information, e.g. word vectors or cluster ID for words and phrases.
    Example:
    {'vector_source': <DenseVectors object>} or {'clusters': <pd.DataFrame of word clusters>}
    :param conf: configuration dict
    :raise ValueError: if the conf is wrong in any way
    """
    result = dict()
    train_time_extractor = FeatureExtractor().update(**conf['feature_extraction']). \
        update(**conf['feature_extraction']['train_time_opts'])
    result['train_time_extractor'] = train_time_extractor
    decode_time_extractor = FeatureExtractor().update(**conf['feature_extraction']). \
        update(**conf['feature_extraction']['decode_time_opts'])
    result['decode_time_extractor'] = decode_time_extractor

    vectors_exist = conf['feature_selection']['must_be_in_thesaurus']
    handler_ = conf['vectorizer']['decode_token_handler']
    random_thes = conf['vectorizer']['random_neighbour_thesaurus']
    dummy_thes = conf['vector_sources']['dummy_thesaurus']
    vs_params = conf['vector_sources']
    vectors_path = vs_params['neighbours_file']
    clusters_path = vs_params['clusters_file']

    if 'Base' in handler_:
        # don't need vectors, this is a non-distributional experiment
        return result
    if vectors_path and clusters_path:
        raise ValueError('Cannot use both word vectors and word clusters')

    if random_thes and dummy_thes:
        raise ValueError('Cant use both random and dummy thesauri')
    elif random_thes:
        result['vector_source'] = RandomThesaurus(k=conf['vectorizer']['k'])
    elif dummy_thes:
        result['vector_source'] = DummyThesaurus()
    else:
        if vectors_path and clusters_path:
            raise ValueError('Cannot use both word vectors and word clusters')
        if 'signified' in handler_.lower() or vectors_exist:
            # vectors are needed either at decode time (signified handler) or during feature selection
            if not (vectors_path or clusters_path):
                raise ValueError('You must provide at least one source of distributional information '
                                 'because you requested %s and must_be_in_thesaurus=%s' % (handler_, vectors_exist))

    if len(vectors_path) == 1:
        # set up a row filter, if needed
        entries = vs_params['entries_of']
        if entries:
            entries = get_thesaurus_entries(entries)
            vs_params['row_filter'] = lambda x, y: x in entries
        if conf['vector_sources']['is_thesaurus']:
            result['vector_source'] = Thesaurus.from_tsv(vectors_path[0], **vs_params)
        else:
            result['vector_source'] = Vectors.from_tsv(vectors_path[0], **vs_params)
    if len(vectors_path) > 1:
        all_vect = [Vectors.from_tsv(p, **vs_params) for p in vectors_path]
        result['vector_source'] = MultiVectors(all_vect)

    if clusters_path:
        result['clusters'] = pd.read_hdf(clusters_path, key='clusters')

    return result