def density_window(vectors, words=None, num_neighbours=10, window_size=0.1, alpha='auto', nn_metric='cosine', **kwargs): """ Perform smoothing by associative inference :param vectors: Original elementary APTs :param words: Lexemes of interest to apply distributional inference on (pass None for all lexemes) :param num_neighbours: Maximum number of neighbours used for distributional inference :param window_size: proportional distance to nearest neighbour, defining the parzen window for each vector individually (default=0.1) :param alpha: weighting of original vector (default='auto', which multiplies the original vectors by `num_neighbours`) :param nn_metric: nearest neighbour metric to use (default='cosine'; supported are 'cosine' and 'euclidean') :return: smoothed apt vector """ smoothed_vectors = {} if (isinstance(vectors, Vectors)): disco_vectors = vectors else: # Passive-Aggressive-Defensive loading cascade if (isinstance(vectors, dict)): disco_vectors = Vectors.from_dict_of_dicts(vectors) else: raise ValueError( 'Unsupported type[{}] for `vectors` supplied. Supported types are [`discoutils.thesaurus_loader.Vectors` and `dict`]!' ) if (not kwargs.pop('is_initialised', False)): disco_vectors.init_sims( n_neighbors=num_neighbours, nn_metric=nn_metric, knn='brute' if nn_metric == 'cosine' else 'kd_tree') words = words if words is not None else vectors.keys() a = alpha if alpha != 'auto' else num_neighbours for w in words: if (w not in disco_vectors): continue # Retrieve top neighbour top_neighbour = disco_vectors.get_nearest_neighbours(w)[0] # Anything within `distance_threshold` is still considered for inference distance_threshold = top_neighbour[1] * (1 + window_size) neighbours = [] for neighbour, distance in disco_vectors.get_nearest_neighbours(w): if (distance > distance_threshold): break neighbours.append((neighbour, distance)) # Enrich original vector apt = disco_vectors.get_vector(w) * a for neighbour, _ in neighbours: apt += disco_vectors.get_vector(neighbour) smoothed_vectors[w] = apt.copy() return disco_vectors, smoothed_vectors
def wordnet_synsets(vectors, words, num_neighbours, alpha='auto', nn_metric='cosine', **kwargs): """ Perform smoothing by associative inference :param vectors: Original elementary APTs :param words: Lexemes of interest to apply distributional inference on (pass None for all lexemes), !!!Need to be (word, pos) tuples!!! :param num_neighbours: Maximum number of neighbours used for distributional inference :param alpha: weighting of original vector (default='auto', which multiplies the original vectors by `num_neighbours`) :param nn_metric: nearest neighbour metric to use (default='cosine'; supported are 'cosine' and 'euclidean') :return: smoothed apt vector """ smoothed_vectors = {} if (isinstance(vectors, Vectors)): disco_vectors = vectors else: # Passive-Aggressive-Defensive loading cascade if (isinstance(vectors, dict)): disco_vectors = Vectors.from_dict_of_dicts(vectors) else: raise ValueError( 'Unsupported type[{}] for `vectors` supplied. Supported types are [`discoutils.thesaurus_loader.Vectors` and `dict`]!' ) if (not kwargs.pop('is_initialised', False)): disco_vectors.init_sims( n_neighbors=num_neighbours, nn_metric=nn_metric, knn='brute' if nn_metric == 'cosine' else 'kd_tree') words = words if words is not None else vectors.keys() a = alpha if alpha != 'auto' else num_neighbours for w, pos in words: if (w not in disco_vectors): continue neighbours = set() for syn in wordnet.synsets(w, pos=pos): n = syn.name().split('.')[0] if (n != w): neighbours.add(n) # Get indices of neighbours idx = [] for i, n in enumerate(neighbours, 1): if (i > num_neighbours): break if (n in disco_vectors): idx.append(disco_vectors.name2row[n]) A = disco_vectors.matrix[np.array(idx)] # Retrieve vector for `w` and add `A` to it and apply alpha weighting to original APT apt = sparse.csr_matrix( disco_vectors.get_vector(w).multiply(a) + A.sum(axis=0)) # Should still be sparse enough smoothed_vectors[w] = apt.copy() return disco_vectors, smoothed_vectors
def static_top_n(vectors, words=None, num_neighbours=10, alpha='auto', nn_metric='cosine', **kwargs): """ Perform smoothing by associative inference :param vectors: Original elementary APTs :param words: Lexemes of interest to apply distributional inference on (pass None for all lexemes) :param num_neighbours: Number of neighbours used for distributional inference :param alpha: weighting of original vector (default='auto', which multiplies the original vectors by `num_neighbours`) :param nn_metric: nearest neighbour metric to use (default='cosine'; supported are 'cosine' and 'euclidean') :return: smoothed apt vector """ smoothed_vectors = {} if (isinstance(vectors, Vectors)): disco_vectors = vectors else: # Passive-Aggressive-Defensive loading cascade if (isinstance(vectors, dict)): disco_vectors = Vectors.from_dict_of_dicts(vectors) else: raise ValueError('Unsupported type[{}] for `vectors` supplied. Supported types are [`discoutils.thesaurus_loader.Vectors` and `dict`]!') if (not kwargs.pop('is_initialised', False)): disco_vectors.init_sims(n_neighbors=num_neighbours, nn_metric=nn_metric, knn='brute' if nn_metric == 'cosine' else 'kd_tree') words = words if words is not None else vectors.keys() a = alpha if alpha != 'auto' else num_neighbours for w in words: if (w not in disco_vectors): smoothed_vectors[w] = sparse.csr_matrix((1, disco_vectors.matrix.shape[1])) continue neighbours = [] try: neighbours = disco_vectors.get_nearest_neighbours(w) except ValueError as ex: import logging logging.error('Failed to retrieve neighbours for w={}: {}...'.format(w, ex)) raise ex # Enrich original vector apt = disco_vectors.get_vector(w) if (apt is None): # OOV apt = sparse.csr_matrix((1, disco_vectors.matrix.shape[1])) apt *= a for neighbour, _ in neighbours: apt += disco_vectors.get_vector(neighbour) smoothed_vectors[w] = apt.copy() return disco_vectors, smoothed_vectors
def density_window(vectors, words=None, num_neighbours=10, window_size=0.1, alpha='auto', nn_metric='cosine', **kwargs): """ Perform smoothing by associative inference :param vectors: Original elementary APTs :param words: Lexemes of interest to apply distributional inference on (pass None for all lexemes) :param num_neighbours: Maximum number of neighbours used for distributional inference :param window_size: proportional distance to nearest neighbour, defining the parzen window for each vector individually (default=0.1) :param alpha: weighting of original vector (default='auto', which multiplies the original vectors by `num_neighbours`) :param nn_metric: nearest neighbour metric to use (default='cosine'; supported are 'cosine' and 'euclidean') :return: smoothed apt vector """ smoothed_vectors = {} if (isinstance(vectors, Vectors)): disco_vectors = vectors else: # Passive-Aggressive-Defensive loading cascade if (isinstance(vectors, dict)): disco_vectors = Vectors.from_dict_of_dicts(vectors) else: raise ValueError('Unsupported type[{}] for `vectors` supplied. Supported types are [`discoutils.thesaurus_loader.Vectors` and `dict`]!') if (not kwargs.pop('is_initialised', False)): disco_vectors.init_sims(n_neighbors=num_neighbours, nn_metric=nn_metric, knn='brute' if nn_metric == 'cosine' else 'kd_tree') words = words if words is not None else vectors.keys() a = alpha if alpha != 'auto' else num_neighbours for w in words: if (w not in disco_vectors): continue # Retrieve top neighbour top_neighbour = disco_vectors.get_nearest_neighbours(w)[0] # Anything within `distance_threshold` is still considered for inference distance_threshold = top_neighbour[1] * (1+window_size) neighbours = [] for neighbour, distance in disco_vectors.get_nearest_neighbours(w): if (distance > distance_threshold): break neighbours.append((neighbour, distance)) # Enrich original vector apt = disco_vectors.get_vector(w) * a for neighbour, _ in neighbours: apt += disco_vectors.get_vector(neighbour) smoothed_vectors[w] = apt.copy() return disco_vectors, smoothed_vectors
def wordnet_synsets(vectors, words, num_neighbours, alpha='auto', nn_metric='cosine', **kwargs): """ Perform smoothing by associative inference :param vectors: Original elementary APTs :param words: Lexemes of interest to apply distributional inference on (pass None for all lexemes), !!!Need to be (word, pos) tuples!!! :param num_neighbours: Maximum number of neighbours used for distributional inference :param alpha: weighting of original vector (default='auto', which multiplies the original vectors by `num_neighbours`) :param nn_metric: nearest neighbour metric to use (default='cosine'; supported are 'cosine' and 'euclidean') :return: smoothed apt vector """ smoothed_vectors = {} if (isinstance(vectors, Vectors)): disco_vectors = vectors else: # Passive-Aggressive-Defensive loading cascade if (isinstance(vectors, dict)): disco_vectors = Vectors.from_dict_of_dicts(vectors) else: raise ValueError('Unsupported type[{}] for `vectors` supplied. Supported types are [`discoutils.thesaurus_loader.Vectors` and `dict`]!') if (not kwargs.pop('is_initialised', False)): disco_vectors.init_sims(n_neighbors=num_neighbours, nn_metric=nn_metric, knn='brute' if nn_metric == 'cosine' else 'kd_tree') words = words if words is not None else vectors.keys() a = alpha if alpha != 'auto' else num_neighbours for w, pos in words: if (w not in disco_vectors): continue neighbours = set() for syn in wordnet.synsets(w, pos=pos): n = syn.name().split('.')[0] if (n != w): neighbours.add(n) # Get indices of neighbours idx = [] for i, n in enumerate(neighbours, 1): if (i > num_neighbours): break if (n in disco_vectors): idx.append(disco_vectors.name2row[n]) A = disco_vectors.matrix[np.array(idx)] # Retrieve vector for `w` and add `A` to it and apply alpha weighting to original APT apt = sparse.csr_matrix(disco_vectors.get_vector(w).multiply(a) + A.sum(axis=0)) # Should still be sparse enough smoothed_vectors[w] = apt.copy() return disco_vectors, smoothed_vectors
def test_loading_dict_of_dicts(): d = { 'monday': { 'det:the': 23, 'amod:terrible': 321 }, 'tuesday': { 'amod:awful': 231, 'det:a': 12 } } v = Vectors(d) v1 = v.from_dict_of_dicts(d) assert v.columns == v1.columns for word in d.keys(): assert_array_equal(v.get_vector(word).A, v1.get_vector(word).A)
def static_top_n(vectors, words=None, num_neighbours=10, alpha='auto', nn_metric='cosine', **kwargs): """ Perform smoothing by associative inference :param vectors: Original elementary APTs :param words: Lexemes of interest to apply distributional inference on (pass None for all lexemes) :param num_neighbours: Number of neighbours used for distributional inference :param alpha: weighting of original vector (default='auto', which multiplies the original vectors by `num_neighbours`) :param nn_metric: nearest neighbour metric to use (default='cosine'; supported are 'cosine' and 'euclidean') :return: smoothed apt vector """ smoothed_vectors = {} if (isinstance(vectors, Vectors)): disco_vectors = vectors else: # Passive-Aggressive-Defensive loading cascade if (isinstance(vectors, dict)): disco_vectors = Vectors.from_dict_of_dicts(vectors) else: raise ValueError( 'Unsupported type[{}] for `vectors` supplied. Supported types are [`discoutils.thesaurus_loader.Vectors` and `dict`]!' ) if (not kwargs.pop('is_initialised', False)): disco_vectors.init_sims( n_neighbors=num_neighbours, nn_metric=nn_metric, knn='brute' if nn_metric == 'cosine' else 'kd_tree') words = words if words is not None else vectors.keys() a = alpha if alpha != 'auto' else num_neighbours for w in words: if (w not in disco_vectors): smoothed_vectors[w] = sparse.csr_matrix( (1, disco_vectors.matrix.shape[1])) continue neighbours = [] try: neighbours = disco_vectors.get_nearest_neighbours(w) except ValueError as ex: import logging logging.error( 'Failed to retrieve neighbours for w={}: {}...'.format(w, ex)) raise ex # Enrich original vector apt = disco_vectors.get_vector(w) if (apt is None): # OOV apt = sparse.csr_matrix((1, disco_vectors.matrix.shape[1])) apt *= a for neighbour, _ in neighbours: apt += disco_vectors.get_vector(neighbour) smoothed_vectors[w] = apt.copy() return disco_vectors, smoothed_vectors