Exemplo n.º 1
0
def test_cosine_distances():
    # Check the pairwise Cosine distances computation
    rng = np.random.RandomState(1337)
    x = np.abs(rng.rand(910))
    XA = np.vstack([x, x])
    D = cosine_distances(XA)
    assert_array_almost_equal(D, [[0., 0.], [0., 0.]])
    # check that all elements are in [0, 2]
    assert np.all(D >= 0.)
    assert np.all(D <= 2.)
    # check that diagonal elements are equal to 0
    assert_array_almost_equal(D[np.diag_indices_from(D)], [0., 0.])

    XB = np.vstack([x, -x])
    D2 = cosine_distances(XB)
    # check that all elements are in [0, 2]
    assert np.all(D2 >= 0.)
    assert np.all(D2 <= 2.)
    # check that diagonal elements are equal to 0 and non diagonal to 2
    assert_array_almost_equal(D2, [[0., 2.], [2., 0.]])

    # check large random matrix
    X = np.abs(rng.rand(1000, 5000))
    D = cosine_distances(X)
    # check that diagonal elements are equal to 0
    assert_array_almost_equal(D[np.diag_indices_from(D)], [0.] * D.shape[0])
    assert np.all(D >= 0.)
    assert np.all(D <= 2.)
Exemplo n.º 2
0
def sumACluster(dist, vecsIn, topK_t, sameTweetThred):
    if dist == "cosine":
        distMatrix = pairwise.cosine_distances(vecsIn)
    elif dist == "eu":
        distMatrix = pairwise.euclidean_distances(vecsIn, vecsIn)

    sameTweetClusters = [[0]]
    for seqid, text in enumerate(vecsIn[1:], start=1):
        added = None
        for stcid, stc in enumerate(sameTweetClusters):
            sameFlag = False
            if distMatrix[seqid][stc[0]] <= sameTweetThred:
                sameFlag = True

            if sameFlag:
                stc.append(seqid)
                added = (stcid, stc)
                break
        if added is None:
            sameTweetClusters.append([seqid])
        else:
            sameTweetClusters[added[0]] = added[1]
    sameTweetClusterNum = [(stcid, len(stc)) for stcid, stc in enumerate(sameTweetClusters)]
    numIn = len(sameTweetClusterNum)
    top = sorted(sameTweetClusterNum, key = lambda a:a[1], reverse=True)[:min(topK_t, numIn)]
    top = [(sameTweetClusters[item[0]][0], item[1]) for item in top]
    return top
Exemplo n.º 3
0
def test_linkage_misc():
    # Misc tests on linkage
    X = np.ones((5, 5))
    assert_raises(ValueError,
                  AgglomerativeClustering(linkage='foobar').fit,
                  X)
    assert_raises(ValueError, linkage_tree, X, linkage='foobar')
    assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", DeprecationWarning)
        # Use the copy argument, to raise a warning
        Ward(copy=True).fit(X)
    # We should be getting 2 warnings: one for using Ward that is
    # deprecated, one for using the copy argument
    assert_equal(len(warning_list), 2)

    # test hiearchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)
    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])

    # test hiearchical clustering on a precomputed distances matrix
    res = linkage_tree(X, affinity=manhattan_distances)
    assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
Exemplo n.º 4
0
def test_linkage_misc():
    # Misc tests on linkage
    rnd = np.random.RandomState(42)
    X = rnd.normal(size=(5, 5))
    assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X)
    assert_raises(ValueError, linkage_tree, X, linkage='foo')
    assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    # Deprecation of Ward class
    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", DeprecationWarning)
        Ward().fit(X)
    assert_equal(len(warning_list), 1)

    # test hiearchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)
    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])

    # test hiearchical clustering on a precomputed distances matrix
    res = linkage_tree(X, affinity=manhattan_distances)
    assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
Exemplo n.º 5
0
    def get_features(vocab):
        vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_head = vectorizer_head.fit_transform(headlines)

        vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_body = vectorizer_body.fit_transform(bodies)

        # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The
        # more important topic words a body contains of a certain topic, the higher its value for this topic
        lda_body = LatentDirichletAllocation(n_topics=n_topics, learning_method='online', random_state=0, n_jobs=3)

        print("latent_dirichlet_allocation_cos: fit and transform body")
        t0 = time()
        lda_body_matrix = lda_body.fit_transform(X_train_body)
        print("done in %0.3fs." % (time() - t0))

        print("latent_dirichlet_allocation_cos: transform head")
        # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics
        # their vectors should be similar
        lda_head_matrix = lda_body.transform(X_train_head)

        #print_top_words(lda_body, vectorizer_body.get_feature_names(), 100)

        print('latent_dirichlet_allocation_cos: calculating cosine distance between head and body')
        # calculate cosine distance between the body and head
        X = []
        for i in range(len(lda_head_matrix)):
            X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated
            X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1))
            cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten()
            X.append(cos_dist.tolist())
        return X
Exemplo n.º 6
0
def getModelInfo(model, features):
    print("Shape of the transformed features = {}".format(features.shape))
    # Uncomment to info:
    # vocab = model.get_feature_names()
    # dist = np.sum(features, axis=0)
    # for tag, count in izip(vocab, dist):
    #     print("word = {}, frequency = {}".format(tag, count))
    return cosine_distances(features)
def calcurate_centroid_Matrix(veclist, word2vecdic,DimentionN):
	centroid_Matrix = np.zeros((DimentionN, 200))
	distance_arrays = np.zeros(DimentionN)
	for word in veclist:
		label = word2vecdic[word]
		centroid_Matrix[label] += veclist[word]
	for word in veclist:
		label = word2vecdic[word]
		distance_arrays[label] += cosine_distances(veclist[word], centroid_Matrix[label])
	return centroid_Matrix, distance_arrays
Exemplo n.º 8
0
    def _build_metastore(self):

        medians = np.median(self.X, axis=0).reshape(1, self.dim)

        # how far each data point is from the global median
        dists = cosine_distances(self.X, Y=medians).reshape(-1)

        sorted_index = [self.index[i] for i in dists.argsort()]

        return {'sorted_index': sorted_index}
Exemplo n.º 9
0
def memory_cf(users, movies, k, similarity_measure, weight_schema,
              repr_matrix=rating_matrix_orig, rating_matrix=rating_matrix_orig):
    """
    Memory-based collaborative filtering.
    :param users: a user list.
    :param movies: a movie list.
    :param k: number of nearest users
    :param similarity_measure: 'cosine' or 'dot_product'
    :param weight_schema: 'mean' or 'weighted_mean'
    :param repr_matrix: data point representation
    :param rating_matrix: ratings based on user-movie or cluster centroids
    :return: recommended ratings for the queries
    """

    # construct mapping between input users and unique users
    ratings, user_unique = [], list(set(users))
    user_index_map = dict((u, i) for i, u in enumerate(user_unique))
    users = [(u, user_index_map[u]) for u in users]

    # find k nearest neighbor for each user
    if similarity_measure == 'cosine':
        dist = cosine_distances(repr_matrix[user_unique, :], repr_matrix)
        sims = 1 - dist
    elif similarity_measure == 'dot_product':
        sims = repr_matrix[user_unique, :].dot(repr_matrix.T)
        if issparse(sims):
            sims = sims.toarray()
        dist = -sims

    sorted_neighbors = np.argsort(dist, axis=1)

    # make rating matrix dense for fast access
    rating_matrix = rating_matrix.toarray()
    weight_method = mean if weight_schema == 'mean' else weighted_mean

    for (user_index, neighbor_index), movie in zip(users, movies):
        neighbors = list(islice(ifilter(lambda u: (u, movie) in entry_set,
                                        sorted_neighbors[neighbor_index]),
                                k + 1))

        # no neighbors, regarded as 3
        if not neighbors:
            ratings.append(3)
            continue

        # exclude itself
        if user_index in neighbors:
            neighbors.remove(user_index)

        rating = weight_method(rating_matrix[neighbors, movie],
                               sims[neighbor_index, neighbors])
        ratings.append(rating)

    return ratings
Exemplo n.º 10
0
    def get_features(head_and_body):
        filename = "NMF_topics" + str(n_topics) + "topics"

        if include_holdout == True:
            filename += "_holdout"

        if include_unlbled_test == True:
            filename += "unlbled_test"

        if not (os.path.exists(features_dir + "/" + filename + ".pkl")):
            X_all, vocab = get_all_data(head_and_body, filename)

            # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The
            # more important topic words a body contains of a certain topic, the higher its value for this topic
            nfm = NMF(n_components=n_topics, random_state=1, alpha=.1)

            print("NMF_topics: fit and transform body")
            t0 = time()
            nfm.fit_transform(X_all)
            print("done in %0.3fs." % (time() - t0))

            with open(features_dir + "/" + filename + ".pkl", 'wb') as handle:
                joblib.dump(nfm, handle, protocol=pickle.HIGHEST_PROTOCOL)
        else:
            vocab = get_vocab(head_and_body, filename)
            with open(features_dir + "/" + filename + ".pkl", 'rb') as handle:
                nfm = joblib.load(handle)

        vectorizer_head = TfidfVectorizer(vocabulary=vocab, norm='l2')
        X_train_head = vectorizer_head.fit_transform(headlines)

        vectorizer_body = TfidfVectorizer(vocabulary=vocab, norm='l2')
        X_train_body = vectorizer_body.fit_transform(bodies)

        print("NMF_topics: transform head and body")
        # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics
        # their vectors should be similar
        nfm_head_matrix = nfm.transform(X_train_head)
        nfm_body_matrix = nfm.transform(X_train_body)

        if cosinus_dist == False:
            return np.concatenate([nfm_head_matrix, nfm_body_matrix], axis=1)
        else:
            # calculate cosine distance between the body and head
            X = []
            for i in range(len(nfm_head_matrix)):
                X_head_vector = np.array(nfm_head_matrix[i]).reshape((1, -1))  # 1d array is deprecated
                X_body_vector = np.array(nfm_body_matrix[i]).reshape((1, -1))
                cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten()
                X.append(cos_dist.tolist())
            return X
Exemplo n.º 11
0
def get_sparse_dist_matrix(tweets_tfidf_matrix, eps):
    """Get the sparse distance matrix from the pairwise cosine distance
    computations from the given tfidf vectors. Only distances less than or
    equal to eps are put into the matrix"""
    rows = []
    cols = []
    data = []
    for ndx, tweet in enumerate(tweets_tfidf_matrix):
        rows.append(len(cols))
        distances = cosine_distances(tweet, tweets_tfidf_matrix)[0]
        for other_ndx, dist in enumerate(distances):
            if ndx != other_ndx and dist <= eps:
                cols.append(other_ndx)
                data.append(dist)
    return csr_matrix((data, cols, rows), dtype=int)
Exemplo n.º 12
0
    def get_features(n_topics):
        features_dir = "%s/data/fnc-1/features" % (path.dirname(path.dirname(path.dirname(path.dirname(path.abspath(__file__))))))

        filename = "lda_gensim_cos_" + str(n_topics) + "topics"
        if (os.path.exists(features_dir + "/" + filename + ".pkl")):
            lda = models.LdaMulticore.load(features_dir + "/" + filename + ".pkl")
            dictionary = corpora.Dictionary.load(features_dir + "/" + filename + ".dict")
            print("latent_dirichlet_allocation_gensim_cos model found and load")
        else:
            print("Creating new latent_dirichlet_allocation_gensim_cos model")
            h, b = word_ngrams.get_head_body_tuples()
            head_and_body = combine_and_tokenize_head_and_body(h, b)
            dictionary = corpora.Dictionary(head_and_body)
            dictionary.save(features_dir + "/" + filename + ".dict")
            corpus = [dictionary.doc2bow(text) for text in head_and_body]
            print(dictionary)

            lda = models.LdaMulticore(corpus, id2word=dictionary, num_topics=n_topics, workers=1)
            lda.save(features_dir + "/" + filename + ".pkl")

        X = []
        for i in range(len(headlines)):
            X_head_vector = lda[dictionary.doc2bow(nltk.word_tokenize(headlines[i]))]
            X_body_vector = lda[dictionary.doc2bow(nltk.word_tokenize(bodies[i]))]

            # calculate zero padded vector for cosinus distance
            X_head_vector_filled = np.zeros(n_topics, dtype=np.double)
            for id, prob in X_head_vector:
                X_head_vector_filled[id] = prob

            X_body_vector_filled = np.zeros(n_topics, dtype=np.double)
            for id, prob in X_body_vector:
                X_body_vector_filled[id] = prob

            # reshape for sklearn
            X_head_vector_filled_reshaped = np.array(X_head_vector_filled).reshape((1, -1))  # 1d array is deprecated
            X_body_vector_filled_reshaped = np.array(X_body_vector_filled).reshape((1, -1))

            cos_dist = cosine_distances(X_head_vector_filled_reshaped, X_body_vector_filled_reshaped).flatten()
            X.append(cos_dist.tolist())

        return X
Exemplo n.º 13
0
def test_linkage_misc():
    # Misc tests on linkage
    rng = np.random.RandomState(42)
    X = rng.normal(size=(5, 5))
    assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X)
    assert_raises(ValueError, linkage_tree, X, linkage='foo')
    assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    # test hiearchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)

    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])

    # test hiearchical clustering on a precomputed distances matrix
    res = linkage_tree(X, affinity=manhattan_distances)
    assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
Exemplo n.º 14
0
    def cluster_cf_memory():
        """
        Cluster-based memory CF.
        """
        rating_matrix_cluster = np.empty([k_user, rating_matrix_orig.shape[1]],
                                         dtype=np.float64)

        # build rating matrix for each user cluster, on each movie
        for i in range(k_user):
            cluster_indicator = np.where(user_belonging == i)[0]
            rating_cluster = rating_matrix_orig[cluster_indicator, :]
            rating_sum = rating_cluster.sum(axis=0)
            # take average by dividing count
            rating_cluster.data = np.ones(len(rating_cluster.data))
            mu = rating_sum / rating_cluster.sum(axis=0)
            # fill 0 for nan
            mu[np.isnan(mu)] = 0
            rating_matrix_cluster[i, :] = mu

        # construct mapping between input users and unique users
        ratings, user_unique = [], list(set(users))
        user_index_map = dict((u, i) for i, u in enumerate(user_unique))
        users_neighbors = [user_index_map[u] for u in users]

        if similarity_measure == 'cosine':
            dist = cosine_distances(rating_matrix_orig[user_unique, :], m2uc.T)
            sims = 1 - dist
        else:
            sims = rating_matrix_orig[user_unique, :].dot(m2uc).toarray()
            dist = -sims

        nearest_neighbors = np.argpartition(dist, k, axis=1)[:, :k]
        weight_method = mean if weight_schema == 'mean' else weighted_mean

        for neighbor_index, movie in zip(users_neighbors, movies):
            neighbors = nearest_neighbors[neighbor_index]
            rating = weight_method(rating_matrix_cluster[neighbors, movie],
                                   sims[neighbor_index, neighbors])
            ratings.append(rating)

        return ratings
Exemplo n.º 15
0
Arquivo: test.py Projeto: src-d/kmcuda
 def test_fp16_cosine_metric(self):
     arr = numpy.empty((10000, 2), dtype=numpy.float16)
     angs = numpy.random.rand(10000) * 2 * numpy.pi
     for i in range(10000):
         arr[i] = numpy.sin(angs[i]), numpy.cos(angs[i])
     with self.stdout:
         centroids, assignments = kmeans_cuda(
             arr, 4, init="kmeans++", metric="cos", device=1, verbosity=2,
             seed=3)
     self.assertEqual(self._get_iters_number(self.stdout), 5)
     self.assertEqual(len(centroids), 4)
     for c in centroids:
         norm = numpy.linalg.norm(c)
         self.assertTrue(0.9995 < norm < 1.0005)
     dists = numpy.round(cosine_distances(centroids)).astype(int)
     self.assertTrue((dists == [
         [0, 2, 1, 1],
         [2, 0, 1, 1],
         [1, 1, 0, 2],
         [1, 1, 2, 0],
     ]).all())
     self.assertEqual(numpy.min(assignments), 0)
     self.assertEqual(numpy.max(assignments), 3)
Exemplo n.º 16
0
def plot_mds(points, genres, n_points=500):
    '''
    Plots a set of documents in MDS space

    Args:
        points: dense array with coordinates of each document
        genres: list of genres for each entry in points
    Returns:
        None
    '''

    genres = np.array(genres)
    genre_sel = np.not_equal(genres, None)
    X, y = points[genre_sel], genres[genre_sel]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, train_size=n_points)

    distances = cosine_distances(X_train, X_train)
    mds = MDS(n_components=2, dissimilarity='precomputed')
    mds.fit(distances)

    plot_embedding(mds.embedding_, y_train)
"""
Testing the change in embeddings over time. Assumes
that we've already generated embeddings in output/.
"""
import pandas as pd
import numpy as np
import os, codecs
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances

if __name__ == '__main__':
    out_dir = 'output'
    embedding_files = [os.path.join(out_dir, f) for f in os.listdir(out_dir)]
    # test 0: do the embeddings make semantic sense? 
    end_embedding = pd.read_csv(embedding_files[-1], sep='\t', index_col=0)
    test_words = ['you', 'go', 'road', 'give', 'cold']
    for test_word in test_words:
        sims = end_embedding.apply(lambda r: cosine_similarity(r.reshape(1,-1), 
                                                               end_embedding.loc[test_word].reshape(1,-1))[0][0], 
                                   axis=1)
        print('test word %s has top 10 similarities \n%s'%
              (test_word, sims.sort_values(ascending=False)[:10]))
    # TL;DR the embeddings aren't perfect but they work for more common words
    # test 1: how much have embeddings changed from start to end of data?
    start_embedding = pd.read_csv(embedding_files[1], sep='\t', index_col=0)
    embedding_deltas = abs(cosine_distances(end_embedding, start_embedding))
    embedding_deltas = pd.Series(np.diagonal(embedding_deltas), 
                                 index=end_embedding.index).sort_values(ascending=True)
    print('got embedding deltas %s'%(embedding_deltas))
Exemplo n.º 18
0
def get_cos_sim(dset,
                n_cats,
                dtype,
                dset_name,
                version,
                sim_type,
                IPC_dict=None):
    """
    This will take a dataset and calculate the cosine similiarity within and
    between classes, producing a csv with results and updating a main doc.

    :param dset: data to be tested, csv, (pd or np array?)
    :param n_cats: number of classes (items per-class calculated as items/classes)
    :param dtype: binary, chan_dist or chanProp.  only needed for labelling
    :param dset_name: of dataset eg HBHW, HBLW, LBHW, LBLW
    :param version: number with 2 versions of each type
    :param sim_type: Describe the similarity e.g., HBHW or vary etc
    :param IPC_dict: defalt = None.  if the number of items per class is not
                    equal, enter a dict


    """
    print("\nrunning ** get_cos_sim()**")

    file_path = "/home/nm13850/Documents/PhD/python_v2/experiments/" \
                "within_between_dist_july2020/New_data/"
    if running_on_laptop():
        file_path = '/Users/nickmartin/Library/Mobile Documents/com~apple~CloudDocs/' \
                    'Documents/PhD/python_v2/experiments/' \
                    'within_between_dist_july2020/New_data/'

    save_path = os.path.join(file_path, 'similarity_details')

    # # enter either 'cos_sim, 'cos_dist' or 'taxi'
    distance = 'cos_sim'

    dataset = np.asarray(dset)
    items, features = np.shape(dataset)
    print(f'\ndataset: {dataset}')
    print(f'items, features: {items}, {features}')

    # add IPC dict here if class_sizes are not equal
    if IPC_dict is None:
        cat_size = int(items / n_cats)
        IPC_dict = {i: cat_size for i in range(n_cats)}
        print(f'\nequal size IPC dict\n{IPC_dict}')
    else:
        print("using IPC dict")

    # separate out the individual classes
    # start with class inidices list containing zero, index of the first class
    class_indices = [0]
    IPC_vals = list(IPC_dict.values())
    print(f'\nIPC_vals: {IPC_vals}')
    for i in range(n_cats):
        next_val = class_indices[-1] + IPC_vals[i]
        class_indices.append(next_val)

    #  list of items numbers to start each class
    start_indices = class_indices[:n_cats]
    # print(f'\nstart_indices: {start_indices}')

    # list of indices to end each class
    end_indices = class_indices[1:]
    # print(f'end_indices: {end_indices}')

    # 1. define classes as slices of dataset array
    class_list = []
    names_list = []

    for cat in range(n_cats):
        this_name = f'class_{cat}'
        names_list.append(this_name)

        this_class = dataset[start_indices[cat]:end_indices[cat], :]
        class_list.append(this_class)

        # print(f'\n{this_name}\n{this_class}\n')

    # within class similarities
    # 3. make empty list to store results.
    within_list = []

    for index, this_cat in enumerate(class_list):
        # print(f'\ngetting within class cos_sim for {names_list[index]}')

        # will do all pairwise comparrisons within the given category
        if distance in [
                'cos_sim', 'cosine_similarity', 'cosine_sim', 'cos_similarity'
        ]:
            within_cat = cosine_similarity(this_cat)
            # the SIMILARITY between two identical vectors will be 1
        elif distance in [
                'cos_dist', 'cosine_distance', 'cosine_dist', 'cos_distance'
        ]:
            within_cat = cosine_distances(this_cat)
            # this DISTANCE between two identical vectors will be 0
            # Cosine_distance = 1 - cosine_similarity
        elif distance in ['manhattan', 'taxi']:
            within_cat = manhattan_distances(this_cat)
        else:
            raise ValueError('must input a valid distance name')

        # print(within_cat)

        # just take the triangle since this analysis compares items with themselves
        triangle_indices = np.triu_indices(IPC_dict[index], 1)
        values_for_descriptives = (within_cat[triangle_indices])
        # print(values_for_descriptives)

        data_similarity_descriptives = scipy.stats.describe(
            values_for_descriptives, axis=None)
        mean_sim = str(np.round(data_similarity_descriptives.mean, decimals=2))
        print(
            f"\nWithin group mean {distance} for {names_list[index]}: {mean_sim}"
        )

        within_list.append(mean_sim)

    print(f'\nwithin_list ({distance}): {within_list}\n')

    # between class similarities.
    print('\nbetween class similarities')
    '''
    For each pair of classes
    - get the similarities of each item in one class to each item in the other class.
    - take the average of the whole matrix (not just the triangle) to get the 
    mean similaritiy between these two classes.
    
    These mean between class similarities go into an n_cats x n_cats-1 matrix.
    (n_cats-1 because I am not going to have diagonals comparing classes with themselves.  
    Each row shows a classes similarity to all other classes.
    - Take the average of each row to a get a class's mean between class similarity.
    
    Example below shows 4 classes (rows) and the values show which other class is being compared.
    e.g., class1 is compared with classes 2, 3, 4.  Class2 is compared with classes 1, 3, 4.
           compA   compB   compC
    class1: 2       3       4
    class2: 1       3       4
    class3: 1       2       4
    class4: 1       2       3
    '''

    class_pairs_list = list(combinations(class_list, 2))
    class_names_list = list(combinations(names_list, 2))
    class_index_list = list(combinations(range(n_cats), 2))
    print(
        f'running {len(class_index_list)} between class comparrrions.\n{class_index_list}'
    )
    between_array = np.zeros(shape=(n_cats, n_cats - 1))

    for index, cat_pair in enumerate(class_pairs_list):
        cat_a = cat_pair[0]
        cat_name_a = class_names_list[index][0]

        cat_b = cat_pair[1]
        cat_name_b = class_names_list[index][1]

        print(f'\nbetween class {distance} for: {cat_name_a} and {cat_name_b}')

        # # do all pairwise comparrisons between the classes
        if distance in [
                'cos_sim', 'cosine_similarity', 'cosine_sim', 'cos_similarity'
        ]:
            between_pairs_matrix = cosine_similarity(X=cat_a, Y=cat_b)
        elif distance in [
                'cos_dist', 'cosine_distance', 'cosine_dist', 'cos_distance'
        ]:
            between_pairs_matrix = cosine_distances(X=cat_a, Y=cat_b)
        elif distance in ['manhattan', 'taxi']:
            between_pairs_matrix = manhattan_distances(X=cat_a, Y=cat_b)
        else:
            raise ValueError('must input a valid distance name')

        print(f'{between_pairs_matrix}')
        mean_between_pair = np.mean(between_pairs_matrix)
        print(f'mean_between_pair: {mean_between_pair}')

        # append to between array in both (ofset) diagonals
        idxA, idxB = class_index_list[index]
        print(f'add to matrix position: {idxA}, {idxB}')
        between_array[idxA, idxB - 1] = mean_between_pair
        between_array[idxB, idxA] = mean_between_pair

    print(f"\nbetween_array:\n{between_array}")

    print(f'\nmean between class {distance}')
    between_list = []
    for index in range(n_cats):
        this_row = between_array[index]
        this_mean = np.mean(this_row)
        between_list.append(this_mean)
        print(index, this_mean)

    print("I want to get the mean of the between list and the within list")
    dset_between_mean = np.mean(between_list)
    dset_between_sd = np.std(between_list)
    print(
        f"dataset mean between class distance: {dset_between_mean} std.dev: {dset_between_sd}"
    )

    print(f"check within list:\n{within_list}")
    within_list_num = [float(i) for i in within_list]
    print(f"check within_list_num:\n{within_list_num}")

    dset_within_mean = np.mean(within_list_num)
    dset_within_sd = np.std(within_list_num)
    print(
        f"dataset mean within class distance: {dset_within_mean} std.dev: {dset_within_sd}"
    )

    # # save output.
    '''for each class:
       mean within
       mean between
       paired between 
    '''
    names_list.append('Dset_means')
    names_list.append('Dset_sd')
    within_list.append(dset_within_mean)
    within_list.append(dset_within_sd)
    between_list.append(dset_between_mean)
    between_list.append(dset_between_sd)

    class_sim_dict = {
        'class': names_list,
        'between': between_list,
        'within': within_list
    }
    class_sim_df = pd.DataFrame(class_sim_dict)
    print(class_sim_df)
    csv_name = f'{dset_name}_{distance}.csv'
    csv_path = os.path.join(save_path, csv_name)
    class_sim_df.to_csv(
        csv_path,
        index_label='class',
    )

    # check if similiarity summary exists
    similarity_info = [
        dtype, dset_name, sim_type, version, n_cats, dset_between_mean,
        dset_between_sd, dset_within_mean, dset_within_sd
    ]
    print(f"similarity_info:\n{similarity_info}")

    # check if training_info.csv exists
    summary_name = 'similarity_summary.csv'
    print(f"\nlooking for file:\n{os.path.join(save_path, summary_name)}")
    if not os.path.isfile(os.path.join(save_path, summary_name)):
        print("making summary page")
        headers = [
            "dtype", "dset_name", 'sim_type', "version", "n_cats", "mean_b",
            "sd_b", "mean_w", "sd_w"
        ]

        similarity_overview = open(os.path.join(save_path, summary_name), 'w')
        mywriter = csv.writer(similarity_overview)
        mywriter.writerow(headers)
    else:
        print("appending to summary page")
        similarity_overview = open(os.path.join(save_path, summary_name), 'a')
        mywriter = csv.writer(similarity_overview)

    mywriter.writerow(similarity_info)
    similarity_overview.close()

    return_dict = {
        "dtype": dtype,
        "dset_name": dset_name,
        'sim_type': sim_type,
        "version": version,
        "n_cats": n_cats,
        "dset_between_mean": dset_between_mean,
        "dset_between_sd": dset_between_sd,
        "dset_within_mean": dset_within_mean,
        "dset_within_sd": dset_within_sd
    }

    return return_dict
Exemplo n.º 19
0
def cosine_similarity(vector_a, vector_b):
	return 1-cosine_distances(vector_a,vector_b)
Exemplo n.º 20
0
        print(name + " failed to read")

    dist = cosine_distances(matrix)
    link = hier.linkage(dist, method="average")
    clust = hier.fcluster(link, t=0.1)
    #print(clust)


"""
print("starting predictions")
for name in os.listdir(input_test):
    streng = input_test + "/" + name + "/docs"
    vocabulary = []
    #print(name)
    for site in os.listdir(streng):
        nyStreng = input_test + "/" + name + "/docs/" + site
        voc = handle_html(nyStreng)
        vocabulary.append(voc)
    try:
        matrix = pre_process(vocabulary)
    except ValueError:
        print(name + " failed to predict")

    dist = cosine_distances(matrix)
    link = hier.linkage(dist, method="centroid", metric="euclidean")
    clust = hier.fcluster(link, t=0.08)
    d = sortClusters(clust)
    printout(name,d)
    #print(pred)

print("done")
Exemplo n.º 21
0
def getFeaturesFromVectors_pair(vectorData, seg1, seg2, segData):
    X = myX()
    segDilated = segData.segDilated
    segBoundSize = segData.segBoundSize
    seggSizes = segData.segSizes
    EdgeMapList = vectorData.EdgeMapList
    vectors = vectorData.segVectors
    clustersL2 = vectorData.segClustersL2
    ratios = vectorData.ratios
    regFeatures = []
    boundLine = np.logical_and(segDilated[seg1], segDilated[seg2])
    for e in EdgeMapList:
        bound = e[boundLine]
        bound = bound[bound > 0]
        regFeatures.append(np.mean(bound))  #Feature mean bound dist.

    boundOverlap1 = bound.size / segBoundSize[seg1]
    boundOverlap2 = bound.size / segBoundSize[seg2]
    regFeatures.append(max(boundOverlap1,
                           boundOverlap2))  #Feature max overlap with bound
    regFeatures.append(bound.size)  #Feature bound size
    size1 = seggSizes[seg1]
    size2 = seggSizes[seg2]
    regFeatures.append(size1 + size2)  #Feature New seg area

    X.regFeatures = regFeatures
    for ratioIdx, ratio in enumerate(ratios):
        cnnFeatures = []
        for numLayer in range(0, len(vectors[seg1][ratioIdx])):
            layerFeatures = []
            pair_dist = scipy.spatial.distance.cdist(
                vectors[seg1][ratioIdx][numLayer],
                vectors[seg2][ratioIdx][numLayer])
            layerFeatures.append(
                np.min(pair_dist))  #Feature L2 min dist in vec rep.
            layerFeatures.append(
                np.max(pair_dist))  #Feature L2 max dist in vec rep.
            layerFeatures.append(
                np.mean(pair_dist))  #Feature L2 average dist in vec rep.
            layerFeatures.append(
                bn.median(pair_dist))  #Feature L2 median dist in vec rep.
            layerFeatures.append(
                np.sqrt(
                    np.sum((clustersL2[seg1][ratioIdx][numLayer] -
                            clustersL2[seg2][ratioIdx][numLayer]
                            )**2)))  #Feature L2 dist between L2 clusters
            pair_dist = scipy.spatial.distance.cdist(
                vectors[seg1][ratioIdx][numLayer],
                vectors[seg2][ratioIdx][numLayer],
                metric='cosine')
            layerFeatures.append(
                np.min(pair_dist))  # Feature cosine min dist in vec rep.
            layerFeatures.append(
                np.max(pair_dist))  # Feature cosine max dist in vec rep.
            layerFeatures.append(
                np.mean(pair_dist))  # Feature cosine average dist in vec rep.
            layerFeatures.append(
                bn.median(pair_dist))  # Feature cosine median dist in vec rep.
            layerFeatures.append(
                cosine_distances(
                    np.array([clustersL2[seg1][ratioIdx][numLayer]]),
                    np.array([
                        clustersL2[seg2][ratioIdx][numLayer]
                    ]))[0][0])  #Feature cosine dist between L2 clusters

            cnnFeatures.append(layerFeatures)
        X.cnnFeatures[ratio] = cnnFeatures
    ImageFeatures = []
    ImageFeatures.append(
        np.sqrt((vectorData.segL[seg1] -
                 vectorData.segL[seg2])**2))  #Feature L channel dist
    ImageFeatures.append(
        np.sqrt((vectorData.segA[seg1] -
                 vectorData.segA[seg2])**2))  #Feature A channel dist
    ImageFeatures.append(
        np.sqrt((vectorData.segB[seg1] -
                 vectorData.segB[seg2])**2))  #Feature B channel dist
    X.ImageFeatures = ImageFeatures
    return X
Exemplo n.º 22
0
def local_measure(word, knn, bins):
    # the matrix of size len(bins) x len(bins)
    # where the cosine distance of each pair of bins is computed
    S = []
    print(word, knn)
    for xx in range(len(bins) - 2):
        bin1 = bins[xx]
        bin2 = bins[xx + 1]
        time1 = str(bin1)
        time2 = str(bin2)
        # path to the embeddings
        path = base_all + '3EMB-' + name + '-' + 'win_' + str(
            win) + '-size_' + str(size) + '-min_count_' + str(
                min_count) + '-iter_' + str(time1) + '-' + str(time2)
        # load the embeddings at time t0 using gensim KeyedVectors
        embed = KeyedVectors.load(path)
        # check if the word is in the embedding wocabulary
        if word not in embed.wv.vocab:
            print(word + ' not in base_embed\'s vocabulary')
            continue
        else:
            knn_t = embed.most_similar(word, topn=knn)
            knn_t_words = [k[0] for k in knn_t]
            knn_t_sims = [k[1] for k in knn_t]
        if xx > 0:
            S.append([0] * xx)
        else:
            S.append([])
            knn_t0 = knn_t
            knn_t0_words = knn_t_words
            time0 = str(bin1) + '_' + str(bin2)
        # only the values of S above the main diagonal are non-zero
        # this because cosine distance is simmetric and the values on the diagonal are 0
        for x in range(xx + 1, len(bins) - 1):
            time11 = str(bins[x])
            time22 = str(bins[x + 1])
            time = time1 + '-' + time2 + '_' + time11 + '-' + time22
            time00 = time0 + '_' + time11 + '-' + time22
            print(time)
            path_t1 = base_all + '3EMB-' + name + '-' + 'win_' + str(
                win) + '-size_' + str(size) + '-min_count_' + str(
                    min_count) + '-iter_' + time11 + '-' + time22

            # load the embeddings at time t1 using gensim KeyedVectors
            embed_t1 = KeyedVectors.load(path_t1)

            if word not in embed_t1.wv.vocab:
                print(word + ' not in embed\'s vocabulary')
                continue
            else:
                knn_t1 = embed_t1.most_similar(word, topn=knn)
                knn_t1_words = [k[0] for k in knn_t1]
                knn_t1_sims = [k[1] for k in knn_t1]
            # create the second order vector as in:
            # Hamilton, William L., Jure Leskovec, and Dan Jurafsky. "Cultural shift or linguistic drift? comparing two computational measures of semantic change." Proceedings of the Conference on Empirical Methods in Natural Language Processing. Conference on Empirical Methods in Natural Language Processing. Vol. 2016. NIH Public Access, 2016.
            # Equation 2
            s_t = getSim(embed, word, knn_t_words + knn_t1_words)
            s_t1 = getSim(embed_t1, word, knn_t_words + knn_t1_words)
            dist = cosine_distances([s_t, s_t1]).tolist()[0][1]

            new_words, lost_words, diffs = comp_changes(knn_t, knn_t1)
            new_words0, lost_words0, diffs0 = comp_changes(knn_t0, knn_t1)

            writeMat(base, 'new_words_' + time + '_' + word + '_' + str(knn),
                     new_words)
            writeMat(base, 'lost_words_' + time + '_' + word + '_' + str(knn),
                     lost_words)
            writeMat(base, 'changes_' + time + '_' + word + '_' + str(knn),
                     diffs)

            writeMat(base, 'new_words_' + time00 + '_' + word + '_' + str(knn),
                     new_words0)
            writeMat(base,
                     'lost_words_' + time00 + '_' + word + '_' + str(knn),
                     lost_words0)
            writeMat(base, 'changes_' + time00 + '_' + word + '_' + str(knn),
                     diffs0)

            S[xx].append(dist)
    writeMat(base, 'Local_Similarity_' + name + '_' + word + '_' + str(knn), S)
Exemplo n.º 23
0
def wmdo(wvvecs,
         ref,
         cand,
         ref_lang='en',
         cand_lang='en',
         delta=0.18,
         alpha=0.1):
    '''
    wvvecs: word vectors -- retrieved from load_wv method
    ref: reference translation
    cand: candidate translation
    missing: missing word dictionary -- initialise as {}
    dim: word vector dimension
    delta: weight of fragmentation penalty
    alpha: weight of missing word penalty
    '''

    ref_list = get_input_words(ref)
    cand_list = get_input_words(cand)

    ref = ' '.join(ref_list)
    cand = ' '.join(cand_list)

    common_vectorizer = CountVectorizer().fit(ref_list + cand_list)

    ref_count_vector, cand_count_vector = common_vectorizer.transform(
        [ref, cand])

    ref_count_vector = ref_count_vector.toarray().ravel()
    cand_count_vector = cand_count_vector.toarray().ravel()

    dim = wvvecs[ref_lang].vector_size

    wvoc, missing = create_vocabulary(common_vectorizer, wvvecs, dim, ref_list,
                                      cand_list, ref_lang, cand_lang)

    distance_matrix = cosine_distances(wvoc)
    vocab_words = common_vectorizer.get_feature_names()
    for cand_word_idx, count in enumerate(cand_count_vector):
        if count > 0:
            most_similar_ref_indexes = np.argsort(
                distance_matrix[cand_word_idx])
            for ref_word_index in most_similar_ref_indexes[1:]:
                if ref_count_vector[ref_word_index] > 0:
                    print('{}: {}'.format(vocab_words[cand_word_idx],
                                          vocab_words[ref_word_index]))
                    break

    if np.sum(distance_matrix) == 0.0:
        return 0., {}
        #return float('inf')

    ref_count_vector = ref_count_vector.astype(np.double)
    cand_count_vector = cand_count_vector.astype(np.double)

    ref_count_vector /= ref_count_vector.sum()
    cand_count_vector /= cand_count_vector.sum()

    distance_matrix = distance_matrix.astype(np.double)
    (wmd, flow) = emd_with_flow(ref_count_vector, cand_count_vector,
                                distance_matrix)

    return wmd, {}

    # adding penalty
    ratio = fragmentation(ref_list, cand_list, common_vectorizer, flow)
    if ratio > 1:
        ratio = 1
    penalty = delta * ratio

    # missing words penalty
    missingwords = 0
    for w in cand_list:
        if w not in wvvecs:
            missingwords += 1
    missingratio = missingwords / len(cand_list)
    missing_penalty = alpha * missingratio

    penalty += missing_penalty

    wmd += penalty

    return wmd, missing
Exemplo n.º 24
0
def test_pairwise_distances():
    """ Test the pairwise_distance helper function. """
    rng = np.random.RandomState(0)
    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    S = pairwise_distances(X, metric="euclidean")
    S2 = euclidean_distances(X)
    assert_array_almost_equal(S, S2)
    # Euclidean distance, with Y != X.
    Y = rng.random_sample((2, 4))
    S = pairwise_distances(X, Y, metric="euclidean")
    S2 = euclidean_distances(X, Y)
    assert_array_almost_equal(S, S2)
    # Test with tuples as X and Y
    X_tuples = tuple([tuple([v for v in row]) for row in X])
    Y_tuples = tuple([tuple([v for v in row]) for row in Y])
    S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean")
    assert_array_almost_equal(S, S2)
    # "cityblock" uses sklearn metric, cityblock (function) is scipy.spatial.
    S = pairwise_distances(X, metric="cityblock")
    S2 = pairwise_distances(X, metric=cityblock)
    assert_equal(S.shape[0], S.shape[1])
    assert_equal(S.shape[0], X.shape[0])
    assert_array_almost_equal(S, S2)
    # The manhattan metric should be equivalent to cityblock.
    S = pairwise_distances(X, Y, metric="manhattan")
    S2 = pairwise_distances(X, Y, metric=cityblock)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)
    # manhattan does not support sparse matrices atm.
    assert_raises(ValueError, pairwise_distances, csr_matrix(X),
                  metric="manhattan")
    # Low-level function for manhattan can divide in blocks to avoid
    # using too much memory during the broadcasting
    S3 = manhattan_distances(X, Y, size_threshold=10)
    assert_array_almost_equal(S, S3)
    # Test cosine as a string metric versus cosine callable
    # "cosine" uses sklearn metric, cosine (function) is scipy.spatial
    S = pairwise_distances(X, Y, metric="cosine")
    S2 = pairwise_distances(X, Y, metric=cosine)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)
    # Tests that precomputed metric returns pointer to, and not copy of, X.
    S = np.dot(X, X.T)
    S2 = pairwise_distances(S, metric="precomputed")
    assert_true(S is S2)
    # Test with sparse X and Y,
    # currently only supported for euclidean and cosine
    X_sparse = csr_matrix(X)
    Y_sparse = csr_matrix(Y)
    S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean")
    S2 = euclidean_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    S = pairwise_distances(X_sparse, Y_sparse, metric="cosine")
    S2 = cosine_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    # Test with scipy.spatial.distance metric, with a kwd
    kwds = {"p": 2.0}
    S = pairwise_distances(X, Y, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, Y, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)
    # same with Y = None
    kwds = {"p": 2.0}
    S = pairwise_distances(X, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)
    # Test that scipy distance metrics throw an error if sparse matrix given
    assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski")
    assert_raises(TypeError, pairwise_distances, X, Y_sparse,
                  metric="minkowski")
Exemplo n.º 25
0
tfidf_matrix = tfidf_matrix.T.round(3)
tfidf_matrix.columns = data['Princesa']

tfidf_matrix

# # Punto 3: Distancia del coseno
#
# - Calcular la distancia del coseno entre cada una de las princesas
# - ¿Cuáles son las princesas más parecidas?
# - ¿Cuáles son las princesas más diferentes?

# In[356]:

from sklearn.metrics.pairwise import cosine_distances

dist_cos = cosine_distances(tfidf_matrix.T.values)
dist_cos = pd.DataFrame(dist_cos,
                        columns=tfidf_matrix.columns,
                        index=tfidf_matrix.columns)
dist_cos

# In[480]:

dist_cos.max()

# In[519]:

for col in dist_cos:
    print(col)
    print(max(dist_cos[col]))
    print(" ")
Exemplo n.º 26
0
    # We should be getting 2 warnings: one for using Ward that is
    # deprecated, one for using the copy argument
    assert_equal(len(warning_list), 2)
=======
from sklearn.utils.testing import assert_warns
>>>>>>> remote

    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", UserWarning)
        # Use the copy argument, to raise a warning
        ward_tree(X, copy=True)
    # We should be getting 1 warnings: for using the copy argument
    assert_equal(len(warning_list), 1)

    # Let's test a hiearchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)
    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])


def test_structured_linkage_tree():
    """
    Check that we obtain the correct solution for structured linkage trees.
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    # Avoiding a mask with only 'True' entries
    mask[4:7, 4:7] = 0
    X = rnd.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    for tree_builder in _TREE_BUILDERS.values():
Exemplo n.º 27
0
def discr_stat(X,
               Y,
               dissimilarity="euclidean",
               remove_isolates=True,
               return_rdfs=True):
    """
    Computes the discriminability statistic.
    Parameters
    ----------
    X : array, shape (n_samples, n_features) or (n_samples, n_samples)
        Input data. If dissimilarity=='precomputed', the input should be the dissimilarity matrix.
    Y : 1d-array, shape (n_samples)
        Input labels.
    dissimilarity : str, {"euclidean" (default), "precomputed"}
        Dissimilarity measure to use:
        - 'euclidean':
            Pairwise Euclidean distances between points in the dataset.
        - 'precomputed':
            Pre-computed dissimilarities.
    remove_isolates : bool, optional, default=True
        Whether to remove data that have single label.
    return_rdfs : bool, optional, default=False
        Whether to return rdf for all data points.
    Returns
    -------
    stat : float
        Discriminability statistic.
    rdfs : array, shape (n_samples, max{len(id)})
        Rdfs for each sample. Only returned if ``return_rdfs==True``.
    """
    check_X_y(X, Y, accept_sparse=True)

    uniques, counts = np.unique(Y, return_counts=True)
    if (counts != 1).sum() <= 1:
        msg = "You have passed a vector containing only a single unique sample id."
        raise ValueError(msg)
    if remove_isolates:
        idx = np.isin(Y, uniques[counts != 1])
        labels = Y[idx]

        if dissimilarity == "euclidean" or dissimilarity == "cosine" or dissimilarity == "haversine" or \
            dissimilarity == "manhattan" or dissimilarity == "mahalanobis":
            X = X[idx]
        else:
            X = X[np.ix_(idx, idx)]
    else:
        labels = Y

    if dissimilarity == "euclidean":
        dissimilarities = nan_euclidean_distances(X)
    elif dissimilarity == "cosine":
        dissimilarities = cosine_distances(X)
    elif dissimilarity == "haversine":
        dissimilarities = haversine_distances(X)
    elif dissimilarity == "manhattan":
        dissimilarities = manhattan_distances(X)
    else:
        dissimilarities = X

    rdfs = _discr_rdf(dissimilarities, labels)
    rdfs[rdfs < 0.5] = np.nan
    stat = np.nanmean(rdfs)

    if return_rdfs:
        return stat, rdfs
    else:
        return stat
Exemplo n.º 28
0
 def __cal__(self,
             a: np.ndarray,
             b: np.ndarray):
     from sklearn.metrics.pairwise import cosine_distances
     return cosine_distances(a, b)
Exemplo n.º 29
0
    for date in s_text.keys():
        data += 1
        bar.update(data)

        sample = s_text[date]
        sample_doc = s_all_doc[date]

        if (len(sample) < 20):
            continue

        tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2)
        tf = tf_vectorizer.fit_transform(sample)
        feature_names = tf_vectorizer.get_feature_names()

        cosine_sim = cosine_similarity(tf)
        cosine_dis = cosine_distances(tf)

        #  get labels_true
        topic_labels, minor_class_labels, major_class_labels, topic_dense = get_labels_true(
            sample_doc)

        # DBSCAN
        db_group, db_labels, db_n_clusters = dbscan(cosine_dis)

        centroid_list, centroid_word_list, score_list, centroid_name_list, size_list = find_centroid_score(
            tf, db_group, feature_names, topic_labels)
        #     print type(centroid_list),centroid_list
        tweet_id_list, author_list = get_tweet_id(sample_doc, db_group)
        write_event(date, 'DBSCAN', feature_names, centroid_list,
                    centroid_word_list, score_list, centroid_name_list,
                    size_list, tweet_id_list, author_list)
Exemplo n.º 30
0
    def search_in_gallery(self, embedding):
        ''' Takes input embedding vector and searches it in the gallery. '''

        distances = cosine_distances(embedding, self.embeddings).reshape([-1])
        sorted_indexes = np.argsort(distances)
        return sorted_indexes, distances
Exemplo n.º 31
0
 def cosine_dist():
     return cosine_distances(paths)
                                                    17 + 47 * test_ind:47 * (test_ind + 1), :][sorted_mouse_bool, :, :],
                                             axis=1)
                    missing = np.isnan(train_traces[:, 0]) | np.isnan(test_traces[:, 0])
                #         train_traces = train_traces[~missing & resorted_drop, :]
                #         test_traces = test_traces[~missing & resorted_drop, :]
                train_traces = train_traces[~missing, :]
                test_traces = test_traces[~missing, :]
                if test_traces.shape[0] > 0:
                    logger.info(f'{m} {train_stage}-{test_stage}: {test_traces.shape[0]} cells remain after matching')
                else:
                    logger.warning(f'{m} {train_stage}-{test_stage}: {test_traces.shape[0]} cells matched, skipping')
                    continue

                # take cosine distance
                for c, cue in enumerate(cues):
                    cos[train_ind, test_ind, c, mc] = cosine_distances(train_traces[:, c][None, :],
                                                                      test_traces[:, c][None, :])[0][0]
                    norms[train_ind, test_ind, c,
                          mc] = (np.linalg.norm(train_traces[:, c]) -
                                np.linalg.norm(test_traces[:, c])) / np.linalg.norm(train_traces[:, c])
                    mdiff[train_ind, test_ind, c,
                          mc] = (np.mean(train_traces[:, c]) - np.mean(test_traces[:, c])) / np.mean(train_traces[:, c])
                    mdiff2[train_ind, test_ind, c, mc] = np.mean(train_traces[:, c] - test_traces[:, c])
                    mdiff3[train_ind, test_ind, c, mc] = np.mean(np.abs(train_traces[:, c] - test_traces[:, c]))
                    mdiff_cells[train_ind, test_ind, c, mc] = np.mean(train_traces[:, c]) - np.mean(test_traces[:, c])
    logger.info(f'Finished model: {mod}\n')

    # plot heatmap 
    ax = []
    fig = plt.figure(figsize=(30, 15))
    gs = fig.add_gridspec(100, 110)
    ax.append(fig.add_subplot(gs[:, 3:5]))
Exemplo n.º 33
0
    def fit_joint_all(self,
                      vectors,
                      orig_groups,
                      article_ids,
                      xy_embeddings,
                      sparse_matrix,
                      filtered_matrix,
                      loss_weight,
                      low_weight=0.05):
        N, D = vectors.shape
        K = self.k
        embeddings = xy_embeddings.iloc[:, 1:].values
        best_group = orig_groups

        # initialize the first 'k' elements in the dataset to be the initial centroids
        high_centroid = np.stack(vectors[:K])
        assert high_centroid.shape == (K, D)
        low_centroid = np.stack(embeddings[:K])  # low dimensional clustering

        for i in range(self.max_iterations):
            # get cosine distance betw each point and the cenroids, N x k
            high_dim_dist = cosine_distances(vectors, high_centroid)
            assert high_dim_dist.shape == (N, K)

            # Calculate normalized euclidean distance in low dimensional space
            low_dim_dist = euclidean_distances(embeddings, low_centroid)
            xy_range = (np.max(embeddings) - np.min(embeddings))
            max_dist = np.sqrt(xy_range * xy_range + xy_range * xy_range)
            low_dim_dist /= max_dist

            # Calculate the label distance
            country_matrix = generate_country_matrix(best_group, article_ids)
            country_label = country_matrix.dot(filtered_matrix)
            country_label = normalize(country_label, axis=1)
            label_scores = cosine_distances(filtered_matrix, country_label)

            # Calculate loss
            dis_mat = high_dim_dist * (
                1 - low_weight - loss_weight
            ) + low_dim_dist * low_weight - label_scores * loss_weight
            best_group = assign_best_groups(dis_mat, article_ids)
            assert best_group.shape == (N, 2)

            # calculate the # of articles per group
            points_per_group = np.zeros(K) + 1e-6
            np.add.at(points_per_group, best_group['country'], 1)

            # calculate the new centroid by averaging the new centroid at each cluster in both high and low dim
            high_centroid_new = np.zeros((K, D))
            np.add.at(high_centroid_new, best_group['country'], vectors)
            high_centroid_new /= points_per_group.repeat(D).reshape(K, D)

            low_centroid_new = np.zeros((K, 2))
            np.add.at(low_centroid_new, best_group['country'], embeddings)
            low_centroid_new /= points_per_group.repeat(2).reshape(K, 2)

            # break out of the main loop if the results are optimal, ie. the centroids don't change their positions
            # much(more than our tolerance)
            centroid_changes = np.sum(np.abs(high_centroid_new -
                                             high_centroid),
                                      axis=1)
            assert centroid_changes.shape == (K, )
            max_centroid_change = np.max(centroid_changes)
            high_centroid = high_centroid_new
            low_centroid = low_centroid_new
            if max_centroid_change < self.tolerance:
                break
        mean_distance = get_mean_centroid_distance(vectors, high_centroid,
                                                   best_group['country'])
        return best_group, mean_distance
Exemplo n.º 34
0
def columns_tfidf_cosine_distances(A: pd.DataFrame, B: pd.DataFrame, vectorizer: TfidfVectorizer):
    return cosine_distances(*[
        vectorizer.transform(df.columns) for df in (A, B)
    ])
Exemplo n.º 35
0
ngram_vectorizer = TfidfVectorizer(analyzer='char',ngram_range=(1, 3), min_df=1,sublinear_tf=True,lowercase=False)
tf = ngram_vectorizer.fit_transform(job_docs)
fnames = ngram_vectorizer.get_feature_names()
dense = tf.todense()

Cpp = [i for i,k in enumerate(fnames) if "C++" == k]
new=np.reshape(np.array(dense[:,Cpp]),num_docs)
ngramMat['C++'] = pd.Series(new,index=ngramMat.index)

R = [i for i,k in enumerate(fnames) if " R" == k]
new=np.reshape(np.array(dense[:,R]),num_docs)
ngramMat['R'] = pd.Series(new,index=ngramMat.index)

#%% remove duplicate docs

DM_docs = cosine_distances(ngramMat)

duplicates = np.zeros(num_docs)

for n in range(0,num_docs):
    doc_dupes = np.sort(np.where(DM_docs[n,:] == 0))[0][1:]
    duplicates[doc_dupes] = 1

docs_for_removal = np.where(duplicates.astype('int')==1)[0]
    
ngramMat.drop(docs_for_removal,inplace=True,axis=0)

#%%setup some constants used later

word_occurences=np.sum(ngramMat)
num_words = word_occurences.shape[0]
Exemplo n.º 36
0
def findSimilarBourbons(BourbonID):
    ''' This recommendation is finding similar bourbons not from the same distillery '''

    bourbons = pd.DataFrame(list(Bourbon.objects.all().values()))

    # Remove all bourbons with the same distillery (keep BourbonID)
    distillery = bourbons.loc[bourbons['BourbonID'] ==
                              BourbonID]['Distillery'].values[0]
    bourbons = bourbons.loc[(bourbons['BourbonID'] == BourbonID) |
                            (bourbons['Distillery'] != distillery)]

    # Remove items not being used for similarities (Bourbon Name, Distillery(we have location), Website, Description, id(from sqlite))
    final = bourbons.drop(
        ['Bourbon', 'Distillery', 'Website', 'Description', 'id'],
        axis=1).set_index('BourbonID')

    # Encode the categorical data values
    le = preprocessing.LabelEncoder()
    final['Style'] = le.fit_transform(final['Style'])
    final['Type'] = le.fit_transform(final['Type'])
    final['Location'] = le.fit_transform(final['Location'])

    # Calculate the different distance similarity calculations (rank by average of each rank)
    cosine = cosine_distances(final.values)
    cosine = pd.DataFrame(cosine,
                          columns=final.index.values,
                          index=final.index)[BourbonID]

    euclidean = euclidean_distances(final.values)
    euclidean = pd.DataFrame(euclidean,
                             columns=final.index.values,
                             index=final.index)[BourbonID]

    manhattan = manhattan_distances(final.values)
    manhattan = pd.DataFrame(manhattan,
                             columns=final.index.values,
                             index=final.index)[BourbonID]

    allSimilarities = pd.concat([cosine, euclidean, manhattan], axis=1)
    allSimilarities.columns = [
        'Cosine Distance', 'Euclidean Distance', 'Manhattan Distance'
    ]

    allRank = allSimilarities.rank(axis=0)
    finalRank = allRank.mean(axis=1)

    bourbons['SimilaritiesRank'] = list(finalRank.values)
    bourbons = bourbons.sort_values(by="SimilaritiesRank")

    # For each recommendation, remove recurring distilleries (so I dont get bourbons from the same distillery)
    bourbons = bourbons[bourbons['BourbonID'] != BourbonID]
    for index, row in bourbons.iterrows():
        if row['BourbonID'] in bourbons['BourbonID'].tolist():
            distillery = bourbons.loc[bourbons['BourbonID'] ==
                                      row['BourbonID']]['Distillery'].values[0]
            bourbons = bourbons.loc[(bourbons['BourbonID'] == row['BourbonID'])
                                    | (bourbons['Distillery'] != distillery)]

    topBourbons = bourbons.head()

    topBourbons.apply(saveSimilarities, axis=1)
Exemplo n.º 37
0
import sys
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_distances
from facedata import FaceData, Spotting

face_data = FaceData()
items = face_data.find_by_location('entrance')
embs = np.stack([item.vector for item in items])
print('faces found: %d ' % embs.shape[0])

db = DBSCAN(0.45, metric='precomputed')
dist = cosine_distances(embs)

clusters = db.fit_predict(dist)
print('clustered: %d' % (len(set(clusters)) - 1))
groups = [[] for i in range(len(set(clusters)) - 1)]
for i, label in enumerate(clusters):
    if label == -1:
        continue
    groups[label].append(i)


def triage(group):
    keyed_by_hour = {}
    for item in group:
        key = item.spotted_at.split(' ')[1].split(':')[0]
        if keyed_by_hour.get(key, None) is None:
            keyed_by_hour[key] = []
        keyed_by_hour[key].append(item)
    keep = []
Exemplo n.º 38
0
# extract the terms-by-documents matrix
# in scipy compressed sparse column format
sparse_movies_tdm = tdm_method.fit_transform(parsed_text)
# convert sparse matrix into regular terms-by-documents matrix
movies_tdm = sparse_movies_tdm.todense()
# define the documents-by-terms matrix
movies_dtm = movies_tdm.transpose()

# dissimilarity measures and multidimensional scaling
# consider alternative pairwise distance metrics from sklearn modules
# euclidean_distances, cosine_distances, manhattan_distances (city-block)
# note that different metrics provide different solutions
# movies_distance_matrix = euclidean_distances(movies_tdm)
# movies_distance_matrix = manhattan_distances(movies_tdm)
movies_distance_matrix = cosine_distances(movies_tdm)

mds_method = manifold.MDS(n_components = 2, random_state = 9999,\
    dissimilarity = 'precomputed')
mds_fit = mds_method.fit(movies_distance_matrix)
mds_coordinates = mds_method.fit_transform(movies_distance_matrix)

# plot tagline text for years in two dimensions
# defined by multidimensional scaling
plt.figure()
plt.scatter(mds_coordinates[:,0],mds_coordinates[:,1],\
    facecolors = 'none', edgecolors = 'none')  # plots points in white (invisible)
labels = []
for iyear in range(1974, 2014):
    labels.append(str(iyear))
for label, x, y in zip(labels, mds_coordinates[:, 0], mds_coordinates[:, 1]):
Exemplo n.º 39
0
def cosine(X, Y=None, Y_norm_squared=None, squared=False):
    return cosine_distances(X, Y)
Exemplo n.º 40
0
# extract the terms-by-documents matrix 
# in scipy compressed sparse column format
sparse_movies_tdm = tdm_method.fit_transform(parsed_text)
# convert sparse matrix into regular terms-by-documents matrix
movies_tdm = sparse_movies_tdm.todense()
# define the documents-by-terms matrix 
movies_dtm = movies_tdm.transpose()
 
# dissimilarity measures and multidimensional scaling
# consider alternative pairwise distance metrics from sklearn modules
# euclidean_distances, cosine_distances, manhattan_distances (city-block)
# note that different metrics provide different solutions
# movies_distance_matrix = euclidean_distances(movies_tdm)
# movies_distance_matrix = manhattan_distances(movies_tdm)
movies_distance_matrix = cosine_distances(movies_tdm)

mds_method = manifold.MDS(n_components = 2, random_state = 9999,\
    dissimilarity = 'precomputed')
mds_fit = mds_method.fit(movies_distance_matrix)  
mds_coordinates = mds_method.fit_transform(movies_distance_matrix) 

# plot tagline text for years in two dimensions 
# defined by multidimensional scaling
plt.figure()
plt.scatter(mds_coordinates[:,0],mds_coordinates[:,1],\
    facecolors = 'none', edgecolors = 'none')  # plots points in white (invisible)
labels = []
for iyear in range(1974,2014):
    labels.append(str(iyear))  
for label, x, y in zip(labels, mds_coordinates[:,0], mds_coordinates[:,1]):
Exemplo n.º 41
0
# using the word2vec model
word2idx = tokenizer.word_index
idx2word = {v:k for k, v in word2idx.items()}

# retrieve the weights from the first dense layer. This will convert
# the input vector from a one-hot sum of two words to a dense 300 
# dimensional representation
W, b = model.layers[0].get_weights()

idx2emb = {}    
for word in word2idx.keys():
    wid = word2idx[word]
    vec_in = ohe.fit_transform(np.array(wid)).todense()
    vec_emb = np.dot(vec_in, W)
    idx2emb[wid] = vec_emb

for word in ["stupid", "alice", "succeeded"]:
    wid = word2idx[word]
    source_emb = idx2emb[wid]
    distances = []
    for i in range(1, vocab_size):
        if i == wid:
            continue
        target_emb = idx2emb[i]
        distances.append(((wid, i), 
                         cosine_distances(source_emb, target_emb)))
    sorted_distances = sorted(distances, key=operator.itemgetter(1))[0:10]
    predictions = [idx2word[x[0][1]] for x in sorted_distances]
    print("{:s} => {:s}".format(word, ", ".join(predictions)))
Exemplo n.º 42
0
    def analyse(self, test_data, mode):
        # Vectorise
        cluster_test = self.dv.transform(test_data)
        # Scale
        cluster_test = self.scaler.transform(cluster_test)

        cluster_results = []

        if self.number_of_authors is None:
            for i in range(len(cluster_test)):
                closest_center = None
                closest_distance = None
                for j in range(len(self.centers)):
                    distance = cosine_similarity(
                        [cluster_test[i], self.centers[j]])[0][0]
                    if closest_distance is None:
                        closest_distance = distance
                        closest_center = j
                    elif distance < closest_distance:
                        closest_distance = distance
                        closest_center = j
                self.predictions.append(closest_center)
        else:
            self.predictions = self.kmeans.predict(cluster_test)

        feature_significance_level = 2
        mode_code = 1
        if mode == 'diverge':
            mode_code = 2
            feature_significance_level = 0.2

        for i in range(len(self.predictions)):
            prediction = self.predictions[i]

            result_set = {}
            result_set["significant_features"] = []
            result_set["cluster"] = int(prediction)
            result_set['dist'] = cosine_distances(
                [cluster_test[i], self.centers[prediction]])[0][1] * 100

            for j in range(len(self.centers[prediction])):
                feature_name = self.feats[j]
                if feature_name is 'capital_count':
                    print(feature_name)
                unscaled_test_value = 0
                try:
                    unscaled_test_value = test_data[i][feature_name]
                except:
                    pass
                centroid_value = self.centers[prediction][j]
                if centroid_value is not 0 and unscaled_test_value > 0:
                    test_value = cluster_test[i][j]
                    feature_difference = abs(test_value - centroid_value)
                    if mode_code == 2:
                        if feature_significance_level > feature_difference and unscaled_test_value > 0:
                            result_set["significant_features"].append(
                                (feature_name, feature_difference, test_value,
                                 centroid_value))
                    else:
                        if feature_difference > feature_significance_level and unscaled_test_value > 0:
                            result_set["significant_features"].append(
                                (feature_name, feature_difference, test_value,
                                 centroid_value))

            cluster_results.append(result_set)

        return cluster_results
Exemplo n.º 43
0
def test_pairwise_distances():
    """ Test the pairwise_distance helper function. """
    rng = np.random.RandomState(0)
    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    S = pairwise_distances(X, metric="euclidean")
    S2 = euclidean_distances(X)
    assert_array_almost_equal(S, S2)
    # Euclidean distance, with Y != X.
    Y = rng.random_sample((2, 4))
    S = pairwise_distances(X, Y, metric="euclidean")
    S2 = euclidean_distances(X, Y)
    assert_array_almost_equal(S, S2)
    # Test with tuples as X and Y
    X_tuples = tuple([tuple([v for v in row]) for row in X])
    Y_tuples = tuple([tuple([v for v in row]) for row in Y])
    S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean")
    assert_array_almost_equal(S, S2)
    # "cityblock" uses sklearn metric, cityblock (function) is scipy.spatial.
    S = pairwise_distances(X, metric="cityblock")
    S2 = pairwise_distances(X, metric=cityblock)
    assert_equal(S.shape[0], S.shape[1])
    assert_equal(S.shape[0], X.shape[0])
    assert_array_almost_equal(S, S2)
    # The manhattan metric should be equivalent to cityblock.
    S = pairwise_distances(X, Y, metric="manhattan")
    S2 = pairwise_distances(X, Y, metric=cityblock)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)
    # manhattan does not support sparse matrices atm.
    assert_raises(ValueError, pairwise_distances, csr_matrix(X),
                  metric="manhattan")
    # Low-level function for manhattan can divide in blocks to avoid
    # using too much memory during the broadcasting
    S3 = manhattan_distances(X, Y, size_threshold=10)
    assert_array_almost_equal(S, S3)
    # Test cosine as a string metric versus cosine callable
    # "cosine" uses sklearn metric, cosine (function) is scipy.spatial
    S = pairwise_distances(X, Y, metric="cosine")
    S2 = pairwise_distances(X, Y, metric=cosine)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)
    # Tests that precomputed metric returns pointer to, and not copy of, X.
    S = np.dot(X, X.T)
    S2 = pairwise_distances(S, metric="precomputed")
    assert_true(S is S2)
    # Test with sparse X and Y,
    # currently only supported for euclidean and cosine
    X_sparse = csr_matrix(X)
    Y_sparse = csr_matrix(Y)
    S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean")
    S2 = euclidean_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    S = pairwise_distances(X_sparse, Y_sparse, metric="cosine")
    S2 = cosine_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    # Test with scipy.spatial.distance metric, with a kwd
    kwds = {"p": 2.0}
    S = pairwise_distances(X, Y, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, Y, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)
    # same with Y = None
    kwds = {"p": 2.0}
    S = pairwise_distances(X, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)
    # Test that scipy distance metrics throw an error if sparse matrix given
    assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski")
    assert_raises(TypeError, pairwise_distances, X, Y_sparse,
                  metric="minkowski")
Exemplo n.º 44
0
def cosine_distance(v1, v2):
    #As cosine similarity interval is [-1.0, 1.0], the cosine distance interval is [0.0, 2.0].
    #This normalizes the cosine distance to interval [0.0, 1.0]
    return pairwise.cosine_distances(v1, v2) / 2.0
Exemplo n.º 45
0
         'control': 3.721765211295327,
         'democratic': 3.1026721743330414,
         'governments': 4.167571323949673,
         'in': 0.0009654063501214492,
         'law': 2.4538226269605703,
         'popular': 2.764478952022998,
         'response': 4.261461747058352,
         'to': 0.04694493768179923}

word_indices = [map_index_to_word[word] for word in tweet.keys()]

tweet_tf_idf = scipy.sparse.csr_matrix((list(tweet.values()), ([0] * len(word_indices), word_indices)),
                                       shape=(1, tf_idf.shape[1]))
obama_tf_idf = tf_idf[obama_id]
print("The cosine distance between Obama's article and the tweet is {:.6e}."
      .format(cosine_distances(obama_tf_idf, tweet_tf_idf)[0, 0]))
print('''
With cosine distances, the tweet is "nearer" to Barack Obama.
Ignoring article lengths completely resulted in nonsensical results.
In practice, it is common to enforce maximum or minimum document lengths.
''')

# QUIZ QUESTIONS:
print("Quiz Questions:")
# 1. Among the words that appear in both Barack Obama and Francisco Barrio,
#    take the 5 that appear most frequently in Obama.
#    How many of the articles in the Wikipedia dataset contain all of those 5 words?
print("1. Among the words that appear in both Barack Obama and Francisco Barrio, ")
print("   take the 5 that appear most frequently in Obama.")
print("   There are {:d} articles in the Wikipedia dataset contain all of those 5 words.\n"
      .format(has_top_words_count[True]))
Exemplo n.º 46
0
def computeSimilarities(answers, response, vectorizer):
    response_transformed = vectorizer.transform([response])
    answers_transformed = vectorizer.transform(answers)
    distances = cosine_distances(response_transformed, answers_transformed)
    return [1 - x for x in distances[0]]
Exemplo n.º 47
0
print("Test score: {:.3f}, accuracy: {:.3f}".format(score[0], score[1]))

# using the word2vec model
word2idx = tokenizer.word_index
idx2word = {v: k for k, v in word2idx.items()}

# retrieve the weights from the first dense layer. This will convert
# the input vector from a one-hot sum of two words to a dense 300
# dimensional representation
W, b = model.layers[0].get_weights()

idx2emb = {}
for word in word2idx.keys():
    wid = word2idx[word]
    vec_in = ohe.fit_transform(np.array(wid)).todense()
    vec_emb = np.dot(vec_in, W)
    idx2emb[wid] = vec_emb

for word in ["stupid", "alice", "succeeded"]:
    wid = word2idx[word]
    source_emb = idx2emb[wid]
    distances = []
    for i in range(1, vocab_size):
        if i == wid:
            continue
        target_emb = idx2emb[i]
        distances.append(((wid, i), cosine_distances(source_emb, target_emb)))
    sorted_distances = sorted(distances, key=operator.itemgetter(1))[0:10]
    predictions = [idx2word[x[0][1]] for x in sorted_distances]
    print("{:s} => {:s}".format(word, ", ".join(predictions)))
Exemplo n.º 48
0
def test_pairwise_distances():
    # Test the pairwise_distance helper function.
    rng = np.random.RandomState(0)
    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    S = pairwise_distances(X, metric="euclidean")
    S2 = euclidean_distances(X)
    assert_array_almost_equal(S, S2)
    # Euclidean distance, with Y != X.
    Y = rng.random_sample((2, 4))
    S = pairwise_distances(X, Y, metric="euclidean")
    S2 = euclidean_distances(X, Y)
    assert_array_almost_equal(S, S2)
    # Test with tuples as X and Y
    X_tuples = tuple([tuple([v for v in row]) for row in X])
    Y_tuples = tuple([tuple([v for v in row]) for row in Y])
    S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean")
    assert_array_almost_equal(S, S2)
    # "cityblock" uses sklearn metric, cityblock (function) is scipy.spatial.
    S = pairwise_distances(X, metric="cityblock")
    S2 = pairwise_distances(X, metric=cityblock)
    assert_equal(S.shape[0], S.shape[1])
    assert_equal(S.shape[0], X.shape[0])
    assert_array_almost_equal(S, S2)
    # The manhattan metric should be equivalent to cityblock.
    S = pairwise_distances(X, Y, metric="manhattan")
    S2 = pairwise_distances(X, Y, metric=cityblock)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)
    # Low-level function for manhattan can divide in blocks to avoid
    # using too much memory during the broadcasting
    S3 = manhattan_distances(X, Y, size_threshold=10)
    assert_array_almost_equal(S, S3)
    # Test cosine as a string metric versus cosine callable
    # "cosine" uses sklearn metric, cosine (function) is scipy.spatial
    S = pairwise_distances(X, Y, metric="cosine")
    S2 = pairwise_distances(X, Y, metric=cosine)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)
    # Test with sparse X and Y,
    # currently only supported for Euclidean, L1 and cosine.
    X_sparse = csr_matrix(X)
    Y_sparse = csr_matrix(Y)
    S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean")
    S2 = euclidean_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    S = pairwise_distances(X_sparse, Y_sparse, metric="cosine")
    S2 = cosine_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan")
    S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo())
    assert_array_almost_equal(S, S2)
    S2 = manhattan_distances(X, Y)
    assert_array_almost_equal(S, S2)
    # Test with scipy.spatial.distance metric, with a kwd
    kwds = {"p": 2.0}
    S = pairwise_distances(X, Y, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, Y, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)
    # same with Y = None
    kwds = {"p": 2.0}
    S = pairwise_distances(X, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)
    # Test that scipy distance metrics throw an error if sparse matrix given
    assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski")
    assert_raises(TypeError, pairwise_distances, X, Y_sparse,
                  metric="minkowski")

    # Test that a value error is raised if the metric is unknown
    assert_raises(ValueError, pairwise_distances, X, Y, metric="blah")
Exemplo n.º 49
0
    plt.annotate(label, (x_value, y_value),
                 xytext=(0, space),
                 textcoords='offset points',
                 ha='center',
                 va=va)
fig = plt.gcf()
plt.show()
fig.savefig(os.path.join("Outputs", "lsk_sparse_kmeans_user_counts_6.png"))

print kmeans.inertia_

####################################
# Try mds on sparse counts
user_counts.drop(['kcluster'], axis=1, inplace=True)
user_affinity = cosine_distances(user_counts)
clf = manifold.MDS(n_components=2, dissimilarity='precomputed')
ny_mds = clf.fit_transform(user_affinity)

plt.scatter(ny_mds[:, 0], ny_mds[:, 1])
plt.xlabel("First Component")
plt.ylabel("Second Component")
fig = plt.gcf()
plt.show()
fig.savefig(os.path.join("Outputs", "lsmk_sparse_mds.png"))

# Check silhouette scores
range_n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10]
for n_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=n_clusters).fit(ny_mds)
    cluster_labels = kmeans.fit_predict(ny_mds)
Exemplo n.º 50
0
def test_pairwise_distances():
    # Test the pairwise_distance helper function.
    rng = np.random.RandomState(0)

    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    S = pairwise_distances(X, metric="euclidean")
    S2 = euclidean_distances(X)
    assert_array_almost_equal(S, S2)

    # Euclidean distance, with Y != X.
    Y = rng.random_sample((2, 4))
    S = pairwise_distances(X, Y, metric="euclidean")
    S2 = euclidean_distances(X, Y)
    assert_array_almost_equal(S, S2)

    # Test with tuples as X and Y
    X_tuples = tuple([tuple([v for v in row]) for row in X])
    Y_tuples = tuple([tuple([v for v in row]) for row in Y])
    S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean")
    assert_array_almost_equal(S, S2)

    # Test haversine distance
    # The data should be valid latitude and longitude
    X = rng.random_sample((5, 2))
    X[:, 0] = (X[:, 0] - 0.5) * 2 * np.pi/2
    X[:, 1] = (X[:, 1] - 0.5) * 2 * np.pi
    S = pairwise_distances(X, metric="haversine")
    S2 = haversine_distances(X)
    assert_array_almost_equal(S, S2)

    # Test haversine distance, with Y != X
    Y = rng.random_sample((2, 2))
    Y[:, 0] = (Y[:, 0] - 0.5)*2*np.pi/2
    Y[:, 1] = (Y[:, 1] - 0.5)*2*np.pi
    S = pairwise_distances(X, Y, metric="haversine")
    S2 = haversine_distances(X, Y)
    assert_array_almost_equal(S, S2)

    # "cityblock" uses scikit-learn metric, cityblock (function) is
    # scipy.spatial.
    S = pairwise_distances(X, metric="cityblock")
    S2 = pairwise_distances(X, metric=cityblock)
    assert_equal(S.shape[0], S.shape[1])
    assert_equal(S.shape[0], X.shape[0])
    assert_array_almost_equal(S, S2)

    # The manhattan metric should be equivalent to cityblock.
    S = pairwise_distances(X, Y, metric="manhattan")
    S2 = pairwise_distances(X, Y, metric=cityblock)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)

    # Test cosine as a string metric versus cosine callable
    # The string "cosine" uses sklearn.metric,
    # while the function cosine is scipy.spatial
    S = pairwise_distances(X, Y, metric="cosine")
    S2 = pairwise_distances(X, Y, metric=cosine)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)

    # Test with sparse X and Y,
    # currently only supported for Euclidean, L1 and cosine.
    X_sparse = csr_matrix(X)
    Y_sparse = csr_matrix(Y)
    S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean")
    S2 = euclidean_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    S = pairwise_distances(X_sparse, Y_sparse, metric="cosine")
    S2 = cosine_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan")
    S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo())
    assert_array_almost_equal(S, S2)
    S2 = manhattan_distances(X, Y)
    assert_array_almost_equal(S, S2)

    # Test with scipy.spatial.distance metric, with a kwd
    kwds = {"p": 2.0}
    S = pairwise_distances(X, Y, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, Y, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)

    # same with Y = None
    kwds = {"p": 2.0}
    S = pairwise_distances(X, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)

    # Test that scipy distance metrics throw an error if sparse matrix given
    assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski")
    assert_raises(TypeError, pairwise_distances, X, Y_sparse,
                  metric="minkowski")

    # Test that a value error is raised if the metric is unknown
    assert_raises(ValueError, pairwise_distances, X, Y, metric="blah")
Exemplo n.º 51
0
def Similarity(filename, resumename):

    array_main = filename
    #df_main = pd.read_csv(filename)
    #df_main = df_main.drop(["Unnamed: 0"], axis = 1)
    #array_main = df_main.values
    #-------------------------
    dataset = "resume_dataset.csv"
    df = table_preprocess(dataset)
    # Create a list of job titles, descriptions, and companies

    jd = df['Resume'].tolist()
    categories = df['Category'].tolist()
    print(len(categories))

    #-------------------------
    # Resume vector

    data = Word2Vec_Vectorize(str(resumename))
    data_array = np.array(data)
    data_array_reshaped = data_array.reshape(1, -1)

    #-------------------------

    cos_dist = []

    for vec in array_main:
        vec = np.array(vec)
        vec = vec.reshape(1, -1)
        cos_dist.append(float(cosine_distances(vec, data_array_reshaped)))

    #-----------------------

    ps = PorterStemmer()
    key_list = []

    for j in jd:
        key = ''
        w = set()
        for word in keywords(j).split('\n'):
            w.add(ps.stem(word))
        for x in w:
            key += '{} '.format(x)
        key_list.append(key)

    print(len(cos_dist))
    summary = pd.DataFrame({
        'Cosine Distances': cos_dist,
        "Category": categories,
        'Resume': jd
    })

    z = summary.sort_values('Cosine Distances', ascending=False)
    z.to_csv('Summary_res_vec.csv', encoding="utf-8")

    #--------------------------------
    # Plot graphs
    # array_main = df_main.values
    # array_list = array_main.tolist()

    #data_list = data[0]
    #array_list.append(data_list)

    #mean_vec = array_list
    #plot_pca(array_list)

    #plot_pca(array_list)

    return z.head()
Exemplo n.º 52
0
def test_pairwise_distances():
    # Test the pairwise_distance helper function.
    rng = np.random.RandomState(0)
    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    S = pairwise_distances(X, metric="euclidean")
    S2 = euclidean_distances(X)
    assert_array_almost_equal(S, S2)
    # Euclidean distance, with Y != X.
    Y = rng.random_sample((2, 4))
    S = pairwise_distances(X, Y, metric="euclidean")
    S2 = euclidean_distances(X, Y)
    assert_array_almost_equal(S, S2)
    # Test with tuples as X and Y
    X_tuples = tuple([tuple([v for v in row]) for row in X])
    Y_tuples = tuple([tuple([v for v in row]) for row in Y])
    S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean")
    assert_array_almost_equal(S, S2)
    # "cityblock" uses sklearn metric, cityblock (function) is scipy.spatial.
    S = pairwise_distances(X, metric="cityblock")
    S2 = pairwise_distances(X, metric=cityblock)
    assert_equal(S.shape[0], S.shape[1])
    assert_equal(S.shape[0], X.shape[0])
    assert_array_almost_equal(S, S2)
    # The manhattan metric should be equivalent to cityblock.
    S = pairwise_distances(X, Y, metric="manhattan")
    S2 = pairwise_distances(X, Y, metric=cityblock)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)
    # Low-level function for manhattan can divide in blocks to avoid
    # using too much memory during the broadcasting
    S3 = manhattan_distances(X, Y, size_threshold=10)
    assert_array_almost_equal(S, S3)
    # Test cosine as a string metric versus cosine callable
    # "cosine" uses sklearn metric, cosine (function) is scipy.spatial
    S = pairwise_distances(X, Y, metric="cosine")
    S2 = pairwise_distances(X, Y, metric=cosine)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)
    # Test with sparse X and Y,
    # currently only supported for Euclidean, L1 and cosine.
    X_sparse = csr_matrix(X)
    Y_sparse = csr_matrix(Y)
    S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean")
    S2 = euclidean_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    S = pairwise_distances(X_sparse, Y_sparse, metric="cosine")
    S2 = cosine_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan")
    S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo())
    assert_array_almost_equal(S, S2)
    S2 = manhattan_distances(X, Y)
    assert_array_almost_equal(S, S2)
    # Test with scipy.spatial.distance metric, with a kwd
    kwds = {"p": 2.0}
    S = pairwise_distances(X, Y, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, Y, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)
    # same with Y = None
    kwds = {"p": 2.0}
    S = pairwise_distances(X, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)
    # Test that scipy distance metrics throw an error if sparse matrix given
    assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski")
    assert_raises(TypeError,
                  pairwise_distances,
                  X,
                  Y_sparse,
                  metric="minkowski")

    # Test that a value error is raised if the metric is unknown
    assert_raises(ValueError, pairwise_distances, X, Y, metric="blah")
# using the word2vec model
word2idx = tokenizer.word_index
idx2word = {v: k for k, v in word2idx.items()}

# retrieve the weights from the first dense layer. This will convert
# the input vector from a one-hot sum of two words to a dense 300 
# dimensional representation
W, b = model.layers[0].get_weights()

idx2emb = {}    
for word in word2idx.keys():
    wid = word2idx[word]
    vec_in = ohe.fit_transform(np.array(wid)).todense()
    vec_emb = np.dot(vec_in, W)
    idx2emb[wid] = vec_emb

for word in ["stupid", "alice", "succeeded"]:
    wid = word2idx[word]
    source_emb = idx2emb[wid]
    distances = []
    for i in range(1, vocab_size):
        if i == wid:
            continue
        target_emb = idx2emb[i]
        distances.append(((wid, i), 
                         cosine_distances(source_emb, target_emb)))
    sorted_distances = sorted(distances, key=operator.itemgetter(1))[0:10]
    predictions = [idx2word[x[0][1]] for x in sorted_distances]
    print("{:s} => {:s}".format(word, ", ".join(predictions)))
Exemplo n.º 54
0
def test_pairwise_distances():
    # Test the pairwise_distance helper function.
    rng = np.random.RandomState(0)

    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    S = pairwise_distances(X, metric="euclidean")
    S2 = euclidean_distances(X)
    assert_array_almost_equal(S, S2)

    # Euclidean distance, with Y != X.
    Y = rng.random_sample((2, 4))
    S = pairwise_distances(X, Y, metric="euclidean")
    S2 = euclidean_distances(X, Y)
    assert_array_almost_equal(S, S2)
    # Check to ensure NaNs work with pairwise_distances.
    X_masked = rng.random_sample((5, 4))
    Y_masked = rng.random_sample((2, 4))
    X_masked[0, 0] = np.nan
    Y_masked[0, 0] = np.nan
    S_masked = pairwise_distances(X_masked, Y_masked, metric="nan_euclidean")
    S2_masked = nan_euclidean_distances(X_masked, Y_masked)
    assert_array_almost_equal(S_masked, S2_masked)
    # Test with tuples as X and Y
    X_tuples = tuple([tuple([v for v in row]) for row in X])
    Y_tuples = tuple([tuple([v for v in row]) for row in Y])
    S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean")
    assert_array_almost_equal(S, S2)

    # Test haversine distance
    # The data should be valid latitude and longitude
    X = rng.random_sample((5, 2))
    X[:, 0] = (X[:, 0] - 0.5) * 2 * np.pi / 2
    X[:, 1] = (X[:, 1] - 0.5) * 2 * np.pi
    S = pairwise_distances(X, metric="haversine")
    S2 = haversine_distances(X)
    assert_array_almost_equal(S, S2)

    # Test haversine distance, with Y != X
    Y = rng.random_sample((2, 2))
    Y[:, 0] = (Y[:, 0] - 0.5) * 2 * np.pi / 2
    Y[:, 1] = (Y[:, 1] - 0.5) * 2 * np.pi
    S = pairwise_distances(X, Y, metric="haversine")
    S2 = haversine_distances(X, Y)
    assert_array_almost_equal(S, S2)

    # "cityblock" uses scikit-learn metric, cityblock (function) is
    # scipy.spatial.
    S = pairwise_distances(X, metric="cityblock")
    S2 = pairwise_distances(X, metric=cityblock)
    assert S.shape[0] == S.shape[1]
    assert S.shape[0] == X.shape[0]
    assert_array_almost_equal(S, S2)

    # The manhattan metric should be equivalent to cityblock.
    S = pairwise_distances(X, Y, metric="manhattan")
    S2 = pairwise_distances(X, Y, metric=cityblock)
    assert S.shape[0] == X.shape[0]
    assert S.shape[1] == Y.shape[0]
    assert_array_almost_equal(S, S2)

    # Test cosine as a string metric versus cosine callable
    # The string "cosine" uses sklearn.metric,
    # while the function cosine is scipy.spatial
    S = pairwise_distances(X, Y, metric="cosine")
    S2 = pairwise_distances(X, Y, metric=cosine)
    assert S.shape[0] == X.shape[0]
    assert S.shape[1] == Y.shape[0]
    assert_array_almost_equal(S, S2)

    # Test with sparse X and Y,
    # currently only supported for Euclidean, L1 and cosine.
    X_sparse = csr_matrix(X)
    Y_sparse = csr_matrix(Y)
    S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean")
    S2 = euclidean_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    S = pairwise_distances(X_sparse, Y_sparse, metric="cosine")
    S2 = cosine_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan")
    S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo())
    assert_array_almost_equal(S, S2)
    S2 = manhattan_distances(X, Y)
    assert_array_almost_equal(S, S2)

    # Test with scipy.spatial.distance metric, with a kwd
    kwds = {"p": 2.0}
    S = pairwise_distances(X, Y, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, Y, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)

    # same with Y = None
    kwds = {"p": 2.0}
    S = pairwise_distances(X, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)

    # Test that scipy distance metrics throw an error if sparse matrix given
    with pytest.raises(TypeError):
        pairwise_distances(X_sparse, metric="minkowski")
    with pytest.raises(TypeError):
        pairwise_distances(X, Y_sparse, metric="minkowski")

    # Test that a value error is raised if the metric is unknown
    with pytest.raises(ValueError):
        pairwise_distances(X, Y, metric="blah")
Exemplo n.º 55
0


"""
# extra work : eu-distance of X from origin
eu_origin_normX = euclidean_distances(norm_X, [[0,0,0,0]])
eu_origin_X = euclidean_distances(X, [[0,0,0,0]])
print(eu_origin_normX)
print(eu_origin_X)
"""

"""
c.ii Cosine distance = 1-Cosine_similarity
"""

cosine_dist = cosine_distances(norm_X,norm_X)
#print(cosine_dist)
print("Cosine matrix calculated for normalized X")
plt.matshow(cosine_dist)
plt.title("Cosine distance")
plt.show()



"""
from sklearn.neighbors import DistanceMetric


maha1 = DistanceMetric.get_metric('mahalanobis')
maha = maha1.pairwise(norm_X, 'mahalanobis', 'V')
print("mahahaaaaa")
Exemplo n.º 56
0
def cosine_metric(w, X, y):
    return cosine_distances([X.dot(w)], [y])[0, 0]
plt.title('Distribution of document length')
plt.xlabel('# of words')
plt.ylabel('Percentage')
plt.rcParams.update({'font.size': 16})
plt.tight_layout()
plt.show()


# drawback of cosine metric: ignores text length completely
tweet = {'act': 3.4597778278724887,
         'control': 3.721765211295327,
         'democratic': 3.1026721743330414,
         'governments': 4.167571323949673,
         'in': 0.0009654063501214492,
         'law': 2.4538226269605703,
         'popular': 2.764478952022998,
         'response': 4.261461747058352,
         'to': 0.04694493768179923}


word_indices = [map_index_to_word[word] for word in tweet.keys()]
#                               data                row id's         col id's
tweet_tf_idf = csr_matrix((tweet.values(), ([0]*len(word_indices), word_indices)),
                          shape=(1, tf_idf.shape[1]))
obama_tf_idf = tf_idf[35817]
print cosine_distances(obama_tf_idf, tweet_tf_idf)

distances, indices = model2_tf_idf.kneighbors(obama_tf_idf, n_neighbors=10)
print distances