Пример #1
0
def test_pairwise_distances():
    """ Test the pairwise_distance helper function. """
    rng = np.random.RandomState(0)
    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    S = pairwise_distances(X, metric="euclidean")
    S2 = euclidean_distances(X)
    assert_array_almost_equal(S, S2)
    # Euclidean distance, with Y != X.
    Y = rng.random_sample((2, 4))
    S = pairwise_distances(X, Y, metric="euclidean")
    S2 = euclidean_distances(X, Y)
    assert_array_almost_equal(S, S2)
    # Test with tuples as X and Y
    X_tuples = tuple([tuple([v for v in row]) for row in X])
    Y_tuples = tuple([tuple([v for v in row]) for row in Y])
    S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean")
    assert_array_almost_equal(S, S2)
    # "cityblock" uses sklearn metric, cityblock (function) is scipy.spatial.
    S = pairwise_distances(X, metric="cityblock")
    S2 = pairwise_distances(X, metric=cityblock)
    assert_equal(S.shape[0], S.shape[1])
    assert_equal(S.shape[0], X.shape[0])
    assert_array_almost_equal(S, S2)
    # The manhattan metric should be equivalent to cityblock.
    S = pairwise_distances(X, Y, metric="manhattan")
    S2 = pairwise_distances(X, Y, metric=cityblock)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)
    # Test cosine as a string metric versus cosine callable
    S = pairwise_distances(X, Y, metric="cosine")
    S2 = pairwise_distances(X, Y, metric=cosine)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)
    # Tests that precomputed metric returns pointer to, and not copy of, X.
    S = np.dot(X, X.T)
    S2 = pairwise_distances(S, metric="precomputed")
    assert_true(S is S2)
    # Test with sparse X and Y
    X_sparse = csr_matrix(X)
    Y_sparse = csr_matrix(Y)
    S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean")
    S2 = euclidean_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    # Test with scipy.spatial.distance metric, with a kwd
    kwds = {"p": 2.0}
    S = pairwise_distances(X, Y, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, Y, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)
    # same with Y = None
    kwds = {"p": 2.0}
    S = pairwise_distances(X, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)
    # Test that scipy distance metrics throw an error if sparse matrix given
    assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski")
    assert_raises(TypeError, pairwise_distances, X, Y_sparse,
                  metric="minkowski")
Пример #2
0
def kmeans(X, k, centroids=None, steps=20, verbose=0):
    if not centroids:
        centroids = X[np.random.choice(np.arange(X.shape[0]), size=k)]

        if sp.sparse.issparse(centroids):
            centroids = centroids.toarray()

    for step in xrange(steps):
        D = euclidean_distances(centroids, X, squared=0) # since rows are normalized, it's cosine
        clusters = D.argmin(axis=0)

        new_centroids, k = cluster_centroids(X, clusters, k)
        
        J = np.abs((new_centroids ** 2).sum() - (centroids ** 2).sum())
        if verbose and step % 10 == 0:
            print 'step %d... J=%0.4f' % (step, J)

        if J < 1e-6:
            break

        centroids = new_centroids

    if verbose:
        print 'converged after step=%d, final J=%0.4f' % (step, J)
    D = euclidean_distances(centroids, X, squared=0)
    clusters = D.argmin(axis=0)
    return clusters, k, centroids
def get_top_k_match(k, source, targets, source_embeddings,target_embeddings ):

	result_dict_average = {}
	result_dict_average_tfidf = {}
	result_dict_sum = {}
	for t in targets:
	
	
		distance_average = euclidean_distances(vector_averaging(source.split(" "),source_embeddings,DIMENSION),vector_averaging(t.split(" "),target_embeddings,DIMENSION))[0][0]
		distance_average_tfidf = euclidean_distances(vector_averaging_with_tfidf(source.split(" "),source_embeddings,cs_word2weight,DIMENSION),vector_averaging_with_tfidf(t.split(" "),target_embeddings,java_word2weight,DIMENSION))[0][0]
		# distance_sum = euclidean_distances(vector_summing(source.split(" "),source_embeddings,DIMENSION),vector_summing(t.split(" "),target_embeddings,DIMENSION))[0][0]
		# distance_sum_tfidf = euclidean_distances(vector_summing_with_tfidf(source.split(" "),source_embeddings,cs_word2weight,DIMENSION),vector_averaging_with_tfidf(t.split(" "),target_embeddings,java_word2weight,DIMENSION))[0][0]
		
		result_dict_average[t] = distance_average
		result_dict_average_tfidf[t] = distance_average_tfidf
		# result_dict_sum[t] = distance_sum
	

	sorted_result_average = sorted(result_dict_average.items(), key=operator.itemgetter(1))

	
	sorted_result_average_tfidf = sorted(result_dict_average_tfidf.items(), key=operator.itemgetter(1))

	
	# sorted_result_sum = sorted(result_dict_sum.items(), key=operator.itemgetter(1))
	return sorted_result_average[:k], sorted_result_average_tfidf[:k] #sorted_result_sum[:k]
def make_sample_df(labels, np, labeled_data, limit, algorithm_name, dims, cores):
  used_labels = np.unique(labels)[0:3]
  label_dfs = []
  label = used_labels[0]
  
  # sub-sample the stratified subset
  subset = labeled_data[labeled_data[:,0] == label,1:]   # select all those elements with this label
  num_samples = min(limit,subset.shape[0])
  indices = np.arange(subset.shape[0])
  np.random.shuffle(indices)
  label_pts = subset[indices[:num_samples],:]
  
  # repeat for the same number of pts from one opposing label
  first_comparators = labeled_data[labeled_data[:,0] == label_opposites[label][0],1:]
  num_samples = min(limit,first_comparators.shape[0])
  indices = np.arange(first_comparators.shape[0])
  np.random.shuffle(indices)
  opposing_pts = first_comparators[indices[:num_samples],:]      
  distances = euclidean_distances(label_pts,opposing_pts)
  num_records = distances.size      
  label_dfs.append(pd.DataFrame({"distances": distances.ravel(), "dimension": [dims for i in range(num_records)], "label": [label_dict[label] for i in range(num_records)], "opposing label": [label_dict[label_opposites[label][0]] for i in range(num_records)], "algorithm": [algorithm_name for i in range(num_records)]}))      
  
  # repeat for the same number of pts from the other opposing label
  second_comparators = labeled_data[labeled_data[:,0] == label_opposites[label][1],1:]
  num_samples = min(limit,second_comparators.shape[0])
  indices = np.arange(second_comparators.shape[0])
  np.random.shuffle(indices)
  opposing_pts = second_comparators[indices[:num_samples],:]      
  distances = euclidean_distances(label_pts,opposing_pts)
  num_records = distances.size      
  label_dfs.append(pd.DataFrame({"distances": distances.ravel(), "dimension": [dims for i in range(num_records)], "label": [label_dict[label] for i in range(num_records)], "opposing label": [label_dict[label_opposites[label][1]] for i in range(num_records)], "algorithm": [algorithm_name for i in range(num_records)]}))       
      
  return label_dfs
Пример #5
0
    def _fit_process_bagirov(self, X):
        """
        Clusters using the global K-means algorithm Bagirov variation
        :param X:
        :return:
        """

        # Create a KNN structure for fast search
        self._neighbors = NearestNeighbors()
        self._neighbors.fit(X)

        # Compute the centroid of the dataset
        centroids = sum(X) / X.shape[0]
        assignments = [0 for i in range(X.shape[0])]

        centroids.shape = (1, X.shape[1])

        # compute the distance of the examples to the centroids
        mindist = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            mindist[i] = \
            euclidean_distances(X[i].reshape(1, -1), centroids[assignments[i]].reshape(1, -1), squared=True)[0]

        for k in range(2, self.n_clusters + 1):
            newCentroid = self._compute_next_centroid(X, centroids, assignments, mindist)
            centroids = np.vstack((centroids, newCentroid))
            km = KMeans(n_clusters=k, init=centroids, n_init=1)
            km.fit(X)
            assignments = km.labels_
            for i in range(X.shape[0]):
                mindist[i] = \
                euclidean_distances(X[i].reshape(1, -1), centroids[assignments[i]].reshape(1, -1), squared=True)[0]

        return km.cluster_centers_, km.labels_, km.inertia_
Пример #6
0
def test_euclidean_distances_with_norms(dtype, y_array_constr):
    # check that we still get the right answers with {X,Y}_norm_squared
    # and that we get a wrong answer with wrong {X,Y}_norm_squared
    rng = np.random.RandomState(0)
    X = rng.random_sample((10, 10)).astype(dtype, copy=False)
    Y = rng.random_sample((20, 10)).astype(dtype, copy=False)

    # norms will only be used if their dtype is float64
    X_norm_sq = (X.astype(np.float64) ** 2).sum(axis=1).reshape(1, -1)
    Y_norm_sq = (Y.astype(np.float64) ** 2).sum(axis=1).reshape(1, -1)

    Y = y_array_constr(Y)

    D1 = euclidean_distances(X, Y)
    D2 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq)
    D3 = euclidean_distances(X, Y, Y_norm_squared=Y_norm_sq)
    D4 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq,
                             Y_norm_squared=Y_norm_sq)
    assert_allclose(D2, D1)
    assert_allclose(D3, D1)
    assert_allclose(D4, D1)

    # check we get the wrong answer with wrong {X,Y}_norm_squared
    wrong_D = euclidean_distances(X, Y,
                                  X_norm_squared=np.zeros_like(X_norm_sq),
                                  Y_norm_squared=np.zeros_like(Y_norm_sq))
    with pytest.raises(AssertionError):
        assert_allclose(wrong_D, D1)
Пример #7
0
def bhargavi_gowda_score(X, labels):
    """
    Score from:

    Bhargavi, M. & Gowda, S. D. "A novel validity index with dynamic cut-off for determining true clusters"
    Pattern Recognition , 2015, 48, 3673 - 3687

    :param X:
    :param labels:
    :return:
    """
    llabels = np.unique(labels)
    poslabels = maplabels(llabels)

    nclust = len(llabels)
    nex = len(labels)

    # Centroid of the data

    centroid = np.zeros((1, X.shape[1]))
    centroid += np.sum(X, axis=0)
    centroid /= X.shape[0]

    # Compute SSB and intracluster distance
    ccentroid = np.zeros((nclust, X.shape[1]))
    dist = 0.0
    for idx in llabels:
        center = np.zeros((1, X.shape[1]))
        center_mask = labels == idx
        center += np.sum(X[center_mask], axis=0)
        center /= center_mask.sum()
        ccentroid[poslabels[idx]] = center
        dvector = euclidean_distances(centroid.reshape(1, -1), ccentroid[poslabels[idx]].reshape(1, -1), squared=True)
        dist += dvector.sum() * center_mask.sum()

    SSB = dist / len(labels)

    # Compute SSW
    dist = 0.0
    Intra = 0.0
    for idx in llabels:
        center_mask = labels == idx
        dvector = euclidean_distances(X[center_mask], ccentroid[poslabels[idx]].reshape(1, -1), squared=True)
        dist += dvector.sum()
        sdvector = euclidean_distances(X[center_mask], ccentroid[poslabels[idx]].reshape(1, -1), squared=False)
        Intra += sdvector.sum()

    SSW = dist / len(labels)

    SST = SSB + SSW


    # Centroids distance matrix
    cdistances = euclidean_distances(ccentroid, squared=False)

    Inter = np.sum(cdistances)/(nclust**2)


    return(np.abs((SSW/SSB)*SST) - (Intra/Inter) - (nex - nclust))
Пример #8
0
    def getSimMat(self, type = 'euclidean', ftr_type = 'data', orderFlag = True, pca_dim=20):
        if ftr_type == 'ftr':
            #use input features
            self.slctData = [ts for ts in self.slctData if ((ts.ftr is not None) and (len(ts.ftr) > 0))]
            dataMat = [ts.ftr for ts in self.slctData]
        elif ftr_type == 'data':
            #use input data
            dataMat = [ts.val for ts in self.slctData]
        else:
            print 'unknown ftr_type for ftr_type:', ftr_type
        if pca_dim > len(dataMat):
            pca_dim = int(math.ceil(len(dataMat)/2.0))

        if type  == 'euclidean': #euclidean distance based on time series data
            self.simMat = skmpw.euclidean_distances(dataMat)
        elif type == 'pca_euc': #extract feature based on PCA, then use Euclidean distance
            pca = skd.PCA(n_components=pca_dim)
            dataMat = pca.fit_transform(dataMat)
            self.simMat = skmpw.euclidean_distances(dataMat)
        elif type == 'nmf_euc': #extract feature based on NMF, then use Euclidean distance
            nmf = skd.NMF(n_components=pca_dim)
            dataMat = nmf.fit_transform(dataMat)
            self.simMat = skmpw.euclidean_distances(dataMat)
        elif type =='ica_euc': #extract feature based on ICA, then use Euclidean distance
            ica = skd.FastICA(n_components=pca_dim)
            dataMat = ica.fit_transform(dataMat)
            self.simMat = skmpw.euclidean_distances(dataMat)
        elif type =='cosine':
            self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine')
        elif type == 'pca_cos': #extract feature based on PCA, then use cosine distance
            pca = skd.PCA(n_components=pca_dim)
            dataMat = pca.fit_transform(dataMat)
            self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine')
        elif type == 'nmf_cos': #extract feature based on NMF, then use cosine distance
            nmf = skd.NMF(n_components=pca_dim)
            dataMat = nmf.fit_transform(dataMat)
            self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine')
        elif type =='ica_cos': #extract feature based on ICA, then use cosine distance
            ica = skd.FastICA(n_components=pca_dim)
            dataMat = ica.fit_transform(dataMat)
            self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine')
        else:
            print 'unknown type for similarity matrix: ', type

        #rearrange the order of data in simMat
        self.slctDataMat = dataMat
        if orderFlag:
            link = spc.hierarchy.linkage(self.simMat)
            dend = spc.hierarchy.dendrogram(link, no_plot=True)
            order = dend['leaves']
            self.slctData = [self.slctData[i] for i in order] #rearrange order
            self.simMat = [self.simMat[i] for i in order]
            for i in xrange(len(self.simMat)):
                self.simMat[i] = [self.simMat[i][j] for j in order]
            self.slctDataMat = [self.slctDataMat[i] for i in order]
        # self.patchOrdering = [ts.ptchNm for ts in self.slctData] #record new ordering
        self.patchOrdering = JSONifyData(self.slctData) # Deok wants all the data for each patch in the response
        self.clstData = self.slctData
        self.clstSimMat = self.simMat
Пример #9
0
def bagOfWordsModel():
    #simple vectorization example
    from sklearn.feature_extraction.text import CountVectorizer
    corpus = [
        'UNC played Duke in basketball',
        'Duke lost the basketball game',
        'I ate a sandwich'
    ]
    vectorizer = CountVectorizer()
    print vectorizer.fit_transform(corpus).todense()
    print vectorizer.vocabulary_
    #viewing the euclidean distance between features vectors
    from sklearn.metrics.pairwise import   euclidean_distances
    counts = [[0, 1, 1, 0, 0, 1, 0, 1],[0, 1, 1, 1, 1, 0, 0, 0],[1, 0, 0, 0, 0, 0, 1, 0]]
    print ('Distances between 1st and 2nd documents:',euclidean_distances(counts[0],counts[1]))
    print ('Distances between 1st and 3rd documents:',euclidean_distances(counts[0],counts[2]))
    print ('Distances between 2nd and 3rd documents:',euclidean_distances(counts[1],counts[2]))
    #filtering stop words
    vectorizer = CountVectorizer(stop_words='english')
    print vectorizer.fit_transform(corpus).todense()
    print vectorizer.vocabulary_
    # stemming and lemmatization
    """stemming =  removes all patterns of characters that appear to be affixes,resulting in a token that is not necessarily a valid word.  and lemmatization = finding the roots of a word ex jumping becomes jump
     Lemmatization frequently
    requires a lexical resource, like WordNet, and the word's part of speech. Stemming
    algorithms frequently use rules instead of lexical resources to produce stems and can
    operate on any token, even without its context.

    stem mesh hayfara2 been gathering as a noun and gathering as a verb w hay2lebhom homma el etneen le gather
    lemmatization bey7tag el context 3ashan yeraga3 el verbs lel root w el nouns zay ma heya
    stemming uses rules to remove characters that appear as zyadat fa momken yebawaz kelma ex: was>= wa, lemmatization uses el context
    """
    from nltk import word_tokenize
    from nltk.stem import PorterStemmer
    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk import pos_tag
    wordnet_tags = ['n','v']

    corpus = [
 'He ate the sandwiches',
 'Every sandwich was eaten by him'
 ]
    stemmer = PorterStemmer()
    print 'Stemmed:', [[stemmer.stem(token) for token in word_tokenize(document)] for document in corpus]

    lemmatizer = WordNetLemmatizer()
    tagged_corpus = [pos_tag(word_tokenize(document)) for document in corpus]
    print 'Lemmatized:', [[lemmatize(token, tag) for token, tag in
                           document] for document in tagged_corpus]


    #TF-IDF => the frequencies of the tokens are put into considerations
    from sklearn.feature_extraction.text import  CountVectorizer
    corpus = ['The dog ate a sandwich, the wizard transfigured a sandwich, and I ate a sandwich']
    vectorizer = CountVectorizer(stop_words='english')
    """The binary argument is defaulted to False,so instead of a binary representation
     we get a number of occurences for each token"""
    print vectorizer.fit_transform(corpus).todense()
Пример #10
0
def distToSeed(tweetVecs, seedTweetVecs):
    #seedNews = []
    distToSeedTweets = pairwise.euclidean_distances(tweetVecs, seedTweetVecs[range(10),:])
    distToSeedTweets = np.mean(distToSeedTweets)#/len(tweetVecs)

    distToSeedNews = pairwise.euclidean_distances(tweetVecs, seedTweetVecs[range(10, 20),:])
    distToSeedNews = np.mean(distToSeedNews)#/len(tweetVecs)

    return distToSeedTweets, distToSeedNews
Пример #11
0
def cluster_centers(data, n_clusters):
    centers_idxs = []
    data_new = data.copy()
    for i in range(n_clusters):
        dist_matrix = euclidean_distances(data_new, data_new)
        c_idx = dist_matrix.sum(axis=1).argsort()[::-1][0]
        centers_idxs.append(c_idx)
        data_new = np.delete(data_new, c_idx, axis=0)

    return euclidean_distances(data, data), np.array(centers_idxs)
Пример #12
0
def test_euclidean_distances():
    """ Check the pairwise Euclidean distances computation"""
    X = [[0]]
    Y = [[1], [2]]
    D = euclidean_distances(X, Y)
    assert_array_almost_equal(D, [[1., 2.]])

    X = csr_matrix(X)
    Y = csr_matrix(Y)
    D = euclidean_distances(X, Y)
    assert_array_almost_equal(D, [[1., 2.]])
Пример #13
0
def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
    random_state = check_random_state(random_state)
    n_samples, n_features = X.shape
    centers = np.empty((n_clusters, n_features))
    assert x_squared_norms is not None, 'x_squared_norms None in _k_init'

    # Set the number of local seeding trials if none is given
    if n_local_trials is None:
        n_local_trials = 2 + int(np.log(n_clusters))


    # Pick the first center randomly
    center_id  = random_state.randint(0, n_samples-1)
    centers[0] = X[center_id]

    # Initialize list of closest distances and calculate current potential
    closest_dist_sq = euclidean_distances(centers[0, np.newaxis], X,
                                            Y_norm_squared=x_squared_norms, squared=True)

    current_pot = closest_dist_sq.sum()
    # Pick the remaining n_clusters-1 points
    for c in range(1, n_clusters):
        # Choose center candidates by sampling with probability proportional
        # to the squared distance to the closest existing center
        rand_vals = random_state.random_sample(n_local_trials) * current_pot
        candidate_ids = np.searchsorted(closest_dist_sq.cumsum(), rand_vals)

        # Compute distances to center candidates
        distance_to_candidates = euclidean_distances(X[candidate_ids], X,
                                                        Y_norm_squared=x_squared_norms, squared=True)

        # Decide which candidate is the best
        best_candidate = None
        best_pot = None
        best_dist_sq = None
        for trial in range(n_local_trials):
            # Compute potential when including center candidate
            new_dist_sq = np.minimum(closest_dist_sq,
                                     distance_to_candidates[trial])
            new_pot = new_dist_sq.sum()

            # Store result if it is the best local trial so far
            if (best_candidate is None) or (new_pot < best_pot):
                best_candidate = candidate_ids[trial]
                best_pot = new_pot
                best_dist_sq = new_dist_sq

        # Permanently add best center candidate found in local tries
        centers[c] = X[best_candidate]
        current_pot = best_pot
        closest_dist_sq = best_dist_sq

    return centers
Пример #14
0
def test_pairwise_parallel():
    rng = np.random.RandomState(0)
    for func in (np.array, csr_matrix):
        X = func(rng.random_sample((5, 4)))
        Y = func(rng.random_sample((3, 4)))

        S = euclidean_distances(X)
        S2 = _parallel_pairwise(X, None, euclidean_distances, n_jobs=-1)
        assert_array_almost_equal(S, S2)

        S = euclidean_distances(X, Y)
        S2 = _parallel_pairwise(X, Y, euclidean_distances, n_jobs=-1)
        assert_array_almost_equal(S, S2)
Пример #15
0
def sammon(data, target_dim=2, max_iterations=250, max_halves=10):
    """
    Adopted from the Matlab implementation by Dr. Gavin C. Cawley.

    Matlab source can be found here:
    
    https://people.sc.fsu.edu/~jburkardt/m_src/profile/sammon_test.m

    """
    TolFun = 1 * 10 ** (-9)
    
    D = euclidean_distances(data, data)
    N = data.shape[0]
    scale = np.sum(D.flatten('F'))
    D = D + np.identity(N)
    D_inv = np.linalg.inv(D)
    
    y = np.random.randn(N, target_dim)
    one = np.ones((N, target_dim))
    d = euclidean_distances(y, y) + np.identity(N)
    d_inv = np.linalg.inv(d)
    delta = D - d
    E = np.sum(np.sum(np.power(delta, 2) * D_inv))

    for i in range(max_iterations):
        delta = d_inv - D_inv
        deltaone = np.dot(delta, one)
        g = np.dot(delta, y) - y * deltaone
        dinv3 = np.power(d_inv, 3)
        y2 = np.power(y, 2)
        H = np.dot(dinv3, y2) - deltaone - 2 * np.multiply(y, np.dot(dinv3, y)) + np.multiply(y2, np.dot(dinv3, one))
        s = np.divide(-np.transpose(g.flatten('F')), np.transpose(np.abs(H.flatten('F'))))
        y_old = y

    for j in range(max_halves):
        [rows, columns] = y.shape
        y = y_old.flatten('F') + s
        y = y.reshape(rows, columns)
        d = euclidean_distances(y, y) + np.identity(N)
        d_inv = np.linalg.inv(d)
        delta = D - d
        E_new = np.sum(np.sum(np.power(delta, 2) * D_inv))

        if E_new < E:
            break
        else:
            s = 0.5 * s

    E = E_new
    E = E * scale
    return (y, E)
Пример #16
0
def get_sim(dt_frame,n_rows=2000,plt_flag=False,sort_flag=True,out_file="sim.png",plot_every=1):
    if sort_flag:
        dist=euclidean_distances(dt_frame.values,dt_frame.values[0])
        dt_temp=dt_frame.copy()
        dt_temp["dist"]=dist
        dt_sort=dt_temp.sort("dist").drop("dist",axis=1)
    else:
        dt_sort=dt_frame.copy()
    dist_full=euclidean_distances(dt_sort[0:n_rows].values)
    plt.figure()
    plt.imshow(dist_full[::plot_every,::plot_every],extent=(0,n_rows,n_rows,0))
    plt.colorbar()
    plt.savefig(out_file)
    if plt_flag:
        plt.show()
def rbf_kernel(Z, X, gamma=None):
    """
    Compute the rbf (gaussian) kernel between X and Y::

        K(x, y) = exp(-γ ||x-y||²)

    for each pair of rows x in X and y in Y.

    Parameters
    ----------
    X : array of shape (n_samples_X, n_features)

    Y : array of shape (n_samples_Y, n_features)

    gamma : float

    Returns
    -------
    kernel_matrix : array of shape (n_samples_X, n_samples_Y)
    """
    if gamma is None:
        gamma = 1.0 / X.shape[1]
    K = pw.euclidean_distances(X, Z, squared=True)
    K *= -gamma
    np.exp(K, K)    # exponentiate K in-place
    return K
Пример #18
0
    def partition_FOV_KMeans(self,tradeoff_weight=.5,fx=.25,fy=.25,n_clusters=4,max_iter=500):
        """
        Partition the FOV in clusters that are grouping pixels close in space and in mutual correlation

        Parameters
        ------------------------------
        tradeoff_weight:between 0 and 1 will weight the contributions of distance and correlation in the overall metric
        fx,fy: downsampling factor to apply to the movie
        n_clusters,max_iter: KMeans algorithm parameters

        Outputs
        -------------------------------
        fovs:array 2D encoding the partitions of the FOV
        mcoef: matric of pairwise correlation coefficients
        distanceMatrix: matrix of picel distances

        Example

        """

        _,h1,w1=self.shape
        self.resize(fx,fy)
        T,h,w=self.shape
        Y=np.reshape(self,(T,h*w))
        mcoef=np.corrcoef(Y.T)
        idxA,idxB =  np.meshgrid(list(range(w)),list(range(h)));
        coordmat=np.vstack((idxA.flatten(),idxB.flatten()))
        distanceMatrix=euclidean_distances(coordmat.T);
        distanceMatrix=old_div(distanceMatrix,np.max(distanceMatrix))
        estim=KMeans(n_clusters=n_clusters,max_iter=max_iter);
        kk=estim.fit(tradeoff_weight*mcoef-(1-tradeoff_weight)*distanceMatrix)
        labs=kk.labels_
        fovs=np.reshape(labs,(h,w))
        fovs=cv2.resize(np.uint8(fovs),(w1,h1),old_div(1.,fx),old_div(1.,fy),interpolation=cv2.INTER_NEAREST)
        return np.uint8(fovs), mcoef, distanceMatrix
Пример #19
0
def getClusterFeatures(tweetClusters, documents, feaVecs, seedTweetVecs, snp_comp, symCompHash):
    cLabels, tLabels, centroids, docDist = tweetClusters

    cTexts = []
    cDocs_zip = []
    cComps = []
    cDensity = []
    cDistToST = []
    for clbl in cLabels:
        dataIn = [item[0] for item in enumerate(tLabels) if item[1] == clbl]
        vecsIn = feaVecs[dataIn, :]
        textsIn = [documents[docid] for docid in dataIn]
        textsIn = Counter(textsIn).items()
        dataIn_zip = [(documents.index(text), num) for text, num in textsIn]
        compsIn = compInCluster(textsIn, snp_comp, symCompHash, False, True)
        inDist = pairwise.euclidean_distances(vecsIn, vecsIn)
        distToST = distToSeed(vecsIn, seedTweetVecs)

        cTexts.append(textsIn)
        cComps.append(compsIn)
        cDocs_zip.append(dataIn_zip)
        cDensity.append(np.mean(inDist))
        cDistToST.append(distToST)


        if 0:
            print clbl, cDensity[-1]
            for item in textsIn: print item
            print compsIn

    return docDist, cDensity, cTexts, cComps, cDocs_zip, cDistToST
Пример #20
0
 def PrecomputeSimilarities(self):
     from sklearn.metrics.pairwise import euclidean_distances
     if self.verbose > 10:
         print 'Precomputing similarities...'
     X=np.matrix(self.usedTrainingData['length']).transpose()
     self.Similarities = \
             np.exp(-self.choice_parameter*euclidean_distances(X))
def neighbor_pixel_check(cluster_coords, max_distance=2):
    """
    Check that all events in the cluster have a maximum distance smaller than max_distance

    :param cluster_coords:
    :param max_distance:
    :return:
    """

    hot_pix_flag = True

    all_distances = euclidean_distances(cluster_coords, cluster_coords)
    if args.debug == "yes":
        print all_distances

    for distances in all_distances:

        for distance in distances:

            if distance < max_distance:
                pass

            else:
                hot_pix_flag = False
                break

        if hot_pix_flag == False:
            break

    return hot_pix_flag
Пример #22
0
    def NN(self,datas, centroids):
      #  start = time.time()
        ####### find which centroids the x is closet to, and put x into the centroids location #####
        group = [[] for n in range(len(centroids))]
#        group = [[np.zeros(len(centroids[0]))] for n in range(len(centroids))]

        all_distances = euclidean_distances(centroids, datas, squared=True)

	labels = np.empty(len(datas), dtype=np.int32)
        labels.fill(-1)
        mindist = np.empty(len(datas))
        mindist.fill(np.infty)
        for center_id in range(len(centroids)):
            dist = all_distances[center_id]
            labels[dist < mindist] = center_id
            mindist = np.minimum(dist, mindist)
        #for k in range(len(centroids)):
	    #group.append(list)
	for i in range(len(labels)):
	    group[labels[i]].append(datas[i]-centroids[labels[i]])


        # end for
#        print("End of NN: ",(time.time()-start))
        return group
Пример #23
0
    def image_similarity(self, img1):
        """
                returns closest nth image to image
            """
        list_img_score = []
        closest = float("Inf")
        closest_id = ""
        value_img = self.img_dict_train[img1]
        current = euclidean_distances(self.matrix_test, value_img.reshape(1, -1))
        values_array = np.squeeze(np.asarray(current))
        # current = current.tolist()
        # print values_array
        # print np.argmax(current)
        max_indexes = values_array.argsort()[:-100][::1]
        max_list = max_indexes.tolist()
        # print len(max_indexes)
        # print max_indexes
        # current = current.tolist()
        # print 'error'
        for idx in max_list:
            # print idx
            # tuple_=[]
            print self.img_dict_test[idx], values_array[idx]
            float_val = float(values_array[idx])
            rounded_val = round(float_val, 5)
            # tuple_.append(self.img_dict_test[idx[0]])
            # tuple_.append(current[idx])
            list_img_score.append([self.img_dict_test[idx], rounded_val])

        return list_img_score
    def action(self, tweets_list):
        corpus = []
        for tweet in tweets_list:
            #corpus += [t["text"]]
            tweet_str = tweet["text"].encode("utf-8")
            tweet_str = unicode(tweet_str,'utf-8')
            corpus.append(tweet_str)

        print(corpus)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(corpus)
        M,P=X.shape


        dist_corpus=euclidean_distances(X)

        stwf=stopwords.words('french')
        stwf.append('les')
        vectorizer=CountVectorizer(stop_words=stwf)
        X = vectorizer.fit_transform(corpus)
        dico=vectorizer.vocabulary_
        
        #Tous les print regroupés ici
        
        print("Results of Birch algorithm")

        clusters = birch_algo(X.toarray(), None)
        quit()
Пример #25
0
def between_scatter_matrix_score(X, labels):
    """
    Computes the between scatter matrix score of a labeling of a clustering
    :param X:
    :param labels:
    :return:
    """

    llabels = np.unique(labels)

    # Centroid of the data

    centroid = np.zeros((1, X.shape[1]))
    centroid += np.sum(X, axis=0)
    centroid /= X.shape[0]

    dist = 0.0
    for idx in llabels:
        center = np.zeros((1, X.shape[1]))
        center_mask = labels == idx
        center += np.sum(X[center_mask], axis=0)
        center /= center_mask.sum()
        dvector = euclidean_distances(centroid, center, squared=True)
        dist += dvector.sum() * center_mask.sum()
    return dist / len(labels)
Пример #26
0
def trimmedrbf_kernel(X, Y=None, gamma=None, robust_gamma = None):
    """
    Compute the rbf (gaussian) kernel between X and Y::

        K(x, y) = exp(-gamma ||x-y||**2)

    for each pair of rows x in X and y in Y.

    Parameters
    ----------
    X : array of shape (n_samples_X, n_features)

    Y : array of shape (n_samples_Y, n_features)

    gamma : float

    Returns
    -------
    kernel_matrix : array of shape (n_samples_X, n_samples_Y)
    """
    X, Y = check_pairwise_arrays(X, Y)
    if gamma is None:
        gamma = 1.0 / X.shape[1]

    K = euclidean_distances(X, Y, squared=True)
    print K
    print "SHape kernel" + str(np.where(np.sqrt(K) > robust_gamma)[0].shape)
    K[np.where(np.sqrt(K) > robust_gamma)] = robust_gamma**2
    
    K *= -gamma
    np.exp(K, K)    # exponentiate K in-place
    return K
Пример #27
0
def sumACluster(dist, vecsIn, topK_t, sameTweetThred):
    if dist == "cosine":
        distMatrix = pairwise.cosine_distances(vecsIn)
    elif dist == "eu":
        distMatrix = pairwise.euclidean_distances(vecsIn, vecsIn)

    sameTweetClusters = [[0]]
    for seqid, text in enumerate(vecsIn[1:], start=1):
        added = None
        for stcid, stc in enumerate(sameTweetClusters):
            sameFlag = False
            if distMatrix[seqid][stc[0]] <= sameTweetThred:
                sameFlag = True

            if sameFlag:
                stc.append(seqid)
                added = (stcid, stc)
                break
        if added is None:
            sameTweetClusters.append([seqid])
        else:
            sameTweetClusters[added[0]] = added[1]
    sameTweetClusterNum = [(stcid, len(stc)) for stcid, stc in enumerate(sameTweetClusters)]
    numIn = len(sameTweetClusterNum)
    top = sorted(sameTweetClusterNum, key = lambda a:a[1], reverse=True)[:min(topK_t, numIn)]
    top = [(sameTweetClusters[item[0]][0], item[1]) for item in top]
    return top
Пример #28
0
def cluster_sentence_vectors(sentences, X, N_CLUSTERS=5):
    """
        given vector results and number of clusters return cluster objects
    """
    kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=45)
    cluster_assignments = kmeans.fit_predict(X)
    centroids = kmeans.cluster_centers_

    cluster_dict = {
        x: {"vector": centroids[x], "sentences": [], "reduced": False}
        for x in xrange(len(centroids))}

    temp_cluster_keywords = {x: {} for x in xrange(len(centroids))}

    for i, sent in enumerate(sentences):
        sent['feature_vector'] = X.toarray()[i]
        cluster_num = cluster_assignments[i]
        sent["cluster_num"] = cluster_num
        dist_to_centroid = euclidean_distances(centroids[cluster_num], sent["feature_vector"])[0][0]
        sent["dist_to_centroid"] = dist_to_centroid
        # add to cluster object
        cluster_dict[cluster_num]["sentences"].append(sent)
        # merge keyword dictionaries together
        temp_cluster_keywords[cluster_num] = merge_kwd_counts([temp_cluster_keywords[cluster_num], sent["key_terms"]])

    NUM_KEYWORDS = 5
    for cluster_num in temp_cluster_keywords:
        clustered = sorted(temp_cluster_keywords[cluster_num].items(), key=lambda x: x[1])[0:NUM_KEYWORDS]
        cluster_dict[cluster_num]["keywords"] = [x[0] for x in clustered]

    return cluster_dict
Пример #29
0
def calculate_similarity(movie_name_1, movie_name_2, min_common_users=0):
    
    movie1 = movie_name_to_id_dictionary[movie_name_1]
    movie2 = movie_name_to_id_dictionary[movie_name_2]
    
    #This is the set of UNIQUE user ids  who reviewed  movie1
    users_who_rated_movie1 = set((movielens_df[(movielens_df.MovieId == movie1)].UserId).tolist())
    
    #This is the set of UNIQUE user ids  who reviewed  movie2
    users_who_rated_movie2 = set((movielens_df[(movielens_df.MovieId == movie2)].UserId).tolist())
    
    #Compute the common users who rated both movies: 
    # hint convert both to set and do the intersection
    common_users = users_who_rated_movie1.intersection(users_who_rated_movie2)
    
    #Using the code you wrote in t2a, get the reviews for the movies and common users
    movie1_reviews = get_movie_reviews(movie1, common_users)
    movie2_reviews = get_movie_reviews(movie2, common_users)
    
    #Now you have the data frame for both movies
    # Use the euclidean_distances function from sklean (imported already)
    # to compute the distance between their rating values
    distance = euclidean_distances(movie1_reviews['Rating'].values, movie2_reviews['Rating'].values)

    if len(common_users) < min_common_users:
        return [[float('inf')]]
    return distance
Пример #30
0
    def make_connectivity_matrix(self):
        """
        Computes the connectivity matrix of this Population. Each point is
        connected to each other within a radius.
        """

        if self.connectivity_matrix:
            return

        points_arr = np.array([[p.x, p.y] for p in self.points])
        distance_mat = euclidean_distances(points_arr, points_arr)

        # Every point p will be connected to each other point whose distance
        # to p is less than a cut-off value. This value is computed as the
        # mean of {min_nonzero(dist_mat(p)) | p is a point}, times a factor
        min_nonzero = lambda r: min(r[r > 0])

        # apply_along_axis(f, axis=1, arr) applies f to each row
        min_neighbor_distances = np.apply_along_axis(min_nonzero, axis=1, arr=distance_mat)

        factor = 2.2
        neighbor_cutoff = np.mean(min_neighbor_distances) * factor
        connectivity_matrix = distance_mat < neighbor_cutoff

        self.connectivity_matrix = connectivity_matrix
        def kmeans_step(frame, K):
            rng = np.random.RandomState(2)
            cluster_ids = np.zeros(X.shape[0])
            centroids = rng.randn(K, 2)

            nsteps = frame // 3

            for i in range(nsteps + 1):
                old_centroids = centroids

                if i < nsteps or frame % 3 > 0:
                    dist = euclidean_distances(X, centroids)
                    cluster_ids = dist.argmin(1)

                if i < nsteps or frame % 3 > 1:
                    centroids = np.array(
                        [X[cluster_ids == k].mean(0) for k in range(K)])
                    nans = np.isnan(centroids)
                    centroids[nans] = old_centroids[nans]

            # plot data
            c = cluster_ids if frame > 0 else 'w'
            plt.scatter(X[:, 0],
                        X[:, 1],
                        c=c,
                        s=50,
                        edgecolors='k',
                        vmin=0,
                        vmax=K - 1,
                        alpha=0.6)

            # plot centroids
            plt.scatter(old_centroids[:, 0],
                        old_centroids[:, 1],
                        marker='o',
                        c=range(K),
                        s=200)
            plt.scatter(old_centroids[:, 0],
                        old_centroids[:, 1],
                        marker='o',
                        c='black',
                        s=50)

            # plot new centers if third frame
            if frame % 3 == 2:
                for i in range(K):
                    plt.annotate('',
                                 xy=centroids[i],
                                 xytext=old_centroids[i],
                                 arrowprops=dict(arrowstyle='->',
                                                 linewidth=1,
                                                 color='k'))
                plt.scatter(centroids[:, 0],
                            centroids[:, 1],
                            marker='o',
                            c=range(K),
                            s=200)
                plt.scatter(centroids[:, 0],
                            centroids[:, 1],
                            marker='o',
                            c='black',
                            s=50)

            plt.xlim(-4, 4)
            plt.ylim(-2, 10)

            if frame % 3 == 1:
                plt.title("Assign data to nearest centroid", size=14)
            elif frame % 3 == 2:
                plt.title("Update centroids to cluster means", size=14)
            else:
                plt.title(" ", size=14)
Пример #32
0
def find_rating(nearby_rid, cuisine):
    print("I am starting to find rating algo")
    #print nearby_rid

    # First read csv files and store it in dataframe
    # Second convert dataframe to array
    df_restaurant = pd.read_csv('data/restaurant.csv', header=0)
    array_restaurant = df_restaurant.values
    #print array_restaurant

    df_cuisine = pd.read_csv('data/cuisine.csv', header=0)
    array_cuisine = df_cuisine.values

    # # Perform natural join on cuisine and restaurant based on key 'rid' and store it in dataframe
    # # convert that dataframe into an array
    # combine = df_cuisine.set_index('rid').join(df_restaurant.set_index('id'))
    # array_combine = combine.values
    # #print array_combine

    #---------------------------------------------------------------------------

    # Select only those restaurant from all which are nearby
    # Convert 2d numpy array to 1d array. For eg. [[1, 2, 3]] into [1, 2, 3]
    nearby_rid = nearby_rid.ravel()
    filter_nearby = df_restaurant.loc[df_restaurant['id'].isin(nearby_rid)]
    array_filter_nearby = filter_nearby.values
    #print array_filter_nearby

    filter_cuisine_id = array_cuisine[array_cuisine[:, 2] == 'Italian']
    #print "I WANT THISSSSSSSSSS"
    #print filter_cuisine_id
    filter_cuisine_id = filter_cuisine_id[:, 1]
    #print filter_cuisine_id.astype(int)

    filter_cuisine = filter_nearby.loc[filter_nearby['id'].isin(
        filter_cuisine_id.astype(int))]
    print(filter_cuisine)
    filter_cuisine = filter_cuisine.values

    #---------------------------------------------------------------------------
    # Extract latitude and longitude of above filtered restaurant
    lat_long = filter_cuisine[:, 2:4]
    #print lat_long

    # Apply clustering algo on filtered restaurant data
    kmeans = KMeans(n_clusters=3, random_state=0).fit(lat_long)

    # Cluster number for all the above filtered restaurant in which cluster they fall
    print(kmeans.labels_)
    #print kmeans.predict([[18.95618666,72.81199761], [18.99120402, 72.81458057]])
    print("Clustering centre")
    print(kmeans.cluster_centers_)

    #----------------------------------------------------------------------------

    # calculate distance of each cluster from user's current location
    distance = euclidean_distances([[19.044497, 72.8204535]],
                                   kmeans.cluster_centers_)
    print(np.transpose(distance))
    print(len(distance))

    # append cluster number with above distance array, for knowing which cluster distance is that
    # because after we are sorting these distances
    distance_cluster_centre = np.insert(np.transpose(distance),
                                        1,
                                        np.array([0, 1, 2]),
                                        axis=1)
    print(distance_cluster_centre)

    # sorted distances
    print("sorted distance")
    arr = distance_cluster_centre[distance_cluster_centre[:, 0].argsort()]

    #------------------------------------------------------------------------------

    # make numpy array with columns [id, lat, long, rating, cid]
    # cid = cluster id
    id_after_cuisine = filter_cuisine[:, 0]
    id_lat_long = np.insert(lat_long, 0, id_after_cuisine, axis=1)
    id_lat_long_cid = np.insert(id_lat_long, 3, kmeans.labels_, axis=1)
    id_lat_long_rating_cid = np.insert(id_lat_long_cid,
                                       3,
                                       filter_cuisine[:, 8],
                                       axis=1)
    print(id_lat_long_rating_cid)

    # convert above array to dataframe
    columns = ['id', 'latitude', 'longitude', 'rating', 'cid']
    df = pd.DataFrame(id_lat_long_rating_cid, columns=columns)

    #-----------------------------------------------------------------------------------------------

    # SORT CLUSTER ACCORDING TO CLUSTER CENTRE DISTANCES FROM USER'S LOCATION

    # select [[12.313, 12.375843, 24.7364],[0, 2, 1]] - [[centre distances][cluster id]]
    print(np.array(arr[:, 1][0]))
    #initialize empty dataframe
    sorted_cluster = pd.DataFrame()
    # sort cluster according to cluster centre distance
    for i in range(0, len(arr[:, 1])):
        # dataframe  of single cluster
        single_cluster = df.loc[df['cid'].isin(np.array(arr[:, 1][i]).ravel())]
        single_cluster = single_cluster.sort_values(by='rating',
                                                    ascending=False)
        sorted_cluster = sorted_cluster.append(single_cluster)
    print(sorted_cluster)

    #df_groupby = sorted_cluster.groupby('cid')
    #print len(df_groupby)

    #for group in  df_groupby:
    #	print group
    #print df_groupby.sort_values('rating', ascending=False)
    #print df_groupby.get_group(0)

    # convert dataframe to array and extract only rid
    sorted_cluster_rid = sorted_cluster.as_matrix(columns=None)[:, 0]
    # convert long datatype of rid into int
    return sorted_cluster_rid.astype(int)

    #df_groupby = df.groupby('cid')
    #print len(df_groupby)

    #for group in  df_groupby:
    #	print group
    #print df_groupby.sort_values('rating', ascending=False)
    #print df_groupby.get_group(0)

    #--------------------------------------------------------------------------------------------------

    # featureset_all = np.delete(filter_cuisine, np.s_[2:10], axis=1)
    # print "CONVERT THIS ARRAY TO DATFRAMEEEEEEEEEEEEEE"
    # print featureset_all
    # #featureset_all = featureset_all[0:6,:]

    # featureset_X = np.delete(featureset_all, np.s_[0], axis=1)
    # print featureset_X

    # featureset_Y = np.delete(featureset_all, np.s_[1:], axis=1)
    # print featureset_Y

    # columns=['cuisine','homedelivery','smoking','alcohol','wifi', 'valetparking','rooftop']

    # df = pd.DataFrame(featureset_X ,columns=columns)
    # print "CONVERTEDDDDDDDDDDDDDDDDDDD"
    # print df

    # cols_to_retain = ['cuisine', 'homedelivery', 'smoking', 'alcohol', 'wifi', 'valetparking', 'rooftop']
    # #cols_to_retain = ['homedelivery', 'smoking', 'alcohol', 'wifi']
    # feature = df[cols_to_retain].to_dict( orient = 'records' )
    # print "DICTIONARYYYYYYYYY"
    # print feature

    # vec = DictVectorizer()
    # X = vec.fit_transform(feature).toarray()
    # print X

    # columns=['id']
    # df = pd.DataFrame(featureset_Y ,columns=columns)
    # cols_to_retain = ['id']
    # Y = df[cols_to_retain].to_dict( orient = 'records' )
    # vec = DictVectorizer()
    # Y = vec.fit_transform(Y).toarray()
    # print Y

    # X_train, X_test, Y_train_labels, Y_test_labels = train_test_split(X, Y, test_size=0.3, random_state=100)
    # print "-----------Training feature---------------"
    # print X_train
    # print "------------Testing feature--------------"
    # print X_test
    # print "------------Training label--------------"
    # print Y_train_labels
    # print "-----------Testing label---------------"
    # print Y_test_labels
    # print "--------------------------"

    # clf_entropy = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=3, min_samples_leaf=5)
    # clf_entropy.fit(X_train, Y_train_labels)

    # print "Fitting done"
    # # Make predictions
    # y_pred_en = clf_entropy.predict(X_test)
    # print y_pred_en

    # # columns=['cuisine','homedelivery','smoking','alcohol','wifi', 'valetparking','rooftop']
    # # df = pd.DataFrame([['Italian', 'yes', 'no', 'yes', 'no', 'no', 'no'], ['Italian', 'yes', 'no', 'yes', 'no', 'no', 'no']] ,columns=columns)
    # # cols_to_retain = ['cuisine', 'homedelivery', 'smoking', 'alcohol', 'wifi', 'valetparking', 'rooftop']
    # # feature = df[cols_to_retain].to_dict( orient = 'records' )
    # # print feature
    # # vec = DictVectorizer()
    # # user_input = vec.fit_transform(feature).toarray()
    # # print user_input

    # print clf_entropy.predict([[0. ,1. ,1. ,1. , 0., 0., 1., 0., 1., 0., 0., 1., 0.]])

    print("shraddha")
Пример #33
0
from sklearn.manifold import MDS

df = pd.DataFrame.from_csv('./train_lyrics_1000.csv')

X_train = df['lyrics'].values
names = df['title'].values

count_vect = CountVectorizer()
dtm = count_vect.fit_transform(X_train.ravel())

vocab = count_vect.get_feature_names()

dtm = dtm.toarray()
vocab = np.array(vocab)

dist = euclidean_distances(dtm)

dist = np.round(dist, 1)

mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]

for x, y, name in zip(xs, ys, names):
    color = 'skyblue'
    plt.scatter(x, y, c=color)
    plt.text(x, y, name)

plt.show()
__author__ = 'zelalem'
#
import textmining
from math import *
import numpy as np
from scipy.spatial.distance import pdist, euclidean, squareform
# #
doc1 = 'John and Bob are brothers.'
doc2 = 'John went to the store. The store was closed.'
doc3 = 'Bob went to the store too.'

tdm = textmining.TermDocumentMatrix()

tdm.add_doc(doc1)
tdm.add_doc(doc2)
tdm.add_doc(doc3)

tdm.write_csv('/home/zelalem/Downloads//matrix.csv', cutoff=1)

a = list(tdm.rows(cutoff=1))[1:]
x = a[0]
y = a[1]
print a[2]

from sklearn.metrics.pairwise import euclidean_distances
X = [[0, 1], [1, 1]]
print euclidean_distances(X, X)
Пример #35
0
def angular_distances(X, Y):
    return euclidean_distances(normalize(X), normalize(Y))
Пример #36
0
    def metrics(self, epsilon=1.0, k=None):
        """
    Calculate the metrics (correctness and coverage) for all possible pairs
    of groups. Epsilon defines threshold (if a mapped initial point is further
    away from a target point by a length > epsilon, the point is considered
    false.
    """
        num_dimensions = self.original_X.shape[1]

        correctness = np.zeros((self.num_clusters, self.num_clusters))
        coverage = np.zeros((self.num_clusters, self.num_clusters))

        for initial in range(self.num_clusters):
            for target in range(self.num_clusters):
                x_init = [
                    self.original_X[i] for i in range(len(self.original_X))
                    if self.original_Y[i] == initial
                ]
                x_target = [
                    self.original_X[i] for i in range(len(self.original_X))
                    if self.original_Y[i] == target
                ]

                # Construct the explanation between the initial and target regions
                if initial == target:
                    d = np.zeros((1, num_dimensions))
                elif initial == 0:
                    d = self.delta[target - 1]
                elif target == 0:
                    d = -1.0 * self.delta[initial - 1]
                else:
                    d = -1.0 * self.delta[initial - 1] + self.delta[target - 1]

                if k is not None:
                    d = truncate(d, k)

                r_init = self.transformer(x_init + d)
                #r_target = self.transformer(x_target)
                r_target = [
                    self.latent_X[i] for i in range(len(self.latent_X))
                    if self.latent_Y[i] == target
                ]

                dists = euclidean_distances(r_init, Y=r_target)

                close_enough = 1.0 * (dists <= epsilon)

                if initial == target:
                    threshold = 2.0
                else:
                    threshold = 1.0

                correctness[initial, target] = np.mean(
                    1.0 * (np.sum(close_enough, axis=1) >= threshold))
                coverage[initial, target] = np.mean(
                    1.0 * (np.sum(close_enough, axis=0) >= threshold))

        self.correctness = correctness
        self.coverage = coverage

        return correctness, coverage
Пример #37
0
    def findTPs(self):
        locals = self.locals
        model = self.supportmodel
        epsilon = self.options['epsilon']
        R = model.R + 10**(-7)

        ts = {}
        ts['x'] = []
        ts['f'] = []
        ts['neighbor'] = []
        ts['purturb'] = []
        [N, attr] = locals.shape
        tmp_x = []

        if model.support == 'GP':
            for i in range(N):
                for j in range(i, N):
                    for k in range(10):
                        x0 = locals[i] + 0.1 * (k + 1) * (locals[j] -
                                                          locals[i])
                        sep = fsolve(func=fsolve_R_GP,
                                     x0=x0,
                                     fprime=Hess,
                                     args=model,
                                     xtol=10**(-6))
                        tmp_x.append(sep)
            tmp_x = np.array(tmp_x)
            [dummy, I, J] = np.unique(np.round(10 * tmp_x),
                                      axis=0,
                                      return_index=True,
                                      return_inverse=True)
            tmp_x = tmp_x[I, :]
            for i in range(list(tmp_x.shape)[0]):
                sep = tmp_x[i]
                [f, g, H] = my_R_GP2(sep, model)
                [D, V] = la.eig(H)

                ind = []
                if np.sum(D < 0) == 1:
                    sep1 = sep + epsilon * V[np.where(D < 0)[0]]
                    sep2 = sep - epsilon * V[np.where(D < 0)[0]]

                    if attr == 2:
                        res1 = minimize(fun=my_R_GP1,
                                        x0=sep1,
                                        args=model,
                                        method='Nelder-Mead')
                        [temp1, val] = [res1.x, res1.fun]
                        res2 = minimize(fun=my_R_GP1,
                                        x0=sep2,
                                        args=model,
                                        method='Nelder-Mead')
                        [temp2, val] = [res2.x, res2.fun]
                    else:
                        res1 = minimize(fun=my_R_GP1,
                                        x0=sep1,
                                        args=model,
                                        hess=True)
                        [temp1, val] = [res1.x, res1.fun]
                        res2 = minimize(fun=my_R_GP1,
                                        x0=sep2,
                                        args=model,
                                        hess=True)
                        [temp2, val] = [res2.x, res2.fun]
                    [dummy, ind1] = [
                        np.min(
                            euclidean_distances(temp1.reshape(1, -1), locals)),
                        np.argmin(
                            euclidean_distances(temp1.reshape(1, -1), locals))
                    ]

                    [dummy, ind2] = [
                        np.min(
                            euclidean_distances(temp2.reshape(1, -1), locals)),
                        np.argmin(
                            euclidean_distances(temp2.reshape(1, -1), locals))
                    ]

                    if ind1 != ind2:

                        ts['x'].append(sep)
                        ts['f'].append(f)
                        ts['neighbor'].append([ind1, ind2])
                        ts['purturb'].append([sep1, sep2])
        if model.support == 'SVDD':
            for i in range(N):
                for j in range(i, N):
                    for k in range(10):
                        x0 = locals[i] + 0.1 * (k + 1) * (locals[j] -
                                                          locals[i])
                        sep = fsolve(func=fsolve_R,
                                     x0=x0,
                                     args=model,
                                     maxfev=300,
                                     xtol=10**(-6))
                        tmp_x.append(sep)
            tmp_x = np.array(tmp_x)
            [dummy, I, J] = np.unique(np.round(10 * tmp_x),
                                      axis=0,
                                      return_index=True,
                                      return_inverse=True)
            tmp_x = tmp_x[I, :]

            for i in range(list(tmp_x.shape)[0]):
                sep = tmp_x[i]
                [f, g, H] = my_R2(sep, model)
                [D, V] = la.eig(H)
                ind = []
                if np.sum(D < 0) == 1:

                    sep1 = sep + epsilon * V[np.where(D < 0)[0]]
                    sep2 = sep - epsilon * V[np.where(D < 0)[0]]
                    if attr == 2:
                        res1 = minimize(fun=my_R1,
                                        x0=sep1,
                                        args=model,
                                        method='Nelder-Mead')
                        [temp1, val] = [res1.x, res1.fun]
                        res2 = minimize(fun=my_R1,
                                        x0=sep2,
                                        args=model,
                                        method='Nelder-Mead')
                        [temp2, val] = [res2.x, res2.fun]
                    else:
                        res1 = minimize(fun=my_R1,
                                        x0=sep1,
                                        args=model,
                                        hess=True)
                        [temp1, val] = [res1.x, res1.fun]
                        res2 = minimize(fun=my_R1,
                                        x0=sep2,
                                        args=model,
                                        hess=True)
                        [temp2, val] = [res2.x, res2.fun]
                    [dummy, ind1] = [
                        np.min(
                            euclidean_distances(temp1.reshape(1, -1), locals)),
                        np.argmin(
                            euclidean_distances(temp1.reshape(1, -1), locals))
                    ]
                    [dummy, ind2] = [
                        np.min(
                            euclidean_distances(temp2.reshape(1, -1), locals)),
                        np.argmin(
                            euclidean_distances(temp2.reshape(1, -1), locals))
                    ]
                    if ind1 != ind2:
                        ts['x'].append(sep)
                        ts['f'].append(f)
                        ts['neighbor'].append([ind1, ind2])
                        ts['purturb'].append([sep1, sep2])
        ts['x'] = np.array(ts['x'])
        print(ts['x'])
        ts['f'] = np.array(ts['f'])
        ts['neighbor'] = np.array(ts['neighbor'])
        ts['purturb'] = np.array(ts['purturb'])
        self.ts = ts
Пример #38
0
 def between(self, A, B):
     return euclidean_distances(A, B)
Пример #39
0
 def within(self, A):
     return euclidean_distances(A, A)
Пример #40
0
    y = np.array(movie)

    simr = pearsonr(x, y)
    # simmink = minkowski(x, y, 3)

    # simr_hybrid = pearsonr(vI, vJ)
    # simmink_hybrid = minkowski(vI, vJ, 5)

    x = x.reshape(1, -1)
    y = y.reshape(1, -1)

    # vI = vI.reshape(1, -1)
    # vJ = vJ.reshape(1, -1)

    sim = cosine_similarity([movie], [movief])
    sime = euclidean_distances(x, y)

    # sim_hybrid = cosine_similarity(vI, vJ)
    # sime_hybrid = euclidean_distances(vI, vJ)

    q = "SELECT m.title FROM movies m JOIN trailers t on t.imdbid = m.imdbid WHERE t.id = ? AND t.best_file = 1"
    c = conn.cursor()
    c.execute(q, (key, ))
    title = c.fetchone()
    if (type(title) is tuple):

        # if (len(ratingsI) > 0 and len(ratingsJ) > 0):
        # simratings = cosine_similarity(ratingsI.reshape(1, -1), ratingsJ.reshape(1, -1))
        # similarities_ratings.append((title[0], simratings))

        similarities_cosine.append((title[0], sim))
Пример #41
0
def calc_distance(x, y):
    nx = np.asarray(x).reshape(1, -1)
    ny = np.asarray(y).reshape(1, -1)
    dist = euclidean_distances(nx, ny)
    return dist[0][0]
Пример #42
0
    def fillBox(self,
                solv,
                molNum,
                checkCollisions=False,
                replaceCollisions=False,
                applyPCBs=True,
                progress=None):
        import math, random
        import numpy as np
        from lib.chemicalGraph.molecule.solvent.Solvent import Solvent

        solventMolecules = Solvent(solv)  #.__class__)

        remaining = -1  # to track removng collissions

        totalDim = self.getDrawer().getBoxDimension()
        boxmin = self.getDrawer().getCellOrigin()
        boxmax = [
            boxmin[0] + totalDim[0], boxmin[1] + totalDim[1],
            boxmin[2] + totalDim[2]
        ]

        #calculate the volume for a single solvent molecule
        center = [0., 0., 0.]
        pos = solv.massCenter()
        solv.moveBy(
            [center[0] - pos[0], center[1] - pos[1], center[2] - pos[2]])
        #solvDiameter = solv.diameter()+ _SEPARATION_BETWEEN_SOLVENTS_
        solvDiameter = math.pow(self.getDrawer().getBoxVolume() / molNum,
                                1. / 3.)
        if solvDiameter == 0: return

        row = math.floor(totalDim[1] / solvDiameter)
        col = math.floor(totalDim[0] / solvDiameter)
        dep = math.floor(totalDim[2] / solvDiameter)

        progressMax = molNum
        progressCount = 0
        if progress != None:
            progress.setLabelText("Adding solvent")
            progress.setRange(0, molNum - 1)
            progress.setValue(0)

        solvRadius = solvDiameter / 2
        newPos = solv.massCenter()
        refCoor = boxmin

        #boxmax[0] -= solvDiameter/2.
        #boxmax[1] -= solvDiameter/2.
        #boxmax[2] -= solvDiameter/2.

        if progress != None:
            progress.setLabelText("Adding solvent")
            progress.setRange(progressCount, progressMax)
            progress.setValue(progressCount)

        # lopps over a sequence of adding solvent and removing collisions
        originalMolecules = self.getMixture().molecules()
        originalCoords = self.getMixture().getAtomsCoordinatesAsArray()
        solvRadius = solvDiameter / 2.0 + 1.5

        #print "anadiendo... ", progressMax-progressCount
        while progressCount < progressMax:  # adds solvent
            #random rotation for solvent atoms
            rx = random.uniform(0, 360)
            ry = random.uniform(0, 360)
            rz = random.uniform(0, 360)

            #random displacement for solvent atoms
            rdx = random.uniform(boxmin[0], boxmax[0])
            rdy = random.uniform(boxmin[1], boxmax[1])
            rdz = random.uniform(boxmin[2], boxmax[2])

            #generate new molecule and assign next position
            mol = solv.copy()
            mol.rotateDeg(rx, ry, rz)
            newPos = np.array([[rdx, rdy, rdz]])

            if checkCollisions:
                atomDistances = euclidean_distances(originalCoords, newPos)
                if np.min(atomDistances) > solvRadius:
                    #print 'fillBox: añadiendo', progressCount, np.min(atomDistances), newPos
                    mol.moveBy(list(newPos)[0])
                    #nodeName = self.addMolecule(mol, checkForInconsistentNames=False)
                    #self.shownMolecules.show(nodeName)
                    solventMolecules.addCoordinates(mol)
                else:
                    #print 'fillBox: colision'
                    if replaceCollisions: progressCount -= 1

            progressCount += 1
            if progress != None: progress.setValue(progressCount)
            del mol

        # add solvent and rename with the given name
        nodeName = self.addMolecule(solventMolecules,
                                    checkForInconsistentNames=True)
        self.shownMolecules.show(nodeName)
        #print "fillBox ", mol.molname(),progressMax
        solv.rename(solventMolecules.molname())
        progressMax -= 1
Пример #43
0
def test_euclidean_distances_known_result(x_array_constr, y_array_constr):
    # Check the pairwise Euclidean distances computation on known result
    X = x_array_constr([[0]])
    Y = y_array_constr([[1], [2]])
    D = euclidean_distances(X, Y)
    assert_allclose(D, [[1., 2.]])
    def estimate_doc2vec_euclidean_dist(self):

        mat = self.word2vec_model.docvecs.get_normed_vectors()
        ecl_sim = euclidean_distances(mat, mat)
        return ecl_sim
Пример #45
0
def d2(c1, vec):
    #get distance
    return math.pow(euclidean_distances([c1], [vec]),2)
Пример #46
0
def test_pairwise_distances():
    # Test the pairwise_distance helper function.
    rng = np.random.RandomState(0)
    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    S = pairwise_distances(X, metric="euclidean")
    S2 = euclidean_distances(X)
    assert_array_almost_equal(S, S2)
    # Euclidean distance, with Y != X.
    Y = rng.random_sample((2, 4))
    S = pairwise_distances(X, Y, metric="euclidean")
    S2 = euclidean_distances(X, Y)
    assert_array_almost_equal(S, S2)
    # Test with tuples as X and Y
    X_tuples = tuple([tuple([v for v in row]) for row in X])
    Y_tuples = tuple([tuple([v for v in row]) for row in Y])
    S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean")
    assert_array_almost_equal(S, S2)
    # "cityblock" uses scikit-learn metric, cityblock (function) is
    # scipy.spatial.
    S = pairwise_distances(X, metric="cityblock")
    S2 = pairwise_distances(X, metric=cityblock)
    assert_equal(S.shape[0], S.shape[1])
    assert_equal(S.shape[0], X.shape[0])
    assert_array_almost_equal(S, S2)
    # The manhattan metric should be equivalent to cityblock.
    S = pairwise_distances(X, Y, metric="manhattan")
    S2 = pairwise_distances(X, Y, metric=cityblock)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)
    # Test cosine as a string metric versus cosine callable
    # The string "cosine" uses sklearn.metric,
    # while the function cosine is scipy.spatial
    S = pairwise_distances(X, Y, metric="cosine")
    S2 = pairwise_distances(X, Y, metric=cosine)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)
    # Test with sparse X and Y,
    # currently only supported for Euclidean, L1 and cosine.
    X_sparse = csr_matrix(X)
    Y_sparse = csr_matrix(Y)
    S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean")
    S2 = euclidean_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    S = pairwise_distances(X_sparse, Y_sparse, metric="cosine")
    S2 = cosine_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan")
    S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo())
    assert_array_almost_equal(S, S2)
    S2 = manhattan_distances(X, Y)
    assert_array_almost_equal(S, S2)
    # Test with scipy.spatial.distance metric, with a kwd
    kwds = {"p": 2.0}
    S = pairwise_distances(X, Y, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, Y, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)
    # same with Y = None
    kwds = {"p": 2.0}
    S = pairwise_distances(X, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)
    # Test that scipy distance metrics throw an error if sparse matrix given
    assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski")
    assert_raises(TypeError,
                  pairwise_distances,
                  X,
                  Y_sparse,
                  metric="minkowski")

    # Test that a value error is raised if the metric is unknown
    assert_raises(ValueError, pairwise_distances, X, Y, metric="blah")
def reachability_distance(a, b):
    distance = euclidean_distances(id_to_embedding[a].reshape(1, -1),
                                   id_to_embedding[b].reshape(1, -1))[0][0]
    return max(k_distance(b), distance)
Пример #48
0
#começo com uma lista, mas para usar numpy tem que virar array
lista_X = []

# fazer uma array of arrays
for i in X:
    lista_X.append([i])
array_lista_X = np.array([np.array(xi) for xi in lista_X])

X = array_lista_X

Y = (X)**2

centroides_num = 10  # numero de centros da RBF

# acho o k via random.choice, e não validação cruzada
index = np.random.choice(a=n, size=centroides_num)

subsample = X[index, :]

gamma = 0.5

kernel = np.exp(-gamma * euclidean_distances(X=X, Y=subsample, squared=True))
para = np.linalg.lstsq(kernel, Y)[0]

predict_Y = np.dot(kernel, para)

plt.plot(X, Y, 'r', label='Dados originais')
plt.plot(X, predict_Y, 'b', label='Após o data fit')
plt.legend()
plt.show()
Пример #49
0
def find_lines(lines_mask: np.ndarray) -> list:
    """
    Finds the longest central line for each connected component in the given binary mask.
    :param lines_mask: Binary mask of the detected line-areas
    :return: a list of Opencv-style polygonal lines (each contour encoded as [N,1,2] elements where each tuple is (x,y) )
    """
    # Make sure one-pixel wide 8-connected mask
    lines_mask = skeletonize(lines_mask)

    class MakeLineMCP(MCP_Connect):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.connections = dict()
            self.scores = defaultdict(lambda: np.inf)

        def create_connection(self, id1, id2, pos1, pos2, cost1, cost2):
            k = (min(id1, id2), max(id1, id2))
            s = cost1 + cost2
            if self.scores[k] > s:
                self.connections[k] = (pos1, pos2, s)
                self.scores[k] = s

        def get_connections(self, subsample=5):
            results = dict()
            for k, (pos1, pos2, s) in self.connections.items():
                path = np.concatenate(
                    [self.traceback(pos1),
                     self.traceback(pos2)[::-1]])
                results[k] = path[::subsample]
            return results

        def goal_reached(self, int_index, float_cumcost):
            if float_cumcost > 0:
                return 2
            else:
                return 0

    if np.sum(lines_mask) == 0:
        return []
    # Find extremities points
    end_points_candidates = np.stack(
        np.where((convolve2d(lines_mask, np.ones((3, 3)), mode='same') == 2)
                 & lines_mask)).T
    connected_components = skimage_label(lines_mask, connectivity=2)
    # Group endpoint by connected components and keep only the two points furthest away
    d = defaultdict(list)
    for pt in end_points_candidates:
        d[connected_components[pt[0], pt[1]]].append(pt)
    end_points = []
    for pts in d.values():
        d = euclidean_distances(np.stack(pts), np.stack(pts))
        i, j = np.unravel_index(d.argmax(), d.shape)
        end_points.append(pts[i])
        end_points.append(pts[j])
    end_points = np.stack(end_points)

    mcp = MakeLineMCP(~lines_mask)
    mcp.find_costs(end_points)
    connections = mcp.get_connections()
    # print(type(connections))
    # print(connections.keys())
    a = connections[(8, 9)][:, None, ::-1]
    print(type(a))
    print(a)
    img = np.zeros((lines_mask.shape[0], lines_mask.shape[1], 3),
                   dtype=np.uint8)
    img += 255
    # for c in connections.values():
    #     c = c.astype(np.uint8)
    #     print(type(c))
    #     print(c)
    res = [connections[c][:, None, ::-1] for c in connections.keys()]
    for c in res:
        cv2.polylines(img, c, isClosed=True, color=(0, 0, 255), thickness=10)
        # cv2.fillPoly(img, [c], (255, 0, 0))
    Image.fromarray(img).show()
    if not np.all(
            np.array(sorted([i for k in connections.keys()
                             for i in k])) == np.arange(len(end_points))):
        print('Warning : find_lines seems weird')
    return [c[:, None, ::-1] for c in connections.values()]
gt_pred = []
gt_head = []  # Ground Truth of head entity
for line in open(os.path.join(args.output,'test.txt'), 'r'):
    items = line.strip().split("\t")
    gt_head.append(items[1])
    gt_pred.append(items[3])
    gt_tail.append(items[4])"""

# In[36]:

notmatch = list(set(range(0, total_num)).symmetric_difference(id_match))

# In[37]:

notmatch_idx = euclidean_distances(head_emb[notmatch],
                                   entities_emb,
                                   squared=True).argsort(axis=1)

# In[38]:

for idx, i in enumerate(notmatch):
    for j in notmatch_idx[idx, 0:40]:
        mid = mid_num_dic[j]
        head_mid_idx[i].append((mid, None))
        match_mid_list.append(mid)

# In[39]:

correct, mid_num = 0, 0
for i, head_ids in enumerate(head_mid_idx):
    mids = set()
Пример #51
0
 def closest_docs(self, point, docs, num_docs=5):
     distances = euclidean_distances(point, docs)
     
     return distances.argsort()[:, :num_docs]
Пример #52
0
def count(thresholded, segmented):

	# find the convex hull of the segmented hand region

	chull = cv2.convexHull(segmented)



	# find the most extreme points in the convex hull

	extreme_top    = tuple(chull[chull[:, :, 1].argmin()][0])

	extreme_bottom = tuple(chull[chull[:, :, 1].argmax()][0])

	extreme_left   = tuple(chull[chull[:, :, 0].argmin()][0])

	extreme_right  = tuple(chull[chull[:, :, 0].argmax()][0])



	# find the center of the palm

	cX = int ((extreme_left[0] + extreme_right[0]) / 2)

	cY = int ((extreme_top[1] + extreme_bottom[1]) / 2)



	# find the maximum euclidean distance between the center of the palm

	# and the most extreme points of the convex hull

	distance = pairwise.euclidean_distances([(cX, cY)], Y=[extreme_left, extreme_right, extreme_top, extreme_bottom])[0]

	maximum_distance = distance[distance.argmax()]

	

	# calculate the radius of the circle with 80% of the max euclidean distance obtained

	radius = int(0.5 * maximum_distance)

	

	# find the circumference of the circle

	circumference = (2 * np.pi * radius)



	# take out the circular region of interest which has 

	# the palm and the
	circular_roi =np.zeros(thresholded.shape[:2], dtype="uint8")

	

	# draw the circular ROI

	cv2.circle(circular_roi, (cX, cY), radius, 255, 1)

	

	# take bit-wise AND between thresholded hand using the circular ROI as the mask

	# which gives the cuts obtained using mask on the thresholded hand image

	circular_roi = cv2.bitwise_and(thresholded, thresholded, mask=circular_roi)



	# compute the contours in the circular ROI

	(cnts, _) = cv2.findContours(circular_roi.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)

        

	# initalize the finger count

	count = 0
        
	# loop through the contours found

	for c in cnts:

		(x, y, w, h)= cv2.boundingRect(c)
		if ((cY + (cY * 0.25)) > (y + h)) and ((circumference * 0.25) > c.shape[0]):
			count += 1
		
	return count
Пример #53
0
    def test_bd_test(self):
        x = [1, 2, 3, 4, 5]
        y = [1, 2, 3, 4, 5]
        bd_value = bd_test(x, y)
        bd_value = bd_value[0]
        self.assertAlmostEqual(bd_value, 0.0)

        np.random.seed(7654567)
        x = np.random.normal(0, 1, 50)
        y = np.random.normal(1, 1, 50)
        bd_value = bd_test(x, y)
        bd_value = bd_value[0]
        self.assertAlmostEqual(bd_value, 0.196408479999)

        x = np.random.normal(0, 1, 100).reshape(50, 2)
        y = np.random.normal(3, 1, 100).reshape(50, 2)
        bd_value = bd_test(x, y)
        bd_value = bd_value[0]
        self.assertAlmostEqual(bd_value, 0.5681075200000011)

        x = np.random.normal(0, 1, 100).reshape(50, 2)
        y = np.random.normal(10, 1, 100).reshape(50, 2)
        z = np.random.normal(100, 1, 100).reshape(50, 2)
        bd_value = bd_test(x, y, z)
        bd_value = bd_value[0]
        self.assertAlmostEqual(bd_value, 2.0604000000000022)

        bd_value = bd_test(x, y, z, weight="max")
        bd_value = bd_value[0]
        self.assertAlmostEqual(bd_value, 1.3736000000000015)

        n = 90
        x = np.random.normal(0, 1, n)
        bd_value = bd_test(x, size=np.array([40, 50]))
        bd_value = bd_value[0]
        self.assertAlmostEqual(bd_value, 0.009086599999999997)

        x = [np.random.normal(0, 1, num) for num in [40, 50]]
        x = np.hstack(x)
        bd_value = bd_test(x, [40, 50])
        bd_value = bd_value[0]
        self.assertAlmostEqual(bd_value, 0.9639094650205713)

        x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        x = np.array(x, dtype=np.double)
        bd_value = bd_test(x, size=np.array([5, 5]))
        bd_value = bd_value[0]
        self.assertAlmostEqual(bd_value, 0.7231999999999997)

        x = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]]
        bd_value = bd_test(x)
        bd_value = bd_value[0]
        self.assertAlmostEqual(bd_value, 2.403199999999999)

        from sklearn.metrics.pairwise import euclidean_distances
        sigma = [[1, 0], [0, 1]]
        x = np.random.multivariate_normal(mean=[0, 0], cov=sigma, size=50)
        y = np.random.multivariate_normal(mean=[1, 1], cov=sigma, size=50)
        x = np.row_stack((x, y))
        dx = euclidean_distances(x, x)
        data_size = [50, 50]
        bd_value = bd_test(dx, size=data_size, dst=True)
        bd_value = bd_value[0]
        self.assertAlmostEqual(bd_value, 0.10779759999999977)
Пример #54
0
    def hierarchicalLabelTSVC(self):
        print("hierarchicalLableTSVC")
        nOfLocals = self.locals.shape[0]
        ts = self.ts
        nOfTS = len(ts['f'])
        K = self.options['K']

        local_clusters_assignments = []
        f_sort = np.sort(ts['f'], 0)  # small --> large
        print("f_sort:", f_sort)
        adjacent = np.zeros([nOfLocals, nOfLocals, nOfTS])
        a = []
        flag = 0
        for m in range(nOfTS):
            cur_f = f_sort[
                -m -
                1]  # % cutting level:large --> small  (small number of clusters --> large number of clusters)
            # %cur_f=f_sort(i);         % cutting level: small --> large (large number of clusters --> small number of clusters)

            tmp = np.nonzero(ts['f'] < cur_f)[0]
            if len(tmp) > 0:  # % TSs inside the sphere
                for j in range(len(tmp)):
                    adjacent[ts['neighbor'][tmp[j], 0],
                             ts['neighbor'][tmp[j], 1], m] = 1
                    adjacent[ts['neighbor'][tmp[j], 1],
                             ts['neighbor'][tmp[j], 0], m] = 1
                    # %% To connect nodes which can be connected via directly connected edges.
                for i in range(nOfLocals):
                    for j in range(i):
                        if (adjacent[i, j, m] == 1):
                            adjacent[i, :, m] = np.logical_or(
                                adjacent[i, :, m], adjacent[j, :, m])
                    adjacent[i, i] = 1

            a = [a, cur_f]
            my_ts = {}
            my_ts['x'] = ts['x'][tmp, :]
            my_ts['f'] = ts['f'][tmp, :]
            my_ts['purturb'] = ts['purturb'][tmp, :]
            my_ts['neighbor'] = ts['neighbor'][tmp, :]
            my_ts['cuttingLevel'] = cur_f
            ind = np.nonzero(ts['f'] == cur_f)[0]
            my_ts['levelx'] = ts['x'][ind[0], :]
            tmp_ts = {}  ####dictionary
            tmp_ts[m] = my_ts

            assignment = cg.connected_components(adjacent[:, :, m])[1]
            print("assignment:", assignment)
            print("N_clusters:", np.max(assignment) + 1)
            if np.max(assignment) == K - 1:
                print('We can find the number of K clusters')
                # % clstmodel update
                self.out_ts = tmp_ts[m]
                # % cluster assignment into entire data points
                self.local_ass = assignment
                self.cluster_labels = self.local_ass[self.match_local].T
                flag = 1
                break

            local_clusters_assignments = [
                local_clusters_assignments, assignment
            ]

            # % cannot find k clusters
        if flag == 0:
            print(
                'Cannot find cluster assignments with K number of clusters, instead that we find cluster assignments the with the nearest number of clusters to K !'
            )
            [dummy, ind] = np.min(
                euclidean_distances(
                    np.max(local_clusters_assignments, 0).T, K),
                0)  ####min/max

            # %ts=[];
            self.out_ts = tmp_ts[ind[0]]
            local_clusters_assignments = local_clusters_assignments[:, ind[0]]
            self.local_ass = local_clusters_assignments
            self.cluster_labels = self.local_ass[self.match_local]
            print(self.cluster_labels)
Пример #55
0
def _labels_inertia_precompute_dense(X, x_squared_norms, centers, distances):
    """Compute labels and inertia using a full distance matrix.
    This will overwrite the 'distances' array in-place.
    Parameters
    ----------
    X : numpy array, shape (n_sample, n_features)
        Input data.
    x_squared_norms : numpy array, shape (n_samples,)
        Precomputed squared norms of X.
    centers : numpy array, shape (n_clusters, n_features)
        Cluster centers which data is assigned to.
    distances : numpy array, shape (n_samples,)
        Pre-allocated array in which distances are stored.
    Returns
    -------
    labels : numpy array, dtype=np.int, shape (n_samples,)
        Indices of clusters that samples are assigned to.
    inertia : float
        Sum of distances of samples to their closest cluster center.
    """
    n_samples = X.shape[0]
    k = centers.shape[0]
    all_distances = euclidean_distances(centers, X, x_squared_norms,
                                        squared=True)
    labels = np.empty(n_samples, dtype=np.int32)
    labels.fill(-1)
    mindist = np.empty(n_samples)
    mindist.fill(np.infty)


    n_samples = X.shape[0]
    k = centers.shape[0]
    max_cluster_size = get_clusters_size(n_samples, k)

    labels, mindist = initial_assignment(labels, mindist, n_samples, all_distances, max_cluster_size)
    all_points = np.arange(n_samples)

    for point in all_points:
        for point_dist in get_best_point_distances(point, all_distances):
            cluster_id, point_dist = point_dist
            # initial assignment
            if not is_cluster_full(cluster_id, max_cluster_size, labels):
                labels[point] = cluster_id
                mindist[point] = point_dist
                break

    # refinement of clustering
    transfer_list = []
    best_mindist = mindist.copy()
    best_labels = labels.copy()
    # sort all of the points from largest distance to smallest
    points_by_high_distance = np.argsort(mindist)[::-1]
    for point in points_by_high_distance:
        point_cluster = labels[point]

        # see if there is an opening on the best cluster for this point
        cluster_id, point_dist = get_best_cluster_for_point(point, all_distances)
        if not is_cluster_full(cluster_id, max_cluster_size, labels) and point_cluster != cluster_id:
            labels[point] = cluster_id
            mindist[point] = point_dist
            best_labels = labels.copy()
            best_mindist = mindist.copy()
            continue # on to the next point

        for swap_candidate in transfer_list:
            cand_cluster = labels[swap_candidate]
            if point_cluster != cand_cluster:

                # get the current dist of swap candidate
                cand_distance = mindist[swap_candidate]

                # get the potential dist of point
                point_distance = all_distances[cand_cluster, point]

                # compare
                if point_distance < cand_distance:

                    labels[point] = cand_cluster
                    mindist[point] = all_distances[cand_cluster, point]

                    labels[swap_candidate] = point_cluster
                    mindist[swap_candidate] = all_distances[point_cluster, swap_candidate]

                    if np.absolute(mindist).sum() <  np.absolute(best_mindist).sum():
                        # update the labels since the transfer was a success
                        best_labels = labels.copy()
                        best_mindist = mindist.copy()
                        break

                    else:
                        # reset since the transfer was not a success
                        labels = best_labels.copy()
                        mindist = best_mindist.copy()

        transfer_list.append(point)

    if n_samples == distances.shape[0]:
        # distances will be changed in-place
        distances[:] = mindist
    inertia = best_mindist.sum()

    return best_labels, inertia
def calculate_distance(cent, player_values):
    dist = euclidean_distances(cent, player_values)
    return dist[0][0]
Пример #57
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--threshold', type=float, default=0.9)
    parser.add_argument('--delete',
                        action='store_true',
                        help='Delete the outliers.')
    parser.add_argument('--doSubDirs',
                        action='store_true',
                        help='Work on all direct subdirectories.')
    parser.add_argument('--duplicates',
                        action='store_true',
                        help='Identify duplicates rather than outliers.')
    parser.add_argument(
        '--checkDirs',
        action='store_true',
        help=
        'Check directories for high variance, indicating previous clean-up has not worked well.'
    )
    parser.add_argument('directory')

    args = parser.parse_args()

    if args.duplicates and args.checkDirs:
        sys.exit("Combination of --duplicates and --checkDirs is not allowed.")

    if not os.path.isdir(args.directory):
        sys.exit("Input directory not found.")

    if args.doSubDirs:
        d = next(os.walk(args.directory))[1]
    else:
        d = [args.directory]

    for thisdir in d:
        print("=== {} ===".format(os.path.join(args.directory, thisdir)))
        outliers = []
        duplicates = []
        featList = list()
        allfiles = os.listdir(os.path.join(args.directory, thisdir))
        allfilesFaces = list()
        for thisfile in allfiles:
            if thisfile.endswith(".vgg1"):
                with open(os.path.join(args.directory, thisdir, thisfile),
                          'rb') as f:
                    reader = csv.reader(f)
                    for row in reader:
                        featList.append(row)
                        #multiple faces in a single image files
                        allfilesFaces.append(
                            os.path.join(args.directory, thisdir, thisfile))
        thisEmbeddings = np.vstack(featList)
        thisEmbeddings = thisEmbeddings.astype(np.float)
        if args.duplicates:
            for p1 in range(0, thisEmbeddings.shape[0]):
                for p2 in range(p1 + 1, thisEmbeddings.shape[0]):
                    dist = euclidean_distances(
                        thisEmbeddings[p1].reshape(1, -1),
                        thisEmbeddings[p2].reshape(1, -1))
                    if dist < args.threshold:
                        duplicates.append(
                            (allfilesFaces[p1], allfilesFaces[p2], dist))
            print("Found {} duplicate pairs from {} images.".format(
                len(duplicates), len(allfiles)))
            for p1, p2, dist in duplicates:
                print("{} - {}: {:0.4f}".format(p1, p2, dist[0][0]))
                if args.delete:
                    try:
                        os.remove(p2)
                    except OSError:
                        # might already be removed if 3 or more identials
                        print("could not remove: ", p2)
        elif args.checkDirs:
            std = np.std(reps, axis=0)
            #mean = np.mean(reps, axis=0)
            dists = euclidean_distances(reps, reps)
            o = np.std(dists)
            # little reduction of std in cleaned-up version after outlier removal could be a hint,
            # but could also indicate perfect start, and would need keeping both directories
            # std < 0.2 means probably mostly images of one person, OK
            # std > 0.25 means probably images of two or more persons, not OK
            # std between 0.2 and 0.25 is a bit unclear, either a very varied face, or young to old, or multiple persons
            print(o)
        else:
            #print(type(thisEmbeddings))
            #print(type(thisEmbeddings[0][0]))
            mean = np.mean(thisEmbeddings, axis=0)
            dists = euclidean_distances(thisEmbeddings, mean.reshape(1, -1))
            for path, dist in zip(allfilesFaces, dists):
                dist = dist.take(0)
                if dist > args.threshold:
                    outliers.append((path, dist))
            print("Found {} outlier(s) from {} images.".format(
                len(outliers), len(allfiles)))
            for path, dist in outliers:
                print(" + {} ({:0.2f})".format(path, dist))
                if args.delete:
                    try:
                        os.remove(path)
                    except:
                        a = 3
Пример #58
0
 def _transform(self, X):
     """guts of transform method; no input validation"""
     #print "In _transform(), X =\n", X
     #print "In _transform(), self.cluster_centers_ =\n", self.cluster_centers_
     return euclidean_distances(X, self.cluster_centers_)
Пример #59
0
def _kmedoids_run(X, n_clusters, max_iter, tolerance):
    '''
    Main function for runing the k-medoids clustering
    -------------
    X: the input data ndarray for k-medoids clustering, (#samples, #features)
    n_cluster: number of clusters
    max_iter: maximum number of clusters
    torlerance: the tolerance to stop the iterations, in percentage; 
                i.e.: if tolerance=0.01, it means if the cost function decrease is less than 1%, the iteraction will stop.
    '''

    n_samples = len(X)
    '''Calcuate the paired eucledian distance '''
    dist_mat = euclidean_distances(X)
    ''' Initialize the medoids'''
    currentMedoids = np.asarray(_get_init_centers(n_clusters, n_samples))
    '''Calcualte the total cost of the initial medoids'''
    costs_iters = []
    dist_meds = dist_mat[currentMedoids]
    tot_cos = _get_cost(dist_meds, currentMedoids)
    costs_iters.append(tot_cos)
    cc = 0

    for i in range(max_iter):
        dist_meds = dist_mat[currentMedoids]
        '''Associate  each data point to the closest medoid
            And calcualte the total cost'''
        tot_cos = _get_cost(dist_meds, currentMedoids)
        '''Get new mediods o'''
        newMedoids = []
        for j in range(n_clusters):
            o = np.random.choice(n_samples)
            if (not o in currentMedoids and not o in newMedoids):
                newMedoids.append(o)
        newMedoids = np.asarray(newMedoids).astype(int)
        dist_meds_ = dist_mat[newMedoids]
        tot_cos_ = _get_cost(dist_meds_, newMedoids)
        '''Swap newmediods with the current mediod if cost decreases'''
        if (tot_cos_ - tot_cos) < 0:
            currentMedoids = newMedoids
            costs_iters.append(tot_cos_)
            cc = +1
            if abs(costs_iters[cc] / costs_iters[cc - 1] - 1) < tolerance:
                '''Associated  data points to the final calucated medoids (reached by tolerance)'''
                clsts_membr_ids = []
                dis_min = np.min(dist_meds, axis=0)
                for k in range(n_clusters):
                    clst_mem_ids = np.where(dist_meds[k] == dis_min)[0]
                    clsts_membr_ids.append(clst_mem_ids)

                return currentMedoids, clsts_membr_ids, costs_iters
                break

    costs_iters = np.asarray(costs_iters)
    '''Associated  data points to the final calucated medoids (reached by maximum iters)'''
    clsts_membr_ids = []
    dist_meds = dist_mat[currentMedoids]
    dis_min = np.min(dist_meds, axis=0)
    for k in range(n_clusters):
        clst_mem_ids = np.where(dist_meds[k] == dis_min)[0]
        clsts_membr_ids.append(clst_mem_ids)

    return currentMedoids, clsts_membr_ids, costs_iters
Пример #60
0
    def _predict(self, X_predict):
        """
        Auxiliary function to do the kNN prediction based on an approximated
        geodesic metric, while possibly intersecting each new sample with
        a Euclidean ball size self.ball_radius first.

        Parameters
        ------------
        X_predict: np.array of size (D) or of size (N, D)
            Test points at which a prediction is done.

        Returns
        -------------
        An np.array of size (N, len(self.n_neighbors)) containing the prediction
        for the N-th points with all desired choices of neighbors in the N-th row.
        """
        # Handle only case where n_neighbors is a list here
        if isinstance(self.n_neighbors, (int, long)):
            tmp_n_neighbors = [self.n_neighbors]
        else:
            tmp_n_neighbors = self.n_neighbors
        if len(X_predict.shape) == 1:  # Single sample case
            tmp_X_predict = np.reshape(X_predict, (1, -1))
        else:
            tmp_X_predict = X_predict
        n_test_samples = tmp_X_predict.shape[0]
        if self.ball_radius is None:
            ball_radius = 1e16
        else:
            ball_radius = self.ball_radius
        # Container
        prediction = np.zeros((n_test_samples, len(tmp_n_neighbors)))
        # Boolean matrix with 1 in (i,j) if training sample X_j is inside the Euclidean ball around test sample i
        inside_euclidean_ball = np.less(
            euclidean_distances(tmp_X_predict, self.X_),
            ball_radius).astype('bool')
        # Get training samples belonging to a certain level set
        assignment = [[] for _ in range(self.n_levelsets)]
        for j in range(self.n_levelsets):
            assignment[j] = np.where(self.labels_ == j)[0]
        # Get maximum number of points in the radius for any test point
        min_idx = tmp_n_neighbors
        for k in range(n_test_samples):
            distances = 100.0 * np.ones(self.N)
            for i in range(self.n_levelsets):
                idx = np.where(inside_euclidean_ball[k, assignment[i]])[
                    0]  # Find indices that are inside euclidean ball
                PX_predict = self.tangents_[i, :].dot(tmp_X_predict[k, :].T)
                ind_levelset = np.where(self.labels_[idx] == i)[0]
                # Setting distances inside level set and euclidean ball
                distances[assignment[i][idx]] = np.abs(
                    self.PX_[assignment[i][idx]] - PX_predict)
                # distances[assignemnt[ind_levelset]] = np.abs(self.PX_[idx][ind_levelset] - PX_predict)
            for l, nNei in enumerate(tmp_n_neighbors):
                idx_for_pred = np.argpartition(
                    distances, tmp_n_neighbors[l])[:tmp_n_neighbors[l]]
                idx_below_bound = np.where(distances[idx_for_pred] < 90.0)[0]
                if len(idx_below_bound) < min_idx[l]:
                    min_idx[l] = len(idx_below_bound)
                if len(idx_for_pred[idx_below_bound]) == 0:
                    raise RuntimeError(
                        "kNN Prediction: No neighbours satisfy the requirements."
                    )
                prediction[k,
                           l] = np.mean(self.Y_[idx_for_pred[idx_below_bound]])
        if any(np.array(min_idx) < np.array(tmp_n_neighbors)):
            print "Could use only {0} samples for some predictions".format(
                min_idx)
        return prediction