def draw_intensity(a, cmap=GREEN_CMAP, metric='euclidean', method='average', sort_x=True, sort_y=True):
    main_axes = plt.gca()
    divider = make_axes_locatable(main_axes)

    if sort_x is True:
        plt.sca(divider.append_axes("top", 0.5, pad=0))
        xlinkage = linkage(pdist(a.T, metric=metric), method=method, metric=metric)
        xdendro = dendrogram(xlinkage, orientation='top', no_labels=True,
                             distance_sort='descending',
                             link_color_func=lambda x: 'black')
        plt.gca().set_axis_off()
        a = a[[a.columns[i] for i in xdendro['leaves']]]

    if sort_y is True:
        plt.sca(divider.append_axes("left", 1.0, pad=0))
        ylinkage = linkage(pdist(a, metric=metric), method=method, metric=metric)
        ydendro = dendrogram(ylinkage, orientation='right', no_labels=True,
                             distance_sort='descending',
                             link_color_func=lambda x: 'black')
        plt.gca().set_axis_off()
        a = a.ix[[a.index[i] for i in ydendro['leaves']]]

    plt.sca(main_axes)
    plt.imshow(a, aspect='auto', interpolation='none',
               cmap=cmap, vmin=0.0, vmax=1.0)
    plt.colorbar(pad=0.15)
    plt.gca().yaxis.tick_right()
    plt.xticks(range(a.shape[1]), a.columns, rotation=90, size='small')
    plt.yticks(range(a.shape[0]), a.index, size='x-small')
    plt.gca().xaxis.set_ticks_position('none')
    plt.gca().yaxis.set_ticks_position('none')
    plt.gca().invert_yaxis()

    plt.show()
Exemplo n.º 2
0
def rmsd(ref_cds, est_cds):
    """
    Root-mean-squared-difference
    """
    ref_dists = pdist(ref_cds)
    est_dists = pdist(est_cds)
    return np.sqrt(((ref_dists - est_dists)**2).mean())
def compute_distance():
	'''
	Computes distances between congress members for a particular category and writes out the results
	in a text file. Web App reads these text files to show graphs. 
	'''

	category_map = {1: 'Health Care', 2: 'National Security', 3:'Economy', 4:'Environment', 5:'Domestic Issues' }
	vm = Voting_Matrix('114')

	for j in xrange(1,6):
		votes, member_to_row =  vm.generate_matrix(category = [j])
		y = pdist(votes, 'cosine')
		y_dist = squareform(y)
		normed_distances = np.zeros((len(y_dist), len(y_dist)))
		for i in xrange(len(y_dist)):
			min_value = min(y_dist[i,:])
			max_value = max(y_dist[i,:])
			normed_distances[i,:] = (y_dist[i,:]-min_value) / (max_value-min_value)

		np.savetxt("data/%s114Distance.csv" %category_map[j], normed_distances, delimiter=",", fmt='%5.5f')

	votes, member_to_row =  vm.generate_matrix(category = [1,2,3,4,5])
	y = pdist(votes, 'cosine')
	y_dist = squareform(y)
	normed_distances = np.zeros((len(y_dist), len(y_dist)))
	for i in xrange(len(y_dist)):
		min_value = min(y_dist[i,:])
		max_value = max(y_dist[i,:])
		normed_distances[i,:] = (y_dist[i,:]-min_value) / (max_value-min_value)
	np.savetxt("data/All Categories114Distance.csv" , normed_distances, delimiter=",", fmt='%5.5f')

	df = pd.read_csv('../DataCollectionInsertion/Members/114Members.csv')
	row_nums = np.array([member_to_row[str(df.iloc[i]['person__id'])] for i in xrange(len(df))])
	df['row_nums'] = row_nums
	df.to_csv('../DataCollectionInsertion/Members/114Members.csv', sep=',')
Exemplo n.º 4
0
def writePlotMDS(num, nest, seqs, dbfile, mappos, maparr, map2d, outfile, refdb=None, refseqs=None, rg=None):
    
    #initialize variables
    clusters = range(1,len(num)+1)
    frequency = list(num)
    
    #loop through clusters
    structure = [0 for i in range(len(num))]
    diversity = [[] for i in range(len(num))]
    for i in range(len(num)):
        indices = [j for j, x in enumerate(seqs) if x == nest[i]]
        db = [dbfile[j] for j in indices]
        
        #get cluster structure medoids
        structs = [j.replace('.','0') for j in db]
        structs = [j.replace('(','1') for j in structs]
        structs = [j.replace(')','1') for j in structs]
        structs = [[int(x) for x in list(j)] for j in structs]
        dst = pdist(1-np.matrix(structs),'jaccard')
        dst = np.sum(dst, axis=0)
        ind = np.argmin(dst)
        structure[i] = db[ind]

        #get diversity
        if refdb is not None:
            indices = [j for j, x in enumerate(refseqs) if x == nest[i]]
            db = [refdb[j] for j in indices]
            db = [x[rg[0]:rg[-1]] for x in db]
            structs = [j.replace('.','0') for j in db]
            structs = [j.replace('(','1') for j in structs]
            structs = [j.replace(')','1') for j in structs]
            structs = [[int(x) for x in list(j)] for j in structs]
            if not indices:
                diversity[i] = [0, 0]
            else:
                d = Counter(db)
                d = sorted(d.items())
                n = [x[0] for x in d] #unique structures
                m = [x[1] for x in d] #frequency
                divsz = 1
                if len(m) > 1:
                    if len(structs) < 2:
                        divsz = 1
                    if len(structs) < 3:
                        divsz = pdist(1-np.matrix(structs),'jaccard').tolist()[0]
                    else:
                        divsz = min(np.diag(np.matrix(pdist(1-np.matrix(structs),'jaccard')),k=1))
                divfreq = max(m)/len(db)
                diversity[i] = [divsz, divfreq]
       
    #write to file
    with open(outfile+'.csv', 'w') as csvfile:
        fieldnames = ['cluster', 'xy-coords', 'frequency', 'mediod-structure', 'vectorization']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for i in range(len(num)):
            basic_dict = {'cluster': clusters[i], 'xy-coords': np.array_str(mappos[i]), 'frequency': frequency[i], 'mediod-structure': structure[i], 'vectorization': maparr[i],}
            writer.writerow(basic_dict)

    return{'structs':structure, 'diversity':diversity}
Exemplo n.º 5
0
def stress(ref_cds, est_cds):
    """
    Kruskal's stress
    """
    ref_dists = pdist(ref_cds)
    est_dists = pdist(est_cds)
    return np.sqrt(((ref_dists - est_dists)**2).sum() / (ref_dists**2).sum())
Exemplo n.º 6
0
Arquivo: rti.py Projeto: npatwari/rti
def initRTI(nodeLocs, delta_p, sigmax2, delta, excessPathLen):

    # Set up pixel locations as a grid.
    personLL        = nodeLocs.min(axis=0)
    personUR        = nodeLocs.max(axis=0)
    pixelCoords, xVals, yVals = calcGridPixelCoords(personLL, personUR, delta_p)
    pixels          = pixelCoords.shape[0]
    #plt.figure(3)
    #plotLocs(pixelCoords)
    

    # Find distances between pixels and transceivers
    DistPixels  = dist.squareform(dist.pdist(pixelCoords))
    DistPixelAndNode = dist.cdist(pixelCoords, nodeLocs)
    DistNodes   = dist.squareform(dist.pdist(nodeLocs))

    # Find the (inverse of) the Covariance matrix between pixels
    CovPixelsInv       = linalg.inv(sigmax2*np.exp(-DistPixels/delta))

    # Calculate weight matrix for each link.
    nodes = len(nodeLocs)
    links = nodes*(nodes-1)
    W = np.zeros((links, pixels))
    for ln in range(links):
        txNum, rxNum  = txRxForLinkNum(ln, nodes)
        ePL           = DistPixelAndNode[:,txNum] + DistPixelAndNode[:,rxNum] - DistNodes[txNum,rxNum]  
        inEllipseInd  = np.argwhere(ePL < excessPathLen)
        pixelsIn      = len(inEllipseInd)
        if pixelsIn > 0:
            W[ln, inEllipseInd] = 1.0 / float(pixelsIn)

    # Compute the projection matrix
    inversion       = np.dot(linalg.inv(np.dot(W.T, W) + CovPixelsInv), W.T)

    return (inversion, xVals, yVals)
Exemplo n.º 7
0
    def test_pdist(self):
        for metric, argdict in self.scipy_metrics.iteritems():
            keys = argdict.keys()
            for vals in itertools.product(*argdict.values()):
                kwargs = dict(zip(keys, vals))
                D_true = pdist(self.X1, metric, **kwargs)
                Dsq_true = squareform(D_true)
                dm = DistanceMetric(metric, **kwargs)
                for X in self.X1, self.spX1:
                    yield self.check_pdist, metric, X, dm, Dsq_true, True

                for X in self.X1, self.spX1:
                    yield self.check_pdist, metric, X, dm, D_true, False

        for rmetric, (metric, func) in self.reduced_metrics.iteritems():
            argdict = self.scipy_metrics[metric]
            keys = argdict.keys()
            for vals in itertools.product(*argdict.values()):
                kwargs = dict(zip(keys, vals))
                D_true = func(pdist(self.X1, metric, **kwargs),
                              **kwargs)
                Dsq_true = squareform(D_true)
                dm = DistanceMetric(rmetric, **kwargs)
                for X in self.X1, self.spX1:
                    yield self.check_pdist, rmetric, X, dm, Dsq_true, True

                for X in self.X1, self.spX1:
                    yield self.check_pdist, rmetric, X, dm, D_true, False
Exemplo n.º 8
0
def optimal_clustering(df, patch, method='kmeans', statistic='gap', max_K=5):
    if len(patch) == 1:
        return [patch]

    if statistic == 'db':
        if method == 'kmeans':
            if len(patch) <= 5:
                K_max = 2
            else:
                K_max = min(len(patch) / 2, max_K)
            clustering = {}
            db_index = []
            X = df.ix[patch, :]
            for k in range(2, K_max + 1):
                kmeans = cluster.KMeans(n_clusters=k).fit(X)
                clustering[k] = pd.DataFrame(kmeans.predict(X), index=patch)
                dist_mu = squareform(pdist(kmeans.cluster_centers_))
                sigma = []
                for i in range(k):
                    points_in_cluster = clustering[k][clustering[k][0] == i].index
                    sigma.append(sqrt(X.ix[points_in_cluster, :].var(axis=0).sum()))
                db_index.append(davies_bouldin(dist_mu, np.array(sigma)))
            db_index = np.array(db_index)
            k_optimal = np.argmin(db_index) + 2
            return [list(clustering[k_optimal][clustering[k_optimal][0] == i].index) for i in range(k_optimal)]

        elif method == 'agglomerative':
            if len(patch) <= 5:
                K_max = 2
            else:
                K_max = min(len(patch) / 2, max_K)
            clustering = {}
            db_index = []
            X = df.ix[patch, :]
            for k in range(2, K_max + 1):
                agglomerative = cluster.AgglomerativeClustering(n_clusters=k, linkage='average').fit(X)
                clustering[k] = pd.DataFrame(agglomerative.fit_predict(X), index=patch)
                tmp = [list(clustering[k][clustering[k][0] == i].index) for i in range(k)]
                centers = np.array([np.mean(X.ix[c, :], axis=0) for c in tmp])
                dist_mu = squareform(pdist(centers))
                sigma = []
                for i in range(k):
                    points_in_cluster = clustering[k][clustering[k][0] == i].index
                    sigma.append(sqrt(X.ix[points_in_cluster, :].var(axis=0).sum()))
                db_index.append(davies_bouldin(dist_mu, np.array(sigma)))
            db_index = np.array(db_index)
            k_optimal = np.argmin(db_index) + 2
            return [list(clustering[k_optimal][clustering[k_optimal][0] == i].index) for i in range(k_optimal)]

    elif statistic == 'gap':
        X = np.array(df.ix[patch, :])
        if method == 'kmeans':
            f = cluster.KMeans
        gaps = gap(X, ks=range(1, min(max_K, len(patch))), method=f)
        k_optimal = list(gaps).index(max(gaps))+1
        clustering = pd.DataFrame(f(n_clusters=k_optimal).fit_predict(X), index=patch)
        return [list(clustering[clustering[0] == i].index) for i in range(k_optimal)]

    else:
        raise 'error: only db and gat statistics are supported'
Exemplo n.º 9
0
def plot_transition_clustermap(data_array, gene_names, pseudotimes, n_clusters=10, gradient=False):
    if gradient:
        data_to_plot = zscore(np.gradient(data_array)[1].T, axis=0)
        scale = None
        metric = 'seuclidean'
        row_linkage = linkage(pdist(abs(data_to_plot), metric=metric), method='complete')
    else:
        data_to_plot = data_array.T
        scale = 0
        metric = 'correlation'
        row_linkage = linkage(pdist(data_to_plot, metric=metric), method='complete')
    
    assignments = fcluster(row_linkage, n_clusters, criterion='maxclust')
    cm = sns.clustermap(data_to_plot, col_cluster=False, standard_scale=scale, 
                        yticklabels=gene_names, row_linkage=row_linkage,
                        row_colors=[settings.STATE_COLORS[i] for i in assignments])
    r = np.arange(10, data_array.shape[0], data_array.shape[0]/10)
    plt.setp(cm.ax_heatmap.get_yticklabels(), fontsize=5)
    cm.ax_heatmap.set_xticks(r)
    cm.ax_heatmap.set_xticklabels(['%.1f' % x for x in pseudotimes[r]])
    cm.ax_heatmap.set_xlabel('Pseudotime')
    cm.ax_heatmap.set_ylabel('Gene')
    
    gene_clusters = defaultdict(list)
    for i, cl in enumerate(assignments):
        gene_clusters[settings.STATE_COLORS[cl]].append(gene_names[i])
    return gene_clusters
Exemplo n.º 10
0
    def kcca(self, X, Y, kernel_x=gaussian_kernel, kernel_y=gaussian_kernel, eta=1.0):
        n, p = X.shape
        n, q = Y.shape
        
        Kx = DIST.squareform(DIST.pdist(X, kernel_x))
        Ky = DIST.squareform(DIST.pdist(Y, kernel_y))
        J = np.eye(n) - np.ones((n, n)) / n
        M = np.dot(np.dot(Kx.T, J), Ky) / n
        L = np.dot(np.dot(Kx.T, J), Kx) / n + eta * Kx
        N = np.dot(np.dot(Ky.T, J), Ky) / n + eta * Ky


        sqx = SLA.sqrtm(SLA.inv(L))
        sqy = SLA.sqrtm(SLA.inv(N))
        
        a = np.dot(np.dot(sqx, M), sqy.T)
        A, s, Bh = SLA.svd(a, full_matrices=False)
        B = Bh.T
        
        # U = np.dot(np.dot(A.T, sqx), X).T
        # V = np.dot(np.dot(B.T, sqy), Y).T
        print s.shape
        print A.shape
        print B.shape
        return s, A, B
 def getClosestGID( self , rounded):
     """Calculate the kmer score
     
     returns (score, (closestGID, dist), (furthestGID, dist))
     """
     LGTs = self.lgtGenomes.keys()
     print LGTs
     dgs = []
     for lgt_id in LGTs:
         #print lgt_id
         dg1 = pdist([self.genomeTmers[self.lgtGenomes[lgt_id][0]], self.lgtTmer[lgt_id] ])
         dg2 = pdist([self.genomeTmers[self.lgtGenomes[lgt_id][1]], self.lgtTmer[lgt_id] ])
         dg1_str = ''.join(map(str,dg1))
         dg2_str = ''.join(map(str,dg2)) 
         rounded_score = float(np.round(dg1/(dg1+dg2),decimals=2))
         score = float(dg1/(dg1+dg2))
         #print rounded_score
         self.lgtScores[lgt_id] = [score,float(np.mean([dg1,dg2])),dg1_str,dg2_str]
         print self.lgtScores
         if rounded:
             try:
                 self.Dist_dict[rounded_score]+=1
             except KeyError:
                 self.Dist_dict[rounded_score]=1
         else:
             self.Dist_dict[score]=[float(np.mean([dg1,dg2])),dg1_str,dg2_str]
Exemplo n.º 12
0
def cengci(data):
    X = data
    distMatrix = pdist(X)
    Z = linkage(X, 'ward')
    c, coph_dists = cophenet(Z, pdist(X))
    print c
    dendrogram(Z)
Exemplo n.º 13
0
def distcorr(X, Y, flip=True):
    """ Compute the distance correlation function
    
    >>> a = [1,2,3,4,5]
    >>> b = np.array([1,2,9,4,4])
    >>> distcorr(a, b)
    0.762676242417
    
    Taken from: https://gist.github.com/satra/aa3d19a12b74e9ab7941
    """
    X = np.atleast_1d(X)
    Y = np.atleast_1d(Y)
    if np.prod(X.shape) == len(X):
        X = X[:, None]
    if np.prod(Y.shape) == len(Y):
        Y = Y[:, None]
    X = np.atleast_2d(X)
    Y = np.atleast_2d(Y)
    n = X.shape[0]
    if Y.shape[0] != X.shape[0]:
        raise ValueError('Number of samples must match')
    a = squareform(pdist(X))
    b = squareform(pdist(Y))
    A = a - a.mean(axis=0)[None, :] - a.mean(axis=1)[:, None] + a.mean()
    B = b - b.mean(axis=0)[None, :] - b.mean(axis=1)[:, None] + b.mean()
    
    dcov2_xy = (A * B).sum()/float(n * n)
    dcov2_xx = (A * A).sum()/float(n * n)
    dcov2_yy = (B * B).sum()/float(n * n)
    dcor = np.sqrt(dcov2_xy)/np.sqrt(np.sqrt(dcov2_xx) * np.sqrt(dcov2_yy))
    if flip == True:
        dcor = 1-dcor
    return dcor
Exemplo n.º 14
0
Arquivo: gp.py Projeto: davidar/gpo
def K_SE(xs, ys=None, l=1, deriv=False, wrt='l'):
    l = asarray(l)
    sig = 1 #l[0]
    #l = l[1:]
    xs = ascolumn(xs)
    if ys is None:
        d = squareform(pdist(xs/l, 'sqeuclidean'))
    else:
        ys = ascolumn(ys)
        d = cdist(xs/l, ys/l, 'sqeuclidean')
    cov = exp(-d/2)
    if not deriv: return sig * cov

    grads = []
    if wrt == 'l':
        #grads.append(cov) # grad of sig
        for i in xrange(shape(xs)[1]):
            if ys is None:
                grad = sig * cov * squareform(pdist(ascolumn(xs[:,i]), 'sqeuclidean'))
            else:
                grad = sig * cov * cdist(ascolumn(xs[:,i]), ascolumn(ys[:,i]), 'sqeuclidean')
            grad /= l[i] ** 3
            grads.append(grad)
        return sig * cov, grads
    elif wrt == 'y':
        if shape(xs)[0] != 1: print '*** x not a row vector ***'
        jac = sig * cov * ((ys - xs) / l**2).T
        return sig * cov, jac
Exemplo n.º 15
0
def vi_pairwise_matrix(segs, split=False):
    """Compute the pairwise VI distances within a set of segmentations.

    If 'split' is set to True, two matrices are returned, one for each 
    direction of the conditional entropy.

    0-labeled pixels are ignored.

    Parameters
    ----------
    segs : iterable of np.ndarray of int
        A list or iterable of segmentations. All arrays must have the same
        shape.
    split : bool, optional
        Should the split VI be returned, or just the VI itself (default)?

    Returns
    -------
    vi_sq : np.ndarray of float, shape (len(segs), len(segs))
        The distances between segmentations. If `split==False`, this is a
        symmetric square matrix of distances. Otherwise, the lower triangle
        of the output matrix is the false split distance, while the upper
        triangle is the false merge distance.
    """
    d = np.array([s.ravel() for s in segs])
    if split:
        def dmerge(x, y): return split_vi(x, y)[0]
        def dsplit(x, y): return split_vi(x, y)[1]
        merges, splits = [squareform(pdist(d, df)) for df in [dmerge, dsplit]]
        out = merges
        tri = np.tril(np.ones(splits.shape), -1).astype(bool)
        out[tri] = splits[tri]
    else:
        out = squareform(pdist(d, vi))
    return out
Exemplo n.º 16
0
def test_PDist():
    targets = np.tile(xrange(3),2)
    chunks = np.repeat(np.array((0,1)),3)
    ds = dataset_wizard(samples=data, targets=targets, chunks=chunks)
    data_c = data - np.mean(data,0)
    # DSM matrix elements should come out as samples of one feature
    # to be in line with what e.g. a classifier returns -- facilitates
    # collection in a searchlight ...
    euc = pdist(data, 'euclidean')[None].T
    pear = pdist(data, 'correlation')[None].T
    city = pdist(data, 'cityblock')[None].T
    center_sq = squareform(pdist(data_c,'correlation'))

    # Now center each chunk separately
    dsm1 = PDist()
    dsm2 = PDist(pairwise_metric='euclidean')
    dsm3 = PDist(pairwise_metric='cityblock')
    dsm4 = PDist(center_data=True,square=True)
    assert_array_almost_equal(dsm1(ds).samples,pear)
    assert_array_almost_equal(dsm2(ds).samples,euc)
    dsm_res = dsm3(ds)
    assert_array_almost_equal(dsm_res.samples,city)
    # length correspondings to a single triangular matrix
    assert_equal(len(dsm_res.sa.pairs), len(ds) * (len(ds) - 1) / 2)
    # generate label pairs actually reflect the vectorform generated by
    # squareform()
    dsm_res_square = squareform(dsm_res.samples.T[0])
    for i, p in enumerate(dsm_res.sa.pairs):
        assert_equal(dsm_res_square[p[0], p[1]], dsm_res.samples[i, 0])
    dsm_res = dsm4(ds)
    assert_array_almost_equal(dsm_res.samples,center_sq)
    # sample attributes are carried over
    assert_almost_equal(ds.sa.targets, dsm_res.sa.targets)
Exemplo n.º 17
0
def scene_based_double_corr(ds):
    
    num_subj = ds.shape[0]
    num_voxels = ds.shape[1]
    num_scenes = len(ds.a.event_bounds)
    ds_list = np.zeros((num_subj, num_voxels, num_scenes-1))
    prev_cutoff = 0

    # average correlations for each scene
    for i, scene_cutoff in enumerate(ds.a.event_bounds):
        ds_list[:,:,i] = np.mean(ds.samples[:,:,prev_cutoff:scene_cutoff], axis=2)
        prev_cutoff = scene_cutoff

    self_correlations = []

    # convert each subject to a vector of its pairwise correlations between scenes
    for subj in ds_list:
        corrs = 1 - pdist(subj.T, metric='correlation')
        self_correlations.append(corrs)
    
    # get all pairwise correlations between subjects    
    correlation = 1 - pdist(self_correlations, metric="correlation")

    # return the average isc scene based correlation
    return np.mean(correlation)
Exemplo n.º 18
0
    def _call(self,dataset):

        # Get neural sim b/w pairs of targets
        if self.pairwise_metric == 'correlation':
            pairsim = dict((pair[0]+'-'+pair[1],pdist([dataset[dataset.sa.targets == pair[0]].samples[0], dataset[dataset.sa.targets == pair[1]].samples[0]],metric=self.pairwise_metric)) for pair in self.pairs)
        else: pairsim = dict((pair[0]+'-'+pair[1],pdist([dataset[dataset.sa.targets == pair[0]].samples[0], dataset[dataset.sa.targets == pair[1]].samples[0]],metric=self.pairwise_metric)) for pair in self.pairs)
        return Dataset(np.array([pairsim,]))
Exemplo n.º 19
0
def expand_triangular_mesh(c, offset=2, com_bias=(0,0,0)):
    #find center of mass of current points

    #adding the bias doesn't really make a big effect unless the bias is very
    #large which is not what we want
    u, v, w = np.mean(c, axis=0) + com_bias

    new_c = []

    for pt in c:
        #coordinates of point
        a, b, c = pt
        #distance from point to center, effective coordinates
        x, y, z = a-u, b-v, c-w
        #find (rho, theta, phi)
        theta = np.arctan2(y, x)
        h = pdist( ((a,b), (u,v)) )
        phi = np.arctan2(z, h)
        rho = pdist( ((a,b,c), (u,v,w)) )

        # change rho and call it nu
        nu = rho + offset
        # find new effective coordinates of (nu, theta, phi)
        f = nu*np.sin(phi)
        g = nu*np.cos(phi)
        e = g*np.sin(theta)
        d = g*np.cos(theta)

        #align new effective coordinates to center of mass
        new_c.append( (u+d, v+e, w+f) )

    return np.squeeze(new_c) 
Exemplo n.º 20
0
def covMatrix(X, Y, theta, symmetric = True, kernel = lambda u, theta: theta[0]*theta[0]*np.exp(-0.5*u*u/(theta[1]*theta[1])), \
        dist_f=None):
    if len(np.array(X).shape) == 1:
        _X = np.array([X]).T
    else:
        _X = np.array(X)
        
    if len(np.array(Y).shape) == 1:
        _Y = np.array([Y]).T
    else:
        _Y = np.array(Y)
        
    if dist_f == None:
        if symmetric:
            cM = pdist(_X)
            M = squareform(cM)
            M = kernel(M, theta)
            return M
        else:
            cM = cdist(_X, _Y)
            M = kernel(cM, theta)
            return M
    else:
        if symmetric:
            cM = pdist(_X, dist_f)
            M = squareform(cM)
            M = kernel(M, theta)
            return M
        else:
            cM = cdist(_X, _Y, dist_f)
            M = kernel(cM, theta)
            return M
    return
Exemplo n.º 21
0
def compare_clusters(args):

    ref_df = pd.read_table(args['ref'], sep='\t', skipinitialspace=True, index_col=0).as_matrix()
    check_symmetry(ref_df)
    linkage_ref = linkage(ref_df, 'average')
    c_ref, coph_dists_ref = cophenet(linkage_ref, pdist(ref_df))

    outfile = open(args['output'],"w")
    outfile.write("Tree_cluster\tMantel_Correlation_Coefficient\tManter_P-value\tCophenetic_Pearson\tCophenetic_P-value\n")

    for i in args['all']:
        fst_df = pd.read_table(i, sep='\t', skipinitialspace=True, index_col=0).as_matrix()
        check_symmetry(fst_df)
        mantel_coeff = 0.0
        p_value_mantel = 0.0
        cophenetic_pearson = 0.0
        p_value_cophenetic = 0.0
        n = 0
        try:
            mantel_coeff, p_value_mantel, n = mantel(ref_df, fst_df)
            linkage_fst = linkage(fst_df, 'average')
            c_fst, coph_dists_fst = cophenet(linkage_fst, pdist(fst_df))
            cophenetic_pearson, p_value_cophenetic = pearsonr(coph_dists_ref, coph_dists_fst)
        except Exception as e:
            print("Error : %s" % str(e))
            mantel_coeff = "Failed"
            p_value_manel = "Failed"
            cophenetic_pearson = "Failed"
            p_value_cophenetic = "Failed"

        outfile.write(i+"\t"+str(mantel_coeff)+"\t"+str(p_value_mantel)+"\t"+str(cophenetic_pearson)+"\t"+str(p_value_cophenetic)+"\n")

    outfile.close()
Exemplo n.º 22
0
def mds_author_term(fname1='corr_2d_mds_authors_by_terms.png', fname2='corr_2d_mds_terms_by_authors.png'):
    bib_data = get_bib_data()
    mat, authors, term_list, authors_cnt = get_author_by_term_mat(bib_data, tfreq=5, afreq=10)
    adist = dist.squareform(dist.pdist(mat, 'correlation'))
    coords,_ = mds(adist, dim=2)
    
    fig = plt.figure()
    fig.clf()
    plt.xlim(-15, 20)
    plt.ylim(-15, 20)
    for label, x, y in zip(authors, coords[:,0], coords[:,1]):
        plt.annotate(label, xy=(x*20,y*20))
    plt.axis('off')
    plt.savefig(fname1)
    
    
    mat = mat.T
    tdist = dist.squareform(dist.pdist(mat, 'correlation'))
    coords, _ = mds(tdist, dim=2)
    #fig = plt.figure()
    fig.clf();
    plt.xlim(-80,100)
    plt.ylim(-100,100)
    for label, x, y in zip(term_list, coords[:,0], coords[:,1]):
        plt.annotate(label, xy=(x*500,y*500))
    plt.axis('off')
    plt.savefig(fname2)
Exemplo n.º 23
0
def collaspe_fclusters(data=None, t=None, row_labels=None, col_labels=None,
			linkage='average', pdist='euclidean', standardize=3, log=False):
	"""a function to collaspe flat clusters by averaging the vectors within
	each flat clusters achieved from hierarchical clustering"""
	## preprocess data
	if log:
		data = np.log2(data + 1.0)
	if standardize == 1: # Standardize along the columns of data
		data = zscore(data, axis=0)
	elif standardize == 2: # Standardize along the rows of data
		data = zscore(data, axis=1)
	
	if row_labels is not None and col_labels is None: ## only get fclusters for rows
		d = dist.pdist(data, metric=pdist)
		axis = 1 ##!!! haven't checked whether this is correct yet
	elif row_labels is None and col_labels is not None: ## only get fclusters for cols
		d = dist.pdist(data.T, metric=pdist)
		axis = 0
	D = dist.squareform(d)
	Y = sch.linkage(D, method=linkage, metric=pdist)
	fclusters = sch.fcluster(Y, t, 'distance')
	fcluster_set = set(fclusters)
	data_cf = []
	for fc in fcluster_set:
		mask = np.where(fclusters==fc)
		data_t = data.T
		vector_avg = np.average(data_t[mask],axis=axis)
		data_cf.append(vector_avg)
	data_cf = np.array(data_cf).T
	return data_cf
Exemplo n.º 24
0
def similarities(obj):
    """
    Optional: similarities of entities.
    """
    phi = coo_matrix(np.load(str(obj.directory / 'phi.npy')))
    theta = coo_matrix(np.load(str(obj.directory / 'theta.npy')))

    with CsvWriter(obj.directory, DocumentSimilarity) as out:
        distances = squareform(pdist(theta.T, 'cosine'))
        out << (dict(a_id=i,
                     b_id=sim_i,
                     similarity=1 - row[sim_i])
                for i, row in enumerate(distances)
                for sim_i in row.argsort()[:31]  # first 30 similar docs
                if sim_i != i)

    with CsvWriter(obj.directory, TopicSimilarity) as out:
        distances = squareform(pdist(phi.T, 'cosine'))
        out << (dict(a_id=topic_id(1, i),
                     b_id=topic_id(1, sim_i),
                     similarity=1 - row[sim_i])
                for i, row in enumerate(distances)
                for sim_i in row.argsort()[:]
                if sim_i != i)

    with CsvWriter(obj.directory, TermSimilarity) as out:
        distances = squareform(pdist(phi, 'cosine'))
        out << (dict(a_modality_id=1, a_id=i,
                     b_modality_id=1, b_id=sim_i,
                     similarity=1 - row[sim_i])
                for i, row in enumerate(distances)
                for sim_i in row.argsort()[:21]  # first 20 similar terms
                if sim_i != i)
Exemplo n.º 25
0
def kcca(X, Y, kernel_x=gaussian_kernel, kernel_y=gaussian_kernel, eta=1.0):
    '''
    カーネル正準相関分析
    http://staff.aist.go.jp/s.akaho/papers/ibis00.pdf
    '''
    n, p = X.shape
    n, q = Y.shape

    Kx = DIST.squareform(DIST.pdist(X, kernel_x))
    Ky = DIST.squareform(DIST.pdist(Y, kernel_y))
    J = np.eye(n) - np.ones((n, n)) / n
    M = np.dot(np.dot(Kx.T, J), Ky) / n
    L = np.dot(np.dot(Kx.T, J), Kx) / n + eta * Kx
    N = np.dot(np.dot(Ky.T, J), Ky) / n + eta * Ky

    sqx = LA.sqrtm(LA.inv(L))
    sqy = LA.sqrtm(LA.inv(N))

    a = np.dot(np.dot(sqx, M), sqy.T)
    A, s, Bh = LA.svd(a, full_matrices=False)
    B = Bh.T

    # U = np.dot(np.dot(A.T, sqx), X).T
    # V = np.dot(np.dot(B.T, sqy), Y).T

    return s, A, B
Exemplo n.º 26
0
def edge_matrix(abs_times, camera_types):
    """Returns the edge matrix E in non-squareform, calculated using pdist.

    We consider an edge between two metadata entries to exist if:
        a) those entries were taken within 120 seconds of each other, AND
        b) those entries came from different cameras.
    Note: it is recommended that camera_types contains an index for metadata entries
        that did not have a camera type listed.

    Args:
        abs_times (numpy.array): N-dimensional array of floats of absolute times,
            in seconds, that images were taken.
        camera_types (numpy.array): N-dimensional array of ints corresponding to the
            camera types that were used.

    Returns:
        The edge matrix, E.
    """
    assert len(abs_times) == len(camera_types)
    T = pdist(abs_times)
    T = np.asarray(T < 120, dtype=bool)
    C = pdist(camera_types)
    C = np.asarray(C, dtype=bool)
    E = T & C
    return E, T, C
def get_monk_human_pspace():
	DATPATH = '/mindhive/dicarlolab/u/rishir/monkey_objectome/monkey_behaviour/tmpmonk.mat'
	dat = scipy.io.loadmat(DATPATH)
	monkdata = dat['monkdata']
	humdata = dat['humdata']
	models_oi = dat['models_oi'][0]

	mcent_symm = get_pspace_centers(monkdata, dim=20, symmetrize=True)
	hcent_asymm, obj_ind = get_precomputed_pspace_centers(models_oi)
	hcent_asymm_all, tmp = get_precomputed_pspace_centers()

	mcent_symm_d = d.pdist(mcent_symm)
	hcent_asymm_d = d.pdist(hcent_asymm)

	rho = utils.nnan_consistency(mcent_symm_d, hcent_asymm_d)
	print rho

	mat_data = {}
	mat_data['mcent_symm'] = mcent_symm
	mat_data['hcent_asymm'] = hcent_asymm
	mat_data['mcent_symm_d'] = mcent_symm_d
	mat_data['hcent_asymm_d'] = hcent_asymm_d
	mat_data['hcent_asymm_all'] = hcent_asymm_all
	mat_data['models_oi'] = models_oi
	mat_data['obj_ind'] = obj_ind
	scipy.io.savemat('pspace_res.mat', mat_data)
Exemplo n.º 28
0
Arquivo: yo.py Projeto: cyanut/ca1-3d
def get_distance_distro(tracked_objects, sample_size=None, repeat=1, neighbours=0):
    '''
    Given an 2d array of coordinates, random sample from it calculate pair-wise distances.
    tracked_objects: input 2d array. Each row is a coordinate.
    sample_size: the size of random sample to be withdrawn, and if is None,
                 calculate pair-wise distance of the whole input.
    repeat: number of random samples to be drawn.
    neighbours: number of nearest neighbours to include in the analysis
    return: a 1d array of distances, pooled from all samples.
    '''

    if sample_size is None:
        sample_size = tracked_objects.shape[0]
    dist = []
    ind_array = np.arange(tracked_objects.shape[0])
    for i in range(repeat):
        np.random.shuffle(ind_array)
        selected_objects = tracked_objects[ind_array[:sample_size],:]
        if neighbours <= 0:
            dist.append(pdist(selected_objects))
        else:
            dist_all = squareform(pdist(selected_objects))
            dist_all.partition(neighbours)
            dist_all = dist_all[:,:neighbours+1]
            dist.append(dist_all[dist_all > 0])

    dist = np.hstack(dist)

    return dist
Exemplo n.º 29
0
def getDistances(x, attr, var, cidx, didx, cheader):
    """ This creates the distance array for only discrete or continuous data 
        with no missing data """
    from scipy.spatial.distance import pdist, squareform
    #--------------------------------------------------------------------------
    def pre_normalize(x):
        idx = 0
        for i in cheader:
            cmin = attr[i][2]
            diff = attr[i][3]
            x[:,idx] -= cmin
            x[:,idx] /= diff
            idx += 1

        return x
    #--------------------------------------------------------------------------
    dtype = var['dataType']
    numattr = var['NumAttributes']

    if(dtype == 'discrete'):
        return squareform(pdist(x,metric='hamming'))

    if(dtype == 'mixed'):
        d_dist = squareform(pdist(x[:,didx],metric='hamming'))
        xc = pre_normalize(x[:,cidx])
        c_dist = squareform(pdist(xc,metric='cityblock'))
        return np.add(d_dist, c_dist) / numattr

    else: #(dtype == 'continuous'):
        return squareform(pdist(pre_normalize(x),metric='cityblock'))
Exemplo n.º 30
0
def _compute_AB(x, y, index):
    xa = np.atleast_2d(x)
    ya = np.atleast_2d(y)       

    if xa.ndim > 2 or ya.ndim > 2:
        raise ValueError("x and y must be 1d or 2d array_like objects")

    if xa.shape[0] == 1:
        xa = xa.T

    if ya.shape[0] == 1: 
        ya = ya.T

    if xa.shape[0] != ya.shape[0]:
        raise ValueError("x and y must have the same sample sizes")
       
    if index <= 0 or index > 2:
        raise ValueError("index must be in (0, 2]")

    # compute A
    a_kl = squareform(pdist(xa, 'euclidean')**index)
    a_k = np.mean(a_kl, axis=1).reshape(-1, 1)
    a_l = a_k.T
    a = np.mean(a_kl)
    A = a_kl - a_k - a_l + a

    # compute B
    b_kl = squareform(pdist(ya, 'euclidean')**index)
    b_k = np.mean(b_kl, axis=1).reshape(-1, 1)
    b_l = b_k.T
    b = np.mean(b_kl)
    B = b_kl - b_k - b_l + b

    return A, B
Exemplo n.º 31
0
def kernel_ndvi_outlier_search(band_subset_outlier, sample_k_vol, sample_k_geom, sample_c1, sample_cos_i, sample_slope, sample_ndvi, sample_topo_msk, sample_img_tag, idxRand_dict, hyObj_pointer_dict_list, image_smooth):

    wave_all_samples = np.empty((len(band_subset_outlier), 0), float)
    img_name_list = [x.file_name for x in hyObj_pointer_dict_list]
    group_dict = {}
    group_dict["img_name_list"] = img_name_list
    # print(band_subset_outlier)
    group_dict["band_subset"] = band_subset_outlier

    for i in range(len(hyObj_pointer_dict_list)):
        print(hyObj_pointer_dict_list[i].file_name)
        if hyObj_pointer_dict_list[i].file_type == "ENVI":

            if hyObj_pointer_dict_list[i].interleave == 'bsq':
                spec_data = hyObj_pointer_dict_list[i].data[:, idxRand_dict[i][0], idxRand_dict[i][1]].transpose()
            elif hyObj_pointer_dict_list[i].interleave == 'bil':
                spec_data = hyObj_pointer_dict_list[i].data[idxRand_dict[i][0], :, idxRand_dict[i][1]]
            # hyObj.interleave=='bip':
            else:
                spec_data = hyObj_pointer_dict_list[i].data[idxRand_dict[i][0], idxRand_dict[i][1], :]
        elif hyObj_pointer_dict_list[i].file_type == "HDF":
            spec_data = hyObj_pointer_dict_list[i].data[idxRand_dict[i][0], idxRand_dict[i][1], :]
        else:
            return None

        wave_samples = spec_data[:, band_subset_outlier]
        wave_samples = wave_samples / image_smooth[i][band_subset_outlier]

        sub_index_img_tag = (sample_img_tag == i + 1)
        sample_cos_i_sub = sample_cos_i[sub_index_img_tag]
        sample_slope_sub = sample_slope[sub_index_img_tag]
        sample_c1_sub = sample_c1[sub_index_img_tag]

        topo_mask_sub = (sample_cos_i_sub > COSINE_I_MIN_THRESHOLD) & (sample_slope_sub > SLOPE_MIN_THRESHOLD)

        for iband in range(len(band_subset_outlier)):
            wave_samples_band = wave_samples[:, iband]

            topo_coeff, _, _ = generate_topo_coeff_band(wave_samples_band, (wave_samples_band > REFL_MIN_THRESHOLD) & (wave_samples_band < REFL_MAX_THRESHOLD) & topo_mask_sub, sample_cos_i_sub, non_negative=True)
            correctionFactor = (sample_c1_sub + topo_coeff) / (sample_cos_i_sub + topo_coeff)
            correctionFactor = correctionFactor * topo_mask_sub + 1.0 * (1 - topo_mask_sub)
            wave_samples[:, iband] = wave_samples_band * correctionFactor

        wave_all_samples = np.hstack((wave_all_samples, wave_samples.T))

    ndvi_mask = (sample_ndvi > 0.15) & (sample_ndvi <= 0.95)
    obs_mask = np.isfinite(sample_k_vol) & np.isfinite(sample_k_geom)
    temp_mask = (wave_all_samples[0] > REFL_MIN_THRESHOLD) & (wave_all_samples[0] < REFL_MAX_THRESHOLD) & (obs_mask) & (ndvi_mask)

    for iband in range(len(band_subset_outlier)):
        new_df = pd.DataFrame({'k_geom': sample_k_geom[temp_mask], 'k_vol': sample_k_vol[temp_mask],
                              'reflectance': wave_all_samples[iband, temp_mask], 'line_id': sample_img_tag[temp_mask],
                              "NDVI": sample_ndvi[temp_mask]})

        new_df['ndvi_cut_bins'] = pd.cut(new_df['NDVI'],
                                        bins=[0.15, 0.4, 0.7, 0.95],
                                        labels=['ndvi_1', 'ndvi_2', 'ndvi_3'])

        new_df['geom_cut_bins'] = pd.cut(new_df['k_geom'],
                                bins=np.percentile(sample_k_geom[temp_mask], [5, 33, 67, 95]),  # [5,33,67,95] #[5,25,50,75,95]
                                labels=['k_geom_1', 'k_geom_2', 'k_geom_3'])  # ,'k_geom_4'

        new_df['vol_cut_bins'] = pd.cut(new_df['k_vol'],
                                bins=np.percentile(sample_k_vol[temp_mask], [5, 33, 67, 95]),   # [5,25,50,75,95] # [5,33,67,95]
                                labels=['k_vol_1', 'k_vol_2', 'k_vol_3'])  # 'k_vol_4'

        new_df_bin_group_mean = new_df.groupby(['vol_cut_bins', 'geom_cut_bins', 'ndvi_cut_bins', 'line_id']).median()  # mean()

        new_df_bin_group_mean.reset_index(inplace=True)

        n_bin = new_df_bin_group_mean.shape[0] // len(hyObj_pointer_dict_list)

        ss = new_df_bin_group_mean["reflectance"].values

        bin_avg_array = np.reshape(ss, (n_bin, len(hyObj_pointer_dict_list)))

        bin_mean = np.nanmedian(bin_avg_array, axis=1)
        inds = np.where(np.isnan(bin_avg_array))

        # Place column means in the indices. Align the arrays using take
        bin_avg_array[inds] = np.take(bin_mean, inds[0])

        bin_avg_array = bin_avg_array / bin_mean[:, np.newaxis]

        bin_avg_array = bin_avg_array[~np.isnan(bin_avg_array[:, 0])]

        # Y = pdist(bin_avg_array.T, 'seuclidean', V=None)
        Y = pdist(bin_avg_array.T, 'euclidean', V=None)
        # Y = pdist(bin_avg_array.T, 'canberra')

        print(Y)

        return_dict = {}

        # H_s = hierarchy.single(Y)
        H_s = hierarchy.complete(Y)
        T_ = hierarchy.fcluster(H_s, 1.2, criterion='distance')
        print("Cluster thres 1.2", T_)

        return_dict["Cluster thres 1.2"] = T_.tolist()

        T_ = hierarchy.fcluster(H_s, 1.0, criterion='distance')
        print("Cluster thres 1.0", T_)

        return_dict["Cluster thres 1.0"] = T_.tolist()

        T_ = hierarchy.fcluster(H_s, 0.85, criterion='distance')
        print("Cluster thres 0.85", T_)

        return_dict["Cluster thres 0.9"] = T_.tolist()

        return_dict["distance of metrics"] = Y.tolist()

        major_label_id = np.bincount(np.array(T_)).argmax()

        outlier_img_tag = (np.array(T_) != major_label_id)

        return_dict["outlier_image_bool"] = outlier_img_tag.astype(int).tolist()
        return_dict["outlier_count"] = int(np.count_nonzero(outlier_img_tag))
        group_dict['b' + str(iband + 1)] = return_dict

    return group_dict
Exemplo n.º 32
0
def _initialize_variogram_model(
    X,
    y,
    variogram_model,
    variogram_model_parameters,
    variogram_function,
    nlags,
    weight,
    coordinates_type,
):
    """Initializes the variogram model for kriging. If user does not specify
    parameters, calls automatic variogram estimation routine.
    Returns lags, semivariance, and variogram model parameters.

    Parameters
    ----------
    X: ndarray
        float array [n_samples, n_dim], the input array of coordinates
    y: ndarray
        float array [n_samples], the input array of values to be kriged
    variogram_model: str
        user-specified variogram model to use
    variogram_model_parameters: list
        user-specified parameters for variogram model
    variogram_function: callable
        function that will be called to evaluate variogram model
        (only used if user does not specify variogram model parameters)
    nlags: int
        integer scalar, number of bins into which to group inter-point distances
    weight: bool
        boolean flag that indicates whether the semivariances at smaller lags
        should be weighted more heavily in the automatic variogram estimation
    coordinates_type: str
        type of coordinates in X array, can be 'euclidean' for standard
        rectangular coordinates or 'geographic' if the coordinates are lat/lon

    Returns
    -------
    lags: ndarray
        float array [nlags], distance values for bins into which the
        semivariances were grouped
    semivariance: ndarray
        float array [nlags], averaged semivariance for each bin
    variogram_model_parameters: list
        parameters for the variogram model, either returned unaffected if the
        user specified them or returned from the automatic variogram
        estimation routine
    """

    # distance calculation for rectangular coords now leverages
    # scipy.spatial.distance's pdist function, which gives pairwise distances
    # in a condensed distance vector (distance matrix flattened to a vector)
    # to calculate semivariances...
    if coordinates_type == "euclidean":
        d = pdist(X, metric="euclidean")
        g = 0.5 * pdist(y[:, None], metric="sqeuclidean")

    # geographic coordinates only accepted if the problem is 2D
    # assume X[:, 0] ('x') => lon, X[:, 1] ('y') => lat
    # old method of distance calculation is retained here...
    # could be improved in the future
    elif coordinates_type == "geographic":
        if X.shape[1] != 2:
            raise ValueError(
                "Geographic coordinate type only supported for 2D datasets.")
        x1, x2 = np.meshgrid(X[:, 0], X[:, 0], sparse=True)
        y1, y2 = np.meshgrid(X[:, 1], X[:, 1], sparse=True)
        z1, z2 = np.meshgrid(y, y, sparse=True)
        d = great_circle_distance(x1, y1, x2, y2)
        g = 0.5 * (z1 - z2)**2.0
        indices = np.indices(d.shape)
        d = d[(indices[0, :, :] > indices[1, :, :])]
        g = g[(indices[0, :, :] > indices[1, :, :])]

    else:
        raise ValueError("Specified coordinate type '%s' is not supported." %
                         coordinates_type)

    # Equal-sized bins are now implemented. The upper limit on the bins
    # is appended to the list (instead of calculated as part of the
    # list comprehension) to avoid any numerical oddities
    # (specifically, say, ending up as 0.99999999999999 instead of 1.0).
    # Appending dmax + 0.001 ensures that the largest distance value
    # is included in the semivariogram calculation.
    dmax = np.amax(d)
    dmin = np.amin(d)
    dd = (dmax - dmin) / nlags
    bins = [dmin + n * dd for n in range(nlags)]
    dmax += 0.001
    bins.append(dmax)

    # This old binning method was experimental and doesn't seem
    # to work too well. Bins were computed such that there are more
    # at shorter lags. This effectively weights smaller distances more
    # highly in determining the variogram. As Kitanidis points out,
    # the variogram fit to the data at smaller lag distances is more
    # important. However, the value at the largest lag probably ends up
    # being biased too high for the larger values and thereby throws off
    # automatic variogram calculation and confuses comparison of the
    # semivariogram with the variogram model.
    #
    # dmax = np.amax(d)
    # dmin = np.amin(d)
    # dd = dmax - dmin
    # bins = [dd*(0.5**n) + dmin for n in range(nlags, 1, -1)]
    # bins.insert(0, dmin)
    # bins.append(dmax)

    lags = np.zeros(nlags)
    semivariance = np.zeros(nlags)

    for n in range(nlags):
        # This 'if... else...' statement ensures that there are data
        # in the bin so that numpy can actually find the mean. If we
        # don't test this first, then Python kicks out an annoying warning
        # message when there is an empty bin and we try to calculate the mean.
        if d[(d >= bins[n]) & (d < bins[n + 1])].size > 0:
            lags[n] = np.mean(d[(d >= bins[n]) & (d < bins[n + 1])])
            semivariance[n] = np.mean(g[(d >= bins[n]) & (d < bins[n + 1])])
        else:
            lags[n] = np.nan
            semivariance[n] = np.nan

    lags = lags[~np.isnan(semivariance)]
    semivariance = semivariance[~np.isnan(semivariance)]

    # a few tests the make sure that, if the variogram_model_parameters
    # are supplied, they have been supplied as expected...
    # if variogram_model_parameters was not defined, then estimate the variogram
    if variogram_model_parameters is not None:
        if variogram_model == "linear" and len(
                variogram_model_parameters) != 2:
            raise ValueError(
                "Exactly two parameters required for linear variogram model.")
        elif (variogram_model in [
                "power", "spherical", "exponential", "gaussian", "hole-effect"
        ] and len(variogram_model_parameters) != 3):
            raise ValueError("Exactly three parameters required for "
                             "%s variogram model" % variogram_model)
    else:
        if variogram_model == "custom":
            raise ValueError("Variogram parameters must be specified when "
                             "implementing custom variogram model.")
        else:
            variogram_model_parameters = _calculate_variogram_model(
                lags, semivariance, variogram_model, variogram_function,
                weight)

    return lags, semivariance, variogram_model_parameters
Exemplo n.º 33
0
 def dist(self, X, Y=None):
     if Y is X or Y is None:
         d = scidist.pdist(X, self.metric)
         return scidist.squareform(d)
     else:
         return scidist.cdist(X, Y, self.metric)
Exemplo n.º 34
0
labels_path = 'labels_comma.csv'
#load the .csv files
M_str, nrRows, nrCols = read_tadpole.load_csv_no_header(path_dataset_matrix)
Wrow, _, _ = read_tadpole.load_csv_no_header(path_dataset_affinity_matrix)
labels, _, _ = read_tadpole.load_csv_no_header(labels_path)

#parameters/preprocessing step that do not change during the running
Wrow = preprocessing_dataset.str_to_float(Wrow)
M_init = preprocessing_dataset.normalization(M_str)
labels = preprocessing_dataset.str_to_float(labels)

M = np.concatenate((M_init, labels), axis=1)

#ADD A SIMILARITY MEASURE TO THE GRAPH
# Calculate all pairwise distances
distv = distance.pdist(M, metric='correlation')
# Convert to a square symmetric distance matrix
dist = distance.squareform(distv)
sigma = np.mean(dist)
# Get affinity from similarity matrix
sparse_graph = np.exp(-dist**2 / (2 * sigma**2))
#Wrow=Wrow*sparse_graph
Wrow = preprocessing_dataset.normalize_adj(Wrow)

#creation of a mask for the features: 1 for features and 0 for labels
M_features_ones = np.ones(M_init.shape)
M_labels_zeros = np.zeros(labels.shape)
mask_features = np.concatenate((M_features_ones, M_labels_zeros), axis=1)

#computation of the normalized laplacians
Lrow = csgraph.laplacian(Wrow, normed=True)
Exemplo n.º 35
0
 def __init__(self, points):
     self.points = points
     self.dm = squareform(pdist(points))
Exemplo n.º 36
0
def CosineScore(M):

    cos_M = squareform(pdist(M, 'cosine'))
    alpha_cos = softmax(cos_M, axis=0)

    return np.sum(alpha_cos, axis=1)
Exemplo n.º 37
0
def manhattenScore(M):
    man_M = squareform(pdist(M, 'cityblock'))
    alpha_man = softmax(man_M, axis=0)

    return np.sum(alpha_man, axis=1)
Exemplo n.º 38
0
def euclideanScore(M):
    #Euclidean distance
    euc_M = squareform(pdist(M, 'euclidean'))
    alpha_euc = softmax(euc_M, axis=0)
    return np.sum(alpha_euc, axis=1)
Exemplo n.º 39
0
    def process_batch(self, lines):
        """Helper function to convert raw lines into a mini-batch as a DotDict.
        """
        batch_edges = []
        batch_edges_values = []
        batch_edges_target = []  # Binary classification targets (0/1)
        batch_nodes = []
        batch_nodes_target = [
        ]  # Multi-class classification targets (`num_nodes` classes)
        batch_nodes_coord = []
        batch_tour_nodes = []
        batch_tour_len = []

        for line_num, line in enumerate(lines):
            line = line.split(" ")  # Split into list

            # Compute signal on nodes
            nodes = np.ones(self.num_nodes)  # All 1s for TSP...

            # Convert node coordinates to required format
            nodes_coord = []
            for idx in range(0, 2 * self.num_nodes, 2):
                nodes_coord.append([float(line[idx]), float(line[idx + 1])])

            # Compute distance matrix
            W_val = squareform(pdist(nodes_coord, metric=self.metric))

            # Compute adjacency matrix
            if self.num_neighbors == -1:
                W = np.ones((self.num_nodes,
                             self.num_nodes))  # Graph is fully connected
            else:
                W = np.zeros((self.num_nodes, self.num_nodes))
                # Determine k-nearest neighbors for each node
                knns = np.argpartition(W_val, kth=self.num_neighbors,
                                       axis=-1)[:, self.num_neighbors::-1]
                # Make connections
                for idx in range(self.num_nodes):
                    W[idx][knns[idx]] = 1
            np.fill_diagonal(W, 2)  # Special token for self-connections

            # Convert tour nodes to required format
            # Don't add final connection for tour/cycle
            tour_nodes = [
                int(node) - 1 for node in line[line.index('output') + 1:-1]
            ][:-1]

            # Compute node and edge representation of tour + tour_len
            tour_len = 0
            nodes_target = np.zeros(self.num_nodes)
            edges_target = np.zeros((self.num_nodes, self.num_nodes))
            for idx in range(len(tour_nodes) - 1):
                i = tour_nodes[idx]
                j = tour_nodes[idx + 1]
                nodes_target[
                    i] = idx  # node targets: ordering of nodes in tour
                edges_target[i][j] = 1
                edges_target[j][i] = 1
                tour_len += W_val[i][j]

            # Add final connection of tour in edge target
            nodes_target[j] = len(tour_nodes) - 1
            edges_target[j][tour_nodes[0]] = 1
            edges_target[tour_nodes[0]][j] = 1
            tour_len += W_val[j][tour_nodes[0]]

            # Concatenate the data
            batch_edges.append(W)
            batch_edges_values.append(W_val)
            batch_edges_target.append(edges_target)
            batch_nodes.append(nodes)
            batch_nodes_target.append(nodes_target)
            batch_nodes_coord.append(nodes_coord)
            batch_tour_nodes.append(tour_nodes)
            batch_tour_len.append(tour_len)

        # From list to tensors as a DotDict
        batch = DotDict()
        batch.edges = np.stack(batch_edges, axis=0)
        batch.edges_values = np.stack(batch_edges_values, axis=0)
        batch.edges_target = np.stack(batch_edges_target, axis=0)
        batch.nodes = np.stack(batch_nodes, axis=0)
        batch.nodes_target = np.stack(batch_nodes_target, axis=0)
        batch.nodes_coord = np.stack(batch_nodes_coord, axis=0)
        batch.tour_nodes = np.stack(batch_tour_nodes, axis=0)
        batch.tour_len = np.stack(batch_tour_len, axis=0)
        return batch
Exemplo n.º 40
0
    newick = final_tree.to_newick()
    tree = Phylo.read(StringIO(newick), 'newick')

    Phylo.draw_graphviz(tree, prog='neato')
    plt.savefig("%s.png" % name, dpi=200, bbox_inches='tight')


X += np.random.normal(scale=0.01, size=X.shape)

pca = PCA(2)
pca.fit(X)

# X = pca.transform(X)
N, D = X.shape

C = pdist(X)
tree = to_tree(single(C))


def construct_node(snode):
    if snode.left is None and snode.right is None:
        return TreeLeaf(snode.get_id())
    node = TreeNode()
    node.add_child(construct_node(snode.left))
    node.add_child(construct_node(snode.right))
    return node


root = construct_node(tree)
linkage_tree = Tree(root=root)
plot_tree(linkage_tree, 'linkage_induced')
Exemplo n.º 41
0
    def add_point(self, newpt=[], newcontour=False):

        zpos = self.image.list_idx

        pts = np.array(self.points)
        if len(pts) >= 3:
            pts = pts[pts[:, 2] == zpos, :]

        if len(pts) < 3:
            if len(newpt) > 0:
                self.points.append([newpt[0], newpt[1], zpos, 0])

        else:

            thr = 1000.

            if newcontour == False:

                cid = np.unique(pts[:, 3])

                if len(newpt) > 0:
                    #### add a point
                    dst = np.array(scipydist.cdist([newpt], pts[:, :2])[0])
                    rg = np.arange(len(dst), dtype=int) + 1
                    for ci in cid:
                        idx = np.where(pts[:, 3] == ci)[0]
                        rg[idx[-1]] = idx[0]
                    #print rg
                    dst += dst[rg]

                    mi = np.argmin(dst)
                    ci = pts[mi, 3]
                    pts = np.insert(pts,
                                    mi + 1,
                                    np.append(newpt, [zpos, ci]),
                                    axis=0)

                allpts = []
                for ci in cid:
                    #### a simple tsp solver...
                    idx = np.where(pts[:, 3] == ci)[0]
                    pp = pts[idx].copy()
                    path = np.arange(len(pp), dtype=int)

                    dmat = scipydist.squareform(scipydist.pdist(pp[:, :2]))
                    dmat += np.eye(len(dmat)) * thr
                    if len(pp) > 3:
                        niter = 2000
                    else:
                        niter = 0

                    calc_dist = lambda path: np.sum([
                        dmat[path[i], path[(i + 1) % len(path)]]
                        for i in range(len(path))
                    ])
                    dst = calc_dist(path)
                    nochange = 0
                    for k in range(niter):
                        p0 = path.copy()
                        i0, i1 = np.sort(np.random.randint(0, len(pp), 2))
                        if abs(i0 - i1) % len(path) < 2: continue
                        path = np.hstack([
                            path[:i0 + 1], path[i0 + 1:i1 + 1][::-1],
                            path[i1 + 1:]
                        ])
                        d = calc_dist(path)
                        if d >= dst:
                            path = p0
                            nochange += 1
                            if nochange > 200: break

                        else:
                            dst = d
                            nochange = 0

                    allpts.extend([[pp[i][0], pp[i][1], zpos, ci]
                                   for i in path])

                self.points = [p for p in self.points if p[2] != zpos]
                self.points.extend(allpts)

            else:
                #### start a new contour
                ci = np.max(pts[:, 3]) + 1
                self.points.append([newpt[0], newpt[1], zpos, ci])

        np.savetxt("pts.save", self.points, fmt='%d')
Exemplo n.º 42
0
def dRMSD(x, y):
    return norm(pdist(x) - pdist(y))/((len(x)*(len(x)-1)/2)**(0.5))
Exemplo n.º 43
0
  for i, data in enumerate(dataloader, 0):
      #print('{}/{}'.format(i*bs, nsamples))
      if i*bs > nsamples:
          break
      else:            
          inputs, _ = data  
          out = newmodel.forward(inputs.to(device)) 
          if i == 0:
              Out = out.view(inputs.shape[0], -1).cpu().data    
          else :               
              Out = torch.cat((Out, out.view(inputs.shape[0], -1).cpu().data),0) 
              Out = Out.detach()     
          del out    
 
  # normal ID
  dist = squareform(pdist(Out,method))
  est = estimate(dist,verbose=verbose) 
  id_ori = est[2]
  ID_original.append(id_ori)
  
  # pca data
  pca = PCA()
  Out = StandardScaler().fit_transform(Out)
  pca.fit(Out)
  
  # the n.of eigenvalues should be the minimum between the n. of features
  # and the n. of data points
  neigs = len(pca.singular_values_)
  
  # id given by the pca : 90 % of variance
  id_pc = get_pca_dim(pca.explained_variance_ratio_,th)
Exemplo n.º 44
0
    def metric(self, field):
        """
        Compute metric(s) for a single field

        Parameters
        ----------
        field : numpy array of shape (npx,npx) - npx is number of pixels
            Cloud mask field.

        Returns
        -------
        D0 : float
            Mean geometric nearest neighbour distance between objects.
        scai : float
            Simple Convective Aggregation Index.

        """
        cmlab, num = label(field, return_num=True, connectivity=self.con)
        regions = regionprops(cmlab)

        xC = []
        yC = []
        for i in range(num):
            props = regions[i]
            if props.area > self.areaMin:
                y0, x0 = props.centroid
                xC.append(x0)
                yC.append(y0)
        pos = np.vstack((np.asarray(xC), np.asarray(yC))).T
        nCl = pos.shape[0]

        # print('Number of regions: ',pos.shape[0],'/',num)

        if pos.shape[0] < 1:
            print("No sufficiently large cloud objects, returning nan")
            return float("nan"), float("nan")

        if self.bc == "periodic":
            dist_sq = np.zeros(nCl * (nCl - 1) //
                               2)  # to match the result of pdist
            for d in range(field.ndim):
                box = field.shape[d] // 2
                pos_1d = pos[:, d][:, np.newaxis]
                dist_1d = sd.pdist(pos_1d)
                dist_1d[dist_1d > box * 0.5] -= box
                dist_sq += dist_1d**2
            dist = np.sqrt(dist_sq)
        else:
            dist = sd.pdist(pos)

        D0 = gmean(dist)
        Nmax = field.shape[0] * field.shape[1] / 2
        scai = num / Nmax * D0 / self.L * 1000

        # Force SCAI to zero if there is only 1 region (completely aggregated)
        # This is not strictly consistent with the metric (as D0 is
        # objectively undefined), but is consistent with its spirit
        if pos.shape[0] == 1:
            scai = 0

        if self.plot:
            plt.imshow(field, "gray")
            plt.title("scai: " + str(round(scai, 3)))
            plt.show()

        return D0, scai
Exemplo n.º 45
0
def DB_cluster(density_clean,
               distance_cutoff_percent=0.02,
               delta_cutoff=0.5,
               interactive=False):
    distance_mtx_condensed = pdist(density_clean[:, 0:-1])
    density = density_clean[:, -1]
    cluster_center_index = []
    num_datapoint = len(density)
    cluster = np.full(num_datapoint, -1)
    num_cluster = 0

    distance_cutoff_index = math.ceil(distance_cutoff_percent *
                                      len(distance_mtx_condensed))
    distance_cutoff = np.sort(distance_mtx_condensed)[distance_cutoff_index]
    rho, rho_order, nearest_neighbor, delta = calculate_rho_delta(
        distance_mtx_condensed, density, distance_cutoff)

    if interactive:
        global fig, axis, col

        fig, axis = plt.subplots(dpi=200)
        mask = delta > delta_cutoff

        color = np.array([1, 0, 0, 1] * num_datapoint).reshape(
            -1, 4)  #original poitns: all red

        for index, decider in enumerate(mask):
            if decider:
                color[index] = [0, 1, 0, 1]  #color those above threshold gree

        col = axis.scatter(rho, delta, c=color, marker='.', picker=True)

        axis.set_title("Decision Graph", fontsize='xx-large')
        axis.set_ylabel(r"$\delta$", fontsize='x-large')
        axis.set_xlabel(r"$\rho$", fontsize='x-large')

        fig.canvas.mpl_connect('pick_event', onpick3)

        plt.show()

        for index, point_color in enumerate(col.get_facecolors()):
            point_color = point_color.flatten()
            if not point_color[0]:  #if green, meaning selected
                num_cluster += 1
                cluster[index] = num_cluster
                cluster_center_index.append(index)
        plt.close('all')

    else:
        for i in range(num_datapoint):
            if delta[i] >= delta_cutoff:
                num_cluster += 1
                cluster[i] = num_cluster
                cluster_center_index.append(i)

    for i in range(num_datapoint):
        index = rho_order[i]
        if cluster[index] == -1:
            cluster[index] = cluster[nearest_neighbor[index]]

    assert (not np.any(cluster == -1))

    return rho, delta, cluster, cluster_center_index, distance_mtx_condensed, distance_cutoff
Exemplo n.º 46
0
def compute_ssm(X, metric="seuclidean"):
    """Computes the self-similarity matrix of X."""
    D = distance.pdist(X, metric=metric)
    D = distance.squareform(D)
    D /= D.max()
    return 1 - D
Exemplo n.º 47
0
def proclus(X,
            k=2,
            l=3,
            minDeviation=0.1,
            A=30,
            B=3,
            niters=30,
            seed=1234,
            verboseFlag=True):
    """ Run PROCLUS on a database to obtain a set of clusters and
        dimensions associated with each one.

        Parameters:
        ----------
        - X:                the data set
        - k:                the desired number of clusters
        - l:                average number of dimensions per cluster
        - minDeviation:     for selection of bad medoids
        - A:                constant for initial set of medoids
        - B:                a smaller constant than A for the final set of medoids
        - niters:           maximum number of iterations for the second phase
        - seed:             seed for the RNG
        - verboseFlag:      True/False flag for verbosity
    """
    np.random.seed(seed)

    N, d = X.shape

    if B > A:
        raise Exception("B has to be smaller than A.")

    if l < 2:
        raise Exception("l must be >=2.")

    ###############################
    # 1.) Initialization phase
    ###############################

    # first find a superset of the set of k medoids by random sampling
    idxs = np.arange(N)
    np.random.shuffle(idxs)
    S = idxs[0:(A * k)]
    M = greedy(X, S, B * k)

    ###############################
    # 2.) Iterative phase
    ###############################

    BestObjective = np.inf

    # choose a random set of k medoids from M:
    Mcurr = np.random.permutation(M)[0:k]  # M current
    Mbest = None  # Best set of medoids found

    D = squareform(pdist(X))  # precompute the euclidean distance matrix

    it = 0  # iteration counter
    L = []  # locality sets of the medoids, i.e., points within delta_i of m_i.
    Dis = []  # important dimensions for each cluster
    assigns = []  # cluster membership assignments

    while True:
        it += 1
        L = []

        for i in range(len(Mcurr)):
            mi = Mcurr[i]
            # compute delta_i, the distance to the nearest medoid of m_i:
            di = D[mi, np.setdiff1d(Mcurr, mi)].min()
            # compute L_i, points in sphere centered at m_i with radius d_i
            L.append(np.where(D[mi] <= di)[0])

        # find dimensions:
        Dis = findDimensions(X, k, l, L, Mcurr)

        # form the clusters:
        assigns = assignPoints(X, Mcurr, Dis)

        # evaluate the clusters:
        ObjectiveFunction = evaluateClusters(X, assigns, Dis, Mcurr)

        badM = []  # bad medoids

        Mold = Mcurr.copy()

        if ObjectiveFunction < BestObjective:
            BestObjective = ObjectiveFunction
            Mbest = Mcurr.copy()
            # compute the bad medoids in Mbest:
            badM = computeBadMedoids(X, assigns, Dis, Mcurr, minDeviation)
            if verboseFlag is True:
                print("bad medoids:")
                print(badM)

        if len(badM) > 0:
            # replace the bad medoids with random points from M:
            if verboseFlag is True:
                print("old mcurr:")
                print(Mcurr)
            Mavail = np.setdiff1d(M, Mbest)
            newSel = np.random.choice(Mavail, size=len(badM), replace=False)
            Mcurr = np.setdiff1d(Mbest, badM)
            Mcurr = np.union1d(Mcurr, newSel)
            if verboseFlag is True:
                print("new mcurr:")
                print(Mcurr)

        if verboseFlag is True:
            print("finished iter: %d" % it)

        if np.allclose(Mold, Mcurr) or it >= niters:
            break

    if verboseFlag is True:
        print("finished iterative phase...")

    ###############################
    # 3.) Refinement phase
    ###############################

    # compute a new L based on assignments:
    L = []
    for i in range(len(Mcurr)):
        mi = Mcurr[i]
        L.append(np.where(assigns == mi)[0])

    Dis = findDimensions(X, k, l, L, Mcurr)
    assigns = assignPoints(X, Mcurr, Dis)

    # handle outliers:

    # smallest Manhattan segmental distance of m_i to all (k-1)
    # other medoids with respect to D_i:
    deltais = np.zeros(k)
    for i in range(k):
        minDist = np.inf
        for j in range(k):
            if j != i:
                dist = manhattanSegmentalDist(X[Mcurr[i]], X[Mcurr[j]], Dis[i])
                if dist < minDist:
                    minDist = dist
        deltais[i] = minDist

    # mark as outliers the points that are not within delta_i of any m_i:
    for i in range(len(assigns)):
        clustered = False
        for j in range(k):
            d = manhattanSegmentalDist(X[Mcurr[j]], X[i], Dis[j])
            if d <= deltais[j]:
                clustered = True
                break
        if not clustered:
            # print "marked an outlier"
            assigns[i] = -1

    return (Mcurr, Dis, assigns)
Exemplo n.º 48
0
def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds):
    """ Compute the distance matrix from a vector array X and optional Y.

    This method takes either a vector array or a distance matrix, and returns
    a distance matrix. If the input is a vector array, the distances are
    computed. If the input is a distances matrix, it is returned instead.

    This method provides a safe way to take a distance matrix as input, while
    preserving compatibility with many other algorithms that take a vector
    array.

    If Y is given (default is None), then the returned matrix is the pairwise
    distance between the arrays from both X and Y.

    Valid values for metric are:

    - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
      'manhattan']. These metrics support sparse matrix inputs.

    - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
      'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
      'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
      'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
      See the documentation for scipy.spatial.distance for details on these
      metrics. These metrics do not support sparse matrix inputs.

    Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are
    valid scipy.spatial.distance metrics), the scikit-learn implementation
    will be used, which is faster and has support for sparse matrices (except
    for 'cityblock'). For a verbose description of the metrics from
    scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics
    function.

    Read more in the :ref:`User Guide <metrics>`.

    Parameters
    ----------
    X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \
             [n_samples_a, n_features] otherwise
        Array of pairwise distances between samples, or a feature array.

    Y : array [n_samples_b, n_features], optional
        An optional second feature array. Only allowed if metric != "precomputed".

    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string, it must be one of the options
        allowed by scipy.spatial.distance.pdist for its metric parameter, or
        a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
        If metric is "precomputed", X is assumed to be a distance matrix.
        Alternatively, if metric is a callable function, it is called on each
        pair of instances (rows) and the resulting value recorded. The callable
        should take two arrays from X as input and return a value indicating
        the distance between them.

    n_jobs : int
        The number of jobs to use for the computation. This works by breaking
        down the pairwise matrix into n_jobs even slices and computing them in
        parallel.

        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging. For n_jobs below -1,
        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
        are used.

    `**kwds` : optional keyword parameters
        Any further parameters are passed directly to the distance function.
        If using a scipy.spatial.distance metric, the parameters are still
        metric dependent. See the scipy docs for usage examples.

    Returns
    -------
    D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b]
        A distance matrix D such that D_{i, j} is the distance between the
        ith and jth vectors of the given matrix X, if Y is None.
        If Y is not None, then D_{i, j} is the distance between the ith array
        from X and the jth array from Y.

    """
    if (metric not in _VALID_METRICS and not callable(metric)
            and metric != "precomputed"):
        raise ValueError("Unknown metric %s. "
                         "Valid metrics are %s, or 'precomputed', or a "
                         "callable" % (metric, _VALID_METRICS))

    if metric == "precomputed":
        X, _ = check_pairwise_arrays(X, Y, precomputed=True)
        return X
    elif metric in PAIRWISE_DISTANCE_FUNCTIONS:
        func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
    elif callable(metric):
        func = partial(_pairwise_callable, metric=metric, **kwds)
    else:
        if issparse(X) or issparse(Y):
            raise TypeError("scipy distance metrics do not"
                            " support sparse matrices.")

        dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None

        X, Y = check_pairwise_arrays(X, Y, dtype=dtype)

        if n_jobs == 1 and X is Y:
            return distance.squareform(distance.pdist(X, metric=metric,
                                                      **kwds))
        func = partial(distance.cdist, metric=metric, **kwds)

    return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
Exemplo n.º 49
0
def merge_tracklets(video_id, tracks, obj_id=0, obj_sim_thr=0.9):
    """Merge tracklets based on feature similarity
    """

    # get list of boxes for each track
    track_boxes = collections.defaultdict(list)
    for fn in tracks:
        for x1, y1, x2, y2, tid in tracks[fn][obj_id]:
            track_boxes[tid].append(
                [int(fn), int(x1), int(y1),
                 int(x2), int(y2)])
    tids = list(track_boxes.keys())

    # load frames, compute features
    frames = sth_dataset.load_video_frames(video_id)
    track_feats = track_histogram(frames, track_boxes)

    # compute pair-wise distances to obtain merge candidates
    feats = np.array(list(track_feats.values()))
    similarity = 1 - squareform(pdist(feats, metric='cosine'))

    # compute similarity pairs
    cliques = []
    idx1, idx2 = np.where(similarity - np.eye(len(track_feats)) > obj_sim_thr)
    for i1, i2 in zip(idx1, idx2):
        t1, t2 = tids[i1], tids[i2]
        new = True
        for clq in cliques:
            if t1 in clq and t2 in clq:
                new = False
            elif t1 in clq and t2 not in clq:
                new = False
                clq.append(t2)
            elif t1 not in clq and t2 in clq:
                new = False
                clq.append(t1)
        if new:
            cliques.append([t1, t2])

    # convert tids to cliq ids
    tid2cliqid = {}
    for c, clq in enumerate(cliques):
        for tid in clq:
            tid2cliqid[tid] = c
    # fill in the singleton tracks
    for tid in tids:
        if tid not in tid2cliqid:
            tid2cliqid[tid] = len(
                tid2cliqid
            )  # should technically be a new cliq-id, but this works :)

    # update track ids
    for fn in tracks:
        tids_in_fn = []
        keep = []
        tids_in_fn = []
        for k, box in enumerate(tracks[fn][obj_id]):
            this_tid = tid2cliqid[box[-1]]
            if this_tid in tids_in_fn:
                # ignore duplicated tid
                pass
            else:
                tracks[fn][obj_id][k][-1] = this_tid
                tids_in_fn.append(this_tid)
                keep.append(k)

        # delete duplicated tids
        tracks[fn][obj_id] = tracks[fn][obj_id][keep, :]

    return tracks
Exemplo n.º 50
0
def pairwise_distances(X, Y=None, metric="euclidean", **kwds):
    """ Compute the distance matrix from a vector array X and optional Y.

    This method takes either a vector array or a distance matrix, and returns
    a distance matrix. If the input is a vector array, the distances are
    computed. If the input is a distances matrix, it is returned instead.

    This method provides a safe way to take a distance matrix as input, while
    preserving compatability with many other algorithms that take a vector
    array.

    If Y is given (default is None), then the returned matrix is the pairwise
    distance between the arrays from both X and Y.

    Please note that support for sparse matrices is currently limited to those
    metrics listed in pairwise.pairwise_distance_functions.

    Valid values for metric are:

    - from scikits.learn: ['euclidean', 'l2', 'l1', 'manhattan', 'cityblock']

    - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
      'correlation', 'cosine', 'dice', 'hamming', 'jaccard', 'kulsinski',
      'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao',
      'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeucludean', 'yule']
      See the documentation for scipy.spatial.distance for details on these
      metrics.

    Note in the case of 'euclidean' and 'cityblock' (which are valid
    scipy.spatial.distance metrics), the values will use the scikits.learn
    implementation, which is faster and has support for sparse matrices.
    For a verbose description of the metrics from scikits.learn, see the
    __doc__ of the sklearn.pairwise.distance_metrics function.

    Parameters
    ----------
    X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \
             [n_samples_a, n_features] otherwise
        Array of pairwise distances between samples, or a feature array.

    Y : array [n_samples_b, n_features]
        A second feature array only if X has shape [n_samples_a, n_features].

    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string, it must be one of the options
        allowed by scipy.spatial.distance.pdist for its metric parameter, or
        a metric listed in pairwise.pairwise_distance_functions.
        If metric is "precomputed", X is assumed to be a distance matrix and
        must be square.
        Alternatively, if metric is a callable function, it is called on each
        pair of instances (rows) and the resulting value recorded. The callable
        should take two arrays from X as input and return a value indicating
        the distance between them.

    `**kwds` : optional keyword parameters
        Any further parameters are passed directly to the distance function.
        If using a scipy.spatial.distance metric, the parameters are still
        metric dependent. See the scipy docs for usage examples.

    Returns
    -------
    D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b]
        A distance matrix D such that D_{i, j} is the distance between the
        ith and jth vectors of the given matrix X, if Y is None.
        If Y is not None, then D_{i, j} is the distance between the ith array
        from X and the jth array from Y.

    """
    if metric == "precomputed":
        if X.shape[0] != X.shape[1]:
            raise ValueError("X is not square!")
        return X
    elif metric in pairwise_distance_functions:
        return pairwise_distance_functions[metric](X, Y, **kwds)
    elif callable(metric):
        # Check matrices first (this is usually done by the metric).
        X, Y = check_pairwise_arrays(X, Y)
        n_x, n_y = X.shape[0], Y.shape[0]
        # Calculate distance for each element in X and Y.
        D = np.zeros((n_x, n_y), dtype='float')
        for i in range(n_x):
            start = 0
            if X is Y:
                start = i
            for j in range(start, n_y):
                # Kernel assumed to be symmetric.
                D[i][j] = metric(X[i], Y[j], **kwds)
                if X is Y:
                    D[j][i] = D[i][j]
        return D
    else:
        # Note: the distance module doesn't support sparse matrices!
        if type(X) is csr_matrix:
            raise TypeError("scipy distance metrics do not"
                            " support sparse matrices.")
        if Y is None:
            return distance.squareform(distance.pdist(X, metric=metric,
                                                      **kwds))
        else:
            if type(Y) is csr_matrix:
                raise TypeError("scipy distance metrics do not"
                                " support sparse matrices.")
            return distance.cdist(X, Y, metric=metric, **kwds)
def speakerDiarization(fileName,
                       numOfSpeakers,
                       mtSize=2.0,
                       mtStep=0.2,
                       stWin=0.05,
                       LDAdim=35,
                       PLOT=False):
    '''
    ARGUMENTS:
        - fileName:        the name of the WAV file to be analyzed
        - numOfSpeakers    the number of speakers (clusters) in the recording (<=0 for unknown)
        - mtSize (opt)     mid-term window size
        - mtStep (opt)     mid-term window step
        - stWin  (opt)     short-term window size
        - LDAdim (opt)     LDA dimension (0 for no LDA)
        - PLOT     (opt)   0 for not plotting the results 1 for plottingy
    '''
    [Fs, x] = audioBasicIO.readAudioFile(fileName)
    x = audioBasicIO.stereo2mono(x)
    Duration = len(x) / Fs

    [
        Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1,
        stStep1, computeBEAT1
    ] = aT.loadKNNModel(os.path.join("data", "knnSpeakerAll"))
    [
        Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2,
        stStep2, computeBEAT2
    ] = aT.loadKNNModel(os.path.join("data", "knnSpeakerFemaleMale"))

    [MidTermFeatures,
     ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs,
                                                 mtSize * Fs, mtStep * Fs,
                                                 round(Fs * stWin),
                                                 round(Fs * stWin * 0.5))

    MidTermFeatures2 = numpy.zeros(
        (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2),
         MidTermFeatures.shape[1]))

    for i in range(MidTermFeatures.shape[1]):
        curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1
        curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2
        [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
        [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
        MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
        MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] +
                         len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1)::,
                         i] = P2 + 0.0001

    MidTermFeatures = MidTermFeatures2  # TODO
    # SELECT FEATURES:
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20];                                                                                         # SET 0A
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100];                                                                                 # SET 0B
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,
    #   97,98, 99,100];     # SET 0C

    iFeaturesSelect = [
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45,
        46, 47, 48, 49, 50, 51, 52, 53
    ]  # SET 1A
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];                                          # SET 1B
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 1C

    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53];             # SET 2A
    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];     # SET 2B
    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 2C

    #iFeaturesSelect = range(100);                                                                                                    # SET 3
    #MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010

    MidTermFeatures = MidTermFeatures[iFeaturesSelect, :]

    (MidTermFeaturesNorm, MEAN,
     STD) = aT.normalizeFeatures([MidTermFeatures.T])
    MidTermFeaturesNorm = MidTermFeaturesNorm[0].T
    numOfWindows = MidTermFeatures.shape[1]

    # remove outliers:
    DistancesAll = numpy.sum(distance.squareform(
        distance.pdist(MidTermFeaturesNorm.T)),
                             axis=0)
    MDistancesAll = numpy.mean(DistancesAll)
    iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0]

    # TODO: Combine energy threshold for outlier removal:
    #EnergyMin = numpy.min(MidTermFeatures[1,:])
    #EnergyMean = numpy.mean(MidTermFeatures[1,:])
    #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    #iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0]
    #print iNonOutLiers

    perOutLier = (100.0 *
                  (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows
    MidTermFeaturesNormOr = MidTermFeaturesNorm
    MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]

    # LDA dimensionality reduction:
    if LDAdim > 0:
        #[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin));
        # extract mid-term features with minimum step:
        mtWinRatio = int(round(mtSize / stWin))
        mtStepRatio = int(round(stWin / stWin))
        mtFeaturesToReduce = []
        numOfFeatures = len(ShortTermFeatures)
        numOfStatistics = 2
        #for i in range(numOfStatistics * numOfFeatures + 1):
        for i in range(numOfStatistics * numOfFeatures):
            mtFeaturesToReduce.append([])

        for i in range(numOfFeatures):  # for each of the short-term features:
            curPos = 0
            N = len(ShortTermFeatures[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mtWinRatio
                if N2 > N:
                    N2 = N
                curStFeatures = ShortTermFeatures[i][N1:N2]
                mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
                mtFeaturesToReduce[i + numOfFeatures].append(
                    numpy.std(curStFeatures))
                curPos += mtStepRatio
        mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
        mtFeaturesToReduce2 = numpy.zeros(
            (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2),
             mtFeaturesToReduce.shape[1]))
        for i in range(mtFeaturesToReduce.shape[1]):
            curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1
            curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2
            [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
            [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
            mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0],
                                i] = mtFeaturesToReduce[:, i]
            mtFeaturesToReduce2[
                mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] +
                len(classNames1), i] = P1 + 0.0001
            mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] +
                                len(classNames1)::, i] = P2 + 0.0001
        mtFeaturesToReduce = mtFeaturesToReduce2
        mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :]
        #mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010
        (mtFeaturesToReduce, MEAN,
         STD) = aT.normalizeFeatures([mtFeaturesToReduce.T])
        mtFeaturesToReduce = mtFeaturesToReduce[0].T
        #DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0)
        #MDistancesAll = numpy.mean(DistancesAll)
        #iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0]
        #mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2]
        Labels = numpy.zeros((mtFeaturesToReduce.shape[1], ))
        LDAstep = 1.0
        LDAstepRatio = LDAstep / stWin
        #print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i * stWin / LDAstepRatio)
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
            n_components=LDAdim)
        clf.fit(mtFeaturesToReduce.T, Labels)
        MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

    if numOfSpeakers <= 0:
        sRange = range(2, 10)
    else:
        sRange = [numOfSpeakers]
    clsAll = []
    silAll = []
    centersAll = []

    for iSpeakers in sRange:
        k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
        k_means.fit(MidTermFeaturesNorm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T))
        clsAll.append(cls)
        centersAll.append(means)
        silA = []
        silB = []
        for c in range(iSpeakers
                       ):  # for each speaker (i.e. for each extracted cluster)
            clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(
                len(cls))
            if clusterPerCent < 0.020:
                silA.append(0.0)
                silB.append(0.0)
            else:
                MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls ==
                                                              c]  # get subset of feature vectors
                Yt = distance.pdist(
                    MidTermFeaturesNormTemp.T
                )  # compute average distance between samples that belong to the cluster (a values)
                silA.append(numpy.mean(Yt) * clusterPerCent)
                silBs = []
                for c2 in range(
                        iSpeakers
                ):  # compute distances from samples of other clusters
                    if c2 != c:
                        clusterPerCent2 = numpy.nonzero(
                            cls == c2)[0].shape[0] / float(len(cls))
                        MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,
                                                                       cls ==
                                                                       c2]
                        Yt = distance.cdist(MidTermFeaturesNormTemp.T,
                                            MidTermFeaturesNormTemp2.T)
                        silBs.append(
                            numpy.mean(Yt) *
                            (clusterPerCent + clusterPerCent2) / 2.0)
                silBs = numpy.array(silBs)
                silB.append(
                    min(silBs)
                )  # ... and keep the minimum value (i.e. the distance from the "nearest" cluster)
        silA = numpy.array(silA)
        silB = numpy.array(silB)
        sil = []
        for c in range(iSpeakers):  # for each cluster (speaker)
            sil.append((silB[c] - silA[c]) /
                       (max(silB[c], silA[c]) + 0.00001))  # compute silhouette

        silAll.append(numpy.mean(sil))  # keep the AVERAGE SILLOUETTE

    #silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5)))
    imax = numpy.argmax(silAll)  # position of the maximum sillouette value
    nSpeakersFinal = sRange[imax]  # optimal number of clusters

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window)
    cls = numpy.zeros((numOfWindows, ))
    for i in range(numOfWindows):
        j = numpy.argmin(numpy.abs(i - iNonOutLiers))
        cls[i] = clsAll[imax][j]

    # Post-process method 1: hmm smoothing
    for i in range(1):
        startprob, transmat, means, cov = trainHMM_computeStatistics(
            MidTermFeaturesNormOr, cls)
        hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0],
                                       "diag")  # hmm training
        hmm.startprob_ = startprob
        hmm.transmat_ = transmat
        hmm.means_ = means
        hmm.covars_ = cov
        cls = hmm.predict(MidTermFeaturesNormOr.T)

    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = silAll[imax]  # final sillouette
    classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]

    # load ground-truth if available
    gtFile = fileName.replace('.wav', '.segments')
    # open for annotated file
    if os.path.isfile(gtFile):  # if groundturh exists
        [segStart, segEnd, segLabels] = readSegmentGT(gtFile)  # read GT data
        flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels,
                                           mtStep)  # convert to flags

    if PLOT:
        fig = plt.figure()
        if numOfSpeakers > 0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(numpy.array(range(len(classNames))))
        ax1.axis((0, Duration, -1, len(classNames)))
        ax1.set_yticklabels(classNames)
        ax1.plot(numpy.array(range(len(cls))) * mtStep + mtStep / 2.0, cls)

    if os.path.isfile(gtFile):
        if PLOT:
            ax1.plot(
                numpy.array(range(len(flagsGT))) * mtStep + mtStep / 2.0,
                flagsGT, 'r')
        purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization(
            cls, flagsGT)
        print "{0:.1f}\t{1:.1f}".format(100 * purityClusterMean,
                                        100 * puritySpeakerMean)
        if PLOT:
            plt.title(
                "Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(
                    100 * purityClusterMean, 100 * puritySpeakerMean))
    if PLOT:
        plt.xlabel("time (seconds)")
        #print sRange, silAll
        if numOfSpeakers <= 0:
            plt.subplot(212)
            plt.plot(sRange, silAll)
            plt.xlabel("number of clusters")
            plt.ylabel("average clustering's sillouette")
        plt.show()
    return cls
    'sparse_fct': 'global_sparse',
    'smooth_param': 2,
    'init': 'convex',
    'neg_time': False,
    'verbose': 0,
    'maxcount': 50  #,
    #'sparse_param':sparseness
}

print("analysing files in {}".format(basepath))
for sparseness in np.arange(0.1, 1.000001, 0.1):
    ts_path = os.path.join(
        basepath, '_'.join([
            "nnmf",
            str(config_dict['num_components']),
            "sm{}".format(config_dict['smooth_param']), config_dict['init'],
            'sp{:02.0f}'.format(sparseness * 10), datafilename
        ]))

    decomposition = ia.TimeSeries()
    decomposition.load(ts_path)
    signal = ia.TrialMean()(ia.CutOut(response_window)(decomposition))
    mode_cor = ia.CalcStimulusDrive()(signal)
    mask = mode_cor._series.squeeze() < 0.5
    if np.sum(mask) > 1:  #if there are stimulus driven components
        selected_modes = ia.SelectObjects()(decomposition, mask)
        cor = np.nanmax(1 - pdist(selected_modes.base._series, 'correlation'))
    else:
        cor = np.nanmax(1 - pdist(decomposition.base._series, 'correlation'))
    print("{}\t{}".format(sparseness, cor))
        datax=dataText_features[Trmask,:]
        
        clusterslabels=[ClusterLabel[i] for i in dayindexes]
        GTclusterslabels=numpy.unique(clusterslabels,return_inverse=True,return_counts=True)
        dayTrlabels=numpy.asarray(ClusterLabel)[Trmask]
        #kmeans = KMeans(n_clusters=len(GTclusterslabels[0]), random_state=0).fit(datax)
        #db = DBSCAN(eps=2, min_samples=2).fit(datax)

        pred_y=[]        
        for x1 in range(datax.shape[0]):
          if sum(datax[x1,:]==0)==datax.shape[1]:
            continue
          for x2 in range(x1+1,datax.shape[0]):
            if sum(datax[x2,:]==0)==datax.shape[1]:
              continue              
            d = distance.pdist(numpy.vstack((datax[x1,:],datax[x2,:])), metric='cosine')[0]
            if numpy.isnan(d):
              print distance.pdist(numpy.vstack((datax[x1,:],datax[x2,:])), metric='cosine')
              print x1, x2
              print datax.shape
              print Trainingindexes
              print sum(datax[x1,:]==0)#um(numpy.isnan(datax[x1,:]))
              break
            if dayTrlabels[x1]==dayTrlabels[x2]:
                outfile.append(str(d)+","+str(1))                  
            else:
                outfile.append(str(d)+","+str(-1))
            pred_y.append(d)        
        
        dayLabelsList=[]
        for x1 in range(dayTrlabels.shape[0]):
Exemplo n.º 54
0
    def plot_heatmap(
        self,
        kind="final",
        min_freq=0.01,
        threshold=2,
        name=True,
        indirect=True,
        figsize=None,
        max_number_factors=5,
        aspect=1,
        cmap="RdBu_r",
        **kwargs,
    ):
        """Plot clustered heatmap of predicted motif activity.

        Parameters
        ----------
        kind : str, optional
            Which data type to use for plotting. Default is 'final', which will
            plot the result of the rank aggregation. Other options are 'freq'
            for the motif frequencies, or any of the individual activities such
            as 'rf.score'.

        min_freq : float, optional
            Minimum frequency of motif occurrence.

        threshold : float, optional
            Minimum activity (absolute) of the rank aggregation result.

        name : bool, optional
            Use factor names instead of motif names for plotting.

        indirect : bool, optional
            Include indirect factors (computationally predicted or non-curated). Default is True.

        max_number_factors : int, optional
            Truncate the list of factors to this maximum size.

        figsize : tuple, optional
            Tuple of figure size (width, height).

        aspect : int, optional
            Aspect ratio for tweaking the plot.

        cmap : str, optional
            Color paletter to use, RdBu_r by default.

        kwargs : other keyword arguments
            All other keyword arguments are passed to sns.heatmap

        Returns
        -------
        cg : ClusterGrid
            A seaborn ClusterGrid instance.
        """

        filt = np.any(np.abs(self.result) >= threshold, 1)
        if hasattr(self, "freq"):
            filt = filt & np.any(np.abs(self.freq.T) >= min_freq, 1)
        else:
            filt = filt & (self.counts.sum() / self.counts.shape[0] > min_freq)

        idx = self.result.loc[filt].index

        if idx.shape[0] == 0:
            logger.warning("Empty matrix, try lowering the threshold")
            return

        if idx.shape[0] >= 100:
            logger.warning("The filtered matrix has more than 100 rows.")
            logger.warning(
                "It might be worthwhile to increase the threshold for visualization"
            )

        if kind == "final":
            data = self.result
        elif kind == "freq":
            if hasattr(self, "freq"):
                data = self.freq.T
                cmap = "Reds"
            else:
                raise ValueError(
                    "frequency plot only works with maelstrom output from clusters"
                )
        elif kind in self.activity:
            data = self.activity[kind]
            if kind in ["hypergeom.count", "mwu.score"]:
                cmap = "Reds"
        else:
            raise ValueError("Unknown dtype")

        m = data.loc[idx]

        if "vmax" in kwargs:
            vmax = kwargs.pop("vmax")
        else:
            vmax = max(abs(np.percentile(m, 1)), np.percentile(m, 99))

        if "vmin" in kwargs:
            vmin = kwargs.pop("vmin")
        else:
            vmin = -vmax

        if name:
            m["factors"] = [
                self.motifs[n].format_factors(
                    max_length=max_number_factors,
                    html=False,
                    include_indirect=indirect,
                    extra_str=",..",
                )
                for n in m.index
            ]
            m = m.set_index("factors")
        h, w = m.shape

        if figsize is None:
            figsize = (4 + m.shape[1] / 4, 1 + m.shape[0] / 3)

        fig = plt.figure(figsize=figsize)
        npixels = 30
        g = GridSpec(
            2, 1, height_ratios=(fig.get_figheight() * fig.dpi - npixels, npixels)
        )
        ax1 = fig.add_subplot(g[0, :])
        ax2 = fig.add_subplot(g[1, :])
        ax2.set_title("aggregated z-score")
        dm = pdist(m, metric="correlation")
        hc = linkage(dm, method="ward")
        leaves = dendrogram(hc, no_plot=True)["leaves"]
        cg = sns.heatmap(
            m.iloc[leaves],
            ax=ax1,
            cbar_ax=ax2,
            cbar_kws={"orientation": "horizontal"},
            cmap=cmap,
            linewidths=1,
            vmin=vmin,
            vmax=vmax,
            **kwargs,
        )
        plt.setp(cg.axes.xaxis.get_majorticklabels(), rotation=90)
        plt.tight_layout()
        # cg.ax_col_dendrogram.set_visible(False)
        # plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
        return cg
Exemplo n.º 55
0
def visualize_maelstrom(outdir, sig_cutoff=3, pfmfile=None):

    config = MotifConfig()
    if pfmfile is None:
        pfmfile = config.get_default_params().get("motif_db", None)
        pfmfile = os.path.join(config.get_motif_dir(), pfmfile)

    mapfile = pfmfile.replace(".pwm", ".motif2factors.txt")
    if os.path.exists(mapfile):

        m2f = pd.read_csv(
            mapfile, sep="\t", names=["motif", "factors"], index_col=0, comment="#"
        )
        m2f["factors"] = m2f["factors"].str[:50]
    else:
        motifs = [m.id for m in read_motifs(pfmfile)]
        m2f = pd.DataFrame({"factors": motifs}, index=motifs)

    sig_fname = os.path.join(outdir, "final.out.txt")
    df_sig = pd.read_table(sig_fname, index_col=0, comment="#")
    f = np.any(df_sig >= sig_cutoff, 1)
    vis = df_sig[f]
    if vis.shape[0] == 0:
        logger.info("No motifs reach the threshold, skipping visualization.\n")
        return

    # cluster rows
    row_linkage = hierarchy.linkage(pdist(vis, metric="euclidean"), method="complete")
    idx = hierarchy.leaves_list(row_linkage)

    plt.figure()

    vis = safe_join(vis, m2f).set_index("factors")

    # size of figure
    size = [2 + vis.shape[1] * 0.4, 1.8 + vis.shape[0] * 0.3]

    cg = sns.heatmap(
        vis.iloc[idx],
        cmap="viridis",
        yticklabels=True,
        cbar_kws={"orientation": "horizontal"},
    )
    _ = plt.setp(cg.yaxis.get_majorticklabels(), rotation=0)
    plt.title("Motif Relevance")
    plt.tight_layout()
    plt.savefig(os.path.join(outdir, "motif.relevance.png"), dpi=300)

    freq_fname = os.path.join(outdir, "motif.freq.txt")
    if os.path.exists(freq_fname):
        df_freq = pd.read_table(freq_fname, index_col=0, comment="#")
        df_freq = df_freq.T
        vis_freq = df_freq.loc[vis.iloc[idx].index]
        vis_freq = safe_join(vis_freq, m2f).set_index("factors")
        plt.figure(figsize=size)
        cg = sns.heatmap(
            vis_freq,
            cmap="viridis",
            yticklabels=True,
            vmin=0,
            vmax=0.2,
            cbar_kws={"orientation": "horizontal"},
        )
        # idx = cg.dendrogram_row.reordered_ind
        _ = plt.setp(cg.yaxis.get_majorticklabels(), rotation=0)
        plt.title("Motif Frequency")
        plt.tight_layout()
        plt.savefig(os.path.join(outdir, "motif.frequency.png"), dpi=300)

        plt.figure(figsize=size)

        bla = vis_freq.min(1)
        bla[bla < 0.01] = 0.01

        cg = sns.heatmap(
            np.log2(vis_freq.apply(lambda x: x / bla, 0)),
            yticklabels=True,
            vmin=-5,
            vmax=5,
            cbar_kws={"orientation": "horizontal"},
        )
        # idx = cg.dendrogram_row.reordered_ind
        _ = plt.setp(cg.yaxis.get_majorticklabels(), rotation=0)
        plt.title("Motif Enrichment")
        plt.tight_layout()
        plt.savefig(os.path.join(outdir, "motif.enrichment.png"), dpi=300)
Exemplo n.º 56
0
def vector_dispersion(vectors):
    distances = pdist(vectors, metric="cosine")
    dispersion = np.arccos(1.0 - distances.max())
    return dispersion
Exemplo n.º 57
0
    def _execute_map(cls, ctx, op):
        from scipy.spatial.distance import pdist, cdist

        inputs, device_id, xp = as_same_device(
            [ctx[inp.key] for inp in op.inputs],
            device=op.device,
            ret_extra=True)

        if xp is cp:  # pragma: no cover
            raise NotImplementedError(
                '`pdist` does not support running on GPU yet')

        with device(device_id):
            inputs_iter = iter(inputs)
            a = next(inputs_iter)
            if op.b is not None:
                b = next(inputs_iter)
            else:
                b = None
            kw = dict()
            if op.p is not None:
                kw['p'] = op.p
            if op.w is not None:
                kw['w'] = next(inputs_iter)
            if op.v is not None:
                kw['V'] = next(inputs_iter)
            if op.vi is not None:
                kw['VI'] = next(inputs_iter)
            metric = op.metric if op.metric is not None else op.metric_func

            if b is None:
                # one input, pdist on same chunk
                dists = pdist(a, metric=metric, **kw)
                i_indices, j_indices = xp.triu_indices(a.shape[0], k=1)
                i_indices += op.a_offset
                j_indices += op.a_offset
            else:
                # two inputs, pdist on different chunks
                dists = cdist(a, b, metric=metric, **kw).ravel()
                mgrid = \
                    xp.mgrid[op.a_offset: op.a_offset + a.shape[0],
                    op.b_offset: op.b_offset + b.shape[0]]
                i_indices, j_indices = mgrid[0].ravel(), mgrid[1].ravel()

            out_row_sizes = xp.arange(op.n - 1, -1, -1)
            out_row_cum_sizes = xp.empty((op.n + 1, ), dtype=int)
            out_row_cum_sizes[0] = 0
            xp.cumsum(out_row_sizes, out=out_row_cum_sizes[1:])
            indices = out_row_cum_sizes[i_indices] + j_indices - \
                      (op.n - out_row_sizes[i_indices])

            # save as much memory as possible
            del i_indices, j_indices, out_row_sizes, out_row_cum_sizes

            out_cum_size = xp.cumsum(op.out_sizes)
            out = op.outputs[0]
            for i in range(len(op.out_sizes)):
                start_index = out_cum_size[i - 1] if i > 0 else 0
                end_index = out_cum_size[i]
                to_filter = (indices >= start_index) & (indices < end_index)
                downside_indices = indices[to_filter] - start_index
                downside_dists = dists[to_filter]
                ctx[out.key, str(i)] = (downside_indices, downside_dists)
Exemplo n.º 58
0
def create(patterns):
    rdm = RDM()
    rdm.utv = pdist(patterns, 'correlation')
    rdm.square = squareform(rdm.utv)
    return rdm
    u_cols = list(set([l.rsplit("_", 1)[0] for l in list(counts.columns)]))
    cols = list(counts.columns)
    ss = []
    for uc in u_cols:
        cs = [c for c in cols if c.startswith(uc)]
        ss.append(counts[cs].sum(axis=1).rename(uc))
    dc = pd.concat(ss, axis=1)
    return dc


collapsed_counts = collapse_counts(counts)
lut = dict(zip(list(set([c[:3] for c in collapsed_counts.columns])), "rbg"))
row_colors = [lut[c[:3]] for c in collapsed_counts.columns]
# legend_TN = [mpatches.Patch(color=c, label=l) for (list(set([c[:3] for c in collapsed_counts.columns]))]

distances = pdist(collapsed_counts.T.values, metric='euclidean')
dist_matrix = squareform(distances)
dist_df = pd.DataFrame(
    dist_matrix, columns=collapsed_counts.columns, index=collapsed_counts.columns)
sns.clustermap(dist_df)

pairings_05hr = [['col_c_05h', 'col_w_05h'],
                 ['lym_c_05h', 'lym_w_05h'],
                 ['cer_c_05h', 'cer_w_05h']]

pairings_6hr = [['col_c_6h', 'col_w_6h'],
                ['lym_c_6h', 'lym_w_6h'],
                ['cer_c_6h', 'cer_w_6h']]

pairings_to_lym_05hr = [['col_c_05h', 'lym_w_05h'],
                        ['col_w_05h', 'lym_w_05h'],
Exemplo n.º 60
0
# @Time : 2020/3/4 16:06
# @Author : hqjiang
# @File : distance_points.py

import numpy as np
from scipy.spatial.distance import pdist, squareform

x = np.array([[0, 1], [1, 0], [2, 0]])
print(x)

# x每个点与x 间的距离
d = squareform(pdist(x, 'euclidean'))  # 欧几里得距离
print(d)