def linkage(y, method): '''An extended edition of scipy.cluster.hierarchy.linkage allowing for custom set distance function. method can be str indicators as scipy linkage, and all be a callable having form f(dm, set1, set2) which computies set distance given distance matrix dm ''' if isinstance(method, str): return scipy.cluster.hierarcy.linkage(y, method=method) distance.is_valid_y(y, throw=True, name='y') d = distance.num_obs_y(y) Z = np.zeros((d-1,4)) dm = distance.squareform(y) dmo = distance.squareform(y) dm[np.diag_indices(d)] = np.NaN #print(dm) idmap = {i:i for i in range(d)} active = [i for i in range(d)] nodes = {i:{i} for i in range(d)} for i in range(d-1): m = d - i mink = np.nanargmin(dm[np.ix_(active,active)]) minh = active[mink//m] minw = active[mink % m] left = idmap[minh] right = idmap[minw] Z[i,0] = left Z[i,1] = right Z[i,2] = dm[minh,minw] Z[i,3] = len(nodes[left]) + len(nodes[right]) nid = d+i idmap[minh] = nid nodes[nid] = nodes[left] | nodes[right] del active[active.index(minw)] for j in active: if j == minh: continue dm[minh,j] = method(dmo, nodes[nid], nodes[idmap[j]]) dm[j,minh] = dm[minh,j] return Z
def __fullEvalMat(self, cluster, Dissimilarity): # Evaluate all distances even if we have a diss matrix? M = Dissimilarity.pdist(cluster) if scpDist.is_valid_y(M): M = scpDist.squareform(M) #debug assert (M.shape[0] == M.shape[1]) return M
def get_linkage(self, d=None, method='average', skip_condensed=False): if d is None: d = self.G_sym or self.G # convert to numpy matrix if it isn't already if isinstance(d, nx.Graph): d = nx.to_numpy_matrix(d) if (skip_condensed is False) and (not is_valid_y(d)): d = squareform(d) Z = linkage(d, method=method) self.Z = Z return Z
def setup_distances(data, shortest_path=False, min_distance=1e-4, **kwargs): """\ Sets up condensed distances. Parameters ---------- data : array Distance/dissimilarity data. Options: 1) a condensed array containing distances (n_samples*(n_samples-1),) 2) a square matrix containing distances (n_samples, n_samples) 3) an array with features (length n_samples) shortest_path : boolean If True, alter distances by computing shortest path. kwargs ------ metric : str If computing distances from an array of features, this is the metric to be passed to scipy.spatial.distance.pdist Returns ------- distances : array, shape (n_samples*(n_samples-1)/2,) Condensed distances. """ assert isinstance(data, np.ndarray) if len(data.shape) == 1: assert distance.is_valid_y(data) distances = data else: assert len(data.shape) == 2 a, b = data.shape if b == a: distances = distance.squareform(data, checks=False) else: distances = distance.pdist(data) ######,**kwargs) if shortest_path: distances = distance.squareform(distances) distances = csgraph.shortest_path(distances) distances = distance.squareform(distances, checks=False) if min_distance is not None: distances = np.maximum(distances, min_distance * np.max(distances)) return distances
def simplex_volume(*, vertices=None, sides=None) -> float: """ Return the volume of the simplex with given vertices or sides. If vertices are given they must be in a NumPy array with shape (N+1, N): the position vectors of the N+1 vertices in N dimensions. If the sides are given, they must be the compressed pairwise distance matrix as returned from scipy.spatial.distance.pdist. Raises a ValueError if the vertices do not form a simplex (for example, because they are coplanar, colinear or coincident). Warning: this algorithm has not been tested for numerical stability. """ # Implements http://mathworld.wolfram.com/Cayley-MengerDeterminant.html if (vertices is None) == (sides is None): raise ValueError("Exactly one of vertices and sides must be given") # β_ij = |v_i - v_k|² if sides is None: vertices = np.asarray(vertices, dtype=float) sq_dists = distance.pdist(vertices, metric='sqeuclidean') else: sides = np.asarray(sides, dtype=float) if not distance.is_valid_y(sides): raise ValueError("Invalid number or type of side lengths") sq_dists = sides**2 # Add border while compressed num_verts = distance.num_obs_y(sq_dists) bordered = np.concatenate((np.ones(num_verts), sq_dists)) # Make matrix and find volume sq_dists_mat = distance.squareform(bordered) coeff = -(-2)**(num_verts - 1) * factorial(num_verts - 1)**2 vol_square = np.linalg.det(sq_dists_mat) / coeff if vol_square <= 0: raise ValueError('Provided vertices do not form a tetrahedron') return np.sqrt(vol_square)
def computeDistances(descriptors, method, parallel, nprocs, distance_func=None): num_desc = len(descriptors) indices = [(y, x) for y in range(num_desc - 1) for x in range(y + 1, num_desc)] splits = np.array_split(np.array(indices), 8) def loop(inds): dists = [] for ind in inds: if distance_func == None: try: dist = computeDistance(descriptors[ind[0]], descriptors[ind[1]], method) except: print 'method {} failed'.format(method) raise else: dist = distance_func(descriptors[ind[0]], descriptors[ind[1]]) dists.append(dist) return dists if parallel: dists = pc.parmap(loop, splits, nprocs) else: dists = map(loop, splits) # convert densed vector-form to matrix dense_vector = np.concatenate(dists) if spdistance.is_valid_y(dense_vector, warning=True): dist_matrix = spdistance.squareform(dense_vector) else: print 'ERROR: not a valid condensed distance matrix!' n = dense_vector.shape[0] d = int(np.ceil(np.sqrt(n * 2))) should = d * (d - 1) / 2 print '{} != {}, num: {}'.format(should, n, num_desc) sys.exit(1) # fill diagonal elements with max np.fill_diagonal(dist_matrix, np.finfo(float).max) return dist_matrix
def most_and_least_similar_pairs(distance_matrix): if distance.is_valid_dm(distance_matrix) == False: if distance.is_valid_y(distance_matrix) == False: raise ValueError('Invalid distance matrix. Please supply a condensed or redundant distance matrix.') distance_matrix = distance.squareform(distance_matrix, force='tomatrix') similar_score = 1 dissimilar_score = 0 n = distance_matrix.shape[0] for i in range(0, n): for j in range(i+1, n): score = distance_matrix[i, j] if score < similar_score: similar_score = score similar_indices = (i, j) if score > dissimilar_score: dissimilar_score = score dissimilar_indices = (i, j) return similar_score, similar_indices, dissimilar_score, dissimilar_indices
def computeDistances(descriptors, method, parallel, nprocs, distance_func=None): num_desc = len(descriptors) indices = [(y,x) for y in range(num_desc-1) for x in range(y+1, num_desc)] splits = np.array_split(np.array(indices), 8) def loop(inds): dists = [] for ind in inds: if distance_func == None: try: dist = computeDistance(descriptors[ ind[0] ],descriptors[ ind[1] ], method) except: print 'method {} failed'.format(method) raise else: dist = distance_func( descriptors[ ind[0] ],descriptors[ ind[1] ] ) dists.append(dist) return dists if parallel: dists = pc.parmap(loop, splits, nprocs) else: dists = map(loop, splits) # convert densed vector-form to matrix dense_vector = np.concatenate( dists ) if spdistance.is_valid_y(dense_vector, warning=True): dist_matrix = spdistance.squareform( dense_vector ) else: print 'ERROR: not a valid condensed distance matrix!' n = dense_vector.shape[0] d = int(np.ceil(np.sqrt(n * 2))) should = d * (d - 1) / 2 print '{} != {}, num: {}'.format(should, n, num_desc) sys.exit(1) # fill diagonal elements with max np.fill_diagonal(dist_matrix, np.finfo(float).max) return dist_matrix
def Test(X, Y, perms=10000, method='pearson', tail='upper'): """ Takes two distance matrices (either redundant matrices or condensed vectors) and performs a Mantel test. The Mantel test is a significance test of the correlation between two distance matrices. Parameters ---------- X : array_like First distance matrix (condensed or redundant). Y : array_like Second distance matrix (condensed or redundant), where the order of elements corresponds to the order of elements in the first matrix. perms : int, optional The number of permutations to perform (default: 10000). A larger number gives more reliable results but takes longer to run. If the actual number of possilbe permutations is smaller, the program will enumerate all permutations. Enumeration can be forced by setting this argument to 0. method : str, optional Type of correlation coefficient to use; either 'pearson' or 'spearman' (default: 'pearson'). tail : str, optional Which tail to test in the calculation of the empirical p-value; either 'upper' or 'lower' (default: 'upper'). Returns ------- r : float Veridical correlation p : float Empirical p-value z : float Standard score (z-score) """ # Ensure X and Y are arrays. X = asarray(X, dtype=float) Y = asarray(Y, dtype=float) # Check that X and Y are valid distance matrices/vectors. if distance.is_valid_dm(X) == False and distance.is_valid_y(X) == False: raise ValueError('X is not a valid distance matrix') if distance.is_valid_dm(Y) == False and distance.is_valid_y(Y) == False: raise ValueError('Y is not a valid distance matrix') # If X or Y is a matrix, condense it to a vector. if len(X.shape) == 2: X = distance.squareform(X, force='tovector', checks=False) if len(Y.shape) == 2: Y = distance.squareform(Y, force='tovector', checks=False) # Check for size equality. if X.shape[0] != Y.shape[0]: raise ValueError('X and Y are not of equal size') # Check for minimum size. if X.shape[0] < 3: raise ValueError('X and Y should represent at least 3 objects') # If Spearman correlation is requested, convert X and Y to ranks. if method == 'spearman': X = rankdata(X) Y = rankdata(Y) elif method != 'pearson': raise ValueError('The method should be set to "pearson" or "spearman"') # Most parts of the correlation coefficient will be the same for every # permutation and can therefore be computed outside the loop. X_res = X - X.mean() # X residuals Y_res = Y - Y.mean() # Y residuals X_ss = (X_res * X_res).sum() # X sum-of-squares Y_ss = (Y_res * Y_res).sum() # Y sum-of-squares denominator = sqrt(X_ss * Y_ss) # Denominator of the correlation coefficient # Although Y_res will be the same set of numbers on every permutation, the # order will be different each time. Therefore, we reformat Y_res as a matrix # so that we can take matrix permutations of the Y residuals. Y_res_as_matrix = distance.squareform(Y_res, force='tomatrix', checks=False) # Determine the size of the matrix (i.e. number of rows/columns). n = Y_res_as_matrix.shape[0] # Initialize an empty array to store temporary vector permutations of Y_res. Y_res_permuted = zeros(Y_res.shape[0], dtype=float) # Either enumerate all permutations ... if perms >= factorial(n) or perms == 0: # Initialize an empty array to store the correlations. corrs = zeros(factorial(n), dtype=float) # Enumerate all permutations of row/column orders. orders = permutations(range(n)) perms = 0 for order in orders: # Take a permutation of the matrix. Y_res_as_matrix_permuted = Y_res_as_matrix[order, :][:, order] # Condense the permuted version of the matrix. Rather than use # distance.squareform(), we call directly into the C wrapper for speed. distance._distance_wrap.to_vector_from_squareform_wrap(Y_res_as_matrix_permuted, Y_res_permuted) # Compute the correlation coefficient and store it to corrs. corrs[perms] = (X_res * Y_res_permuted).sum() / denominator perms += 1 # ... or randomly sample from the space of permutations. else: # Initialize an empty array to store the correlations. corrs = zeros(perms, dtype=float) # Store the veridical correlation coefficient first. corrs[0] = (X_res * Y_res).sum() / denominator for i in range(1, perms): # Choose a random order in which to permute the rows and columns. order = random.permutation(n) # Take a permutation of the matrix. Y_res_as_matrix_permuted = Y_res_as_matrix[order, :][:, order] # Condense the permuted version of the matrix. Rather than use # distance.squareform(), we call directly into the C wrapper for speed. distance._distance_wrap.to_vector_from_squareform_wrap(Y_res_as_matrix_permuted, Y_res_permuted) # Compute the correlation coefficient and store it to corrs. corrs[i] = (X_res * Y_res_permuted).sum() / denominator # Assign veridical correlation to r. r = corrs[0] # Calculate the empirical p-value for the upper or lower tail. if tail == 'upper': p = (corrs >= r).sum() / float(perms) elif tail == 'lower': p = (corrs <= r).sum() / float(perms) else: raise ValueError('The tail should be set to "upper" or "lower"') # Calculate the standard score. m = corrs.mean() sd = corrs.std() z = (r - m) / sd return r, p, z
def computeDistances(descriptors, method, distance=True, parallel=True, distance_func=None, nprocs=4): num_desc = len(descriptors) if np.isnan(descriptors).any(): raise ValueError('nan in descr!') if np.isinf(descriptors).any(): raise ValueError('inf in descr!') for i in range(len(descriptors)): if not descriptors[i].any(): # faster print 'WARNING: complete row {} is 0'.format(i) indices = [(y, x) for y in range(num_desc - 1) for x in range(y + 1, num_desc)] def loop(ind): if distance_func == None: try: dist = computeDistance(descriptors[ind[0]], descriptors[ind[1]], method) except: print 'method {} failed'.format(method) raise else: dist = distance_func(descriptors[ind[0]], descriptors[ind[1]]) return dist if parallel: dists = pc.parmap(loop, indices, nprocs=nprocs) else: dists = map(loop, indices) dense_vector = np.array(dists, dtype=float) if spdistance.is_valid_y(dense_vector, warning=True): dist_matrix = spdistance.squareform(dense_vector) else: print 'ERROR: not a valid condensed distance matrix!' n = dense_vector.shape[0] d = int(np.ceil(np.sqrt(n * 2))) should = d * (d - 1) / 2 print '{} != {}, num: {}'.format(should, n, num_desc) sys.exit(1) # do some checks if np.isnan(dist_matrix).any(): print 'WARNING have a nan in the dist-matrix' if np.isinf(dist_matrix).any(): print 'WARNING have a inf in the dist-matrix' if distance: if np.count_nonzero( dist_matrix == np.finfo(dist_matrix.dtype).max) > 0: raise ValueError('there is already a float-maximum') np.fill_diagonal(dist_matrix, np.finfo(dist_matrix.dtype).max) else: if np.count_nonzero( dist_matrix == np.finfo(dist_matrix.dtype).min) > 0: raise ValueError('there is already a float-min') np.fill_diagonal(dist_matrix, np.finfo(dist_matrix.dtype).min) return dist_matrix #, dist_m
group_output.add_argument('-f', '--output-format', default="newick", choices=["newick", "json", "png"], help='The output format. [Default: %(default)s]') args = parser.parse_args() # Load distance matrix dist_matrix_io = DistanceMatrixIO(args.input_distances) dist_matrix = dist_matrix_io.dist_matrix rows_names = dist_matrix_io.names # Process tree tree = None data_link = None if len(rows_names) == 1: tree = Node(rows_names[0]) else: # Computing distance and linkage if not is_valid_y(dist_matrix): dist_matrix = squareform(dist_matrix) data_link = linkage(dist_matrix, args.linkage_method) # SciPy format to Node hc_tree = to_tree(data_link, rd=False) id_2_name = dict(zip(range(len(rows_names)), rows_names)) tree = Node.fromClusterNode(hc_tree, id_2_name) # Write output if args.output_format != "png": # Text outputs out_str = None if args.output_format == "newick": out_str = "{};".format(tree.toNewick()) elif args.output_format == "json": out_str = json.dumps(tree.toDict(), default=lambda o: o.__dict__, sort_keys=False) with open(args.output_tree, "w") as FH_out:
dm = d.squareform(d.pdist(coords)) # Distance matrix using scipy.pdist contact = where(dm < cutoff, ones_like(dm), zeros_like( dm)) # An array with 1 if distance < cutoff, 0 otherwise dms.append(dm) # List of distance matrices from all structures contacts.append( contact) # List of contact matrices from all structures rg = sqrt(sum(dm**2) / (2 * nres**2)) # Calculates radius of gyration f_rg.write("%i\t%.3f\n" % (i / nres + 1, rg)) f_rg.close() ave_dms = average(array(dms), axis=0) # Mean distance matrix from all structures std_dms = std(array(dms), axis=0) # Std distance matrix from all structures ave_contacts = average(array(contacts), axis=0) # Mean contact matrix from all structures if d.is_valid_y(ave_dms): ave_dms = d.squareform(ave_dms) if d.is_valid_y(std_dms): std_dms = d.squareform(ave_dms) if d.is_valid_y(ave_contacts): ave_contacts = d.squareform(ave_contacts) savetxt(f[:-4] + '.dm', ave_dms, fmt="%.3f") savetxt(f[:-4] + '.std', std_dms, fmt="%.3f") savetxt(f[:-4] + '.cm', ave_contacts, fmt="%.3f") m, n = ave_dms.shape scalingf = open(f[:-4] + '.nu', 'w') # File with polymer scaling (r vs. N) scalingfs = open(f[:-4] + '.nus', 'w') # File with std of polymer scaling (r vs. N) for i in range(m): dm_diag = diagonal(ave_dms, i) # Diagonals of distance matrix scalingf.write("%s\t%.3f\n" % (i, average(dm_diag))) # Averaged to get mean r dm_diags = diagonal(std_dms, i) # Diagonals of distance matrix scalingfs.write("%s\t%.3f\n" %
def linkage(D, method='single', metric='euclidean', preserve_input=True): '''Hierarchical (agglomerative) clustering on a dissimilarity matrix or on Euclidean data. The argument D is either a compressed distance matrix or a collection of m observation vectors in n dimensions as an (m×n) NumPy array. Apart from the argument preserve_input, the methods have the same input parameters and output format as the functions of the same name in the package scipy.cluster.hierarchy. Therefore, the documentation is not duplicated here. Please refer to the SciPy documentation for further details. The additional, optional argument preserve_input specifies whether the fastcluster package first copies the distance matrix or writes into the existing array. If the distance matrix is only generated for the clustering step and is not needed afterwards, half the memory can be saved by specifying preserve_input=False. Note that the input array D contains unspecified values after this procedure. It is therefore safer to write linkage(D, method="…", preserve_distance=False) del D to make sure the matrix D is not accidentally used after it has been used as scratch memory. The single linkage algorithm does not write to the distance matrix or its copy anyway, so the preserve_distance flag has no effect in this case.''' if not isinstance(D, ndarray): raise ValueError('The first argument must be of type numpy.ndarray.') if len(D.shape)==1: if method=='single': assert D.dtype==double D_ = require(D, dtype=double, requirements=['C']) if D_ is not D: stderr.write('The condensed distance matrix had to be copied since it has the following flags:\n') stderr.write(str(D.flags) + '\n') elif preserve_input: D_ = D.copy() assert D_.dtype == double assert D_.flags.c_contiguous assert D_.flags.owndata assert D_.flags.writeable assert D_.flags.aligned else: assert D.dtype==double D_ = require(D, dtype=double, requirements=['C', 'A', 'W', 'O']) if D_ is not D: stderr.write('The condensed distance matrix had to be copied since it has the following flags:\n') stderr.write(str(D.flags) + '\n') is_valid_y(D_, throw=True) N = num_obs_y(D_) Z = empty((N-1,4)) if N > 1: linkage_wrap(N, D_, Z, mthidx[method]) return Z else: assert len(D.shape)==2 N = D.shape[0] Z = empty((N-1,4)) D_ = pdist(D, metric) assert D_.dtype == double assert D_.flags.c_contiguous assert D_.flags.owndata assert D_.flags.writeable assert D_.flags.aligned if N > 1: linkage_wrap(N, D_, Z, mthidx[method]) return Z
def linearized_fuzzy_c_medoids(data, distance_matrix, components=10, eps=1e-4, max_iter=1000, fuzzifier=2, membership_subset_size=None, initialization_method="random_choice", empty_clusters_method="nothing", medoids_idx=None): """ Performs the linearized fuzzy c-medoids clustering algorithm on a dataset. :param data: The dataset into which the clustering will be performed. The dataset must be 2D np.array with rows as examples and columns as features. :param distance_matrix: The pairwise distance matrix applied across all examples from the data matrix. The distance matrix must be encoded into a condensed distance vector (see: https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.squareform.html) :param components: The number of components (clusters) wanted. :param eps: Criterion used to define convergence. If the absolute differences between two consecutive losses is lower than `eps`, the clustering stop. :param max_iter: Criterion used to stop the clustering if the number of iterations exceeds `max_iter`. :param fuzzifier: Membership fuzzification coefficient. :param membership_subset_size: Size of subset to inspect during the memberships matrix computation. Reduce computations length. :param initialization_method: Method used to initialise the centroids. Can take one of the following values : * "random_uniform" or "uniform", samples values between the min and max across each dimension. * "random_gaussian" or "gaussian", samples values from a gaussian with the same mean and std as each data's dimension. * "random_choice" or "choice", samples random examples from the data without replacement. * "central_dissimilar_medoids", sample the first medoid as the most central point of the dataset, then sample all successive medoids as the most dissimilar to all medoids that have already been picked. * "central_dissimilar_random_medoids", same as "central_dissimilar_medoids", but the first medoid is sampled randomly. :param empty_clusters_method: Method used at each iteration to handle empty clusters. Can take one of the following values : * "nothing", do absolutely nothing and ignore empty clusters. * "random_example", assign a random example to all empty clusters. * "furthest_example_from_its_centroid", assign the furthest example from its centroid to each empty cluster. :param medoids_idx: Initials medoids indexes to use instead of randomly initialize them. :return: A tuple containing : * The memberships matrix. * The medoids matrix. * An array with all losses at each iteration. """ assert len(data.shape) == 2, "The data must be a 2D array" assert data.shape[0] > 0, "The data must have at least one example" assert data.shape[1] > 0, "The data must have at least one feature" assert is_valid_y( distance_matrix ), "The distance matrix is not encoded into a condensed distance vector" assert 1 <= components <= data.shape[ 0], "The number of components wanted must be between 1 and %s" % data.shape[ 0] assert 0 <= max_iter, "The number of max iterations must be positive" assert fuzzifier > 1, "The fuzzifier must be greater than 1" assert (membership_subset_size is None) or (1 <= membership_subset_size <= data.shape[0]), \ "The membership subset size wanted must be between 1 and %s" % data.shape[0] assert (medoids_idx is None) or (medoids_idx.shape == components), \ "The given medoids indexes do not have a correct shape. Expected shape : {}, given shape : {}".format( (components,), medoids_idx.shape ) assert (medoids_idx is None) or np.all(medoids_idx < data.shape[0]), \ "The provided medoid indexes array contains unreachable indexes" raise NotImplementedError("TODO") # If no `membership_subset_size` is specified, [1] suggest to use a value much smaller than the average of points # in a cluster if membership_subset_size is None: membership_subset_size = distance_matrix.shape[0] // components # Initialisation if medoids_idx is None: medoids_idx = cluster_initialization(distance_matrix, components, initialization_method, need_idx=True) with tqdm(total=max_iter, bar_format=_FORMAT_PROGRESS_BAR) as progress_bar: best_memberships = None best_medoids_idx = None best_loss = np.inf memberships = None medoids_idx_old = None losses = [] current_iter = 0 while (current_iter < max_iter) and \ ((current_iter < 1) or (not all(medoids_idx == medoids_idx_old))) and \ ((current_iter < 2) or not (abs(losses[-1] - losses[-2]) <= eps)): medoids_idx_old = medoids_idx memberships = _compute_memberships(distance_matrix, medoids_idx, fuzzifier) handle_empty_clusters(distance_matrix, medoids_idx, memberships, strategy=empty_clusters_method) top_memberships_mask = _compute_top_membership_subset( memberships, membership_subset_size) medoids_idx = _compute_medoids(distance_matrix, memberships, fuzzifier, top_memberships_mask) loss = _compute_loss(distance_matrix, medoids_idx, memberships, fuzzifier) losses.append(loss) if loss < best_loss: best_loss = loss best_memberships = memberships best_medoids_idx = medoids_idx # Update the progress bar current_iter += 1 progress_bar.update() progress_bar.set_postfix({ "Loss": "{0:.6f}".format(loss), "best_loss": "{0:.6f}".format(best_loss) }) return { "memberships": best_memberships, "medoids_indexes": best_medoids_idx, "clusters_center": data[best_medoids_idx, :], "losses": np.array(losses), "affectations": best_memberships.argmax(axis=1), "ambiguity": ambiguity(best_memberships), "partition_coefficient": partition_coefficient(best_memberships), "partition_entropy": partition_entropy(best_memberships), "extended_time": progress_bar.last_print_t - progress_bar.start_t, }
def computeDistances(descriptors, distance=True, parallel=True, nprocs=None, normalize=False): num_desc = len(descriptors) if np.isnan(descriptors).any(): raise ValueError('nan in descr!') if np.isinf(descriptors).any(): raise ValueError('inf in descr!') for i in range(len(descriptors)): # if np.count_nonzero(descriptors[i]) == 0: if not descriptors[i].any(): # faster print 'WARNING: complete row {} is 0'.format(i) indices = [(y, x) for y in range(num_desc - 1) for x in range(y + 1, num_desc)] splits = np.array_split(np.array(indices), 8) def loop(inds): dists = [] for ind in inds: dist = spdistance.cosine(descriptors[ind[0]], descriptors[ind[1]]) # dist = 1.0 - np.dot(descriptors[ind[0]], descriptors[ind[1]]) / \ # ( np.sqrt(descriptors[ind[0]]**2) *\ # np.sqrt(descriptors[ind[1]]**2)) dists.append(dist) return dists if parallel: dists = parmap(loop, splits, nprocs) else: dists = map(loop, splits) # convert densed vector-form to matrix dense_vector = np.concatenate(dists) if spdistance.is_valid_y(dense_vector, warning=True): dist_matrix = spdistance.squareform(dense_vector) else: print 'ERROR: not a valid condensed distance matrix!' n = dense_vector.shape[0] d = int(np.ceil(np.sqrt(n * 2))) should = d * (d - 1) / 2 raise ValueError('{} != {}, num: {}'.format(should, n, num_desc)) # do some checks if np.isnan(dist_matrix).any(): print 'WARNING have a nan in the dist-matrix' if np.isinf(dist_matrix).any(): print 'WARNING have a inf in the dist-matrix' if normalize: dist_matrix /= np.sum(dist_matrix) # if distance: # if np.count_nonzero(dist_matrix == np.finfo(float).max) > 0: # raise ValueError('there is already a float-maximum') # if normalize: # dist_matrix /= np.sum(dist_matrix) # np.fill_diagonal(dist_matrix, np.finfo(float).max) # else: # if np.count_nonzero(dist_matrix == np.finfo(float).min) > 0: # raise ValueError('there is already a float-min') # if normalize: # dist_matrix /= np.sum(dist_matrix) # np.fill_diagonal(dist_matrix, np.finfo(float).min) return dist_matrix #, dist_m
def Test(X, Y, perms=10000, method='pearson', tail='upper'): """ Takes two distance matrices (either redundant matrices or condensed vectors) and performs a Mantel test. The Mantel test is a significance test of the correlation between two distance matrices. Parameters ---------- X : array_like First distance matrix (condensed or redundant). Y : array_like Second distance matrix (condensed or redundant), where the order of elements corresponds to the order of elements in the first matrix. perms : int, optional The number of permutations to perform (default: 10000). A larger number gives more reliable results but takes longer to run. If the actual number of possilbe permutations is smaller, the program will enumerate all permutations. Enumeration can be forced by setting this argument to 0. method : str, optional Type of correlation coefficient to use; either 'pearson', 'spearman', or 'kendall' (default: 'pearson'). N.B. the time complexity of Kendall's tau scales exponentially with matrix size, so it is slow for large matrices. tail : str, optional Which tail to test in the calculation of the empirical p-value; either 'upper' or 'lower' (default: 'upper'). Returns ------- r : float Veridical correlation p : float Empirical p-value z : float Standard score (z-score) """ # Ensure X and Y are arrays. X = asarray(X, dtype=float) Y = asarray(Y, dtype=float) # Check that X and Y are valid distance matrices/vectors. if distance.is_valid_dm(X) == False and distance.is_valid_y(X) == False: raise ValueError('X is not a valid distance matrix') if distance.is_valid_dm(Y) == False and distance.is_valid_y(Y) == False: raise ValueError('Y is not a valid distance matrix') # Figure out whether X and Y are matrices or vectors and convert both to # vectors and one to a matrix (as needed). # X is vector and Y is vector if len(X.shape) == 1 and len(Y.shape) == 1: Y_as_matrix = distance.squareform(Y, force='tomatrix', checks=False) # X is vector and Y is matrix elif len(X.shape) == 1 and len(Y.shape) == 2: Y_as_matrix = Y Y = distance.squareform(Y, force='tovector', checks=False) # X is matrix and Y is vector elif len(X.shape) == 2 and len(Y.shape) == 1: Y_as_matrix = X X, Y = Y, distance.squareform(X, force='tovector', checks=False) # X is matrix and Y is matrix elif len(X.shape) == 2 and len(Y.shape) == 2: Y_as_matrix = Y X = distance.squareform(X, force='tovector', checks=False) Y = distance.squareform(Y, force='tovector', checks=False) # Check for size equality. if X.shape[0] != Y.shape[0]: raise ValueError('X and Y are not of equal size') # Check for minimum size. if X.shape[0] < 3: raise ValueError('X and Y should represent at least 3 objects') # Assign the relevant correlation function to the variable 'correlate'. if method == 'pearson': correlate = pearsonr elif method == 'spearman': correlate = spearmanr elif method == 'kendall': correlate = kendalltau else: raise ValueError( 'The method should be set to "pearson", "spearman", or "kendall"') # Determine the size of the matrix (i.e. number of rows/columns). n = Y_as_matrix.shape[0] # Initialize an empty array to store temporary vector permutations of Y. Y_permuted = zeros(Y.shape[0], dtype=float) # Either enumerate all permutations ... if perms >= factorial(n) or perms == 0: # Initialize an empty array to store the correlations. corrs = zeros(factorial(n), dtype=float) # Enumerate all permutations of row/column orders. orders = permutations(range(n)) perms = 0 for order in orders: # Take a permutation of the matrix. Y_as_matrix_permuted = Y_as_matrix[order, :][:, order] # Condense the permuted version of the matrix. Rather than use # distance.squareform(), we call directly into the C wrapper for speed. distance._distance_wrap.to_vector_from_squareform_wrap( Y_as_matrix_permuted, Y_permuted) # Compute the correlation coefficient and store it to corrs. corrs[perms] = correlate(X, Y_permuted)[0] perms += 1 # ... or randomly sample from the space of permutations. else: # Initialize an empty array to store the correlations. corrs = zeros(perms, dtype=float) # Store the veridical correlation coefficient first. corrs[0] = correlate(X, Y)[0] for i in range(1, perms): # Choose a random order in which to permute the rows and columns. order = random.permutation(n) # Take a permutation of the matrix. Y_as_matrix_permuted = Y_as_matrix[order, :][:, order] # Condense the permuted version of the matrix. Rather than use # distance.squareform(), we call directly into the C wrapper for speed. distance._distance_wrap.to_vector_from_squareform_wrap( Y_as_matrix_permuted, Y_permuted) # Compute the correlation coefficient and store it to corrs. corrs[i] = correlate(X, Y_permuted)[0] # Assign veridical correlation to r. r = corrs[0] # Calculate the empirical p-value for the upper or lower tail. if tail == 'upper': p = (corrs >= r).sum() / float(perms) elif tail == 'lower': p = (corrs <= r).sum() / float(perms) else: raise ValueError('The tail should be set to "upper" or "lower"') # Calculate the standard score. m = corrs.mean() sd = corrs.std() z = (r - m) / sd return r, p, z