def binarise(U, technique, param=0.0): K = np.shape(U)[1] allZerosInd = np.sum(U, axis=1) == 0 technique = technique.lower() if technique in ['union', 'ub']: B = U > 0 elif technique in ['intersection', 'ib']: B = U == 1 elif technique in ['max', 'mvb']: B = nu.isequaltoaxis(U, np.max(U, axis=1), axis=1) elif technique in ['valuethreshold', 'value', 'vtb']: B = U >= param elif technique in ['stdthresh', 'std']: B = (nu.isequaltoaxis(U, np.max(U, axis=1), axis=1)) & \ (np.tile(np.std(U, axis=1), [K, 1]).transpose() > param) elif technique in ['difference', 'diff', 'dtb']: Usorted = np.sort(U, axis=1) diff = Usorted[:, -1] - Usorted[:, -2] B = (nu.isequaltoaxis(U, np.max(U, axis=1), axis=1)) & \ (np.tile(diff, [K, 1]).transpose() > param) elif technique in ['top', 'tb']: B = nu.subtractaxis(U, np.max(U, axis=1), axis=1) <= param else: raise ValueError('The given technique is invalid.') B[allZerosInd] = 0 return np.array(B, dtype='bool')
def mseclusters(X, B, donormalise=True, GDM=None): Xloc = np.array(X) Bloc = np.array(B) if ds.maxDepthOfArray(Xloc) == 2: Xloc = np.expand_dims(Xloc, axis=0) Nx = len(Xloc) # Number of datasets if len(Bloc.shape) == 1: Bloc = Bloc.reshape(-1, 1) M = Bloc.shape[0] # Number of genes K = Bloc.shape[1] # Number of clusters if GDM is None: GDMloc = np.ones([Bloc.shape[0], Nx], dtype=bool) else: GDMloc = np.array(GDM) # I commented these two lines after adding GDM #if any([True if x.shape[0] != M else False for x in Xloc]): # raise ValueError('Unequal number of genes in datasets and partitions') mseC = np.zeros([Nx, K], dtype=float) Nk = [np.sum(b) for b in Bloc.transpose()] # Number of genes per cluster Nd = [x.shape[1] for x in Xloc] # Number of dimensions per dataset # Normalise if needed if donormalise: Xloc = [pp.normaliseSampleFeatureMat(x, 4) for x in Xloc] # Calculations for nx in range(Nx): reportedprogress = 0 for k in range(K): # Report progress if (k - reportedprogress == 100): io.updateparallelprogress(100) reportedprogress = k # WORK if not any(Bloc[:, k]): mseC[nx, k] = float('nan') else: Xlocloc = Xloc[nx][Bloc[GDMloc[:, nx], k], :] tmp = nu.subtractaxis(Xlocloc, np.mean(Xlocloc, axis=0), axis=0) tmp = np.sum(np.power(tmp, 2)) mseC[nx, k] = tmp / Nd[nx] / Nk[k] # Report progress if (K > reportedprogress): io.updateparallelprogress(K - reportedprogress) return np.mean(mseC, axis=0)
def mseclustersfuzzy(X, B, donormalise=True, GDM=None): Xloc = np.array(X) Bloc = np.array(B) if ds.maxDepthOfArray(Xloc) == 2: Xloc = np.expand_dims(Xloc, axis=0) Nx = len(Xloc) # Number of datasets if len(Bloc.shape) == 1: Bloc = Bloc.reshape(-1, 1) M = Bloc.shape[0] # Number of genes K = Bloc.shape[1] # Number of clusters if GDM is None: GDMloc = np.ones([Bloc.shape[0], Nx], dtype=bool) else: GDMloc = np.array(GDM) # I commented these two lines after adding GDM #if any([True if x.shape[0] != M else False for x in Xloc]): # raise ValueError('Unequal number of genes in datasets and partitions') mseC = np.zeros([Nx, K], dtype=float) Nk = [np.sum(b) for b in Bloc.transpose()] # Number of genes per cluster Nd = [x.shape[1] for x in Xloc] # Number of dimensions per dataset # Normalise if needed if donormalise: Xloc = [pp.normaliseSampleFeatureMat(x, 4) for x in Xloc] # Calculations for nx in range(Nx): for k in range(K): if Nk[k] == 0: mseC[nx, k] = float('nan') else: Cmeanloc = nu.multiplyaxis( Xloc[nx], Bloc[GDMloc[:, nx], k], axis=1) / Nk[k] # Weighted mean for the cluster tmp = nu.subtractaxis(Xloc[nx], Cmeanloc, axis=0) # Errors tmp = nu.multiplyaxis(tmp, Bloc[GDMloc[:, nx], k], axis=1) # Weighted errors tmp = np.sum(np.power(tmp, 2)) # Squared weighted errors mseC[nx, k] = tmp / Nd[nx] / Nk[k] # Weighted MSE return np.mean(mseC, axis=0)
def normaliseSampleFeatureMat(X, type): """ X = normalizeSampleFeatureMat(X, type) type: 0 (none), 1 (divide by mean), 2 (divide by the first), 3 (take log2), 31 (take log2 after setting all values < 1.0 to 1.0, i.e. guarantee positive log), 4 (subtract the mean and divide by the std), 5 (divide by the sum), 6 (subtract the mean), 7 (divide by the max), 8 (2 to the power X), 9 (subtract the min), 10 (rank: 1 for lowest, then 2, 3, ...; average on ties), 11 (rank, like 10 but order arbitrarly on ties), 12 (normalise to the [0 1] range), 13 (Genes with low values everywhere are set to zeros; bimodel distribution is fit to maxima of rows) 101 (quantile), 102 (subtract columns (samples) means), 103 (subtract global mean) 1000 (Automatically detect normalisation) If (type) was a vector like [3 1], this means to apply normalisation type (3) over (X) then to apply type (1) over the result. And so on. :param X: :param type: :return: """ Xout = np.array(X) codes = np.array( type ) # stays as input types unless auto-normalisation (type 1000) changes it if isinstance(type, (list, tuple, np.ndarray)): # This has a reason, which is if there is a single type (1000), it will replace it with the actual codes j = 0 for i in range(len(type)): Xout, codesi = normaliseSampleFeatureMat(Xout, type[i]) if isinstance(codesi, (list, tuple, np.ndarray)) & codesi.ndim > 0: codes[j] = codesi[0] codes = np.insert(codes, j + 1, codesi[1:]) j = j + len(codesi) else: j = j + 1 return Xout, codes if type == 1: # 1: Divide by the mean Xout = nu.divideaxis(Xout, np.mean(Xout, axis=1), 1) if type == 2: # 2: Divide by the first value Xout = nu.divideaxis(Xout, Xout[:, 1], 1) if type == 3: # 3: Take log2 Xout[Xout <= 0] = float('nan') Xout = np.log2(Xout) ind1 = np.any(isnan(Xout), axis=1) Xout[ind1] = fixnans(Xout[ind1]) if type == 31: # 31: Set all values < 1 to 1 then take log (guarantee a positive log) Xout[Xout <= 1] = 1 Xout = np.log2(Xout) if type == 4: # 4: Subtract the mean and divide by the std Xout = nu.subtractaxis(Xout, np.mean(Xout, axis=1), axis=1) ConstGenesIndices = np.std(Xout, axis=1) == 0 Xout = nu.divideaxis(Xout, np.std(Xout, axis=1), axis=1) Xout[ConstGenesIndices] = 0 if type == 5: # 5: Divide by the sum Xout = nu.divideaxis(Xout, np.sum(Xout, axis=1), axis=1) if type == 6: # 6: Subtract the mean Xout = nu.subtractaxis(Xout, np.mean(Xout, axis=1), axis=1) if type == 7: # 7: Divide by the maximum Xout = nu.divideaxis(Xout, np.max(Xout, axis=1), axis=1) if type == 8: # 8: (2 to the power X) Xout = np.power(2, Xout) if type == 9: # 9: Subtract the min Xout = nu.subtractaxis(Xout, np.min(Xout, axis=1), axis=1) if type == 10: # 10: Rank: 0 for lowest, then 1, 2, ...; average on ties Xout = spmstats.rankdata(Xout, axis=0) - 1 if type == 11: # 11: Rank: 0 for lowest, then 1, 2, ...; arbitrary order on ties Xout = np.argsort(np.argsort(Xout, axis=0), axis=0) if type == 12: # 12: Normalise to the [0 1] range Xout = nu.subtractaxis(Xout, np.min(Xout, axis=1), axis=1) Xout = nu.divideaxis(Xout, np.max(Xout, axis=1), axis=1) if type == 13: # 13: Genes with low values everywhere are set to zeros; bimodel distribution is fit to maxima of rows Xout = filterBimodal(X) # 100s if type == 101: # 101: quantile av = np.mean(np.sort(Xout, axis=0), axis=1) II = np.argsort(np.argsort(Xout, axis=0), axis=0) Xout = av[II] if type == 102: # 102: subtract the mean of each sample (column) from it Xout = nu.subtractaxis(Xout, np.mean(Xout, axis=0), axis=0) if type == 103: # 103: subtract the global mean of the data Xout -= np.mean(Xout) if type == 1000: # 1000: automatically detect normalisation codes = autoNormalise(Xout) Xout = normaliseSampleFeatureMat(Xout, codes)[0] codes = np.append([101], codes) return Xout, codes
def correcterrors_withinworse(B, X, GDM, falsepositivestrimmed=0.01): Bloc = np.array(B) Xloc = ds.listofarrays2arrayofarrays(X) [Ng, K] = Bloc.shape # Ng genes and K clusters L = Xloc.shape[0] # L datasets # Find clusters' means (Cmeans), absolute shifter clusters genes (SCG), # and the emperical CDF functions for them (cdfs) Cmeans = np.array([None] * L, dtype=object) SCG = np.array([None] * L, dtype=object) for l in range(L): Cmeans[l] = np.zeros([K, Xloc[l].shape[1]]) # K clusters x D dimensions SCG[l] = np.zeros( [np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)), Xloc[l].shape[1]]) # M* genes x D dimensions ... # (M* are all # genes in any cluster) gi = 0 for k in range(K): Cmeans[l][k] = np.median(Xloc[l][Bloc[GDM[:, l], k], :], axis=0) csize = np.sum(Bloc[GDM[:, l], k]) tmpSCG = nu.subtractaxis(Xloc[l][Bloc[GDM[:, l], k], :], Cmeans[l][k], axis=0) SCG[l][gi:(gi + csize), :] = np.abs(tmpSCG) gi += csize SCG[l] = SCG[l][np.any( SCG[l], axis=1)] # Remove all zeros genes (rows of SCG[l]) SCG[l] = np.sort(SCG[l], axis=0) if falsepositivestrimmed > 0: trimmed = int(falsepositivestrimmed * SCG[l].shape[0]) if trimmed > 0: SCG[l] = SCG[l][ 0:-trimmed] # trim the lowest (trimmed) rows in SCG # Helping function def iswithinworse(ref, x): return x <= np.max(ref) # Find who belongs belongs = np.ones([Ng, K, L], dtype=bool) # Ng genes x K clusters x L datasets for l in range(L): for k in range(K): for d in range(Xloc[l].shape[1]): tmpX = np.abs(Xloc[l][:, d] - Cmeans[l][k, d]) belongs[GDM[:, l], k, l] &= iswithinworse(SCG[l][:, d], tmpX) # Include in clusters genes which belongs everywhere B_out = np.all(belongs, axis=2) # Genes included in two clusters, include them in the closest in terms of its worst distance to any of the clusters # (guarrantee that the worst belongingness of a gene to a cluster is optimised) f = np.nonzero(np.sum(B_out, axis=1) > 1)[0] for fi in f: ficlusts = np.nonzero(B_out[fi])[0] # Clusters competing over gene fi fidatasets = np.nonzero(GDM[fi])[0] # Datasets that have gene fi localdists = np.zeros( [len(ficlusts), len(fidatasets)]) # (Clusts competing) x (datasets that have fi) for l in range(len(fidatasets)): ll = fidatasets[l] # Actual dataset index fi_ll = np.sum(GDM[:fi, ll]) # Index of fi in this Xloc[ll] localdists[:, l] = nu.dist_matrices( Cmeans[ll][ficlusts], Xloc[ll][fi_ll]).reshape([len(ficlusts)]) localdists = np.max(localdists, axis=1) # (Clusts competing) x 1 ficlosest = np.argmin(localdists) # Closest cluster B_out[fi] = False B_out[fi, ficlusts[ficlosest]] = True return B_out
def optimise_tukey_sqrtSCG(B, X, GDM, clustdists=None, smallestClusterSize=11, tails=1, Q3s=2): Bloc = np.array(B) Xloc = ds.listofarrays2arrayofarrays(X) [Ng, K] = Bloc.shape # Ng genes and K clusters L = Xloc.shape[0] # L datasets # Normalise clustdists to provide weights. If not provided, make it unity for all if clustdists is None: clustdistsloc = np.ones(K) else: clustdistsloc = [c for c in clustdists] # Find clusters' means (Cmeans), absolute shifted clusters genes (SCG), # and the emperical CDF functions for them (cdfs) Cmeans = np.array([None] * L, dtype=object) SCG = np.array([None] * L, dtype=object) Cgood = mnplotsdistancethreshold(clustdistsloc, method='largestgap') for l in range(L): Cmeans[l] = np.zeros([K, Xloc[l].shape[1]]) # K clusters x D dimensions SCG[l] = np.zeros( [np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)), Xloc[l].shape[1]]) # M* genes x D dimensions ... w = np.zeros(np.sum(np.sum(Bloc[GDM[:, l], :], axis=0))) # M* genes # (M* are all # genes in any cluster) gi = 0 for k in range(K): Cmeans[l][k] = np.median(Xloc[l][Bloc[GDM[:, l], k], :], axis=0) if k in Cgood: csize = np.sum(Bloc[GDM[:, l], k]) tmpSCG = nu.subtractaxis(Xloc[l][Bloc[GDM[:, l], k], :], Cmeans[l][k], axis=0) SCG[l][gi:(gi + csize), :] = np.abs(tmpSCG) gi += csize SCG[l] = SCG[l][np.any( SCG[l], axis=1)] # Remove all zeros genes (rows of SCG[l]) if ds.numel(SCG[l] > 0): if tails == 1: Q3 = np.percentile(SCG[l], q=75, axis=0) thresh = Q3s * Q3 SCGouts = SCG[l] > np.array( [thresh for ii in range(0, SCG[l].shape[0])]) SCG[l][ SCGouts] = 0.0 # Set the outlier values to zeros so they do not affect decisions later on elif tails == 2: Q1 = np.percentile(np.sqrt(SCG[l]), q=25, axis=0) Q3 = np.percentile(np.sqrt(SCG[l]), q=75, axis=0) IQR = np.subtract(Q3, Q1) thresh = np.add(Q3, 1.5 * IQR) SCGouts = np.sqrt(SCG[l]) > np.array( [thresh for ii in range(0, SCG[l].shape[0])]) SCG[l][ SCGouts] = 0.0 # Set the outlier values to zeros so they do not affect decisions later on else: raise ValueError( 'Invalid number of tails. It should be either 1 or 2.') else: SCG[l] = np.zeros((1, SCG[l].shape[1])) # Clusters mins and maxes (NEW) Cmins = np.array([None] * L, dtype=object) Cmaxes = np.array([None] * L, dtype=object) for l in range(L): Cmins[l] = np.zeros([K, Xloc[l].shape[1]]) # K clusters x D dimensions Cmaxes[l] = np.zeros([K, Xloc[l].shape[1]]) # K clusters x D dimensions for k in range(K): Cmins[l][k] = Cmeans[l][k] - np.max(SCG[l], axis=0) Cmaxes[l][k] = Cmeans[l][k] + np.max(SCG[l], axis=0) # Resolve overlaps between clusters (NEW) for k1 in range(K): for k2 in range(K): # Compare the pair of clusters only once, and don't compare a cluster with itself. This if statement # guarantees that k2 will always be a later cluster than k1. if (k1 >= k2): continue # Value of the smallest overlap between the ranges of the clusters k1 and k2, and ... # the dataset (l) and the dimension (d), at which this overlap is found # t_smallest overlap is the type of the overlap, (-1, 0, 1, or 2). Type (-1) means that the entire (min # to max) range of one cluster is within the range of the other cluster. This is the worse overlap. # Type (0) means that the max of (k1) is within the range of (min to max) of (k2), and type (1) is the other # way around. Type (2) means there is no overlap. This is the best and finding one of it breaks the loop v_smallestoverlap = 0 l_smallestoverlap = -1 d_smallestoverlap = -1 t_smallestoverlap = -1 # Overlap type, read above for l in range(L): Nd = len(Cmins[l][k1]) # Dimensions in this dataset for d in range(Nd): x1 = Cmaxes[l][k1][d] x2 = Cmaxes[l][k2][d] n1 = Cmins[l][k1][d] n2 = Cmins[l][k2][d] if (x1 > n2 and x1 <= x2): if (n1 < n2): ov = x1 - n2 if (t_smallestoverlap == -1 or ov < v_smallestoverlap): t_smallestoverlap = 0 v_smallestoverlap = ov l_smallestoverlap = l d_smallestoverlap = d elif (x2 > n1 and x2 <= x1): if (n2 < n1): ov = x2 - n1 if (t_smallestoverlap == -1 or ov < v_smallestoverlap): t_smallestoverlap = 1 v_smallestoverlap = ov l_smallestoverlap = l d_smallestoverlap = d else: t_smallestoverlap = 2 continue # Absolutely no overlap at this point, so k1 and k2 are distinct, so continue if (t_smallestoverlap == 2): continue # Absolutely no overlap at some point, so k1 and k2 are distinct, so continue # Sort out the overlap if exists between k1 and k2 if (t_smallestoverlap == -1): # Here one of the two clusters always swallows the other one. So effectively remove the later one (k2). # Cluster removal is by making its minimum larger than its maximum at a single point (at l=0, d=0), # so effectively no gene will ever be mapped to it! Cmins[0][k2][0] = 1 Cmaxes[0][k2][0] = 0 elif (t_smallestoverlap == 0): Cmins[l_smallestoverlap][k2][d_smallestoverlap] = \ Cmaxes[l_smallestoverlap][k1][d_smallestoverlap] + sys.float_info.epsilon elif (t_smallestoverlap == 1): Cmaxes[l_smallestoverlap][k2][d_smallestoverlap] = \ Cmins[l_smallestoverlap][k1][d_smallestoverlap] - sys.float_info.epsilon # Find who belongs (NEW) belongs = np.ones([Ng, K, L], dtype=bool) # Ng genes x K clusters x L datasets for l in range(L): for k in range(K): tmp1 = nu.largerthanaxis(Xloc[l], Cmins[l][k], axis=0, orequal=True) tmp2 = nu.lessthanaxis(Xloc[l], Cmaxes[l][k], axis=0, orequal=True) belongs[GDM[:, l], k, l] = np.all(np.logical_and(tmp1, tmp2), axis=1) # # Helping function (OLD - to be removed) # def iswithinworse(ref, x): # return x <= np.max(ref) # # # Find who belongs (OLD - to be removed) # belongs = np.ones([Ng, K, L], dtype=bool) # Ng genes x K clusters x L datasets # for l in range(L): # for k in range(K): # for d in range(Xloc[l].shape[1]): # tmpX = np.abs(Xloc[l][:, d] - Cmeans[l][k, d]) # belongs[GDM[:, l], k, l] &= iswithinworse(SCG[l][:, d], tmpX) # Include in clusters genes which belongs everywhere (OLD - to be removed) B_out = np.all(belongs, axis=2) # Solve genes included in two clusters (OLD - should not be needed now - TO BE REMOVED) solution = 2 if solution == 1: # Genes included in two clusters, include them in the closest in terms of its worst distance to any of the clusters # (guarrantee that the worst belongingness of a gene to a cluster is optimised) f = np.nonzero(np.sum(B_out, axis=1) > 1)[0] for fi in f: ficlusts = np.nonzero( B_out[fi])[0] # Clusters competing over gene fi fidatasets = np.nonzero(GDM[fi])[0] # Datasets that have gene fi localdists = np.zeros([ len(ficlusts), len(fidatasets) ]) # (Clusts competing) x (datasets that have fi) for l in range(len(fidatasets)): ll = fidatasets[l] # Actual dataset index fi_ll = np.sum(GDM[:fi, ll]) # Index of fi in this Xloc[ll] localdists[:, l] = nu.dist_matrices(Cmeans[ll][ficlusts], Xloc[ll][fi_ll]).reshape( [len(ficlusts)]) localdists = np.max(localdists, axis=1) # (Clusts competing) x 1 ficlosest = np.argmin(localdists) # Closest cluster B_out[fi] = False B_out[fi, ficlusts[ficlosest]] = True elif solution == 2: # Genes included in two clusters, include them in the earlier cluster (smallest k) f = np.nonzero(np.sum(B_out, axis=1) > 1)[0] for fi in f: ficlusts = np.nonzero( B_out[fi])[0] # Clusters competing over gene fi ficlosest = np.argmin(ficlusts) # earliest cluster (smallest k) B_out[fi] = False B_out[fi, ficlusts[ficlosest]] = True # Remove clusters smaller than minimum cluster size ClusterSizes = np.sum(B_out, axis=0) B_out = B_out[:, ClusterSizes >= smallestClusterSize] return B_out
def correcterrors_weighted_outliers2(B, X, GDM, clustdists=None, stds=3, smallestClusterSize=11): Bloc = np.array(B) Xloc = ds.listofarrays2arrayofarrays(X) [Ng, K] = Bloc.shape # Ng genes and K clusters L = Xloc.shape[0] # L datasets # Normalise clustdists to provide weights. If not provided, make it unity for all if clustdists is None: clustweights = np.ones(K) else: clustweights = np.min(clustdists) / clustdists # Find clusters' means (Cmeans), absolute shifted clusters genes (SCG), # and the emperical CDF functions for them (cdfs) Cmeans = np.array([None] * L, dtype=object) SCG = np.array([None] * L, dtype=object) for l in range(L): Cmeans[l] = np.zeros([K, Xloc[l].shape[1]]) # K clusters x D dimensions SCG[l] = np.zeros( [np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)), Xloc[l].shape[1]]) # M* genes x D dimensions ... w = np.zeros(np.sum(np.sum(Bloc[GDM[:, l], :], axis=0))) # M* genes # (M* are all # genes in any cluster) gi = 0 for k in range(K): Cmeans[l][k] = np.median(Xloc[l][Bloc[GDM[:, l], k], :], axis=0) csize = np.sum(Bloc[GDM[:, l], k]) tmpSCG = nu.subtractaxis(Xloc[l][Bloc[GDM[:, l], k], :], Cmeans[l][k], axis=0) SCG[l][gi:(gi + csize), :] = np.abs(tmpSCG) # Added this in this version w[gi:(gi + csize)] = clustweights[k] gi += csize SCG[l] = SCG[l][np.any( SCG[l], axis=1)] # Remove all zeros genes (rows of SCG[l]) SCG[l] = np.sort(SCG[l], axis=0) SCGmeans = np.average(SCG[l], weights=w, axis=0) SCGstds = st.weighted_std_axis(SCG[l], weights=w, axix=0) SCGouts = nu.divideaxis(nu.subtractaxis(SCG[l], SCGmeans, axis=0), SCGstds, axis=0) # No. of stds away SCGouts = SCGouts > stds # TRUE for outliers and FALSE for others (bool: M* genex x D dimensions) SCG[l][ SCGouts] = 0.0 # Set the outlier values to zeros so they do not affect decisions later on # Helping function def iswithinworse(ref, x): return x <= np.max(ref) # Find who belongs belongs = np.ones([Ng, K, L], dtype=bool) # Ng genes x K clusters x L datasets for l in range(L): for k in range(K): for d in range(Xloc[l].shape[1]): tmpX = np.abs(Xloc[l][:, d] - Cmeans[l][k, d]) belongs[GDM[:, l], k, l] &= iswithinworse(SCG[l][:, d], tmpX) # Include in clusters genes which belongs everywhere B_out = np.all(belongs, axis=2) # Solve genes included in two clusters: solution = 2 if solution == 1: # Genes included in two clusters, include them in the closest in terms of its worst distance to any of the clusters # (guarrantee that the worst belongingness of a gene to a cluster is optimised) f = np.nonzero(np.sum(B_out, axis=1) > 1)[0] for fi in f: ficlusts = np.nonzero( B_out[fi])[0] # Clusters competing over gene fi fidatasets = np.nonzero(GDM[fi])[0] # Datasets that have gene fi localdists = np.zeros([ len(ficlusts), len(fidatasets) ]) # (Clusts competing) x (datasets that have fi) for l in range(len(fidatasets)): ll = fidatasets[l] # Actual dataset index fi_ll = np.sum(GDM[:fi, ll]) # Index of fi in this Xloc[ll] localdists[:, l] = nu.dist_matrices(Cmeans[ll][ficlusts], Xloc[ll][fi_ll]).reshape( [len(ficlusts)]) localdists = np.max(localdists, axis=1) # (Clusts competing) x 1 ficlosest = np.argmin(localdists) # Closest cluster B_out[fi] = False B_out[fi, ficlusts[ficlosest]] = True elif solution == 2: # Genes included in two clusters, include them in the earlier cluster (smallest k) f = np.nonzero(np.sum(B_out, axis=1) > 1)[0] for fi in f: ficlusts = np.nonzero( B_out[fi])[0] # Clusters competing over gene fi ficlosest = np.argmin(ficlusts) # earliest cluster (smallest k) B_out[fi] = False B_out[fi, ficlusts[ficlosest]] = True # Remove clusters smaller than minimum cluster size ClusterSizes = np.sum(B_out, axis=0) B_out = B_out[:, ClusterSizes >= smallestClusterSize] return B_out