def uncles(X, type='A', Ks=[n for n in range(4, 21, 4)], params=None, methods=None, methodsDetailed=None, U=None, Utype='PM', relabel_technique='minmin', setsP=None, setsN=None, dofuzzystretch=False, wsets=None, wmethods=None, GDM=None, smallestClusterSize=11, CoPaMfinetrials=1, CoPaMfinaltrials=1, binarise_techniqueP='DTB', binarise_paramP=np.arange(0.0, 1.1, 0.1, dtype='float'), binarise_techniqueN='DTB', binarise_paramN=np.concatenate(([sys.float_info.epsilon], np.arange(0.1, 1.1, 0.1, dtype='float'))), Xnames=None, deterministic=False, ncores=1): Xloc = ds.listofarrays2arrayofarrays(X) L = len(Xloc) # Number of datasets # Fix parameters if params is None: params = {} if setsP is None: setsP = [x for x in range(int(math.floor(L / 2)))] if setsN is None: setsN = [x for x in range(int(math.floor(L / 2)), L)] setsPN = np.array(np.concatenate((setsP, setsN), axis=0), dtype=int) Xloc = Xloc[setsPN] L = np.shape(Xloc)[0] # Number of datasets if wsets is None: wsets = np.array([1 for x in range(L)]) else: wsets = np.array(wsets)[setsPN] if GDM is None: Ng = np.shape(Xloc[0])[0] GDMloc = np.ones([Ng, L], dtype='bool') else: GDMloc = GDM[:, setsPN] Ng = GDMloc.shape[0] if Xnames is None: Xnames = ['X{0}'.format(l) for l in range(L)] if methods is None: methods = [['k-means']] # largest_DS = np.max([x.shape[0] for x in Xloc]) # if (largest_DS <= maxgenesinsetforpdist): # methods = [['k-means'], ['HC']] # else: # methods = [['k-means']] else: largest_DS = np.max([x.shape[0] for x in Xloc]) if (largest_DS > maxgenesinsetforpdist): methods = [ m for m in methods if 'hc' not in [entry.lower() for entry in m] ] if not methods: io.log('No valid base clustering can be used. Please note that clust would not use HC clustering ' \ 'on datasets with more than {0} genes. You have a dataset with {1} genes.' \ ''.format(maxgenesinsetforpdist, largest_DS)) io.log('Clust will terminate here.') io.log(op.bottomline(), addextrastick=False) sys.exit() if methodsDetailed is None: methodsDetailedloc = np.array([methods for l in range(L)]) else: methodsDetailedloc = methodsDetailed[setsPN] if wmethods is None: wmethods = [[1 for x in m] for m in methodsDetailedloc] elif not isinstance(wmethods[0], (list, tuple, np.ndarray)): wmethods = np.tile(methods, [L, 1]) else: wmethods = np.array(wmethods)[setsPN] setsPloc = [ii for ii in range(len(setsP))] if L > len(setsPloc): setsNloc = [ii for ii in range(len(setsPloc), L)] Ks = np.array(Ks) Ks = Ks[Ks <= Ng] # Remove Ks that are larger than the number of genes Ng Ks = Ks.tolist() NKs = len(Ks) # Number of K values # If the dataset is empty, return basic output if Ng == 0: NPp = len(binarise_paramP) # Number of P params NNp = len(binarise_paramN) # Number of N params if type == 'A': B = np.zeros([CoPaMfinaltrials, NPp, 1, NKs], dtype=object) Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object) elif type == 'B': B = np.zeros([CoPaMfinaltrials, NPp, NNp, NKs], dtype=object) Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object) params = dict( params, **{ 'methods': methods, 'setsP': setsPloc, 'setsN': setsNloc, 'dofuzzystretch': dofuzzystretch, 'type': type, 'Ks': Ks, 'NKs': NKs, 'wsets': wsets, 'wmethods': wmethods, 'Ds': Ds, 'L': L, 'CoPaMs': np.array([None] * (CoPaMfinaltrials * NKs)).reshape( [CoPaMfinaltrials, NKs]), 'smallestclustersize': smallestClusterSize, 'GDM': GDMloc }) Uloc = np.array([None] * (L * NKs)).reshape([L, NKs]) UnclesRes = collections.namedtuple('UnclesRes', ['B', 'Mc', 'params', 'X', 'U']) return UnclesRes(B, Mc, params, Xloc, Uloc) # Clustering if U is None: Utype = 'PM' Uloc = np.array([None] * (L * NKs)).reshape([L, NKs]) totalparallel = np.sum(Ks) * np.sum( [len(meths) for meths in methodsDetailedloc]) for meths in methodsDetailedloc: for meth in meths: if 'k-means' in meth: totalparallel += np.max(Ks) * np.max(Ks) continue io.resetparallelprogress(totalparallel) for l in range(L): # Cache kmeans initialisations for the dataset once to save time: cl.cache_kmeans_init(Xloc[l], Ks, methodsDetailedloc[l], datasetID=l) # Now go to parallel clustering with warnings.catch_warnings(): warnings.simplefilter("ignore") Utmp = Parallel(n_jobs=ncores)\ (delayed(clustDataset) (Xloc[l], Ks[ki], methodsDetailedloc[l], GDMloc[:, l], Ng, l) for ki in range(NKs)) Utmp = [u for u in Utmp] for ki in range(NKs): Uloc[l, ki] = Utmp[ki] gc.collect() #io.updateparallelprogress(np.sum(Ks) * len(methodsDetailedloc)) else: Uloc = ds.listofarrays2arrayofarrays(U)[setsPN] # Calculate a CoPaM for each dataset at each K CoPaMsFine = np.array([None] * (L * NKs)).reshape([L, NKs]) for l in range(L): for ki in range(NKs): if Utype.lower() == 'pm': CoPaMsFineTmp = [ generateCoPaM(Uloc[l, ki], relabel_technique=relabel_technique, X=[Xloc[l]], w=wmethods[l], K=Ks[ki], GDM=GDMloc[:, l].reshape([-1, 1])) for i in range(CoPaMfinetrials) ] elif Utype.lower() == 'idx': CoPaMsFineTmp = \ [generateCoPaMfromidx(Uloc[l, ki], relabel_technique=relabel_technique, X=Xloc, w=wmethods[l], K=Ks[ki]) for i in range(CoPaMfinetrials)] else: raise ValueError('Invalid Utype') CoPaMsFine[l, ki] = generateCoPaM(CoPaMsFineTmp, relabel_technique=relabel_technique, X=[Xloc[l]], GDM=GDMloc[:, l].reshape([-1, 1])) if dofuzzystretch: CoPaMsFine[l, ki] = fuzzystretch(CoPaMsFine[l, ki]) # Calculate the final CoPaM for each K CoPaMs = np.array([None] * (CoPaMfinaltrials * NKs)).reshape( [CoPaMfinaltrials, NKs]) CoPaMsP = np.array([None] * (CoPaMfinaltrials * NKs)).reshape( [CoPaMfinaltrials, NKs]) CoPaMsN = np.array([None] * (CoPaMfinaltrials * NKs)).reshape( [CoPaMfinaltrials, NKs]) for t in range(CoPaMfinaltrials): for ki in range(NKs): if type == 'A': if Utype.lower() == 'pm': CoPaMs[t, ki] = generateCoPaM( CoPaMsFine[:, ki], relabel_technique=relabel_technique, w=wsets, X=Xloc, GDM=GDMloc) elif Utype.lower() == 'idx': CoPaMs[t, ki] = generateCoPaMfromidx( CoPaMsFine[:, ki], relabel_technique=relabel_technique, X=Xloc, w=wsets, GDM=GDMloc) else: raise ValueError('Invalid Utype') elif type == 'B': if Utype.lower() == 'pm': CoPaMsP[t, ki] = generateCoPaM( CoPaMsFine[setsPloc, ki], relabel_technique=relabel_technique, X=Xloc, w=wsets[setsPloc], GDM=GDMloc[:, setsPloc]) CoPaMsN[t, ki] = generateCoPaM( CoPaMsFine[setsNloc, ki], relabel_technique=relabel_technique, X=Xloc, w=wsets[setsNloc], GDM=GDMloc[:, setsNloc]) elif Utype.lower() == 'idx': CoPaMsP[t, ki] = generateCoPaMfromidx( CoPaMsFine[setsPloc, ki], relabel_technique=relabel_technique, X=Xloc, w=wsets[setsPloc], GDM=GDMloc[:, setsPloc]) CoPaMsN[t, ki] = generateCoPaMfromidx( CoPaMsFine[setsNloc, ki], relabel_technique=relabel_technique, X=Xloc, w=wsets[setsNloc], GDM=GDMloc[:, setsNloc]) else: raise ValueError('Invalid Utype') else: raise ValueError( 'Invalid UNCLES type. It has to be either A or B') # Binarise NPp = len(binarise_paramP) # Number of P params NNp = len(binarise_paramN) # Number of N params if type == 'A': B = np.zeros([CoPaMfinaltrials, NPp, 1, NKs], dtype=object) Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object) elif type == 'B': B = np.zeros([CoPaMfinaltrials, NPp, NNp, NKs], dtype=object) Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object) for t in range(CoPaMfinaltrials): for ki in range(NKs): if type == 'A': # Pre-sorting binarisation for p in range(NPp): B[t, p, 0, ki] = binarise(CoPaMs[t, ki], binarise_techniqueP, binarise_paramP[p]) Mc[t, ki] = [np.sum(Bp, axis=0) for Bp in B[t, :, 0, ki]] # Sorting CoPaMs[t, ki] = sortclusters(CoPaMs[t, ki], Mc[t, ki], smallestClusterSize) # Post-sorting binarisation for p in range(NPp): B[t, p, 0, ki] = binarise(CoPaMs[t, ki], binarise_techniqueP, binarise_paramP[p]) Mc[t, ki] = [np.sum(Bp, axis=0) for Bp in B[t, :, 0, ki]] elif type == 'B': # Pre-sorting binarisation BP = [ binarise(CoPaMsP[t, ki], binarise_techniqueP, binarise_paramP[p]) for p in range(NPp) ] McP = [np.sum(BPp, axis=0) for BPp in BP] BN = [ binarise(CoPaMsN[t, ki], binarise_techniqueN, binarise_paramN[p]) for p in range(NNp) ] McN = [np.sum(BNp, axis=0) for BNp in BN] # Sorting CoPaMsP[t, ki] = sortclusters(CoPaMsP[t, ki], McP, smallestClusterSize) CoPaMsN[t, ki] = sortclusters(CoPaMsN[t, ki], McN, smallestClusterSize) # Post-sorting binarisation BP = [ binarise(CoPaMsP[t, ki], binarise_techniqueP, binarise_paramP[p]) for p in range(NPp) ] McP = [np.sum(BPp, axis=0) for BPp in BP] BN = [ binarise(CoPaMsN[t, ki], binarise_techniqueN, binarise_paramN[p]) for p in range(NNp) ] McN = [np.sum(BNp, axis=0) for BNp in BN] # UNCLES B logic for pp in range(NPp): for pn in range(NNp): B[t, pp, pn, ki] = BP[pp] B[t, pp, pn, ki][np.any(BN[pn], axis=1)] = False # Fill Mc Mc[t, ki] = [None] * Ks[ki] for k in range(Ks[ki]): Mc[t, ki][k] = np.zeros([NPp, NNp]) for pp in range(NPp): for pn in range(NNp): Mc[t, ki][k][pp, pn] = np.sum(B[t, pp, pn, ki][:, k]) # Prepare and return the results: params = dict( params, **{ 'methods': methods, 'setsP': setsPloc, 'setsN': setsNloc, 'dofuzzystretch': dofuzzystretch, 'type': type, 'Ks': Ks, 'NKs': NKs, 'wsets': wsets, 'wmethods': wmethods, 'L': L, 'CoPaMs': CoPaMs, 'smallestclustersize': smallestClusterSize, 'GDM': GDMloc }) UnclesRes = collections.namedtuple('UnclesRes', ['B', 'Mc', 'params', 'X', 'U']) return UnclesRes(B, Mc, params, Xloc, Uloc)
def mnplotsgreedy(X, B, type='A', params=None, allMSE=None, tightnessweight=1, setsP=None, setsN=None, Xtype='data', mseCache=None, wsets=None, GDM=None, msesummary='average', percentageOfClustersKept=100, smallestClusterSize=11, Xnames=None, ncores=1): Xloc = ds.listofarrays2arrayofarrays(X) Bloc = ds.reduceToArrayOfNDArraysAsObjects(B, 2) L = Xloc.shape[0] # Number of datasets # Fix parameters if params is None: params = {} if setsP is None: setsP = [x for x in range(int(math.floor(L / 2)))] if setsN is None: setsN = [x for x in range(int(math.floor(L / 2)), L)] setsPN = np.array(np.concatenate((setsP, setsN), axis=0), dtype=int) Xloc = Xloc[setsPN] L = Xloc.shape[0] if wsets is None: wsets = np.array([1 for x in range(L)]) if GDM is None: Ng = np.shape(Xloc[0])[0] GDMloc = np.ones([Ng, L], dtype='bool') else: Ng = np.shape(GDM)[0] GDMloc = GDM[:, setsPN] if Xnames is None: Xnames = ['X{0}'.format(l) for l in range(L)] # Put all clusters in one matrix N = Bloc.shape[0] # Number of partitions K = [Bloc[i].shape[1] for i in range(N)] # Number of clusters in each partition # One big matrix for all clusters BB = Bloc[0] for n in range(1, N): BB = np.append(BB, Bloc[n], axis=1) VMc = np.sum(BB, axis=0) NN = len(VMc) # Total number of clusters # Fill Vmse if not provided if mseCache is None and allMSE is None: # Cache all mse values mseCache = np.zeros([NN, L]) io.resetparallelprogress(NN * L) for l in range(L): if Xtype == 'files': # load files here raise NotImplementedError('Xtype "files" has not been implemented yet.') elif Xtype == 'data': Xtmp = Xloc[l] else: raise ValueError('Xtype has to be "files" or "data". The given Xtype is invalid.') with warnings.catch_warnings(): warnings.simplefilter("ignore") mseCachetmp = Parallel(n_jobs=ncores)\ (delayed(mseclusters) (Xtmp, ds.matlablike_index2D(BB, GDMloc[:, l], nn), 0) for nn in range(NN)) mseCachetmp = [mm[0] for mm in mseCachetmp] for nn in range(NN): mseCache[nn, l] = mseCachetmp[nn] gc.collect() #io.updateparallelprogress(NN) ''' for nn in range(NN): mseCache[nn, l] = mseclusters(Xtmp, ds.matlablike_index2D(BB, GDMloc[:, l], nn), 0)[0] io.log('Done cluster evaluation for {0} have been calculated.'.format(Xnames[l])) ''' # Calculate allMSE if needed (Nx1) if allMSE is None: if type == 'A': wsetsloc = wsets[setsPN] wsetsloc = [float(n)/sum(wsetsloc) for n in wsetsloc] if msesummary == 'average' or msesummary == 'mean': allMSE = np.dot(mseCache[:, setsPN], wsets) elif msesummary == 'worse' or msesummary == 'max': allMSE = np.max(np.multiply(mseCache[:, setsPN], wsets), axis=1) else: raise ValueError('msesummary value has to be "average", "mean", "worse", or "max".', ' "average and "mean" behave similarly, and "worse" and "max" behave similarly.') elif type == 'B': wsetsP = wsets[setsP] wsetsP = [n/sum(wsetsP) for n in wsetsP] wsetsN = wsets[setsN] wsetsN = [n / sum(wsetsN) for n in wsetsN] if msesummary == 'average' or msesummary == 'mean': allMSE = np.dot(mseCache[:, setsP] , wsetsP) - np.dot(mseCache[:, setsN] , wsetsN) elif msesummary == 'worse' or msesummary == 'max': allMSE = np.max(np.multiply(mseCache[:, setsP], wsetsP), axis=1) \ - np.max(np.multiply(mseCache[:, setsN], wsetsN), axis=1) else: raise ValueError('msesummary value has to be "average", "mean", "worse", or "max".', ' "average and "mean" behave similarly, and "worse" and "max" behave similarly.') else: raise ValueError('Type should be either A or B; given type is invalid.') # Find the distances maxx = np.max(allMSE[~np.isnan(allMSE)]) minx = np.min(allMSE[~np.isnan(allMSE)]) maxy = np.log10(np.max(VMc)) miny = 0 with np.errstate(divide='ignore'): allVecs = np.concatenate(([(allMSE - minx) / (maxx - minx)], [(np.log10(VMc) - miny) / (maxy - miny)]), axis=0).transpose() allVecs[:, 0] *= tightnessweight allDists = np.array([np.sqrt(1.1 + np.power(tightnessweight, 2)) if np.any(np.isnan(n)) else sp.spatial.distance.euclidean(n, [0, 1]) for n in allVecs]) alpha = 0.0001 tmp, uVdsI = np.unique(allDists, return_index=True) while len(uVdsI) != len(allDists): for n in range(len(allDists)): if n not in uVdsI: allDists[n] += alpha * sp.random.normal() tmp, uVdsI = np.unique(allDists, return_index=True) # Helper function for greedy solution below def mngreedy(Bloc, I, Vds, iter=float('inf')): Vdsloc = np.array(Vds) res = np.array([False for n in Vdsloc]) if iter == 0 or not any(I): return res for n in range(len(I)): if not I[n]: Vdsloc[n] = float('inf') p = np.argmin(Vdsloc) res[p] = True #II = I overlaps = np.dot(ds.matlablike_index2D(Bloc, 'all', p).transpose(), Bloc) > 0 I &= ~overlaps return res | mngreedy(Bloc, I, Vdsloc, iter-1) # ** Find greedy solution ** # Sort clusters based on distances (not important, but benefits the output) II = np.argsort(allDists) allDists = allDists[II] BB = ds.matlablike_index2D(BB, 'a', II) allVecs = ds.matlablike_index2D(allVecs, II, 'a') allMSE = allMSE[II] mseCache = ds.matlablike_index2D(mseCache, II, 'a') VMc = VMc[II] # include the top XX% of the clusters that have at least smallestClusterSize Ismall = VMc < smallestClusterSize Inans = np.isnan(allDists) tmpDists = [np.max(allDists) if Inans[n] | Ismall[n] else allDists[n] for n in range(len(allDists))] percentageOfClustersKept *= float(np.sum(~Ismall)) / len(allDists) Iincluded = (tmpDists <= np.percentile(tmpDists, percentageOfClustersKept)) & (np.bitwise_not(Ismall)) I = mngreedy(BB, Iincluded, allDists) B_out = ds.matlablike_index2D(BB, 'a', I) # Prepare and return the results: params = dict(params, **{ 'tightnessweight': tightnessweight, 'msesummary': msesummary, 'percentageofclusterskept': percentageOfClustersKept, 'smallestclustersize': smallestClusterSize }) MNResults = collections.namedtuple('MNResults', ['B', 'I', 'allVecs', 'allDists', 'allMSE', 'mseCache', 'Ball', 'params']) return MNResults(B_out, I, allVecs, allDists, allMSE, mseCache, BB, params)