def mseclusters(X, B, donormalise=True, GDM=None): Xloc = np.array(X) Bloc = np.array(B) if ds.maxDepthOfArray(Xloc) == 2: Xloc = np.expand_dims(Xloc, axis=0) Nx = len(Xloc) # Number of datasets if len(Bloc.shape) == 1: Bloc = Bloc.reshape(-1, 1) M = Bloc.shape[0] # Number of genes K = Bloc.shape[1] # Number of clusters if GDM is None: GDMloc = np.ones([Bloc.shape[0], Nx], dtype=bool) else: GDMloc = np.array(GDM) # I commented these two lines after adding GDM #if any([True if x.shape[0] != M else False for x in Xloc]): # raise ValueError('Unequal number of genes in datasets and partitions') mseC = np.zeros([Nx, K], dtype=float) Nk = [np.sum(b) for b in Bloc.transpose()] # Number of genes per cluster Nd = [x.shape[1] for x in Xloc] # Number of dimensions per dataset # Normalise if needed if donormalise: Xloc = [pp.normaliseSampleFeatureMat(x, 4) for x in Xloc] # Calculations for nx in range(Nx): reportedprogress = 0 for k in range(K): # Report progress if (k - reportedprogress == 100): io.updateparallelprogress(100) reportedprogress = k # WORK if not any(Bloc[:, k]): mseC[nx, k] = float('nan') else: Xlocloc = Xloc[nx][Bloc[GDMloc[:, nx], k], :] tmp = nu.subtractaxis(Xlocloc, np.mean(Xlocloc, axis=0), axis=0) tmp = np.sum(np.power(tmp, 2)) mseC[nx, k] = tmp / Nd[nx] / Nk[k] # Report progress if (K > reportedprogress): io.updateparallelprogress(K - reportedprogress) return np.mean(mseC, axis=0)
def mseclustersfuzzy(X, B, donormalise=True, GDM=None): Xloc = np.array(X) Bloc = np.array(B) if ds.maxDepthOfArray(Xloc) == 2: Xloc = np.expand_dims(Xloc, axis=0) Nx = len(Xloc) # Number of datasets if len(Bloc.shape) == 1: Bloc = Bloc.reshape(-1, 1) M = Bloc.shape[0] # Number of genes K = Bloc.shape[1] # Number of clusters if GDM is None: GDMloc = np.ones([Bloc.shape[0], Nx], dtype=bool) else: GDMloc = np.array(GDM) # I commented these two lines after adding GDM #if any([True if x.shape[0] != M else False for x in Xloc]): # raise ValueError('Unequal number of genes in datasets and partitions') mseC = np.zeros([Nx, K], dtype=float) Nk = [np.sum(b) for b in Bloc.transpose()] # Number of genes per cluster Nd = [x.shape[1] for x in Xloc] # Number of dimensions per dataset # Normalise if needed if donormalise: Xloc = [pp.normaliseSampleFeatureMat(x, 4) for x in Xloc] # Calculations for nx in range(Nx): for k in range(K): if Nk[k] == 0: mseC[nx, k] = float('nan') else: Cmeanloc = nu.multiplyaxis( Xloc[nx], Bloc[GDMloc[:, nx], k], axis=1) / Nk[k] # Weighted mean for the cluster tmp = nu.subtractaxis(Xloc[nx], Cmeanloc, axis=0) # Errors tmp = nu.multiplyaxis(tmp, Bloc[GDMloc[:, nx], k], axis=1) # Weighted errors tmp = np.sum(np.power(tmp, 2)) # Squared weighted errors mseC[nx, k] = tmp / Nd[nx] / Nk[k] # Weighted MSE return np.mean(mseC, axis=0)
def runclust(X, Map=None, replicatesIDs=None, normalise=1000, Ks=[n for n in range(4, 21, 4)], tightnessweight=1, stds=3.0, OGsIncludedIfAtLeastInDatasets=1, expressionValueThreshold=-float("inf"), atleastinconditions=0, atleastindatasets=0, absvalue=False, filteringtype='raw', filflat=True, smallestClusterSize=11, ncores=1, optimisation=True, Q3s=2, methods=None, deterministic=False, showPlots=True, printToConsole=True): # Set the global objects label glob.set_print_to_log_file(False) glob.set_print_to_console(printToConsole) if Map is None: glob.set_object_label_upper('Gene') glob.set_object_label_lower('gene') else: glob.set_object_label_upper('OG') glob.set_object_label_lower('OG') glob.set_tmpfile('clust_tmp.txt') # Output: Print initial message, and record the starting time: initialmsg, starttime = op.generateinitialmessage() io.log(initialmsg, addextrastick=False) # Consider X as a list of arrays or of data frames. Otherwise, make it as such first # If the user entered a single dataset as an input (not as a list of arrays), save this fact in a flag, ... # so the result is returned as a single output input_is_one_dataset = False if isinstance(X, pd.DataFrame): input_is_one_dataset = True X = [X] elif isinstance(X, np.ndarray) and ds.maxDepthOfArray(X) == 2: input_is_one_dataset = True X = [X] # Format data (X: list of arrays, Genes: list of arrays of strings, replicates: list of arrays of strings) L = len(X) # Number of datasets replicates = [None] * L Genes = [None] * L io.log('1. Reading dataset(s)') for l in range(L): if type(X[l]) == pd.DataFrame: Genes[l] = np.array(X[l].index, dtype=str, ndmin=2).transpose() Genes[l] = np.array(Genes[l], dtype=object) replicates[l] = np.array(X[l].columns, dtype=str) X[l] = X[l].values else: X[l] = np.array(X[l]) ngenes_digits = int(math.ceil(math.log10(X[l].shape[0]))) nreps_digits = int(math.ceil(math.log10(X[l].shape[1]))) Genes[l] = np.array([['{0}'.format(str(g).zfill(ngenes_digits))] for g in range(X[l].shape[0])]) Genes[l] = np.array(Genes[l], dtype=object) replicates[l] = np.array(['X{0}'.format(str(r).zfill(nreps_digits)) for r in range(X[l].shape[1])]) ndatasets_digits = int(math.ceil(math.log10(L))) datafiles = np.array(['D{0}'.format(str(r).zfill(ndatasets_digits)) for r in range(L)]) datafiles_noext = datafiles # Sort out conditions based on replicates structure if given if replicatesIDs is None: conditions = replicates else: valresult = val.validate_replicatesIDs(replicatesIDs, X) if valresult[0]: conditions = [None] * L for l in range(L): if replicatesIDs[l] is None: conditions[l] = np.array(replicates[l]) else: uniq_reps, cond_indices = np.unique(replicatesIDs[l], return_index=True) if -1 in uniq_reps: cond_indices = cond_indices[1:] conditions[l] = replicates[l][cond_indices] else: io.log(valresult[1]) io.log("Terminating ...") raise Exception("Terminated by an invalid input argument.") # Validate normalisation valresult = val.validate_normalisation(normalise, X) if not valresult[0]: io.log(valresult[1]) io.log("Terminating ...") raise Exception("Terminated by an invalid input argument.") # Preprocessing (Mapping then top level preprocessing including summarising replicates, filtering # low expression genes, and normalisation) io.log('2. Data pre-processing') (X_OGs, GDM, GDMall, OGs, MapNew, MapSpecies) \ = pp.calculateGDMandUpdateDatasets(X, Genes, Map, mapheader=True, OGsFirstColMap=True, delimGenesInMap='\\W+', OGsIncludedIfAtLeastInDatasets=OGsIncludedIfAtLeastInDatasets) (X_summarised_normalised, GDM, Iincluded, params, applied_norms) = \ pp.preprocess(X_OGs, GDM, normalise, replicatesIDs, flipSamples=None, expressionValueThreshold=expressionValueThreshold, replacementVal=0.0, atleastinconditions=atleastinconditions, atleastindatasets=atleastindatasets, absvalue=absvalue, filteringtype=filteringtype, filterflat=filflat, params=None, datafiles=datafiles) OGs = OGs[Iincluded] if MapNew is not None: MapNew = MapNew[Iincluded] # UNCLES and M-N plots io.log('3. Seed clusters production (the Bi-CoPaM method)') ures = unc.uncles(X_summarised_normalised, type='A', GDM=GDM, Ks=Ks, params=params, methods=methods, Xnames=datafiles_noext, ncores=ncores, deterministic=deterministic) io.log('4. Cluster evaluation and selection (the M-N scatter plots technique)') mnres = mn.mnplotsgreedy(X_summarised_normalised, ures.B, GDM=GDM, tightnessweight=tightnessweight, params=ures.params, smallestClusterSize=smallestClusterSize, Xnames=datafiles_noext, ncores=ncores) # Post-processing ppmethod = 'tukey_sqrtSCG' if optimisation: io.log('5. Cluster optimisation and completion') if len(mnres.I) > 0 and sum(mnres.I) > 0: # Otherwise, there are no clusters, so nothing to be corrected try: if ppmethod == 'weighted_outliers': B_corrected = ecorr.correcterrors_weighted_outliers(mnres.B, X_summarised_normalised, GDM, mnres.allDists[mnres.I], stds, smallestClusterSize) elif ppmethod == 'tukey_sqrtSCG': B_corrected = ecorr.optimise_tukey_sqrtSCG(mnres.B, X_summarised_normalised, GDM, mnres.allDists[mnres.I], smallestClusterSize, tails=1, Q3s=Q3s) else: raise ValueError('Invalid post processing method (ppmethod): {0}.'.format(ppmethod)) B_corrected = ecorr.reorderClusters(B_corrected, X_summarised_normalised, GDM) except: io.logerror(sys.exc_info()) io.log('\n* Failed to perform cluster optimisation and completion!\n' '* Skipped cluster optimisation and completion!\n') B_corrected = mnres.B else: B_corrected = mnres.B else: io.log('5. Skipping cluster optimisation and completion') B_corrected = mnres.B # Output: Preparing output parameters as DataFrames if Map is None: Bout = op.clusters_B_as_dataframes(B_corrected, OGs, None) else: Bout, B_species = op.clusters_B_as_dataframes(B_corrected, OGs, MapNew) Xout = op.processed_X_as_dataframes(X_summarised_normalised, OGs, conditions) if input_is_one_dataset: Xout = Xout[0] # Output: Plot figures if showPlots: try: if np.shape(B_corrected)[1] > 0: graph.plotclusters(X_summarised_normalised, B_corrected, datafiles_noext, conditions, GDM=GDM, Cs='all', setPageToDefault=True, showPlots=showPlots, printToPDF=False) except: io.log('Error: could not generate clusters'' plots. Resuming the rest of steps ...') # Output: Prepare message to standard output and the summary then save the summary to a file and print the message summarymsg, endtime, timeconsumedtxt = \ op.generateoutputsummaryparag(X, X_summarised_normalised, MapNew, GDMall, GDM, ures, mnres, B_corrected, starttime) io.log(summarymsg, addextrastick=False) io.deletetmpfile() return Bout, Xout, GDM