示例#1
0
def mseclusters(X, B, donormalise=True, GDM=None):
    Xloc = np.array(X)
    Bloc = np.array(B)

    if ds.maxDepthOfArray(Xloc) == 2:
        Xloc = np.expand_dims(Xloc, axis=0)
    Nx = len(Xloc)  # Number of datasets
    if len(Bloc.shape) == 1:
        Bloc = Bloc.reshape(-1, 1)
    M = Bloc.shape[0]  # Number of genes
    K = Bloc.shape[1]  # Number of clusters

    if GDM is None:
        GDMloc = np.ones([Bloc.shape[0], Nx], dtype=bool)
    else:
        GDMloc = np.array(GDM)

    # I commented these two lines after adding GDM
    #if any([True if x.shape[0] != M else False for x in Xloc]):
    #    raise ValueError('Unequal number of genes in datasets and partitions')

    mseC = np.zeros([Nx, K], dtype=float)

    Nk = [np.sum(b) for b in Bloc.transpose()]  # Number of genes per cluster
    Nd = [x.shape[1] for x in Xloc]  # Number of dimensions per dataset

    # Normalise if needed
    if donormalise:
        Xloc = [pp.normaliseSampleFeatureMat(x, 4) for x in Xloc]

    # Calculations
    for nx in range(Nx):
        reportedprogress = 0
        for k in range(K):
            # Report progress
            if (k - reportedprogress == 100):
                io.updateparallelprogress(100)
                reportedprogress = k
            # WORK
            if not any(Bloc[:, k]):
                mseC[nx, k] = float('nan')
            else:
                Xlocloc = Xloc[nx][Bloc[GDMloc[:, nx], k], :]
                tmp = nu.subtractaxis(Xlocloc,
                                      np.mean(Xlocloc, axis=0),
                                      axis=0)
                tmp = np.sum(np.power(tmp, 2))
                mseC[nx, k] = tmp / Nd[nx] / Nk[k]
        # Report progress
        if (K > reportedprogress):
            io.updateparallelprogress(K - reportedprogress)

    return np.mean(mseC, axis=0)
示例#2
0
def mseclustersfuzzy(X, B, donormalise=True, GDM=None):
    Xloc = np.array(X)
    Bloc = np.array(B)

    if ds.maxDepthOfArray(Xloc) == 2:
        Xloc = np.expand_dims(Xloc, axis=0)
    Nx = len(Xloc)  # Number of datasets
    if len(Bloc.shape) == 1:
        Bloc = Bloc.reshape(-1, 1)
    M = Bloc.shape[0]  # Number of genes
    K = Bloc.shape[1]  # Number of clusters

    if GDM is None:
        GDMloc = np.ones([Bloc.shape[0], Nx], dtype=bool)
    else:
        GDMloc = np.array(GDM)

    # I commented these two lines after adding GDM
    #if any([True if x.shape[0] != M else False for x in Xloc]):
    #    raise ValueError('Unequal number of genes in datasets and partitions')

    mseC = np.zeros([Nx, K], dtype=float)

    Nk = [np.sum(b) for b in Bloc.transpose()]  # Number of genes per cluster
    Nd = [x.shape[1] for x in Xloc]  # Number of dimensions per dataset

    # Normalise if needed
    if donormalise:
        Xloc = [pp.normaliseSampleFeatureMat(x, 4) for x in Xloc]

    # Calculations
    for nx in range(Nx):
        for k in range(K):
            if Nk[k] == 0:
                mseC[nx, k] = float('nan')
            else:
                Cmeanloc = nu.multiplyaxis(
                    Xloc[nx], Bloc[GDMloc[:, nx], k],
                    axis=1) / Nk[k]  # Weighted mean for the cluster
                tmp = nu.subtractaxis(Xloc[nx], Cmeanloc, axis=0)  # Errors
                tmp = nu.multiplyaxis(tmp, Bloc[GDMloc[:, nx], k],
                                      axis=1)  # Weighted errors
                tmp = np.sum(np.power(tmp, 2))  # Squared weighted errors
                mseC[nx, k] = tmp / Nd[nx] / Nk[k]  # Weighted MSE

    return np.mean(mseC, axis=0)
示例#3
0
def runclust(X, Map=None, replicatesIDs=None, normalise=1000,
             Ks=[n for n in range(4, 21, 4)], tightnessweight=1, stds=3.0,
             OGsIncludedIfAtLeastInDatasets=1, expressionValueThreshold=-float("inf"), atleastinconditions=0,
             atleastindatasets=0, absvalue=False, filteringtype='raw', filflat=True, smallestClusterSize=11,
             ncores=1, optimisation=True, Q3s=2, methods=None, deterministic=False, showPlots=True,
             printToConsole=True):

    # Set the global objects label
    glob.set_print_to_log_file(False)
    glob.set_print_to_console(printToConsole)
    if Map is None:
        glob.set_object_label_upper('Gene')
        glob.set_object_label_lower('gene')
    else:
        glob.set_object_label_upper('OG')
        glob.set_object_label_lower('OG')

    glob.set_tmpfile('clust_tmp.txt')


    # Output: Print initial message, and record the starting time:
    initialmsg, starttime = op.generateinitialmessage()
    io.log(initialmsg, addextrastick=False)

    # Consider X as a list of arrays or of data frames. Otherwise, make it as such first
    # If the user entered a single dataset as an input (not as a list of arrays), save this fact in a flag, ...
    # so the result is returned as a single output
    input_is_one_dataset = False
    if isinstance(X, pd.DataFrame):
        input_is_one_dataset = True
        X = [X]
    elif isinstance(X, np.ndarray) and ds.maxDepthOfArray(X) == 2:
        input_is_one_dataset = True
        X = [X]

    # Format data (X: list of arrays, Genes: list of arrays of strings, replicates: list of arrays of strings)
    L = len(X)  # Number of datasets
    replicates = [None] * L
    Genes = [None] * L
    io.log('1. Reading dataset(s)')
    for l in range(L):
        if type(X[l]) == pd.DataFrame:
            Genes[l] = np.array(X[l].index, dtype=str, ndmin=2).transpose()
            Genes[l] = np.array(Genes[l], dtype=object)
            replicates[l] = np.array(X[l].columns, dtype=str)
            X[l] = X[l].values
        else:
            X[l] = np.array(X[l])

            ngenes_digits = int(math.ceil(math.log10(X[l].shape[0])))
            nreps_digits = int(math.ceil(math.log10(X[l].shape[1])))
            Genes[l] = np.array([['{0}'.format(str(g).zfill(ngenes_digits))] for g in range(X[l].shape[0])])
            Genes[l] = np.array(Genes[l], dtype=object)
            replicates[l] = np.array(['X{0}'.format(str(r).zfill(nreps_digits)) for r in range(X[l].shape[1])])

    ndatasets_digits = int(math.ceil(math.log10(L)))
    datafiles = np.array(['D{0}'.format(str(r).zfill(ndatasets_digits)) for r in range(L)])
    datafiles_noext = datafiles

    # Sort out conditions based on replicates structure if given
    if replicatesIDs is None:
        conditions = replicates
    else:
        valresult = val.validate_replicatesIDs(replicatesIDs, X)
        if valresult[0]:
            conditions = [None] * L
            for l in range(L):
                if replicatesIDs[l] is None:
                    conditions[l] = np.array(replicates[l])
                else:
                    uniq_reps, cond_indices = np.unique(replicatesIDs[l], return_index=True)
                    if -1 in uniq_reps:
                        cond_indices = cond_indices[1:]
                    conditions[l] = replicates[l][cond_indices]
        else:
            io.log(valresult[1])
            io.log("Terminating ...")
            raise Exception("Terminated by an invalid input argument.")

    # Validate normalisation
    valresult = val.validate_normalisation(normalise, X)
    if not valresult[0]:
        io.log(valresult[1])
        io.log("Terminating ...")
        raise Exception("Terminated by an invalid input argument.")


    # Preprocessing (Mapping then top level preprocessing including summarising replicates, filtering
    # low expression genes, and normalisation)
    io.log('2. Data pre-processing')
    (X_OGs, GDM, GDMall, OGs, MapNew, MapSpecies) \
        = pp.calculateGDMandUpdateDatasets(X, Genes, Map, mapheader=True, OGsFirstColMap=True, delimGenesInMap='\\W+',
                                           OGsIncludedIfAtLeastInDatasets=OGsIncludedIfAtLeastInDatasets)
    (X_summarised_normalised, GDM, Iincluded, params, applied_norms) = \
        pp.preprocess(X_OGs, GDM, normalise, replicatesIDs, flipSamples=None,
                      expressionValueThreshold=expressionValueThreshold, replacementVal=0.0,
                      atleastinconditions=atleastinconditions, atleastindatasets=atleastindatasets, absvalue=absvalue,
                      filteringtype=filteringtype, filterflat=filflat, params=None, datafiles=datafiles)
    OGs = OGs[Iincluded]
    if MapNew is not None:
        MapNew = MapNew[Iincluded]

    # UNCLES and M-N plots
    io.log('3. Seed clusters production (the Bi-CoPaM method)')
    ures = unc.uncles(X_summarised_normalised, type='A', GDM=GDM, Ks=Ks, params=params, methods=methods,
                      Xnames=datafiles_noext, ncores=ncores, deterministic=deterministic)
    io.log('4. Cluster evaluation and selection (the M-N scatter plots technique)')
    mnres = mn.mnplotsgreedy(X_summarised_normalised, ures.B, GDM=GDM, tightnessweight=tightnessweight,
                             params=ures.params, smallestClusterSize=smallestClusterSize, Xnames=datafiles_noext,
                             ncores=ncores)

    # Post-processing
    ppmethod = 'tukey_sqrtSCG'
    if optimisation:
        io.log('5. Cluster optimisation and completion')
        if len(mnres.I) > 0 and sum(mnres.I) > 0:  # Otherwise, there are no clusters, so nothing to be corrected
            try:
                if ppmethod == 'weighted_outliers':
                    B_corrected = ecorr.correcterrors_weighted_outliers(mnres.B, X_summarised_normalised, GDM,
                                                                        mnres.allDists[mnres.I], stds, smallestClusterSize)
                elif ppmethod == 'tukey_sqrtSCG':
                    B_corrected = ecorr.optimise_tukey_sqrtSCG(mnres.B, X_summarised_normalised, GDM,
                                                                        mnres.allDists[mnres.I], smallestClusterSize,
                                                               tails=1, Q3s=Q3s)
                else:
                    raise ValueError('Invalid post processing method (ppmethod): {0}.'.format(ppmethod))
                B_corrected = ecorr.reorderClusters(B_corrected, X_summarised_normalised, GDM)
            except:
                io.logerror(sys.exc_info())
                io.log('\n* Failed to perform cluster optimisation and completion!\n'
                       '* Skipped cluster optimisation and completion!\n')
                B_corrected = mnres.B
        else:
            B_corrected = mnres.B
    else:
        io.log('5. Skipping cluster optimisation and completion')
        B_corrected = mnres.B


    # Output: Preparing output parameters as DataFrames
    if Map is None:
        Bout = op.clusters_B_as_dataframes(B_corrected, OGs, None)
    else:
        Bout, B_species = op.clusters_B_as_dataframes(B_corrected, OGs, MapNew)
    Xout = op.processed_X_as_dataframes(X_summarised_normalised, OGs, conditions)
    if input_is_one_dataset:
        Xout = Xout[0]

    # Output: Plot figures
    if showPlots:
        try:
            if np.shape(B_corrected)[1] > 0:
                graph.plotclusters(X_summarised_normalised, B_corrected, datafiles_noext, conditions, GDM=GDM,
                                   Cs='all', setPageToDefault=True, showPlots=showPlots, printToPDF=False)
        except:
            io.log('Error: could not generate clusters'' plots. Resuming the rest of steps ...')

    # Output: Prepare message to standard output and the summary then save the summary to a file and print the message
    summarymsg, endtime, timeconsumedtxt = \
        op.generateoutputsummaryparag(X, X_summarised_normalised, MapNew, GDMall, GDM,
                                      ures, mnres, B_corrected, starttime)
    io.log(summarymsg, addextrastick=False)

    io.deletetmpfile()

    return Bout, Xout, GDM