Пример #1
0
def clustpipeline(datapath, mapfile=None, replicatesfile=None, normalisationfile=['1000'], outpath=None,
                  Ks=[n for n in range(4, 21, 4)], tightnessweight=1, stds=3.0,
                  OGsIncludedIfAtLeastInDatasets=1, expressionValueThreshold=-float("inf"), atleastinconditions=0,
                  atleastindatasets=0, absvalue=False, filteringtype='raw', filflat=True, smallestClusterSize=11,
                  ncores=1, optimisation=True, Q3s=2, methods=None, deterministic=False):
    # Set the global objects label
    if mapfile is None:
        glob.set_object_label_upper('Gene')
        glob.set_object_label_lower('gene')
    else:
        glob.set_object_label_upper('OG')
        glob.set_object_label_lower('OG')

    # Output: Prepare the output directory and the log file
    if outpath is None:
        outpathbase = os.getcwd()
        #outpathbase = os.path.abspath(os.path.join(datapath, '..'))
        outpathbase = '{0}/Results_{1}'.format(outpathbase, dt.datetime.now().strftime('%d_%b_%y'))
        outpath = outpathbase
        trial = 0
        while os.path.exists(outpath):
            trial += 1
            outpath = '{0}_{1}'.format(outpathbase, trial)
    if not os.path.exists(outpath):
        os.makedirs(outpath)

    glob.set_logfile(os.path.join(outpath, 'log.txt'))
    glob.set_tmpfile(os.path.join(outpath, 'tmp.txt'))

    # Output: Copy input files to the output
    in2out_path = outpath + '/Input_files_and_params'
    if not os.path.exists(in2out_path):
        os.makedirs(in2out_path)

    if mapfile is not None:
        shutil.copy(mapfile, os.path.join(in2out_path, 'Map.txt'))
    if replicatesfile is not None:
        shutil.copy(replicatesfile, os.path.join(in2out_path, 'Replicates.txt'))
    if normalisationfile is not None:
        if len(normalisationfile) == 1 and not nu.isint(normalisationfile[0]):
            shutil.copy(normalisationfile[0], os.path.join(in2out_path, 'Normalisation.txt'))

    in2out_X_unproc_path = in2out_path + '/Data'
    if not os.path.exists(in2out_X_unproc_path):
        os.makedirs(in2out_X_unproc_path)
    if os.path.isfile(datapath):
        shutil.copy(datapath, in2out_X_unproc_path)
    elif os.path.isdir(datapath):
        for df in io.getFilesInDirectory(datapath):
            shutil.copy(os.path.join(datapath, df), in2out_X_unproc_path)
    else:
        raise ValueError('Data path {0} does not exist. Either provide a path '.format(datapath) + \
                         'of a data file or a path to a directory including data file(s)')


    # Output: Print initial message, and record the starting time:
    initialmsg, starttime = op.generateinitialmessage()
    io.log(initialmsg, addextrastick=False)

    # Read data
    io.log('1. Reading dataset(s)')
    (X, replicates, Genes, datafiles) = io.readDatasetsFromDirectory(datapath, delimiter='\t| |, |; |,|;', skiprows=1, skipcolumns=1,
                                                                     returnSkipped=True)
    datafiles_noext = [os.path.splitext(d)[0] for d in datafiles]

    # Read map, replicates, and normalisation files:
    Map = io.readMap(mapfile)
    (replicatesIDs, conditions) = io.readReplicates(replicatesfile, datapath, datafiles, replicates)
    normalise = io.readNormalisation(normalisationfile, datafiles)

    # Preprocessing (Mapping then top level preprocessing including summarising replicates, filtering
    # low expression genes, and normalisation)
    io.log('2. Data pre-processing')
    (X_OGs, GDM, GDMall, OGs, MapNew, MapSpecies) \
        = pp.calculateGDMandUpdateDatasets(X, Genes, Map, mapheader=True, OGsFirstColMap=True, delimGenesInMap='\\W+',
                                           OGsIncludedIfAtLeastInDatasets=OGsIncludedIfAtLeastInDatasets)
    (X_summarised_normalised, GDM, Iincluded, params, applied_norms) = \
        pp.preprocess(X_OGs, GDM, normalise, replicatesIDs, flipSamples=None,
                      expressionValueThreshold=expressionValueThreshold, replacementVal=0.0,
                      atleastinconditions=atleastinconditions, atleastindatasets=atleastindatasets, absvalue=absvalue,
                      filteringtype=filteringtype, filterflat=filflat, params=None, datafiles=datafiles)
    io.writedic('{0}/Normalisation_actual.txt'.format(outpath), applied_norms, delim='\t')
    OGs = OGs[Iincluded]
    if MapNew is not None:
        MapNew = MapNew[Iincluded]

    # Output: Save processed data
    Xprocessed = op.processed_X(X_summarised_normalised, conditions, GDM, OGs, MapNew, MapSpecies)  # pandas DataFrames
    X_proc_path = outpath + '/Processed_Data'
    if not os.path.exists(X_proc_path):
        os.makedirs(X_proc_path)
    for l in range(len(datafiles)):
        pd.DataFrame.to_csv(Xprocessed[l], '{0}/{1}_processed.tsv'.format(X_proc_path, datafiles[l]), sep='\t', encoding='utf-8', index=None, columns=None, header=False)
        #np.savetxt('{0}/{1}_processed.tsv'.format(X_proc_path, datafiles[l]), Xprocessed[l], fmt='%s', delimiter='\t')


    # UNCLES and M-N plots
    io.log('3. Seed clusters production (the Bi-CoPaM method)')
    ures = unc.uncles(X_summarised_normalised, type='A', GDM=GDM, Ks=Ks, params=params, methods=methods,
                      Xnames=datafiles_noext, ncores=ncores, deterministic=deterministic)
    io.log('4. Cluster evaluation and selection (the M-N scatter plots technique)')
    mnres = mn.mnplotsgreedy(X_summarised_normalised, ures.B, GDM=GDM, tightnessweight=tightnessweight,
                             params=ures.params, smallestClusterSize=smallestClusterSize, Xnames=datafiles_noext,
                             ncores=ncores)

    # Post-processing
    ppmethod = 'tukey_sqrtSCG'
    if optimisation:
        io.log('5. Cluster optimisation and completion')
        if len(mnres.I) > 0 and sum(mnres.I) > 0:  # Otherwise, there are no clusters, so nothing to be corrected
            try:
                if ppmethod == 'weighted_outliers':
                    B_corrected = ecorr.correcterrors_weighted_outliers(mnres.B, X_summarised_normalised, GDM,
                                                                        mnres.allDists[mnres.I], stds, smallestClusterSize)
                elif ppmethod == 'tukey_sqrtSCG':
                    B_corrected = ecorr.optimise_tukey_sqrtSCG(mnres.B, X_summarised_normalised, GDM,
                                                                        mnres.allDists[mnres.I], smallestClusterSize,
                                                               tails=1, Q3s=Q3s)
                else:
                    raise ValueError('Invalid post processing method (ppmethod): {0}.'.format(ppmethod))
                B_corrected = ecorr.reorderClusters(B_corrected, X_summarised_normalised, GDM)
            except:
                io.logerror(sys.exc_info())
                io.log('\n* Failed to perform cluster optimisation and completion!\n'
                       '* Skipped cluster optimisation and completion!\n')
                B_corrected = mnres.B
        else:
            B_corrected = mnres.B
    else:
        io.log('5. Skipping cluster optimisation and completion')
        B_corrected = mnres.B


    # Output: Write input parameters:
    io.log('6. Saving results in\n{0}'.format(outpath))
    inputparams = op.params(mnres.params, Q3s, OGsIncludedIfAtLeastInDatasets,
                            expressionValueThreshold, atleastinconditions, atleastindatasets,
                            deterministic, ures.params['methods'], MapNew)
    io.writedic('{0}/input_params.tsv'.format(in2out_path), inputparams, delim='\t')

    # Output: Generating and saving clusters
    res_og = op.clusters_genes_OGs(B_corrected, OGs, MapNew, MapSpecies, '; ')  # pandas DataFrame
    if mapfile is None:
        pd.DataFrame.to_csv(res_og, '{0}/Clusters_Objects.tsv'.format(outpath), sep='\t',
                            encoding='utf-8', index=None, columns=None, header=False)
        #np.savetxt('{0}/Clusters_Objects.tsv'.format(outpath), res_og, fmt='%s', delimiter='\t')
    else:
        pd.DataFrame.to_csv(res_og, '{0}/Clusters_OGs.tsv'.format(outpath), sep='\t',
                            encoding='utf-8', index=None, columns=None, header=False)
        #np.savetxt('{0}/Clusters_OGs.tsv'.format(outpath), res_og, fmt='%s', delimiter='\t')
        res_sp = op.clusters_genes_Species(B_corrected, OGs, MapNew, MapSpecies)  # pandas DataFrame
        for sp in range(len(res_sp)):
            pd.DataFrame.to_csv(res_sp[sp], '{0}/Clusters_{1}.tsv'.format(outpath, MapSpecies[sp]), sep='\t',
                                encoding='utf-8', index=None, columns=None, header=False)
            #np.savetxt('{0}/Clusters_{1}.tsv'.format(outpath, MapSpecies[sp]), res_sp[sp], fmt='%s', delimiter='\t')

    # Output: Save figures to a PDF

    try:
        if np.shape(B_corrected)[1] > 0:
            clusts_fig_file_name = '{0}/Clusters_profiles.pdf'.format(outpath)
            graph.plotclusters(X_summarised_normalised, B_corrected, datafiles_noext, conditions, clusts_fig_file_name,
                               GDM=GDM, Cs='all', setPageToDefault=True, printToPDF=True, showPlots=False)
    except:
        io.log('Error: could not save clusters plots in a PDF file.\n'
               'Resuming producing the other results files ...')

    # Output: Generating and writing eigengenes
    try:
        if np.shape(B_corrected)[1] > 0:
            if len(X_summarised_normalised) == 1:
                eigengene_matrix = eig.eigengenes_dataframe(X_summarised_normalised, B_corrected, conditions)
                eigengene_matrix.to_csv('{0}/Eigengenes.tsv'.format(outpath), sep='\t',
                                    encoding='utf-8')
            else:
                io.log('Eigengene computation is currently not supported for multiple datasets.')
    except:
        io.log('Error: could not save eigengenes into a file.\n'
               'Resuming producing the other results files ...')

    # Output: Prepare message to standard output and the summary then save the summary to a file and print the message
    summarymsg, endtime, timeconsumedtxt = \
        op.generateoutputsummaryparag(X, X_summarised_normalised, MapNew, GDMall, GDM,
                                      ures, mnres, B_corrected, starttime)
    summary = op.summarise_results(X, X_summarised_normalised, MapNew, GDMall, GDM,
                                   ures, mnres, B_corrected, starttime, endtime, timeconsumedtxt)
    io.writedic(outpath + '/Summary.tsv', summary, delim='\t')
    io.log(summarymsg, addextrastick=False)

    io.deletetmpfile()
Пример #2
0
def runclust(X, Map=None, replicatesIDs=None, normalise=1000,
             Ks=[n for n in range(4, 21, 4)], tightnessweight=1, stds=3.0,
             OGsIncludedIfAtLeastInDatasets=1, expressionValueThreshold=-float("inf"), atleastinconditions=0,
             atleastindatasets=0, absvalue=False, filteringtype='raw', filflat=True, smallestClusterSize=11,
             ncores=1, optimisation=True, Q3s=2, methods=None, deterministic=False, showPlots=True,
             printToConsole=True):

    # Set the global objects label
    glob.set_print_to_log_file(False)
    glob.set_print_to_console(printToConsole)
    if Map is None:
        glob.set_object_label_upper('Gene')
        glob.set_object_label_lower('gene')
    else:
        glob.set_object_label_upper('OG')
        glob.set_object_label_lower('OG')

    glob.set_tmpfile('clust_tmp.txt')


    # Output: Print initial message, and record the starting time:
    initialmsg, starttime = op.generateinitialmessage()
    io.log(initialmsg, addextrastick=False)

    # Consider X as a list of arrays or of data frames. Otherwise, make it as such first
    # If the user entered a single dataset as an input (not as a list of arrays), save this fact in a flag, ...
    # so the result is returned as a single output
    input_is_one_dataset = False
    if isinstance(X, pd.DataFrame):
        input_is_one_dataset = True
        X = [X]
    elif isinstance(X, np.ndarray) and ds.maxDepthOfArray(X) == 2:
        input_is_one_dataset = True
        X = [X]

    # Format data (X: list of arrays, Genes: list of arrays of strings, replicates: list of arrays of strings)
    L = len(X)  # Number of datasets
    replicates = [None] * L
    Genes = [None] * L
    io.log('1. Reading dataset(s)')
    for l in range(L):
        if type(X[l]) == pd.DataFrame:
            Genes[l] = np.array(X[l].index, dtype=str, ndmin=2).transpose()
            Genes[l] = np.array(Genes[l], dtype=object)
            replicates[l] = np.array(X[l].columns, dtype=str)
            X[l] = X[l].values
        else:
            X[l] = np.array(X[l])

            ngenes_digits = int(math.ceil(math.log10(X[l].shape[0])))
            nreps_digits = int(math.ceil(math.log10(X[l].shape[1])))
            Genes[l] = np.array([['{0}'.format(str(g).zfill(ngenes_digits))] for g in range(X[l].shape[0])])
            Genes[l] = np.array(Genes[l], dtype=object)
            replicates[l] = np.array(['X{0}'.format(str(r).zfill(nreps_digits)) for r in range(X[l].shape[1])])

    ndatasets_digits = int(math.ceil(math.log10(L)))
    datafiles = np.array(['D{0}'.format(str(r).zfill(ndatasets_digits)) for r in range(L)])
    datafiles_noext = datafiles

    # Sort out conditions based on replicates structure if given
    if replicatesIDs is None:
        conditions = replicates
    else:
        valresult = val.validate_replicatesIDs(replicatesIDs, X)
        if valresult[0]:
            conditions = [None] * L
            for l in range(L):
                if replicatesIDs[l] is None:
                    conditions[l] = np.array(replicates[l])
                else:
                    uniq_reps, cond_indices = np.unique(replicatesIDs[l], return_index=True)
                    if -1 in uniq_reps:
                        cond_indices = cond_indices[1:]
                    conditions[l] = replicates[l][cond_indices]
        else:
            io.log(valresult[1])
            io.log("Terminating ...")
            raise Exception("Terminated by an invalid input argument.")

    # Validate normalisation
    valresult = val.validate_normalisation(normalise, X)
    if not valresult[0]:
        io.log(valresult[1])
        io.log("Terminating ...")
        raise Exception("Terminated by an invalid input argument.")


    # Preprocessing (Mapping then top level preprocessing including summarising replicates, filtering
    # low expression genes, and normalisation)
    io.log('2. Data pre-processing')
    (X_OGs, GDM, GDMall, OGs, MapNew, MapSpecies) \
        = pp.calculateGDMandUpdateDatasets(X, Genes, Map, mapheader=True, OGsFirstColMap=True, delimGenesInMap='\\W+',
                                           OGsIncludedIfAtLeastInDatasets=OGsIncludedIfAtLeastInDatasets)
    (X_summarised_normalised, GDM, Iincluded, params, applied_norms) = \
        pp.preprocess(X_OGs, GDM, normalise, replicatesIDs, flipSamples=None,
                      expressionValueThreshold=expressionValueThreshold, replacementVal=0.0,
                      atleastinconditions=atleastinconditions, atleastindatasets=atleastindatasets, absvalue=absvalue,
                      filteringtype=filteringtype, filterflat=filflat, params=None, datafiles=datafiles)
    OGs = OGs[Iincluded]
    if MapNew is not None:
        MapNew = MapNew[Iincluded]

    # UNCLES and M-N plots
    io.log('3. Seed clusters production (the Bi-CoPaM method)')
    ures = unc.uncles(X_summarised_normalised, type='A', GDM=GDM, Ks=Ks, params=params, methods=methods,
                      Xnames=datafiles_noext, ncores=ncores, deterministic=deterministic)
    io.log('4. Cluster evaluation and selection (the M-N scatter plots technique)')
    mnres = mn.mnplotsgreedy(X_summarised_normalised, ures.B, GDM=GDM, tightnessweight=tightnessweight,
                             params=ures.params, smallestClusterSize=smallestClusterSize, Xnames=datafiles_noext,
                             ncores=ncores)

    # Post-processing
    ppmethod = 'tukey_sqrtSCG'
    if optimisation:
        io.log('5. Cluster optimisation and completion')
        if len(mnres.I) > 0 and sum(mnres.I) > 0:  # Otherwise, there are no clusters, so nothing to be corrected
            try:
                if ppmethod == 'weighted_outliers':
                    B_corrected = ecorr.correcterrors_weighted_outliers(mnres.B, X_summarised_normalised, GDM,
                                                                        mnres.allDists[mnres.I], stds, smallestClusterSize)
                elif ppmethod == 'tukey_sqrtSCG':
                    B_corrected = ecorr.optimise_tukey_sqrtSCG(mnres.B, X_summarised_normalised, GDM,
                                                                        mnres.allDists[mnres.I], smallestClusterSize,
                                                               tails=1, Q3s=Q3s)
                else:
                    raise ValueError('Invalid post processing method (ppmethod): {0}.'.format(ppmethod))
                B_corrected = ecorr.reorderClusters(B_corrected, X_summarised_normalised, GDM)
            except:
                io.logerror(sys.exc_info())
                io.log('\n* Failed to perform cluster optimisation and completion!\n'
                       '* Skipped cluster optimisation and completion!\n')
                B_corrected = mnres.B
        else:
            B_corrected = mnres.B
    else:
        io.log('5. Skipping cluster optimisation and completion')
        B_corrected = mnres.B


    # Output: Preparing output parameters as DataFrames
    if Map is None:
        Bout = op.clusters_B_as_dataframes(B_corrected, OGs, None)
    else:
        Bout, B_species = op.clusters_B_as_dataframes(B_corrected, OGs, MapNew)
    Xout = op.processed_X_as_dataframes(X_summarised_normalised, OGs, conditions)
    if input_is_one_dataset:
        Xout = Xout[0]

    # Output: Plot figures
    if showPlots:
        try:
            if np.shape(B_corrected)[1] > 0:
                graph.plotclusters(X_summarised_normalised, B_corrected, datafiles_noext, conditions, GDM=GDM,
                                   Cs='all', setPageToDefault=True, showPlots=showPlots, printToPDF=False)
        except:
            io.log('Error: could not generate clusters'' plots. Resuming the rest of steps ...')

    # Output: Prepare message to standard output and the summary then save the summary to a file and print the message
    summarymsg, endtime, timeconsumedtxt = \
        op.generateoutputsummaryparag(X, X_summarised_normalised, MapNew, GDMall, GDM,
                                      ures, mnres, B_corrected, starttime)
    io.log(summarymsg, addextrastick=False)

    io.deletetmpfile()

    return Bout, Xout, GDM