示例#1
0
def findPlateReaderFiles(directory, extension=None):
    '''
    Recrusivelys searches a directory for all files with specific extensions.

    Args:
        directory (str): path to data directory

    Returns:
        list_files (list): list of of paths to data files
    '''

    # you can modify this to include other extensions, but internal data must still be tab-delimited
    if extension is None:
        extension = ('.txt', 'TXT', 'tsv', 'TSV', 'asc', 'ASC')

    # recursively walk through a directory, if nested
    list_files = []

    for (dirpath, dirnames, filenames) in os.walk(directory):

        # only keep files with acceptable extensions
        filenames = [ii for ii in filenames if ii.endswith(extension)]

        # compose and store filepaths but avoid double slashes (i.e. //) between directory names
        for filename in filenames:

            list_files.append(assemblePath(dirpath, filename))

    return list_files
示例#2
0
def breakDownFilePath(filepath, copydirectory):
    '''
    Breaks down a file path into several components.

    Args:
        filepath (str)
        save_dirname (str): directory where a copy of the file would be stored

    Returns:
        filename (str): basename without path
        filebase (str): basename without path and without extension
        newfilepath (str): filename with path and with extension repalced to .tsv

    Example input: '/Users/firasmidani/RandomFileName.asc' will return
        filename --> RandomFileName.asc
        filebase --> RandomFileName
        newfilepath --> /Users/firasmiani/RandomFileName.tsv  
    '''

    filename = os.path.basename(filepath)
    filebase = ''.join(filename.split('.')[:-1])
    dirname = os.path.dirname(filepath)

    newfilepath = assemblePath(copydirectory, filebase, '.tsv')

    return filename, filebase, newfilepath
示例#3
0
    def initPaths(self):
        '''
        Initialize paths for for saving data and results. 
        '''

        # if user did not pass file name for output, use time stamp
        file_name = selectFileName(self.args['fout'])
        dir_path = assemblePath(self.directory['models'], file_name, '')
        if not os.path.exists(dir_path): os.mkdir(dir_path)

        # running model on transformed results and recording results
        file_path_key = assembleFullName(dir_path, '', file_name, 'key',
                                         '.txt')
        file_path_input = assembleFullName(dir_path, '', file_name, 'input',
                                           '.txt')

        paths_dict = {}

        paths_dict['filename'] = file_name
        paths_dict['dir'] = dir_path
        paths_dict['key'] = file_path_key
        paths_dict['input'] = file_path_input

        self.paths_dict = paths_dict
示例#4
0
def basicSummaryOnly(data, mapping, directory, args, verbose=False):
    '''
    If user only requested plotting, then for  each data file, perform a basic algebraic summary
        and plot data. Once completed, exit system. Otherwise, return None.
 
    Args:
        data (dictionary): keys are plate IDs and values are pandas.DataFrames with size t x (n+1)
            where t is the number of time-points and n is number of wells (i.e. samples),
            the additional 1 is due to the explicit 'Time' column, index is uninformative.
        mapping (dictionary): keys are plate IDs and values are pandas.DataFrames with size n x (p)
            where is the number of wells (or samples) in plate, and p are the number of variables or
            parameters described in dataframe.
        directory (dictionary): keys are folder names, values are their paths
        args
        verbose (boolean)

    Returns:
        None: if only_plot_plate argument is False. 
    '''

    if not args['obs']:  # if not only_basic_summary
        return None

    print(tidyMessage('AMiGA is summarizing and plotting data files'))

    list_keys = []

    for pid, data_df in data.items():

        # define paths where summary and plot will be saved
        key_file_path = assemblePath(directory['summary'], pid, '.txt')
        key_fig_path = assemblePath(directory['figures'], pid, '.pdf')

        # grab plate-specific samples
        #   index should be well IDs but a      column Well should also exist
        #   in main.py, annotateMappings() is called which ensures the above is the case
        mapping_df = mapping[pid]
        mapping_df = resetNameIndex(mapping_df, 'Well', False)

        # grab plate-specific data
        wells = list(mapping_df.Well.values)
        data_df = data_df.loc[:, ['Time'] + wells]

        # update plate-specific data with unique Sample Identifiers
        sample_ids = list(mapping_df.index.values)
        data_df.columns = ['Time'] + sample_ids

        # create GrowthPlate object, perform basic summary
        plate = GrowthPlate(data=data_df, key=mapping_df)
        plate.convertTimeUnits(input=getTimeUnits('input'),
                               output=getTimeUnits('output'))
        plate.computeBasicSummary()
        plate.computeFoldChange(subtract_baseline=True)

        # plot and save as PDF, also save key as TXT
        if not args['dp']:
            plate.plot(key_fig_path)

        if args['merges']: list_keys.append(plate.key)
        else:
            plate.key.to_csv(key_file_path, sep='\t', header=True, index=False)

        smartPrint(pid, verbose=verbose)

    if args['merges']:
        filename = selectFileName(args['fout'])
        summary_path = assembleFullName(directory['summary'], 'summary',
                                        filename, '_basic', '.txt')
        summary_df = pd.concat(list_keys, sort=False)
        summary_df.to_csv(summary_path, sep='\t', header=True, index=False)

    smartPrint(
        '\nSee {} for summary text file(s).'.format(directory['summary']),
        verbose)
    smartPrint('See {} for figure PDF(s).\n'.format(directory['figures']),
               verbose)

    msg = 'AMiGA completed your request and '
    msg += 'wishes you good luck with the analysis!'
    print(tidyMessage(msg))

    sys.exit()
示例#5
0
    def plotPredictions(self):
        '''
        Visualizes the model tested by a specific hypothesis given the data.

        Args:
            x_full (pandas.DataFrame)
            x_min (pandas.DataFrame)
            hypotheis (dictionary): keys are str(H0) and str(H1), values are lists of str
            plate (growth.GrowthPlate obj))
            variable (list): variables of interest
            factor_dict (dictionary): mapping of unique values of variables to numerical integers
            subtract_control (boolean): where control sample curves subtracted from treatment sample curves
            file_name (str): 
            directory (str): path where files/figures should be stored
            args_dict (dictionary): must at least include 'nperm', 'nthin', and 'fdr' as keys and their values

        Action:
            saves a plot as PDF file
        '''

        # get necessary attributs
        x_full = self.x_full
        x_min = self.x_min
        factor_dict = self.factor_dict
        hypothesis = self.hypothesis
        variable = self.target[0]
        plate = self.plate

        subtract_control = self.subtract_control
        directory = self.paths_dict['dir']
        file_name = self.paths_dict['filename']

        # get and modify user-accessible parameters from config.py
        plot_params = getHypoPlotParams()  # dict
        tick_spacing = plot_params['tick_spacing']
        legend_loc = plot_params['legend']
        fontsize = plot_params['fontsize']

        posterior_n = getValue('n_posterior_samples')
        colors = getValue('hypo_colors')  # list of colors
        confidence = getValue('confidence')  # confidence interval, e.g. 0.95
        confidence = 1 - (1 - confidence) / 2

        noise = self.args['noise']

        if self.args['dp']: return None

        # grab mapping of integer codes in design matrix to actual variable labels
        varb_codes_map = reverseDict(factor_dict[variable])  # {codes:vlaues}
        cond_variables = list(
            set(hypothesis['H1']).difference(set(
                ['Time', variable])))  # conditioning variables

        # set figure aesthetics
        sns.set_style('whitegrid')
        rcParams['font.family'] = 'sans-serif'
        rcParams['font.sans-serif'] = 'Arial'

        # initialize grid
        fig, ax = plt.subplots(2, 1, figsize=[5, 10.5], sharex=False)

        # for each unique value of variable of interest, plot MVN prediction
        list_values = varb_codes_map.items()
        list_colors = colors[0:x_min.shape[0]]

        # plot MVN predictions
        for v_map, color in zip(list_values, list_colors):
            code, label = v_map
            criteria_real = {variable: [label]}
            criteria_mvn = {variable: code}

            ax[0] = addRealPlotLine(ax[0], plate, criteria_real, color,
                                    plot_params)
            ax[0] = addMVNPlotLine(ax[0], x_full, criteria_mvn, label,
                                   confidence, color, plot_params, noise)
            ax[0].xaxis.set_major_locator(MultipleLocator(tick_spacing))

        # adjust labels and window limits
        ax[0] = setAxesLabels(ax[0], subtract_control, plot_params)

        # if variable has only 2 values and if requested, plot delta OD
        if (len(list_values) != 2) or (not self.args['pdo']):
            fig.delaxes(ax[1])
            dos = None
        else:
            ax[1] = plotDeltaOD(ax[1],
                                self.functional_diff,
                                ylabel=True,
                                xlabel=True,
                                fontsize=fontsize)
            ax[1].xaxis.set_major_locator(MultipleLocator(tick_spacing))
            ax[0].set_xlabel('')

        ax = dynamicWindowAdjustment(ax)

        ## if user did not pass file name for output, use time stamp
        fig_path = assemblePath(directory, file_name, '.pdf')
        plt.subplots_adjust(wspace=0.15, hspace=0.15)
        savePlotWithLegends(ax[0], fig_path, legend_loc, fontsize=fontsize)
示例#6
0
def assembleMappings(data,
                     mapping_path,
                     meta_path=None,
                     save=False,
                     verbose=False):
    '''
    Creates a master mapping file (or dictionary ?) for all data files in the input argument.
        For each data file, in this particular order, it will first (1) check if an individual
        mapping file exists, (2) if not, check if relevant meta-data is provided in meta.txt
        file, (3) if not, infer if plate is a BIOLOG PM based on its file name, and (4) if all
        fail, create a minimalist mapping file. 

    Args:
        data (dictionary): keys are file names (i.e. filebases or Plate IDs) and values are
            pandas DataFrames where index column (row names) are well IDs.
        mapping_path (str): path to the mapping folder.
        meta_path (str): path to the mapping file.
        verbose (boolean)

    Returns:
        df_mapping_dict (dictionary): keys are file names and values are mapping files. 
    '''

    df_mapping_dict = {}

    # list all data files to be analyed
    list_filebases = data.keys()

    # list all potential mapping file paths
    list_mapping_files = [
        assemblePath(mapping_path, ii, '.txt') for ii in list_filebases
    ]

    # read meta.txt and list all plates described by it
    meta_df, meta_df_plates = checkMetaText(meta_path, verbose=verbose)

    # assemble mapping for one data file at a time
    for filebase, mapping_file_path in zip(list_filebases, list_mapping_files):

        # what are the row names from the original data file
        well_ids = data[filebase].columns[
            1:]  # this may no be A1 ... H12, but most ofen will be

        # create file path for saving derived mapping, if requested
        newfilepath = assembleFullName(mapping_path, '', filebase, '', '.map')

        # see if user provided a mapping file that corresponds to this data file (filebase)
        if os.path.exists(mapping_file_path):

            df_mapping = pd.read_csv(mapping_file_path,
                                     sep='\t',
                                     header=0,
                                     index_col=0,
                                     dtype={
                                         'Plate_ID': str,
                                         'Isolate': str
                                     })
            df_mapping = checkPlateIdColumn(
                df_mapping, filebase)  # makes sure Plate_ID is a column
            df_mapping.index = [
                ii[0] + ii[1:].lstrip('0') for ii in df_mapping.index
            ]  # strip leading zeros in well names

            smartPrint('{:.<30} Reading {}.'.format(filebase,
                                                    mapping_file_path),
                       verbose=verbose)

        # see if user described the file in meta.txt
        elif filebase in meta_df_plates:

            meta_info = meta_df[meta_df.Plate_ID == filebase]
            msg = '{:.<30} Found meta-data in meta.txt '.format(filebase)

            biolog = isBiologFromMeta(
                meta_info)  # does meta_df indicate this is a BIOLOG plate

            if biolog:
                checkBiologSize(data[filebase], filebase)
                df_mapping = expandBiologMetaData(meta_info)
                msg += '& seems to be a BIOLOG PM plate.'
                smartPrint(msg, verbose=verbose)
            else:
                df_mapping = initKeyFromMeta(meta_info, well_ids)
                msg += '& does not seem to be a BIOLOG PM plate.'
                smartPrint(msg, verbose=verbose)

        elif isBiologFromName(filebase):
            checkBiologSize(data[filebase], filebase)
            df_mapping = initBiologPlateKey(filebase)
            msg = '{:.<30} Did not find mapping file or meta-data '.format(
                filebase)
            msg += 'BUT seems to be a BIOLOG PM plate.'
            smartPrint(msg, verbose=verbose)

        else:
            df_mapping = initMappingDf(filebase, well_ids)
            msg = '{:.<30} Did not find mapping file or meta-data '.format(
                filebase)
            msg += '& does not seem to be a BIOLOG PM plate.'
            smartPrint(msg, verbose=verbose)

        df_mapping_dict[filebase] = expandMappingParams(df_mapping,
                                                        verbose=verbose)

        if save:
            df_mapping_dict[filebase].to_csv(newfilepath,
                                             sep='\t',
                                             header=True,
                                             index=True)

    #df_mapping = df_mapping.reset_index(drop=False)
    smartPrint('', verbose=verbose)

    return df_mapping_dict