Пример #1
0
def main(args):

	verbose = args.verbose
	#directory = assemblePath(args.input,'summary')
	directory = args.input

	msg = 'AMiGA is peeking inside the summary directory'

	smartPrint('',verbose)
	smartPrint(tidyMessage(msg),verbose)
	# smartPrint(checkDirectoryNotEmpty(directory,'Summary')[1],verbose)

	criteria = checkParameterCommand(args.subset,sep=',')
	
	directory,filename = isFileOrFolder(directory,up=1)

	if filename: ls_files = ['{}{}{}'.format(directory,os.sep,filename)]
	else: ls_files = findPlateReaderFiles(directory,'.txt')

	full_df = read(ls_files)
	sub_df = subsetDf(full_df,criteria)
	sub_df = group(sub_df,args)
	sub_df = pivot(sub_df,args,args.value)
	sub_df = reduceDf(sub_df,args)
	clusterMap(sub_df,full_df,args,directory)
	saveDf(full_df,sub_df,args,directory)
Пример #2
0
def subsetWells(df_mapping_dict, criteria, hypothesis, verbose=False):
    '''
    Tag wells that meet user-passed criteria.

    Args:
        df (pandas.DataFrame) must have Plate_IDs and Well as columns
        criteria (dictionary) with mapping variables (str) as keys and accepted instances (str) as values
        hypothesis (dictionary) 
        verbose (boolean)

    Returns:
        df (pandas.DataFrame)
    '''

    if (len(criteria) == 0):
        smartPrint('No subsetting was requested.\n', verbose)
        return df_mapping_dict, None

    for plate_id, mapping_df in df_mapping_dict.items():

        # subsetting on number-based criteria does not match hits due to type mismatch (str vs int/float)
        mapping_df_str = mapping_df.astype(str)

        remove_boolean = ~(mapping_df_str.isin(criteria).sum(1)
                           == len(criteria)).values  # list of booleans
        remove_idx = mapping_df_str.index[remove_boolean]
        mapping_df.loc[remove_idx, 'Subset'] = [0] * len(remove_idx)

    msg = 'The following criteria were used to subset data:\n'
    msg += tidyDictPrint(criteria)

    smartPrint(msg, verbose)

    return df_mapping_dict, msg
Пример #3
0
def pivot(df,args,metric=None):

	if metric is None:

		return df

	else: 

		df = pd.pivot(data=df,columns=args.x_variable,index=args.y_variable,values=metric)

		rows_todrop = np.where(df.isna().any(1))[0]
		rows_todrop = df.index.values[rows_todrop]

		cols_todrop = np.where(df.isna().any())[0]
		cols_todrop = df.keys().values[cols_todrop]

		if len(rows_todrop) > 0 or len(cols_todrop): 
			msg = 'User Warning: The heatmap data is missing values. '
			msg += 'Pleae check the data for the following:\n\n'
			msg += 'Columns:\t'
			msg += ', '.join(cols_todrop) + '\n'
			msg += '\n'
			msg += 'Rows:\t'
			msg += ', '.join(rows_todrop) + '\n'
			msg += '\nThese variables will be dropped and not plotted unless if you requested that '
			msg += 'they be kept with --keep-rows-missing-data or --keep-columns-missing-data.\n\n'
			smartPrint(msg,args.verbose)

		if not args.keep_rows_missing_data:
			df = df.drop(labels=rows_todrop,axis=0)

		if not args.keep_columns_missing_data:
			df = df.drop(labels=cols_todrop,axis=1)

	return df
Пример #4
0
def main():

    args = parseCommand()
    verbose = args['verbose']
    #directory = assemblePath(args['fi'],'summary')
    directory = args['fi']

    msg = 'AMiGA is peeking inside the summary directory'

    smartPrint('', verbose)
    smartPrint(tidyMessage(msg), verbose)
    # smartPrint(checkDirectoryNotEmpty(directory,'Summary')[1],verbose)

    criteria = checkParameterCommand(args['s'], sep=',')

    directory, filename = isFileOrFolder(directory, up=1)

    if filename: ls_files = ['{}{}{}'.format(directory, os.sep, filename)]
    else: ls_files = findPlateReaderFiles(directory, '.txt')

    df = read(ls_files)
    df = subsetDf(df, criteria)
    df = group(df, args)
    df = pivot(df, args)
    df = reduceDf(df, args)
    #plot(df,args,directory)
    clusterMap(df, args, directory)
Пример #5
0
def expandMappingParams(df,verbose):
    '''
    Expand input data frame to include columns relevant for AMiGA processing of user paramters.
        It will add columns for Group, Control, Flag, and Subset. Note on grouping: plates
        can be divided into multiple groups where each group has its own group-specific 
        control wells. Biolog PM plates has a single group and A1 is control well. 

    Args:
        df (pandas.DataFrame)
        verbose (boolean)

    Returns:
        df (pandas.DataFrame): with four additional columns
    '''

    # get dataframe info
    Plate_ID = Plate_ID = df.Plate_ID.unique()[0]
    keys = list(df.keys()) 
    
    # check if Biolog PM plate
    biolog = isBiologFromName(Plate_ID) # True or False
    
    if ('Control' in keys) and ('Group' not in keys):
        
        df.loc[:,'Control'] = df.Control.fillna(0)
        df.loc[:,'Group'] = [1]*df.shape[0]
                
    if ('Group' in keys) and ('Control' not in keys):

        df.loc[:,'Group'] = df.Group.fillna('NA')
        df.loc[:,'Control'] = [0]*df.shape[0]        
    
    if ('Group' not in keys) and ('Control' not in keys): 
    
        # plate can be divided into multiple group, each gorup has unique control well(s)
        df.loc[:,'Group'] = [1]*df.shape[0]  # all wells in a BIOLOG PM plte belong to same group
        df.loc[:,'Control'] = [0]*df.shape[0]  # all wells (except) A1 are treatments
    
    if biolog:      
        df.loc[:,'Control'] = 0  # A1 is the control well
        df.loc['A1','Control'] = 1  # A1 is the control well


    if not all(x in [0.,1.] or np.isnan(x) for x in df.Control.unique()):
        
        msg = '\nUSER ERROR: Values in Control column for mapping '
        msg += 'of {} must be either 0 or 1.\n'.format(Plate_ID)
        smartPrint(msg,verbose)

        df.loc[:,'Control'] = [0]*df.shape[0]

    # replace na values
    df.loc[:,'Group'] = df.Group.fillna('NA')
    df.loc[:,'Control'] = df.Control.fillna(0)

    # initialize well-specific flag and subset parameters
    df.loc[:,'Flag'] = [0]*df.shape[0]  # by default, no wells are flagged
    df.loc[:,'Subset'] = [1]*df.shape[0]  # by default, all wells are included in analysis

    return df
Пример #6
0
    def model(self, nthin=1, store=False, verbose=False):
        '''
        Infers growth parameters of interest (including diauxic shifts) by Gaussian Process fitting of data.

        Args:
            store (boolean): if True, certain data will be store as object's attributes
            diauxie (float): ratio of peak height (relative to maximum) used to call if diauxie occured or not

        Actions:
            modifies self.key, and may create self.latent and self.dlatent_dt objects
        '''

        # get user-defined parameters from config.py
        posterior_n = getValue('n_posterior_samples')

        # initialize variables for storing parameters and data
        data_ls, diauxie_dict = [], {}
        gp_params = initParamDf(self.key.index, 0)

        for sample_id in self.key.index:

            pid, well = self.key.loc[sample_id, ['Plate_ID', 'Well']].values

            smartPrint('Fitting {}\t{}'.format(pid, well), verbose)

            # extract sample
            args_dict = self.key.loc[sample_id, ['Well', 'Plate_ID']].to_dict()
            sample = self.extractGrowthData(args_dict)

            df = sample.time.join(sample.data)
            df.columns = ['Time', 'OD']

            # create GP object and analyze
            gm = GrowthModel(df=df,
                             baseline=sample.key.OD_Baseline.values,
                             ARD=False,
                             heteroscedastic=False,
                             nthin=nthin)

            curve = gm.run(name=sample_id)

            diauxie_dict[sample_id] = curve.params.pop('df_dx')
            gp_params.loc[sample_id, :] = curve.params

            # passively save data, manipulation occurs below (input OD, GP fit, & GP derivative)
            if store: data_ls.append(curve.data())

        diauxie_df = mergeDiauxieDfs(diauxie_dict)

        # record results in object's key
        self.key = self.key.join(gp_params)
        self.key = pd.merge(self.key, diauxie_df, on='Sample_ID')

        # plotting needs transformed (or real) OD & GP fit, & may need GP derivative, save all as obejct attributes
        if store: self.gp_data = pd.concat(data_ls).reset_index(drop=True)

        return None
Пример #7
0
def readPlateReaderFolder(filename,
                          directory,
                          config,
                          interval_dict={},
                          save=False,
                          verbose=False):
    '''
    Finds, reads, and formats all files in a directory to be AMiGA-compatible.

    Args:
        filename (str or None): str indicates user is intersted in a single data file, None otherwise
        directory (dictionary): keys are folder names, values are their paths
        config (dictionary): variables saved in config.py where key is variable and value is value
        save (boolean): 
        interval_dict (dictionary)
        verbose (boolean)
    '''

    folderpath = directory['data']
    copydirectory = directory['derived']

    # user may have passed a specific file or a directory to the input argument
    if filename:
        filepaths = ['{}{}{}'.format(folderpath, os.sep, filename)]
    else:
        filepaths = findPlateReaderFiles(folderpath)
    # either way, filepaths must be an iterable list or array

    # read one data file at a time
    df_dict = {}
    for filepath in sorted(filepaths):

        # communicate with user
        smartPrint('Reading {}'.format(filepath), verbose)

        # get extension-free file name and path for derived copy
        _, filebase, newfilepath = breakDownFilePath(
            filepath, copydirectory=copydirectory)

        # set the interval time
        if isinstance(interval_dict, dict):
            if filebase in interval_dict.keys():
                plate_interval = interval_dict[filebase][0]
        else:
            plate_interval = config['interval']

        # read and adjust file to format: time by wells where first column is time and rest are ODs
        df = readPlateReaderData(filepath,
                                 plate_interval,
                                 copydirectory,
                                 save=save)
        df_dict[filebase] = df  #..iloc[nskip:,:]

    smartPrint('', verbose)  # print empty newline, for visual asethetics only

    return df_dict
Пример #8
0
def main(args):

	verbose = args.verbose
	directory = args.input
	ovewrrite = args.over_write

	msg = 'AMiGA is parsing your file(s)'

	smartPrint('',verbose)
	smartPrint(tidyMessage(msg),verbose)
	
	directory,filename = isFileOrFolder(directory,up=1)

	if filename: ls_files = ['{}{}{}'.format(directory,os.sep,filename)]
	else: ls_files = findPlateReaderFiles(directory,'.txt')

	for lf in ls_files:

		df = read(ls_files)

		if ovewrrite:  new_name = lf
		elif lf.endswith('.txt'): new_name = '{}_normalized.txt'.format(lf[:-4])
		else: new_name = '{}.normalized.txt'.format(lf)
		
		df = normalizeParameters(args,df)
		df.to_csv(new_name,sep='\t',header=True,index=True)

	msg = 'AMiGA compelted your request'

	smartPrint('',verbose)
	smartPrint(tidyMessage(msg),verbose)
Пример #9
0
def checkMetaText(filepath, verbose=False):
    '''
    Parses meta.txt file into a pandas.DataFrame.

    Args:
        filepath (str): path to meta.txt, must be tab-delimited with first column as "Plate_ID" (i.e. file name)
        verbose (boolean)

    Returns:
        df_meta (pandas.DataFrame)
        df_meta_plates (list)

    '''

    if filepath is None:
        exists = False
    else:
        exists = os.path.exists(filepath)

    if not exists:
        df_meta = pd.DataFrame
        df_meta_plates = []
    else:
        df_meta = pd.read_csv(filepath,
                              sep='\t',
                              header=0,
                              index_col=None,
                              dtype={
                                  'Plate_ID': str,
                                  'Isolate': str
                              })

    # which plates were characterized in meta.txt?
    try:
        df_meta_plates = df_meta.Plate_ID.values
    except:
        df_meta_plates = []

    # neatly prints meta.txt to terminal
    if exists:
        tab = tabulate(df_meta, headers='keys', tablefmt='psql')
        msg = '{:.<21}{}\n{}\n'.format('Meta-Data file is', filepath, tab)
        smartPrint(msg, verbose)
    else:
        smartPrint('No meta.txt file found\n', verbose)

    return df_meta, df_meta_plates
Пример #10
0
def interpretParameters(files, args, verbose=False):
    '''
    Checks specific directories for their existence and makes sure data was 
        provided by user. It will also compose and can print a message that can
        communicate with user the results of this validation.

    Args:
        files (dict): keys are parameter names and values are file paths
        args (dict): keys are parameter names and values are corresponding command-line arguments
 
    Returns:
        params_dict (dict): dictionary where keys are variables and values instances
    '''

    # params_settings defines parameters for interpreting parameter files or argument calls
    #
    #     each item is a 3-tuple where:
    #     1. parameter name which matches both input argument and text file name (str)
    #     2. delimitor that separates values of a variable of the parameter (str)
    #     3. integerize: whether to convert values of a variable to integers (boolean)

    params_settings = [('interval', ',', True), ('subset', ',', False),
                       ('flag', ',', False), ('hypo', '\+|,', False)]

    # initialize all parameters based on their settings
    params_dict = {}
    for pp, sep, integerize in params_settings:
        params_dict[pp] = initializeParameter(files[pp],
                                              args[pp],
                                              sep=sep,
                                              integerize=integerize)

    smartPrint(tidyDictPrint(params_dict), verbose)

    # if user requests any subsetting, summary results must be merged
    if params_dict['subset']:

        args['merges'] = True

        msg = 'WARNING: Because user has requested subsetting of data, '
        msg += 'results will be merged into single summary and/or data file.\n'
        smartPrint(msg, verbose)

    return params_dict, args
Пример #11
0
def flagWells(df, flags, verbose=False, drop=False):
    '''
    Passes plate-well-specific flags from user into mapping dataframes.

    Args:
        df (dictionary of pandas.DataFrame) must have Plate_IDs and Well as columns
        flags (dictionary) with Plate_IDs (str) as keys and Wells (stR) as vlaues
        verbose (boolean)

    Returns:
        df (pandas.DataFrame)
    '''

    if (len(flags) == 0):
        smartPrint('No wells were flagged.\n', verbose)
        return df

    for plate, wells in flags.items():
        if plate in df.keys():

            df[plate].loc[wells, 'Flag'] = [1] * len(wells)

            if drop: df[plate] = df[plate][df[plate].Flag == 0]

    smartPrint('The following flags were detected:\n', verbose)
    smartPrint(tidyDictPrint(flags), verbose)

    return df
Пример #12
0
def main(args):

    verbose = args.verbose
    directory = args.input
    overwrite = args.over_write
    confidence = float(args.confidence) / 100.0
    z_value = (1 - (1 - confidence) / 2)
    add_noise = args.include_noise

    msg = 'AMiGA is parsing your file(s)'

    smartPrint('', verbose)
    smartPrint(tidyMessage(msg), verbose)

    directory, filename = isFileOrFolder(directory, up=1)

    # packge filename(s) into a list
    if filename: ls_files = ['{}{}{}'.format(directory, os.sep, filename)]
    else: ls_files = filename

    for lf in ls_files:

        df = pd.read_csv(lf, sep='\t', header=0, index_col=0)

        # define file name for the updated dataframe
        if overwrite: new_name = lf
        elif lf.endswith('.txt'):
            new_name = '{}_confidence.txt'.format(lf[:-4])
        else:
            new_name = '{}_confidence.txt'.format(lf)

        # compute confidecne intervals and save results
        if args.type == 'Parameters':
            df = get_parameter_confidence(df, z_value)
            df.to_csv(new_name, sep='\t', header=True, index=True)
        elif args.type == 'Curves':
            df = get_curve_confidence(df, z_value, add_noise)
            df.to_csv(new_name, sep='\t', header=True, index=False)
Пример #13
0
def assembleMappings(data,
                     mapping_path,
                     meta_path=None,
                     save=False,
                     verbose=False):
    '''
    Creates a master mapping file (or dictionary ?) for all data files in the input argument.
        For each data file, in this particular order, it will first (1) check if an individual
        mapping file exists, (2) if not, check if relevant meta-data is provided in meta.txt
        file, (3) if not, infer if plate is a BIOLOG PM based on its file name, and (4) if all
        fail, create a minimalist mapping file. 

    Args:
        data (dictionary): keys are file names (i.e. filebases or Plate IDs) and values are
            pandas DataFrames where index column (row names) are well IDs.
        mapping_path (str): path to the mapping folder.
        meta_path (str): path to the mapping file.
        verbose (boolean)

    Returns:
        df_mapping_dict (dictionary): keys are file names and values are mapping files. 
    '''

    df_mapping_dict = {}

    # list all data files to be analyed
    list_filebases = data.keys()

    # list all potential mapping file paths
    list_mapping_files = [
        assemblePath(mapping_path, ii, '.txt') for ii in list_filebases
    ]

    # read meta.txt and list all plates described by it
    meta_df, meta_df_plates = checkMetaText(meta_path, verbose=verbose)

    # assemble mapping for one data file at a time
    for filebase, mapping_file_path in zip(list_filebases, list_mapping_files):

        # what are the row names from the original data file
        well_ids = data[filebase].columns[
            1:]  # this may no be A1 ... H12, but most ofen will be

        # create file path for saving derived mapping, if requested
        newfilepath = assembleFullName(mapping_path, '', filebase, '', '.map')

        # see if user provided a mapping file that corresponds to this data file (filebase)
        if os.path.exists(mapping_file_path):

            df_mapping = pd.read_csv(mapping_file_path,
                                     sep='\t',
                                     header=0,
                                     index_col=0,
                                     dtype={
                                         'Plate_ID': str,
                                         'Isolate': str
                                     })
            df_mapping = checkPlateIdColumn(
                df_mapping, filebase)  # makes sure Plate_ID is a column
            df_mapping.index = [
                ii[0] + ii[1:].lstrip('0') for ii in df_mapping.index
            ]  # strip leading zeros in well names

            smartPrint('{:.<30} Reading {}.'.format(filebase,
                                                    mapping_file_path),
                       verbose=verbose)

        # see if user described the file in meta.txt
        elif filebase in meta_df_plates:

            meta_info = meta_df[meta_df.Plate_ID == filebase]
            msg = '{:.<30} Found meta-data in meta.txt '.format(filebase)

            biolog = isBiologFromMeta(
                meta_info)  # does meta_df indicate this is a BIOLOG plate

            if biolog:
                checkBiologSize(data[filebase], filebase)
                df_mapping = expandBiologMetaData(meta_info)
                msg += '& seems to be a BIOLOG PM plate.'
                smartPrint(msg, verbose=verbose)
            else:
                df_mapping = initKeyFromMeta(meta_info, well_ids)
                msg += '& does not seem to be a BIOLOG PM plate.'
                smartPrint(msg, verbose=verbose)

        elif isBiologFromName(filebase):
            checkBiologSize(data[filebase], filebase)
            df_mapping = initBiologPlateKey(filebase)
            msg = '{:.<30} Did not find mapping file or meta-data '.format(
                filebase)
            msg += 'BUT seems to be a BIOLOG PM plate.'
            smartPrint(msg, verbose=verbose)

        else:
            df_mapping = initMappingDf(filebase, well_ids)
            msg = '{:.<30} Did not find mapping file or meta-data '.format(
                filebase)
            msg += '& does not seem to be a BIOLOG PM plate.'
            smartPrint(msg, verbose=verbose)

        df_mapping_dict[filebase] = expandMappingParams(df_mapping,
                                                        verbose=verbose)

        if save:
            df_mapping_dict[filebase].to_csv(newfilepath,
                                             sep='\t',
                                             header=True,
                                             index=True)

    #df_mapping = df_mapping.reset_index(drop=False)
    smartPrint('', verbose=verbose)

    return df_mapping_dict
Пример #14
0
def readPlateReaderFolder(filename=None,
                          directory=None,
                          interval=dict(),
                          save=False,
                          verbose=False):
    '''
    Finds, reads, and formats all files in a directory to be AMiGA-compatible.

    Args:
        filename (str or None): 
            if str: path to a single data file to be read.
            if None: user is interested in reading one or more data files (so user must pass directory argument).
        directory (dictionary or str or None):
            if dictionary: Keys are folder names, values are their paths. Keys must include 'data' and 'derived'
                'data' sub-folder must exist and house one or more data files to be read. 
            if str: path to folder that houses one ore more data files to be read.
            if None: user is interested in reading a single data file (so user must pass filename argument).
        interval (dictionary or numeric):
            if numeric: must be int or float.
            if dictionary: Keys are file names, values are their respective interval parameter, e.g. 
                {'CD2015_PM1-1':600,'CD2048_PM1-1':900}). If a filename does not have a corresponding key in the
                dictionary, the default parameter for 'interval' in the 'config' dictionary will be used. 
        save (boolean): will save AMiGA-formatted file in the 'derived' or input folder as a TSV file.
        verbose (boolean)
    '''

    if (filename is None) and (directory is None):
        sys.exit(
            'FATAL USER ERROR: User must pass either a filename or a directory argument'
        )

    # what is the data folder (folderpath) and where to save formatted data (copydirectory)?
    if isinstance(directory, dict):
        folderpath = directory['data']
        copydirectory = directory['derived']
    elif isinstance(directory, str):
        copydirectory = folderpath = directory
    elif directory is None:
        copydirectory = folderpath = os.path.dirname(filename)

    # user may have passed a specific file or a directory to the input argument
    if filename:
        filepaths = ['{}{}{}'.format(folderpath, os.sep, filename)]
    else:
        filepaths = findPlateReaderFiles(folderpath)
    # either way, filepaths must be an iterable list or array

    # read one data file at a time
    df_dict = {}
    for filepath in sorted(filepaths):

        # communicate with user
        smartPrint('Reading {}'.format(filepath), verbose)

        # get extension-free file name and path for derived copy
        _, filebase, newfilepath = breakDownFilePath(
            filepath, copydirectory=copydirectory)

        # set the interval time
        if isinstance(interval, (int, float)):
            plate_interval = float(interval)
        elif filebase in interval.keys():
            plate_interval = interval[filebase]
        else:
            plate_interval = config['interval']

        # read and adjust file to format: time by wells where first column is time and rest are ODs
        df = readPlateReaderData(filepath,
                                 plate_interval,
                                 copydirectory,
                                 save=save)
        df_dict[filebase] = df  #..iloc[nskip:,:]

    smartPrint('', verbose)  # print empty newline, for visual asethetics only

    return df_dict