def main(args): verbose = args.verbose #directory = assemblePath(args.input,'summary') directory = args.input msg = 'AMiGA is peeking inside the summary directory' smartPrint('',verbose) smartPrint(tidyMessage(msg),verbose) # smartPrint(checkDirectoryNotEmpty(directory,'Summary')[1],verbose) criteria = checkParameterCommand(args.subset,sep=',') directory,filename = isFileOrFolder(directory,up=1) if filename: ls_files = ['{}{}{}'.format(directory,os.sep,filename)] else: ls_files = findPlateReaderFiles(directory,'.txt') full_df = read(ls_files) sub_df = subsetDf(full_df,criteria) sub_df = group(sub_df,args) sub_df = pivot(sub_df,args,args.value) sub_df = reduceDf(sub_df,args) clusterMap(sub_df,full_df,args,directory) saveDf(full_df,sub_df,args,directory)
def subsetWells(df_mapping_dict, criteria, hypothesis, verbose=False): ''' Tag wells that meet user-passed criteria. Args: df (pandas.DataFrame) must have Plate_IDs and Well as columns criteria (dictionary) with mapping variables (str) as keys and accepted instances (str) as values hypothesis (dictionary) verbose (boolean) Returns: df (pandas.DataFrame) ''' if (len(criteria) == 0): smartPrint('No subsetting was requested.\n', verbose) return df_mapping_dict, None for plate_id, mapping_df in df_mapping_dict.items(): # subsetting on number-based criteria does not match hits due to type mismatch (str vs int/float) mapping_df_str = mapping_df.astype(str) remove_boolean = ~(mapping_df_str.isin(criteria).sum(1) == len(criteria)).values # list of booleans remove_idx = mapping_df_str.index[remove_boolean] mapping_df.loc[remove_idx, 'Subset'] = [0] * len(remove_idx) msg = 'The following criteria were used to subset data:\n' msg += tidyDictPrint(criteria) smartPrint(msg, verbose) return df_mapping_dict, msg
def pivot(df,args,metric=None): if metric is None: return df else: df = pd.pivot(data=df,columns=args.x_variable,index=args.y_variable,values=metric) rows_todrop = np.where(df.isna().any(1))[0] rows_todrop = df.index.values[rows_todrop] cols_todrop = np.where(df.isna().any())[0] cols_todrop = df.keys().values[cols_todrop] if len(rows_todrop) > 0 or len(cols_todrop): msg = 'User Warning: The heatmap data is missing values. ' msg += 'Pleae check the data for the following:\n\n' msg += 'Columns:\t' msg += ', '.join(cols_todrop) + '\n' msg += '\n' msg += 'Rows:\t' msg += ', '.join(rows_todrop) + '\n' msg += '\nThese variables will be dropped and not plotted unless if you requested that ' msg += 'they be kept with --keep-rows-missing-data or --keep-columns-missing-data.\n\n' smartPrint(msg,args.verbose) if not args.keep_rows_missing_data: df = df.drop(labels=rows_todrop,axis=0) if not args.keep_columns_missing_data: df = df.drop(labels=cols_todrop,axis=1) return df
def main(): args = parseCommand() verbose = args['verbose'] #directory = assemblePath(args['fi'],'summary') directory = args['fi'] msg = 'AMiGA is peeking inside the summary directory' smartPrint('', verbose) smartPrint(tidyMessage(msg), verbose) # smartPrint(checkDirectoryNotEmpty(directory,'Summary')[1],verbose) criteria = checkParameterCommand(args['s'], sep=',') directory, filename = isFileOrFolder(directory, up=1) if filename: ls_files = ['{}{}{}'.format(directory, os.sep, filename)] else: ls_files = findPlateReaderFiles(directory, '.txt') df = read(ls_files) df = subsetDf(df, criteria) df = group(df, args) df = pivot(df, args) df = reduceDf(df, args) #plot(df,args,directory) clusterMap(df, args, directory)
def expandMappingParams(df,verbose): ''' Expand input data frame to include columns relevant for AMiGA processing of user paramters. It will add columns for Group, Control, Flag, and Subset. Note on grouping: plates can be divided into multiple groups where each group has its own group-specific control wells. Biolog PM plates has a single group and A1 is control well. Args: df (pandas.DataFrame) verbose (boolean) Returns: df (pandas.DataFrame): with four additional columns ''' # get dataframe info Plate_ID = Plate_ID = df.Plate_ID.unique()[0] keys = list(df.keys()) # check if Biolog PM plate biolog = isBiologFromName(Plate_ID) # True or False if ('Control' in keys) and ('Group' not in keys): df.loc[:,'Control'] = df.Control.fillna(0) df.loc[:,'Group'] = [1]*df.shape[0] if ('Group' in keys) and ('Control' not in keys): df.loc[:,'Group'] = df.Group.fillna('NA') df.loc[:,'Control'] = [0]*df.shape[0] if ('Group' not in keys) and ('Control' not in keys): # plate can be divided into multiple group, each gorup has unique control well(s) df.loc[:,'Group'] = [1]*df.shape[0] # all wells in a BIOLOG PM plte belong to same group df.loc[:,'Control'] = [0]*df.shape[0] # all wells (except) A1 are treatments if biolog: df.loc[:,'Control'] = 0 # A1 is the control well df.loc['A1','Control'] = 1 # A1 is the control well if not all(x in [0.,1.] or np.isnan(x) for x in df.Control.unique()): msg = '\nUSER ERROR: Values in Control column for mapping ' msg += 'of {} must be either 0 or 1.\n'.format(Plate_ID) smartPrint(msg,verbose) df.loc[:,'Control'] = [0]*df.shape[0] # replace na values df.loc[:,'Group'] = df.Group.fillna('NA') df.loc[:,'Control'] = df.Control.fillna(0) # initialize well-specific flag and subset parameters df.loc[:,'Flag'] = [0]*df.shape[0] # by default, no wells are flagged df.loc[:,'Subset'] = [1]*df.shape[0] # by default, all wells are included in analysis return df
def model(self, nthin=1, store=False, verbose=False): ''' Infers growth parameters of interest (including diauxic shifts) by Gaussian Process fitting of data. Args: store (boolean): if True, certain data will be store as object's attributes diauxie (float): ratio of peak height (relative to maximum) used to call if diauxie occured or not Actions: modifies self.key, and may create self.latent and self.dlatent_dt objects ''' # get user-defined parameters from config.py posterior_n = getValue('n_posterior_samples') # initialize variables for storing parameters and data data_ls, diauxie_dict = [], {} gp_params = initParamDf(self.key.index, 0) for sample_id in self.key.index: pid, well = self.key.loc[sample_id, ['Plate_ID', 'Well']].values smartPrint('Fitting {}\t{}'.format(pid, well), verbose) # extract sample args_dict = self.key.loc[sample_id, ['Well', 'Plate_ID']].to_dict() sample = self.extractGrowthData(args_dict) df = sample.time.join(sample.data) df.columns = ['Time', 'OD'] # create GP object and analyze gm = GrowthModel(df=df, baseline=sample.key.OD_Baseline.values, ARD=False, heteroscedastic=False, nthin=nthin) curve = gm.run(name=sample_id) diauxie_dict[sample_id] = curve.params.pop('df_dx') gp_params.loc[sample_id, :] = curve.params # passively save data, manipulation occurs below (input OD, GP fit, & GP derivative) if store: data_ls.append(curve.data()) diauxie_df = mergeDiauxieDfs(diauxie_dict) # record results in object's key self.key = self.key.join(gp_params) self.key = pd.merge(self.key, diauxie_df, on='Sample_ID') # plotting needs transformed (or real) OD & GP fit, & may need GP derivative, save all as obejct attributes if store: self.gp_data = pd.concat(data_ls).reset_index(drop=True) return None
def readPlateReaderFolder(filename, directory, config, interval_dict={}, save=False, verbose=False): ''' Finds, reads, and formats all files in a directory to be AMiGA-compatible. Args: filename (str or None): str indicates user is intersted in a single data file, None otherwise directory (dictionary): keys are folder names, values are their paths config (dictionary): variables saved in config.py where key is variable and value is value save (boolean): interval_dict (dictionary) verbose (boolean) ''' folderpath = directory['data'] copydirectory = directory['derived'] # user may have passed a specific file or a directory to the input argument if filename: filepaths = ['{}{}{}'.format(folderpath, os.sep, filename)] else: filepaths = findPlateReaderFiles(folderpath) # either way, filepaths must be an iterable list or array # read one data file at a time df_dict = {} for filepath in sorted(filepaths): # communicate with user smartPrint('Reading {}'.format(filepath), verbose) # get extension-free file name and path for derived copy _, filebase, newfilepath = breakDownFilePath( filepath, copydirectory=copydirectory) # set the interval time if isinstance(interval_dict, dict): if filebase in interval_dict.keys(): plate_interval = interval_dict[filebase][0] else: plate_interval = config['interval'] # read and adjust file to format: time by wells where first column is time and rest are ODs df = readPlateReaderData(filepath, plate_interval, copydirectory, save=save) df_dict[filebase] = df #..iloc[nskip:,:] smartPrint('', verbose) # print empty newline, for visual asethetics only return df_dict
def main(args): verbose = args.verbose directory = args.input ovewrrite = args.over_write msg = 'AMiGA is parsing your file(s)' smartPrint('',verbose) smartPrint(tidyMessage(msg),verbose) directory,filename = isFileOrFolder(directory,up=1) if filename: ls_files = ['{}{}{}'.format(directory,os.sep,filename)] else: ls_files = findPlateReaderFiles(directory,'.txt') for lf in ls_files: df = read(ls_files) if ovewrrite: new_name = lf elif lf.endswith('.txt'): new_name = '{}_normalized.txt'.format(lf[:-4]) else: new_name = '{}.normalized.txt'.format(lf) df = normalizeParameters(args,df) df.to_csv(new_name,sep='\t',header=True,index=True) msg = 'AMiGA compelted your request' smartPrint('',verbose) smartPrint(tidyMessage(msg),verbose)
def checkMetaText(filepath, verbose=False): ''' Parses meta.txt file into a pandas.DataFrame. Args: filepath (str): path to meta.txt, must be tab-delimited with first column as "Plate_ID" (i.e. file name) verbose (boolean) Returns: df_meta (pandas.DataFrame) df_meta_plates (list) ''' if filepath is None: exists = False else: exists = os.path.exists(filepath) if not exists: df_meta = pd.DataFrame df_meta_plates = [] else: df_meta = pd.read_csv(filepath, sep='\t', header=0, index_col=None, dtype={ 'Plate_ID': str, 'Isolate': str }) # which plates were characterized in meta.txt? try: df_meta_plates = df_meta.Plate_ID.values except: df_meta_plates = [] # neatly prints meta.txt to terminal if exists: tab = tabulate(df_meta, headers='keys', tablefmt='psql') msg = '{:.<21}{}\n{}\n'.format('Meta-Data file is', filepath, tab) smartPrint(msg, verbose) else: smartPrint('No meta.txt file found\n', verbose) return df_meta, df_meta_plates
def interpretParameters(files, args, verbose=False): ''' Checks specific directories for their existence and makes sure data was provided by user. It will also compose and can print a message that can communicate with user the results of this validation. Args: files (dict): keys are parameter names and values are file paths args (dict): keys are parameter names and values are corresponding command-line arguments Returns: params_dict (dict): dictionary where keys are variables and values instances ''' # params_settings defines parameters for interpreting parameter files or argument calls # # each item is a 3-tuple where: # 1. parameter name which matches both input argument and text file name (str) # 2. delimitor that separates values of a variable of the parameter (str) # 3. integerize: whether to convert values of a variable to integers (boolean) params_settings = [('interval', ',', True), ('subset', ',', False), ('flag', ',', False), ('hypo', '\+|,', False)] # initialize all parameters based on their settings params_dict = {} for pp, sep, integerize in params_settings: params_dict[pp] = initializeParameter(files[pp], args[pp], sep=sep, integerize=integerize) smartPrint(tidyDictPrint(params_dict), verbose) # if user requests any subsetting, summary results must be merged if params_dict['subset']: args['merges'] = True msg = 'WARNING: Because user has requested subsetting of data, ' msg += 'results will be merged into single summary and/or data file.\n' smartPrint(msg, verbose) return params_dict, args
def flagWells(df, flags, verbose=False, drop=False): ''' Passes plate-well-specific flags from user into mapping dataframes. Args: df (dictionary of pandas.DataFrame) must have Plate_IDs and Well as columns flags (dictionary) with Plate_IDs (str) as keys and Wells (stR) as vlaues verbose (boolean) Returns: df (pandas.DataFrame) ''' if (len(flags) == 0): smartPrint('No wells were flagged.\n', verbose) return df for plate, wells in flags.items(): if plate in df.keys(): df[plate].loc[wells, 'Flag'] = [1] * len(wells) if drop: df[plate] = df[plate][df[plate].Flag == 0] smartPrint('The following flags were detected:\n', verbose) smartPrint(tidyDictPrint(flags), verbose) return df
def main(args): verbose = args.verbose directory = args.input overwrite = args.over_write confidence = float(args.confidence) / 100.0 z_value = (1 - (1 - confidence) / 2) add_noise = args.include_noise msg = 'AMiGA is parsing your file(s)' smartPrint('', verbose) smartPrint(tidyMessage(msg), verbose) directory, filename = isFileOrFolder(directory, up=1) # packge filename(s) into a list if filename: ls_files = ['{}{}{}'.format(directory, os.sep, filename)] else: ls_files = filename for lf in ls_files: df = pd.read_csv(lf, sep='\t', header=0, index_col=0) # define file name for the updated dataframe if overwrite: new_name = lf elif lf.endswith('.txt'): new_name = '{}_confidence.txt'.format(lf[:-4]) else: new_name = '{}_confidence.txt'.format(lf) # compute confidecne intervals and save results if args.type == 'Parameters': df = get_parameter_confidence(df, z_value) df.to_csv(new_name, sep='\t', header=True, index=True) elif args.type == 'Curves': df = get_curve_confidence(df, z_value, add_noise) df.to_csv(new_name, sep='\t', header=True, index=False)
def assembleMappings(data, mapping_path, meta_path=None, save=False, verbose=False): ''' Creates a master mapping file (or dictionary ?) for all data files in the input argument. For each data file, in this particular order, it will first (1) check if an individual mapping file exists, (2) if not, check if relevant meta-data is provided in meta.txt file, (3) if not, infer if plate is a BIOLOG PM based on its file name, and (4) if all fail, create a minimalist mapping file. Args: data (dictionary): keys are file names (i.e. filebases or Plate IDs) and values are pandas DataFrames where index column (row names) are well IDs. mapping_path (str): path to the mapping folder. meta_path (str): path to the mapping file. verbose (boolean) Returns: df_mapping_dict (dictionary): keys are file names and values are mapping files. ''' df_mapping_dict = {} # list all data files to be analyed list_filebases = data.keys() # list all potential mapping file paths list_mapping_files = [ assemblePath(mapping_path, ii, '.txt') for ii in list_filebases ] # read meta.txt and list all plates described by it meta_df, meta_df_plates = checkMetaText(meta_path, verbose=verbose) # assemble mapping for one data file at a time for filebase, mapping_file_path in zip(list_filebases, list_mapping_files): # what are the row names from the original data file well_ids = data[filebase].columns[ 1:] # this may no be A1 ... H12, but most ofen will be # create file path for saving derived mapping, if requested newfilepath = assembleFullName(mapping_path, '', filebase, '', '.map') # see if user provided a mapping file that corresponds to this data file (filebase) if os.path.exists(mapping_file_path): df_mapping = pd.read_csv(mapping_file_path, sep='\t', header=0, index_col=0, dtype={ 'Plate_ID': str, 'Isolate': str }) df_mapping = checkPlateIdColumn( df_mapping, filebase) # makes sure Plate_ID is a column df_mapping.index = [ ii[0] + ii[1:].lstrip('0') for ii in df_mapping.index ] # strip leading zeros in well names smartPrint('{:.<30} Reading {}.'.format(filebase, mapping_file_path), verbose=verbose) # see if user described the file in meta.txt elif filebase in meta_df_plates: meta_info = meta_df[meta_df.Plate_ID == filebase] msg = '{:.<30} Found meta-data in meta.txt '.format(filebase) biolog = isBiologFromMeta( meta_info) # does meta_df indicate this is a BIOLOG plate if biolog: checkBiologSize(data[filebase], filebase) df_mapping = expandBiologMetaData(meta_info) msg += '& seems to be a BIOLOG PM plate.' smartPrint(msg, verbose=verbose) else: df_mapping = initKeyFromMeta(meta_info, well_ids) msg += '& does not seem to be a BIOLOG PM plate.' smartPrint(msg, verbose=verbose) elif isBiologFromName(filebase): checkBiologSize(data[filebase], filebase) df_mapping = initBiologPlateKey(filebase) msg = '{:.<30} Did not find mapping file or meta-data '.format( filebase) msg += 'BUT seems to be a BIOLOG PM plate.' smartPrint(msg, verbose=verbose) else: df_mapping = initMappingDf(filebase, well_ids) msg = '{:.<30} Did not find mapping file or meta-data '.format( filebase) msg += '& does not seem to be a BIOLOG PM plate.' smartPrint(msg, verbose=verbose) df_mapping_dict[filebase] = expandMappingParams(df_mapping, verbose=verbose) if save: df_mapping_dict[filebase].to_csv(newfilepath, sep='\t', header=True, index=True) #df_mapping = df_mapping.reset_index(drop=False) smartPrint('', verbose=verbose) return df_mapping_dict
def readPlateReaderFolder(filename=None, directory=None, interval=dict(), save=False, verbose=False): ''' Finds, reads, and formats all files in a directory to be AMiGA-compatible. Args: filename (str or None): if str: path to a single data file to be read. if None: user is interested in reading one or more data files (so user must pass directory argument). directory (dictionary or str or None): if dictionary: Keys are folder names, values are their paths. Keys must include 'data' and 'derived' 'data' sub-folder must exist and house one or more data files to be read. if str: path to folder that houses one ore more data files to be read. if None: user is interested in reading a single data file (so user must pass filename argument). interval (dictionary or numeric): if numeric: must be int or float. if dictionary: Keys are file names, values are their respective interval parameter, e.g. {'CD2015_PM1-1':600,'CD2048_PM1-1':900}). If a filename does not have a corresponding key in the dictionary, the default parameter for 'interval' in the 'config' dictionary will be used. save (boolean): will save AMiGA-formatted file in the 'derived' or input folder as a TSV file. verbose (boolean) ''' if (filename is None) and (directory is None): sys.exit( 'FATAL USER ERROR: User must pass either a filename or a directory argument' ) # what is the data folder (folderpath) and where to save formatted data (copydirectory)? if isinstance(directory, dict): folderpath = directory['data'] copydirectory = directory['derived'] elif isinstance(directory, str): copydirectory = folderpath = directory elif directory is None: copydirectory = folderpath = os.path.dirname(filename) # user may have passed a specific file or a directory to the input argument if filename: filepaths = ['{}{}{}'.format(folderpath, os.sep, filename)] else: filepaths = findPlateReaderFiles(folderpath) # either way, filepaths must be an iterable list or array # read one data file at a time df_dict = {} for filepath in sorted(filepaths): # communicate with user smartPrint('Reading {}'.format(filepath), verbose) # get extension-free file name and path for derived copy _, filebase, newfilepath = breakDownFilePath( filepath, copydirectory=copydirectory) # set the interval time if isinstance(interval, (int, float)): plate_interval = float(interval) elif filebase in interval.keys(): plate_interval = interval[filebase] else: plate_interval = config['interval'] # read and adjust file to format: time by wells where first column is time and rest are ODs df = readPlateReaderData(filepath, plate_interval, copydirectory, save=save) df_dict[filebase] = df #..iloc[nskip:,:] smartPrint('', verbose) # print empty newline, for visual asethetics only return df_dict