def findPlateReaderFiles(directory, extension=None): ''' Recrusivelys searches a directory for all files with specific extensions. Args: directory (str): path to data directory Returns: list_files (list): list of of paths to data files ''' # you can modify this to include other extensions, but internal data must still be tab-delimited if extension is None: extension = ('.txt', 'TXT', 'tsv', 'TSV', 'asc', 'ASC') # recursively walk through a directory, if nested list_files = [] for (dirpath, dirnames, filenames) in os.walk(directory): # only keep files with acceptable extensions filenames = [ii for ii in filenames if ii.endswith(extension)] # compose and store filepaths but avoid double slashes (i.e. //) between directory names for filename in filenames: list_files.append(assemblePath(dirpath, filename)) return list_files
def breakDownFilePath(filepath, copydirectory): ''' Breaks down a file path into several components. Args: filepath (str) save_dirname (str): directory where a copy of the file would be stored Returns: filename (str): basename without path filebase (str): basename without path and without extension newfilepath (str): filename with path and with extension repalced to .tsv Example input: '/Users/firasmidani/RandomFileName.asc' will return filename --> RandomFileName.asc filebase --> RandomFileName newfilepath --> /Users/firasmiani/RandomFileName.tsv ''' filename = os.path.basename(filepath) filebase = ''.join(filename.split('.')[:-1]) dirname = os.path.dirname(filepath) newfilepath = assemblePath(copydirectory, filebase, '.tsv') return filename, filebase, newfilepath
def initPaths(self): ''' Initialize paths for for saving data and results. ''' # if user did not pass file name for output, use time stamp file_name = selectFileName(self.args['fout']) dir_path = assemblePath(self.directory['models'], file_name, '') if not os.path.exists(dir_path): os.mkdir(dir_path) # running model on transformed results and recording results file_path_key = assembleFullName(dir_path, '', file_name, 'key', '.txt') file_path_input = assembleFullName(dir_path, '', file_name, 'input', '.txt') paths_dict = {} paths_dict['filename'] = file_name paths_dict['dir'] = dir_path paths_dict['key'] = file_path_key paths_dict['input'] = file_path_input self.paths_dict = paths_dict
def basicSummaryOnly(data, mapping, directory, args, verbose=False): ''' If user only requested plotting, then for each data file, perform a basic algebraic summary and plot data. Once completed, exit system. Otherwise, return None. Args: data (dictionary): keys are plate IDs and values are pandas.DataFrames with size t x (n+1) where t is the number of time-points and n is number of wells (i.e. samples), the additional 1 is due to the explicit 'Time' column, index is uninformative. mapping (dictionary): keys are plate IDs and values are pandas.DataFrames with size n x (p) where is the number of wells (or samples) in plate, and p are the number of variables or parameters described in dataframe. directory (dictionary): keys are folder names, values are their paths args verbose (boolean) Returns: None: if only_plot_plate argument is False. ''' if not args['obs']: # if not only_basic_summary return None print(tidyMessage('AMiGA is summarizing and plotting data files')) list_keys = [] for pid, data_df in data.items(): # define paths where summary and plot will be saved key_file_path = assemblePath(directory['summary'], pid, '.txt') key_fig_path = assemblePath(directory['figures'], pid, '.pdf') # grab plate-specific samples # index should be well IDs but a column Well should also exist # in main.py, annotateMappings() is called which ensures the above is the case mapping_df = mapping[pid] mapping_df = resetNameIndex(mapping_df, 'Well', False) # grab plate-specific data wells = list(mapping_df.Well.values) data_df = data_df.loc[:, ['Time'] + wells] # update plate-specific data with unique Sample Identifiers sample_ids = list(mapping_df.index.values) data_df.columns = ['Time'] + sample_ids # create GrowthPlate object, perform basic summary plate = GrowthPlate(data=data_df, key=mapping_df) plate.convertTimeUnits(input=getTimeUnits('input'), output=getTimeUnits('output')) plate.computeBasicSummary() plate.computeFoldChange(subtract_baseline=True) # plot and save as PDF, also save key as TXT if not args['dp']: plate.plot(key_fig_path) if args['merges']: list_keys.append(plate.key) else: plate.key.to_csv(key_file_path, sep='\t', header=True, index=False) smartPrint(pid, verbose=verbose) if args['merges']: filename = selectFileName(args['fout']) summary_path = assembleFullName(directory['summary'], 'summary', filename, '_basic', '.txt') summary_df = pd.concat(list_keys, sort=False) summary_df.to_csv(summary_path, sep='\t', header=True, index=False) smartPrint( '\nSee {} for summary text file(s).'.format(directory['summary']), verbose) smartPrint('See {} for figure PDF(s).\n'.format(directory['figures']), verbose) msg = 'AMiGA completed your request and ' msg += 'wishes you good luck with the analysis!' print(tidyMessage(msg)) sys.exit()
def plotPredictions(self): ''' Visualizes the model tested by a specific hypothesis given the data. Args: x_full (pandas.DataFrame) x_min (pandas.DataFrame) hypotheis (dictionary): keys are str(H0) and str(H1), values are lists of str plate (growth.GrowthPlate obj)) variable (list): variables of interest factor_dict (dictionary): mapping of unique values of variables to numerical integers subtract_control (boolean): where control sample curves subtracted from treatment sample curves file_name (str): directory (str): path where files/figures should be stored args_dict (dictionary): must at least include 'nperm', 'nthin', and 'fdr' as keys and their values Action: saves a plot as PDF file ''' # get necessary attributs x_full = self.x_full x_min = self.x_min factor_dict = self.factor_dict hypothesis = self.hypothesis variable = self.target[0] plate = self.plate subtract_control = self.subtract_control directory = self.paths_dict['dir'] file_name = self.paths_dict['filename'] # get and modify user-accessible parameters from config.py plot_params = getHypoPlotParams() # dict tick_spacing = plot_params['tick_spacing'] legend_loc = plot_params['legend'] fontsize = plot_params['fontsize'] posterior_n = getValue('n_posterior_samples') colors = getValue('hypo_colors') # list of colors confidence = getValue('confidence') # confidence interval, e.g. 0.95 confidence = 1 - (1 - confidence) / 2 noise = self.args['noise'] if self.args['dp']: return None # grab mapping of integer codes in design matrix to actual variable labels varb_codes_map = reverseDict(factor_dict[variable]) # {codes:vlaues} cond_variables = list( set(hypothesis['H1']).difference(set( ['Time', variable]))) # conditioning variables # set figure aesthetics sns.set_style('whitegrid') rcParams['font.family'] = 'sans-serif' rcParams['font.sans-serif'] = 'Arial' # initialize grid fig, ax = plt.subplots(2, 1, figsize=[5, 10.5], sharex=False) # for each unique value of variable of interest, plot MVN prediction list_values = varb_codes_map.items() list_colors = colors[0:x_min.shape[0]] # plot MVN predictions for v_map, color in zip(list_values, list_colors): code, label = v_map criteria_real = {variable: [label]} criteria_mvn = {variable: code} ax[0] = addRealPlotLine(ax[0], plate, criteria_real, color, plot_params) ax[0] = addMVNPlotLine(ax[0], x_full, criteria_mvn, label, confidence, color, plot_params, noise) ax[0].xaxis.set_major_locator(MultipleLocator(tick_spacing)) # adjust labels and window limits ax[0] = setAxesLabels(ax[0], subtract_control, plot_params) # if variable has only 2 values and if requested, plot delta OD if (len(list_values) != 2) or (not self.args['pdo']): fig.delaxes(ax[1]) dos = None else: ax[1] = plotDeltaOD(ax[1], self.functional_diff, ylabel=True, xlabel=True, fontsize=fontsize) ax[1].xaxis.set_major_locator(MultipleLocator(tick_spacing)) ax[0].set_xlabel('') ax = dynamicWindowAdjustment(ax) ## if user did not pass file name for output, use time stamp fig_path = assemblePath(directory, file_name, '.pdf') plt.subplots_adjust(wspace=0.15, hspace=0.15) savePlotWithLegends(ax[0], fig_path, legend_loc, fontsize=fontsize)
def assembleMappings(data, mapping_path, meta_path=None, save=False, verbose=False): ''' Creates a master mapping file (or dictionary ?) for all data files in the input argument. For each data file, in this particular order, it will first (1) check if an individual mapping file exists, (2) if not, check if relevant meta-data is provided in meta.txt file, (3) if not, infer if plate is a BIOLOG PM based on its file name, and (4) if all fail, create a minimalist mapping file. Args: data (dictionary): keys are file names (i.e. filebases or Plate IDs) and values are pandas DataFrames where index column (row names) are well IDs. mapping_path (str): path to the mapping folder. meta_path (str): path to the mapping file. verbose (boolean) Returns: df_mapping_dict (dictionary): keys are file names and values are mapping files. ''' df_mapping_dict = {} # list all data files to be analyed list_filebases = data.keys() # list all potential mapping file paths list_mapping_files = [ assemblePath(mapping_path, ii, '.txt') for ii in list_filebases ] # read meta.txt and list all plates described by it meta_df, meta_df_plates = checkMetaText(meta_path, verbose=verbose) # assemble mapping for one data file at a time for filebase, mapping_file_path in zip(list_filebases, list_mapping_files): # what are the row names from the original data file well_ids = data[filebase].columns[ 1:] # this may no be A1 ... H12, but most ofen will be # create file path for saving derived mapping, if requested newfilepath = assembleFullName(mapping_path, '', filebase, '', '.map') # see if user provided a mapping file that corresponds to this data file (filebase) if os.path.exists(mapping_file_path): df_mapping = pd.read_csv(mapping_file_path, sep='\t', header=0, index_col=0, dtype={ 'Plate_ID': str, 'Isolate': str }) df_mapping = checkPlateIdColumn( df_mapping, filebase) # makes sure Plate_ID is a column df_mapping.index = [ ii[0] + ii[1:].lstrip('0') for ii in df_mapping.index ] # strip leading zeros in well names smartPrint('{:.<30} Reading {}.'.format(filebase, mapping_file_path), verbose=verbose) # see if user described the file in meta.txt elif filebase in meta_df_plates: meta_info = meta_df[meta_df.Plate_ID == filebase] msg = '{:.<30} Found meta-data in meta.txt '.format(filebase) biolog = isBiologFromMeta( meta_info) # does meta_df indicate this is a BIOLOG plate if biolog: checkBiologSize(data[filebase], filebase) df_mapping = expandBiologMetaData(meta_info) msg += '& seems to be a BIOLOG PM plate.' smartPrint(msg, verbose=verbose) else: df_mapping = initKeyFromMeta(meta_info, well_ids) msg += '& does not seem to be a BIOLOG PM plate.' smartPrint(msg, verbose=verbose) elif isBiologFromName(filebase): checkBiologSize(data[filebase], filebase) df_mapping = initBiologPlateKey(filebase) msg = '{:.<30} Did not find mapping file or meta-data '.format( filebase) msg += 'BUT seems to be a BIOLOG PM plate.' smartPrint(msg, verbose=verbose) else: df_mapping = initMappingDf(filebase, well_ids) msg = '{:.<30} Did not find mapping file or meta-data '.format( filebase) msg += '& does not seem to be a BIOLOG PM plate.' smartPrint(msg, verbose=verbose) df_mapping_dict[filebase] = expandMappingParams(df_mapping, verbose=verbose) if save: df_mapping_dict[filebase].to_csv(newfilepath, sep='\t', header=True, index=True) #df_mapping = df_mapping.reset_index(drop=False) smartPrint('', verbose=verbose) return df_mapping_dict