def subtractControl(self, to_do=False, drop=True): ''' Subtract from each treatment sample's growth curve, the growth curve of its corresponding control sample. Args: to_do (boolean): if False, do not subtract control wells and return None. drop (boolean): if True, drop control samples from data. ''' if not to_do: return None data = self.data.copy() mapping = self.key # find all unique groups plate_groups = mapping.loc[:, ['Plate_ID', 'Group']].drop_duplicates() plate_groups = [tuple(x) for x in plate_groups.values] for plate_group in plate_groups: pid, group = plate_group # grab lists of Sample_ID of wells corresponding to control and cases controls = subsetDf(mapping, { 'Plate_ID': [pid], 'Group': [group], 'Control': [1] }).index.values cases = subsetDf(mapping, { 'Plate_ID': [pid], 'Group': [group] }).index.values # includes controls if len(controls) == 0: msg = '\nFATAL ERROR: User requested subtraction of control samples. However, ' msg += 'samples belonging to group {} of plate {} lack '.format( group, pid) msg += 'any corresponding control samples in the current working directory.\n' sys.exit(msg) data_controls = data.loc[:, controls] data_cases = data.loc[:, cases] # for each case, divide data by mean controls (if log-transformed), o.w. subtract mean controls data_controls = data_controls.mean(1) data_cases = (data_cases.T - data_controls).T data.loc[:, cases] = data_cases.values if drop: data = data.drop(controls, axis=1) self.data = data self.mods.controlled = True
def normalizePooledParameters(args, df): ''' Normalizes growth parameters to control samples for pooled parametes. Args: args (dictionary): keys are arguments and value are user/default choices df (pandas.DataFrame): rows are samples, columns are experimental variables. Must include Plate_ID, Group, Control, auc, k, gr, dr, td, lag. Returns: df (pandas.DataFrame): input but with an additional 6 columns. ''' if (not args['norm']) or (not args['pool']): return df df_orig = df.copy() df_orig_keys = df_orig.columns poolby = args['pb'].split(',') normalizeby = checkParameterCommand(args['nb']) params_1 = initParamList(0) params_1.remove('diauxie') params_2 = ['mean({})'.format(ii) for ii in params_1] if any([ii in df_orig_keys for ii in params_2]): params = params_2 else: params = params_1 params_norm = initParamList(2) params_keep = ['Sample_ID'] + poolby + params df = df.loc[:, params_keep] controls = subsetDf(df, normalizeby) variable = list(set(poolby).difference(set(normalizeby.keys()))) norm_df = [] for _, row in df[variable].drop_duplicates().iterrows(): sub_df = subsetDf(df, row.to_dict()).set_index(['Sample_ID'] + poolby) sub_ctrl = subsetDf(controls, row.to_dict()).set_index(['Sample_ID'] + poolby) norm_df.append((sub_df / sub_ctrl.values)) #.reset_index()) norm_df = pd.concat(norm_df, 0) norm_df.columns = params_norm norm_df = norm_df.reset_index(drop=False) df = pd.merge(df_orig, norm_df, on=['Sample_ID'] + poolby) #df = df.drop(['Group','Control'],1) return df
def main(args): verbose = args.verbose #directory = assemblePath(args.input,'summary') directory = args.input msg = 'AMiGA is peeking inside the summary directory' smartPrint('',verbose) smartPrint(tidyMessage(msg),verbose) # smartPrint(checkDirectoryNotEmpty(directory,'Summary')[1],verbose) criteria = checkParameterCommand(args.subset,sep=',') directory,filename = isFileOrFolder(directory,up=1) if filename: ls_files = ['{}{}{}'.format(directory,os.sep,filename)] else: ls_files = findPlateReaderFiles(directory,'.txt') full_df = read(ls_files) sub_df = subsetDf(full_df,criteria) sub_df = group(sub_df,args) sub_df = pivot(sub_df,args,args.value) sub_df = reduceDf(sub_df,args) clusterMap(sub_df,full_df,args,directory) saveDf(full_df,sub_df,args,directory)
def addRealPlotLine(ax, plate, criteria, color, plot_params): ''' Given data (plate) and criteria, find relevant sample IDs and plot them on axis. Args: ax (matplotlib.axes._subplots.AxesSubplot) plate (GrowthPlate object) criteria (dictionary): keys must be column headers in plate.key, values must be values in plate.key. color (str or (R,G,B,A)) where R,G,B,A are floats [0,1] plot_params (dictionary) Returns: ax (matplotlib.axes._subplots.AxesSubplot) ''' if plot_params['overlay_actual_data']: samples = list(subsetDf(plate.key, criteria).index) if len(samples) == 0: return ax time = plate.time.copy() data = plate.data.copy() #if plot_params['plot_linear_od']: # data = data.apply(np.exp).copy() wide_df = time.join(data) wide_df = wide_df.reindex(['Time'] + samples, axis=1).set_index('Time') wide_df = wide_df.dropna(axis=1) ## necessary to get rid of controls ax.plot(wide_df, color=color, alpha=0.5, lw=1, zorder=1) return ax
def subset(args, df): ls_df, ls_varbs = [], [] for ii in args['s']: criteria = checkParameterCommand(ii, sep=',') ls_df.append(subsetDf(df, criteria)) ls_varbs.append(list(criteria.keys())) df = pd.concat(ls_df, sort=False).reset_index(drop=True).drop_duplicates() if df.shape[0] > 2: msg = '\nFATAL USER ERROR: User-provided summary files and subsetting crieteria ' msg += 'selected for more than two conditions. AMiGA can not perform comparison ' msg += 'on more than two conditions. Please check your arguments and try again. ' msg += 'Below are the currently selected conditions.\n\n' keys = [ii for ii in df.keys() if ('(' not in ii) & (ii != 'diauxie')] print(msg) print(df.loc[:, keys], '\n\n') sys.exit() else: ls_varbs = [item for sublist in ls_varbs for item in sublist] ls_varbs = list(set(ls_varbs)) return df, ls_varbs
def main(): args = parseCommand() verbose = args['verbose'] #directory = assemblePath(args['fi'],'summary') directory = args['fi'] msg = 'AMiGA is peeking inside the summary directory' smartPrint('', verbose) smartPrint(tidyMessage(msg), verbose) # smartPrint(checkDirectoryNotEmpty(directory,'Summary')[1],verbose) criteria = checkParameterCommand(args['s'], sep=',') directory, filename = isFileOrFolder(directory, up=1) if filename: ls_files = ['{}{}{}'.format(directory, os.sep, filename)] else: ls_files = findPlateReaderFiles(directory, '.txt') df = read(ls_files) df = subsetDf(df, criteria) df = group(df, args) df = pivot(df, args) df = reduceDf(df, args) #plot(df,args,directory) clusterMap(df, args, directory)
def shouldYouSubtractControl(mapping, variables): ''' Checks if control samples must be subtracted from treatment samples for proper hypothesis testing. In particular, make sure that the variable of interest is binary (i.e. it has only two possible values in the mapping dataframe. This makes sure that GP regression on variable of interest is performing a test on a binary variable. Args: mapping (pandas.DataFrame): samples (n) by variables (k) variable (str): must be one of the column headers for mapping argument Returns: (boolean) ''' unique_values = mapping.loc[:, variables].drop_duplicates().reset_index() # subtract control curves if none of the values correspond to a control subtract_control = False for _, row in unique_values.iterrows(): criteria = row.to_dict() sub_map = subsetDf(mapping, criteria) sub_map_controls_n = sub_map[sub_map.Control == 1].shape[0] sub_map_total_n = sub_map.shape[0] if (sub_map_controls_n == sub_map_total_n) and (sub_map_controls_n > 0): return False # found a value whose samples all correspond to control wells else: return True
def trimMergeMapping(mapping_dict, verbose=False): ''' Trims and merges mapping dataframes into one master mapping data frame. Args: mapping (dictionary): keys are plate IDs and values are pandas.DataFrames with size n x (p) where is the number of wells (or samples) in plate, and p are the number of variables or parameters described in dataframe. params (dictionary): must at least include 'subset' and 'flag' keys and their values verbose (boolean) Returns: mapping (pandas.DataFrame): number of wells/samples (n) x number of variables (p) ''' # merge mapping dataFrames # sort will force shared (inner) keys to the lead and unshared (outer) keys to the caboose # useful because individual mapping files may lack certain columns, some may even be empty master_mapping = pd.concat(mapping_dict.values(), ignore_index=True, join='outer', sort=False) # trim mapping based on Subset and Flag columns master_mapping = subsetDf(master_mapping, {'Subset': [1], 'Flag': [0]}) # reset_index and set as Sample_ID master_mapping = resetNameIndex(master_mapping, 'Sample_ID', True) return master_mapping
def saveDf(full_df,sub_df,args,directory): if not args.save_filtered_table: return None sub_df = subsetDf(full_df,{args.y_variable:list(sub_df.index.values), args.x_variable:list(sub_df.keys().values)}) fpath = assembleFullName(directory,'',args.output,'filtered','.txt') sub_df.to_csv(fpath,sep='\t',header=True,index=True)
def updateMappingControls(master_mapping, mapping_dict, to_do=False): ''' For all samples in master mapping, find relevant controls and add these controls to the master mapping dataframe. Args: master_mapping (pandas.DataFrame) mapping_dict (dictionary) to_do (boolean) Returns: master_mapping (pandas.DataFrame): will have more rows (i.e. samples) than input ''' # check first if you need to do this if not to_do: return master_mapping # find all unique groups plate_groups = master_mapping.loc[:, ['Plate_ID', 'Group']].drop_duplicates() plate_groups = [tuple(x) for x in plate_groups.values] # grab all relevant control samples df_controls = [] for plate_group in plate_groups: pid, group = plate_group pid_mapping = mapping_dict[pid] df_controls.append( subsetDf(pid_mapping, { 'Plate_ID': [pid], 'Group': [group], 'Control': [1] })) # re-assemble the master mapping dataframe, including the propercontrols df_controls = pd.concat(df_controls) master_mapping = pd.concat( [master_mapping.copy(), df_controls.copy()], sort=True) master_mapping = master_mapping.reset_index(drop=True) master_mapping.index.name = 'Sample_ID' master_mapping = master_mapping.sort_values( ['Plate_ID', 'Group', 'Control']) # if mapping has an interaction column, replace NaN with 0 (so it won't be dropped later) # because you are (above) adding control samples to master_mapping, # they will not have the interaction column and their values will default to NaN variable = [v for v in master_mapping.keys() if '*' in v] master_mapping.loc[:, variable] = master_mapping.loc[:, variable].fillna(0) return master_mapping
def addInteractionTerm(self): ''' If user passed hypothesis with an interaction term (identified by an astersisk), then create ''' # add interaction term, if needed mapping = self.master_mapping for variable in self.target: if ('*' in variable): pairs = variable.split("*") var_dict = {} if ('(' in variable) and (')' in variable): for pair in pairs: var, ctrl = pair.split('(') var_dict[var] = ctrl[:-1] intx = subsetDf(mapping, var_dict).index.values mapping.loc[:, variable] = [0] * mapping.shape[0] mapping.loc[intx, variable] = [1] * len(intx) else: df = mapping.loc[:, pairs].drop_duplicates() df.loc[:, variable] = df.apply( lambda x: '{} x {}'.format(x[pairs[0]], x[pairs[1]]), axis=1) mapping = pd.merge(mapping.reset_index(), df, on=pairs, how='left') mapping = mapping.set_index('Sample_ID') self.master_mapping = mapping
def savePredictions(self): ''' Given model predictions of growth curves (for each unique set of conditions tested), describe the latent function and its derivative in terms of growth parameters. Reports results in a file with {file_name}_params name in dir_path directory. Args: model (GPy.models.gp_regression.GPRegression) data (pandas.DataFrame) hypothesis (dictionary): e.g. {'H0':['Time'],'H1':['Time','Substrate']} actor_dict (dictionary): mapping of unique values of variables to numerical integers posterior (boolean) save_latent (boolean) dir_path (str): path to directory file_name (str): file name Returns: x_full (pandas.DataFrame): x_min (pandas.DataFrame): ''' data = self.data model = self.model hypothesis = self.hypothesis factor_dict = self.factor_dict variable = self.target[0] confidence = getValue('confidence') # confidence interval, e.g. 0.95 posterior = self.args['slf'] save_latent = self.args['sgd'] fix_noise = self.args['fn'] dir_path = self.paths_dict['dir'] file_name = self.paths_dict['filename'] # define hypothesis paraameters model_input = hypothesis['H1'] #grab minimal input data for prediction x_full = self.x_full x_min = self.x_min diauxie_dict = {} params_latent = initParamDf(x_min.index, complexity=0) params_sample = initParamDf(x_min.index, complexity=1) for idx, row in x_min.iterrows(): # get x and y data df = subsetDf(x_full.drop(['mu', 'Sigma', 'Noise'], 1), row.to_dict()) # get curve based on model predictions gm = GrowthModel(model=model.model, x_new=df.values, ARD=True) curve = gm.run() # get parameter estimates using predicted curve diauxie_dict[idx] = curve.params.pop('df_dx') params_latent.loc[idx, :] = curve.params if posterior: params_sample.loc[idx, :] = curve.sample().posterior # summarize diauxie results diauxie_df = mergeDiauxieDfs(diauxie_dict) if posterior: gp_params = params_sample.join(params_latent['diauxie']) else: gp_params = params_latent gp_params = x_min.join(gp_params) gp_params.index.name = 'Sample_ID' gp_params = gp_params.reset_index(drop=False) gp_params = pd.merge(gp_params, diauxie_df, on='Sample_ID') # save gp_data fit x_out = x_full.copy() for key, mapping in factor_dict.items(): if key in x_out.keys(): x_out.loc[:, key] = x_out.loc[:, key].replace(reverseDict(mapping)) if key in gp_params.keys(): gp_params.loc[:, key] = gp_params.loc[:, key].replace( reverseDict(mapping)) #params = initParamList(0) diauxie = initDiauxieList() params = initParamList(0) + initParamList(1) params = list(set(params).intersection(set(gp_params.keys()))) df_params = gp_params.drop(diauxie, axis=1).drop_duplicates() df_params = minimizeParameterReport(df_params) df_diauxie = gp_params[gp_params.diauxie == 1].drop(params, axis=1) df_diauxie = minimizeDiauxieReport(df_diauxie) if posterior: df_params = prettyifyParameterReport(df_params, variable, confidence) df_params = articulateParameters(df_params, axis=0) summ_path = assembleFullName(dir_path, '', file_name, 'params', '.txt') diux_path = assembleFullName(dir_path, '', file_name, 'diauxie', '.txt') #plate_cond.to_csv(file_path,sep='\t',header=True,index=True) df_params.to_csv(summ_path, sep='\t', header=True, index=posterior) if df_diauxie.shape[0] > 0: df_diauxie.to_csv(diux_path, sep='\t', header=True, index=False) if save_latent: file_path = assembleFullName(dir_path, '', file_name, 'output', '.txt') x_out.to_csv(file_path, sep='\t', header=True, index=True)
def __init__(self, df=None, model=None, x_new=None, baseline=1.0, ARD=False, heteroscedastic=False, nthin=1): ''' Data structure for Gaussian Process regression and related parameter inference. Attributes: x (numpy.ndarray): independent variables (N x D), where N is the number of observations, and D is the number of dimensions (or variables). y (numpy.ndarray): dependent variables (N x 1), where N is the number of observations, and the only column is the dependent or obesrved variable (often Optical Density or OD). key (dict or pandas.DataFrame): dictionary (k) or pandas.DataFrame (1 x k) that describes k experimental variables about sample. Must include 'OD_Baseline' and 'Fold_Change' variables. Notes: for growth curve analysis, it is assumed that y was log-transformeed and baseline-corrected. ''' if model: self.model = model self.x_new = x_new self.ARD = ARD self.baseline = baseline self.y = None self.df = None return None self.df = df.copy() # create a dummy non-unique variable/column foo = uniqueRandomString(avoid=df.keys()) df[foo] = [1] * df.shape[0] varbs = df.drop(['Time', 'OD'], axis=1).drop_duplicates() # for each unique non-time variable, estimate variance new_df = [] for idx, row in varbs.iterrows(): sub_df = subsetDf(df, row.to_dict()) sub_df = describeVariance(sub_df, time='Time', od='OD') new_df.append(sub_df) new_df = pd.concat(new_df, axis=0) new_df = new_df.drop(['SID', foo], axis=1) # construct a thinner dataframe to speed up regression time = new_df.Time.sort_values().unique() time = time[::int(nthin)] thin_df = new_df[new_df.Time.isin(time)] # predictions of error and new are based on full dataframe tmp = new_df.drop(['OD'], axis=1).drop_duplicates() error_new = tmp.loc[:, ['error']].values x_new = tmp.drop(['error'], axis=1).values # regression are based on input/output/error from thinned dataframe x = thin_df.drop(['OD', 'error'], axis=1).values y = thin_df.loc[:, ['OD']].values error = thin_df.loc[:, ['error']].values x_keys = thin_df.drop(['OD', 'error'], axis=1).keys() # save attributes self.x_keys = x_keys self.x_new = x_new self.x = x self.y = y self.error = error self.error_new = error_new self.baseline = baseline self.model = model self.ARD = ARD self.heteroscedastic = heteroscedastic self.noise = None
def computeFoldChange(self, subtract_baseline=True): ''' Computes the fold change for all wells using the object's unmodified raw data. The object's key must have the following columns ['Plate_ID','Gropu','Control']. Control values must be {0,1}. The fold change is computed using measurements that have had the first measurment (first time- point subtracted, first. The maximum measurement in controls are averaged to get the 447r (i.e. the average maximum OD of control wells) which divides the maximum OD of all cases. Fold-changes are normalized to controls belonging to the same group, all wells in a Biolog plate will belong to the same group and have the same control (A1 well). ''' mapping = self.key.copy() # if mapping lacks group and control columns skip if ('Group' not in mapping.keys()) or ('Control' not in mapping.keys()): mapping.loc[:, 'Fold_Change'] = [np.nan] * mapping.shape[0] self.key = mapping return None df = self.input_data.copy( ) # timepoints by wells, input data that remains unmodified # subtract first time point from each column (i.e. wells) if subtract_baseline: baseline = df.iloc[0, :] df = df.apply(lambda row: row - baseline, axis=1) # find all unique groups plate_groups = mapping.loc[:, ['Plate_ID', 'Group']].drop_duplicates() plate_groups = [tuple(x) for x in plate_groups.values] for plate_group in plate_groups: pid, group = plate_group # grab lists of Sample_ID of wells corresponding to control and cases controls = subsetDf(mapping, { 'Plate_ID': [pid], 'Group': [group], 'Control': [1] }).index.values cases = subsetDf(mapping, { 'Plate_ID': [pid], 'Group': [group], 'Control': [0] }).index.values # if group does not have a control, skip if len(controls) == 0: mapping.loc[cases, 'Fold_Change'] = [np.nan] * len(cases) continue df_controls = df.loc[:, controls] df_cases = df.loc[:, cases] # for denominator, max by control column (i.e. well), then average all controls df_controls_fc = df_controls.max(0) / df_controls.max(0).mean(0) df_cases_fc = df_cases.max(0) / df_controls.max(0).mean(0) mapping.loc[controls, 'Fold_Change'] = df_controls_fc mapping.loc[cases, 'Fold_Change'] = df_cases_fc self.key = mapping
def runCombinedGrowthFitting(data, mapping, directory, args, verbose=False): ''' Uses Gaussian Processes to fit growth curves and infer paramters of growth kinetics. While runGrowthFitting() analyzes data one plate at a time, runCombinedGrowthFitting() can pool experimental replicates across different plates. The downside is that data summary must be merged and no 96-well plate grid figure can be produced. Args: data (pandas.DataFrame): number of time points (t) x number of variables plus-one (p+1) plus-one because Time is not an index but rather a column. mapping (pandas.DataFrame): number of wells/samples (n) x number of variables (p) directory (dictionary): keys are folder names, values are their paths args (dictionary): keys are arguments and value are user/default choices verbose (boolean) Action: saves summary text file(s) in summary folder in the parent directory. saves figures (PDFs) in figures folder in the parent directory. saves data text file(s) in derived folder in the parent directory. ''' # if user did not pass file name for output, use time stamp, see selectFileName() filename = selectFileName(args['fout']) # pre-process data plate = prepDataForFitting(data, mapping, subtract_baseline=False) # which meta-data variables do you use to group replicates? combine_keys = args['pb'].split(',') missing_keys = [ii for ii in combine_keys if ii not in plate.key.columns] if missing_keys: msg = 'FATAL USER ERROR: The following keys {} are '.format( missing_keys) msg += 'missing from mapping files.' sys.exit(msg) # continue processing data plate.subtractBaseline(to_do=True, poly=getValue('PolyFit'), groupby=combine_keys) plate_key = plate.key.copy() plate_data = plate.data.copy() plate_time = plate.time.copy() plate_cond = plate_key.loc[:, combine_keys + ['Group', 'Control']].drop_duplicates( combine_keys).reset_index(drop=True) smartPrint( 'AMiGA detected {} unique conditions.\n'.format(plate_cond.shape[0]), verbose) data_ls, diauxie_dict = [], {} # get user-defined values from config.py dx_ratio_varb = getValue('diauxie_ratio_varb') dx_ratio_min = getValue('diauxie_ratio_min') posterior_n = getValue('n_posterior_samples') scale = getValue('params_scale') posterior = args['slf'] fix_noise = args['fn'] nthin = args['nthin'] # initialize empty dataframes for storing growth parameters params_latent = initParamDf(plate_cond.index, complexity=0) params_sample = initParamDf(plate_cond.index, complexity=1) # for each unique condition based on user request for idx, condition in plate_cond.iterrows(): # get list of sample IDs cond_dict = condition.drop(['Group', 'Control']) cond_dict = cond_dict.to_dict( ) # e.g. {'Substate':['D-Trehalose'],'PM':[1]} cond_idx = subsetDf( plate_key, cond_dict).index.values # list of index values for N samples smartPrint('Fitting\n{}'.format(tidyDictPrint(cond_dict)), verbose) # get data and format for GP instance cond_data = plate_data.loc[:, list(cond_idx)] # T x N cond_data = plate_time.join(cond_data) # T x N+1 cond_data = cond_data.melt(id_vars='Time', var_name='Sample_ID', value_name='OD') cond_data = cond_data.drop( ['Sample_ID'], axis=1) # T*R x 2 (where R is number of replicates) cond_data = cond_data.dropna() gm = GrowthModel(df=cond_data, ARD=True, heteroscedastic=fix_noise, nthin=nthin) #, curve = gm.run(name=idx) # get parameter estimates using latent function diauxie_dict[idx] = curve.params.pop('df_dx') params_latent.loc[idx, :] = curve.params # get parameter estimates using samples fom the posterior distribution if posterior: params_sample.loc[idx, :] = curve.sample().posterior # passively save data, manipulation occurs below (input OD, GP fit, & GP derivative) if args['sgd']: time = pd.DataFrame(gm.x_new, columns=['Time']) mu0, var0 = np.ravel(gm.y0), np.ravel(np.diag(gm.cov0)) mu1, var1 = np.ravel(gm.y1), np.ravel(np.diag(gm.cov1)) if fix_noise: sigma_noise = np.ravel(gm.error_new) + gm.noise else: sigma_noise = np.ravel([gm.noise] * time.shape[0]) mu_var = pd.DataFrame( [mu0, var0, mu1, var1, sigma_noise], index=['mu', 'Sigma', 'mu1', 'Sigma1', 'Noise']).T gp_data = pd.DataFrame([list(condition.values)] * len(mu0), columns=condition.keys()) gp_data = gp_data.join(time).join(mu_var) data_ls.append(gp_data) # summarize diauxie results diauxie_df = mergeDiauxieDfs(diauxie_dict) if posterior: gp_params = params_sample.join(params_latent['diauxie']) else: gp_params = params_latent # record results in object's key plate_cond = plate_cond.join(gp_params) plate_cond.index.name = 'Sample_ID' plate_cond = plate_cond.reset_index(drop=False) plate_cond = pd.merge(plate_cond, diauxie_df, on='Sample_ID') params = initParamList(0) + initParamList(1) params = list(set(params).intersection(set(plate_cond.keys()))) df_params = plate_cond.drop(initDiauxieList(), axis=1).drop_duplicates() df_diauxie = plate_cond[plate_cond.diauxie == 1] df_diauxie = df_diauxie.drop(params, axis=1) df_diauxie = minimizeDiauxieReport(df_diauxie) summ_path = assembleFullName(directory['summary'], '', filename, 'summary', '.txt') diux_path = assembleFullName(directory['summary'], '', filename, 'diauxie', '.txt') # normalize parameters, if requested df_params = normalizePooledParameters(args, df_params) df_params = df_params.drop(['Group', 'Control'], 1) df_params = minimizeParameterReport(df_params) # save results df_params.to_csv(summ_path, sep='\t', header=True, index=False) if df_diauxie.shape[0] > 0: df_diauxie.to_csv(diux_path, sep='\t', header=True, index=False) # save latent functions if args['sgd']: file_path = assembleFullName(directory['derived'], '', filename, 'gp_data', '.txt') gp_data = pd.concat(data_ls, sort=False).reset_index(drop=True) gp_data.to_csv(file_path, sep='\t', header=True, index=True) return None
def addMVNPlotLine(ax, x, criteria, label, z_value, color, plot_params, noise=False): ''' Given data (x) and criteria, find relevant sample IDs and plot them on axis. Args: ax (matplotlib.axes._subplots.AxesSubplot) x (pandas.DataFrame): must include columns for Time, mu, Sigma criteria (dictionary): keys must be column headers in x, values must be values in x. label (str): used for legend label of plotted line. z_value (float): z-value for computing confidence interval color (str or (R,G,B,A)) where R,G,B,A are floats [0,1] plot_params (dictionary) noise (boolean): whetehr to plot 95-pct credibel intervals including sample uncertainty Returns: ax (matplotlib.axes._subplots.AxesSubplot) ''' scaler = norm.ppf( z_value) # define confidence interval scaler for MVN predictions x = subsetDf(x, criteria) # grab value-specific model predictions if noise: Sigma = x.Sigma + x.Noise else: Sigma = x.Sigma # compute credible interval xtime = x.Time y_avg = x.mu y_low = x.mu - scaler * np.sqrt(Sigma) y_upp = x.mu + scaler * np.sqrt(Sigma) # convert from log2 to linear OD # if plot_params['plot_linear_od']: # y_avg = np.exp(y_avg) # y_low = np.exp(y_low) # y_upp = np.exp(y_upp) ax.plot(xtime, y_avg, color=color, label=label, alpha=0.9, lw=3.0, zorder=10) ax.fill_between(x=xtime, y1=y_low, y2=y_upp, color=color, alpha=0.10, zorder=5) ax = largeTickLabels(ax, fontsize=plot_params['fontsize']) #if plot_params['plot_linear_od']: # ax.axhline(y=1,xmin=0,xmax=xtime.max(),lw=3.0,color=(0,0,0,1)) #else: ax.axhline(y=0, xmin=0, xmax=xtime.max(), lw=3.0, color=(0, 0, 0, 1)) return ax
def normalizeParameters(args,df): ''' Normalizes growth parameters to control samples. Args: args (dictionary): keys are arguments and value are user/default choices df (pandas.DataFrame): rows are samples, columns are experimental variables. Must include Plate_ID, Group, Control, auc, k, gr, dr, td, lag. Returns: df (pandas.DataFrame): input but with an additional 6 columns. ''' # let's keep original dataframe df_orig = df.copy() df_orig_keys = df_orig.columns df = df.reset_index() # How you should normalize? if args.normalize_method == 'division': opr = operator.truediv elif args.normalize_method == 'subtraction': opr = operator.sub # How to group samples and which ones are control samples? # if user specifies with command-line arguments if args.group_by is not None and args.normalize_by is not None: groupby = args.group_by.split(',') controlby = checkParameterCommand(args.normalize_by) elif args.normalize_by is not None and args.group_by is None: controlby = checkParameterCommand(args.normalize_by) df.loc[:,'Group'] = [1]*df.shape[0] groupby = ['Group'] # else check columns for Group and Contol variables elif 'Group' in df_orig_keys and 'Control' in df_orig_keys: groupby = ['Group'] controlby = {'Control':1} if (len(df.Group.unique())==1) and (len(df.Plate_ID.unique())>1): msg = '\nUSER WARNING: AMiGA detected a single "Group" but multiple Plate_IDs.\n' msg += 'Wells from different plates will thus be normalized togther as a group.\n' msg += 'If this was not your intention, please pass explicit arguments to AMiGA\n' msg += 'using "--group-by" and "--control-by" arguments to avoid any ambiguity.\n' print(msg) # else exit with error message else: msg = 'FATAL USER ERROR: User must specify groups of samples and ' msg += 'their corresponding control samples.' sys.exit(msg) # which parameters to normalize and/or to keep params_1 = initParamList(0) params_1.remove('diauxie') params_2 = ['mean({})'.format(ii) for ii in params_1] params_3 = initParamList(2) if any([ii in df_orig_keys for ii in params_2]): params = params_2 elif any([ii in df_orig_keys for ii in params_3]): params = params_3 else: params = params_1 #params_norm = initParamList(2) params_keep = groupby + list(controlby.keys()) + ['Sample_ID','Plate_ID'] + params params_keep = list(df.columns[df.columns.isin(params_keep)]) params_varbs = list(set(params_keep).difference(set(params))) df = df.loc[:,params_keep] norm_df = [] for idx,row in df.loc[:,groupby].drop_duplicates().iterrows(): df_group = subsetDf(df,row.to_dict()).loc[:,params_keep] df_group = df_group.sort_values(params_varbs) df_control = subsetDf(df_group,controlby) df_group.set_index(params_varbs,inplace=True) df_control.set_index(params_varbs,inplace=True) dgv = df_group.values dcv = df_control.mean().values df_group.loc[:,:] = opr(dgv,dcv) norm_df.append(df_group) norm_df = pd.concat(norm_df,axis=0) norm_df.columns = ['norm({})'.format(ii) for ii in norm_df.columns] norm_df = norm_df.reset_index(drop=False) df = pd.merge(df_orig,norm_df,on=params_varbs) if 'Sample_ID' in df.columns: df = df.set_index('Sample_ID') return df