def model(self, nthin=1, store=False, verbose=False): ''' Infers growth parameters of interest (including diauxic shifts) by Gaussian Process fitting of data. Args: store (boolean): if True, certain data will be store as object's attributes diauxie (float): ratio of peak height (relative to maximum) used to call if diauxie occured or not Actions: modifies self.key, and may create self.latent and self.dlatent_dt objects ''' # get user-defined parameters from config.py posterior_n = getValue('n_posterior_samples') # initialize variables for storing parameters and data data_ls, diauxie_dict = [], {} gp_params = initParamDf(self.key.index, 0) for sample_id in self.key.index: pid, well = self.key.loc[sample_id, ['Plate_ID', 'Well']].values smartPrint('Fitting {}\t{}'.format(pid, well), verbose) # extract sample args_dict = self.key.loc[sample_id, ['Well', 'Plate_ID']].to_dict() sample = self.extractGrowthData(args_dict) df = sample.time.join(sample.data) df.columns = ['Time', 'OD'] # create GP object and analyze gm = GrowthModel(df=df, baseline=sample.key.OD_Baseline.values, ARD=False, heteroscedastic=False, nthin=nthin) curve = gm.run(name=sample_id) diauxie_dict[sample_id] = curve.params.pop('df_dx') gp_params.loc[sample_id, :] = curve.params # passively save data, manipulation occurs below (input OD, GP fit, & GP derivative) if store: data_ls.append(curve.data()) diauxie_df = mergeDiauxieDfs(diauxie_dict) # record results in object's key self.key = self.key.join(gp_params) self.key = pd.merge(self.key, diauxie_df, on='Sample_ID') # plotting needs transformed (or real) OD & GP fit, & may need GP derivative, save all as obejct attributes if store: self.gp_data = pd.concat(data_ls).reset_index(drop=True) return None
def executeRegression(self): ''' Computes the log Bayes Factor and its null distribution (based on permutation tests). Args: data (pandas.DataFrame): each row is a single measurement (i.e. time point in a well), columns are variables and must include 'Time', 'OD'. hypothesis (dictionary): keys must be 'H0' and 'H1', values are lists of variables (must match data keys) nperm (int): number ofxec permutations to generate null distribution Returns: log_BF (float): log Bayes Factor = log (P(H1|D)/P(H0|D)) null_distribution (list of floats): the null distribution for log Bayes Factor where variable of interest was permuted for a certain number of times (based on nperm). ''' verbose = self.verbose hypothesis = self.hypothesis fix_noise = self.args.fix_noise nperm = self.args.number_permutations nthin = self.args.time_step_size data = self.data data0 = data.loc[:, ['OD'] + hypothesis['H0']] data1 = data.loc[:, ['OD'] + hypothesis['H1']] gm0 = GrowthModel(df=data0, ARD=True, heteroscedastic=fix_noise, nthin=nthin, logged=self.plate.mods.logged) gm1 = GrowthModel(df=data1, ARD=True, heteroscedastic=fix_noise, nthin=nthin, logged=self.plate.mods.logged) gm0, LL0 = gm0.run(predict=False) gm1, LL1 = gm1.run(predict=False) log_BF = LL1 - LL0 self.log_BF = log_BF self.model = gm1 self.LL0 = LL0 self.LL1 = LL1 self.log_BF_null_dist = None null_distribution = [] to_permute = list( set(hypothesis['H1']).difference(set(hypothesis['H0'])))[0] for rep in range(nperm): smartPrint('Permutation #{}'.format(rep), verbose) null_distribution.append(gm1.permute(to_permute) - LL0) smartPrint('', verbose) if null_distribution: self.log_BF_null_dist = null_distribution
def runCombinedGrowthFitting(data, mapping, directory, args, verbose=False): ''' Uses Gaussian Processes to fit growth curves and infer paramters of growth kinetics. While runGrowthFitting() analyzes data one plate at a time, runCombinedGrowthFitting() can pool experimental replicates across different plates. The downside is that data summary must be merged and no 96-well plate grid figure can be produced. Args: data (pandas.DataFrame): number of time points (t) x number of variables plus-one (p+1) plus-one because Time is not an index but rather a column. mapping (pandas.DataFrame): number of wells/samples (n) x number of variables (p) directory (dictionary): keys are folder names, values are their paths args (dictionary): keys are arguments and value are user/default choices verbose (boolean) Action: saves summary text file(s) in summary folder in the parent directory. saves figures (PDFs) in figures folder in the parent directory. saves data text file(s) in derived folder in the parent directory. ''' # if user did not pass file name for output, use time stamp, see selectFileName() filename = selectFileName(args['fout']) # pre-process data plate = prepDataForFitting(data, mapping, subtract_baseline=False) # which meta-data variables do you use to group replicates? combine_keys = args['pb'].split(',') missing_keys = [ii for ii in combine_keys if ii not in plate.key.columns] if missing_keys: msg = 'FATAL USER ERROR: The following keys {} are '.format( missing_keys) msg += 'missing from mapping files.' sys.exit(msg) # continue processing data plate.subtractBaseline(to_do=True, poly=getValue('PolyFit'), groupby=combine_keys) plate_key = plate.key.copy() plate_data = plate.data.copy() plate_time = plate.time.copy() plate_cond = plate_key.loc[:, combine_keys + ['Group', 'Control']].drop_duplicates( combine_keys).reset_index(drop=True) smartPrint( 'AMiGA detected {} unique conditions.\n'.format(plate_cond.shape[0]), verbose) data_ls, diauxie_dict = [], {} # get user-defined values from config.py dx_ratio_varb = getValue('diauxie_ratio_varb') dx_ratio_min = getValue('diauxie_ratio_min') posterior_n = getValue('n_posterior_samples') scale = getValue('params_scale') posterior = args['slf'] fix_noise = args['fn'] nthin = args['nthin'] # initialize empty dataframes for storing growth parameters params_latent = initParamDf(plate_cond.index, complexity=0) params_sample = initParamDf(plate_cond.index, complexity=1) # for each unique condition based on user request for idx, condition in plate_cond.iterrows(): # get list of sample IDs cond_dict = condition.drop(['Group', 'Control']) cond_dict = cond_dict.to_dict( ) # e.g. {'Substate':['D-Trehalose'],'PM':[1]} cond_idx = subsetDf( plate_key, cond_dict).index.values # list of index values for N samples smartPrint('Fitting\n{}'.format(tidyDictPrint(cond_dict)), verbose) # get data and format for GP instance cond_data = plate_data.loc[:, list(cond_idx)] # T x N cond_data = plate_time.join(cond_data) # T x N+1 cond_data = cond_data.melt(id_vars='Time', var_name='Sample_ID', value_name='OD') cond_data = cond_data.drop( ['Sample_ID'], axis=1) # T*R x 2 (where R is number of replicates) cond_data = cond_data.dropna() gm = GrowthModel(df=cond_data, ARD=True, heteroscedastic=fix_noise, nthin=nthin) #, curve = gm.run(name=idx) # get parameter estimates using latent function diauxie_dict[idx] = curve.params.pop('df_dx') params_latent.loc[idx, :] = curve.params # get parameter estimates using samples fom the posterior distribution if posterior: params_sample.loc[idx, :] = curve.sample().posterior # passively save data, manipulation occurs below (input OD, GP fit, & GP derivative) if args['sgd']: time = pd.DataFrame(gm.x_new, columns=['Time']) mu0, var0 = np.ravel(gm.y0), np.ravel(np.diag(gm.cov0)) mu1, var1 = np.ravel(gm.y1), np.ravel(np.diag(gm.cov1)) if fix_noise: sigma_noise = np.ravel(gm.error_new) + gm.noise else: sigma_noise = np.ravel([gm.noise] * time.shape[0]) mu_var = pd.DataFrame( [mu0, var0, mu1, var1, sigma_noise], index=['mu', 'Sigma', 'mu1', 'Sigma1', 'Noise']).T gp_data = pd.DataFrame([list(condition.values)] * len(mu0), columns=condition.keys()) gp_data = gp_data.join(time).join(mu_var) data_ls.append(gp_data) # summarize diauxie results diauxie_df = mergeDiauxieDfs(diauxie_dict) if posterior: gp_params = params_sample.join(params_latent['diauxie']) else: gp_params = params_latent # record results in object's key plate_cond = plate_cond.join(gp_params) plate_cond.index.name = 'Sample_ID' plate_cond = plate_cond.reset_index(drop=False) plate_cond = pd.merge(plate_cond, diauxie_df, on='Sample_ID') params = initParamList(0) + initParamList(1) params = list(set(params).intersection(set(plate_cond.keys()))) df_params = plate_cond.drop(initDiauxieList(), axis=1).drop_duplicates() df_diauxie = plate_cond[plate_cond.diauxie == 1] df_diauxie = df_diauxie.drop(params, axis=1) df_diauxie = minimizeDiauxieReport(df_diauxie) summ_path = assembleFullName(directory['summary'], '', filename, 'summary', '.txt') diux_path = assembleFullName(directory['summary'], '', filename, 'diauxie', '.txt') # normalize parameters, if requested df_params = normalizePooledParameters(args, df_params) df_params = df_params.drop(['Group', 'Control'], 1) df_params = minimizeParameterReport(df_params) # save results df_params.to_csv(summ_path, sep='\t', header=True, index=False) if df_diauxie.shape[0] > 0: df_diauxie.to_csv(diux_path, sep='\t', header=True, index=False) # save latent functions if args['sgd']: file_path = assembleFullName(directory['derived'], '', filename, 'gp_data', '.txt') gp_data = pd.concat(data_ls, sort=False).reset_index(drop=True) gp_data.to_csv(file_path, sep='\t', header=True, index=True) return None
def savePredictions(self): ''' Given model predictions of growth curves (for each unique set of conditions tested), describe the latent function and its derivative in terms of growth parameters. Reports results in a file with {file_name}_params name in dir_path directory. Args: model (GPy.models.gp_regression.GPRegression) data (pandas.DataFrame) hypothesis (dictionary): e.g. {'H0':['Time'],'H1':['Time','Substrate']} actor_dict (dictionary): mapping of unique values of variables to numerical integers posterior (boolean) save_latent (boolean) dir_path (str): path to directory file_name (str): file name Returns: x_full (pandas.DataFrame): x_min (pandas.DataFrame): ''' data = self.data model = self.model hypothesis = self.hypothesis factor_dict = self.factor_dict variable = self.target[0] confidence = getValue('confidence') # confidence interval, e.g. 0.95 posterior = self.args['slf'] save_latent = self.args['sgd'] fix_noise = self.args['fn'] dir_path = self.paths_dict['dir'] file_name = self.paths_dict['filename'] # define hypothesis paraameters model_input = hypothesis['H1'] #grab minimal input data for prediction x_full = self.x_full x_min = self.x_min diauxie_dict = {} params_latent = initParamDf(x_min.index, complexity=0) params_sample = initParamDf(x_min.index, complexity=1) for idx, row in x_min.iterrows(): # get x and y data df = subsetDf(x_full.drop(['mu', 'Sigma', 'Noise'], 1), row.to_dict()) # get curve based on model predictions gm = GrowthModel(model=model.model, x_new=df.values, ARD=True) curve = gm.run() # get parameter estimates using predicted curve diauxie_dict[idx] = curve.params.pop('df_dx') params_latent.loc[idx, :] = curve.params if posterior: params_sample.loc[idx, :] = curve.sample().posterior # summarize diauxie results diauxie_df = mergeDiauxieDfs(diauxie_dict) if posterior: gp_params = params_sample.join(params_latent['diauxie']) else: gp_params = params_latent gp_params = x_min.join(gp_params) gp_params.index.name = 'Sample_ID' gp_params = gp_params.reset_index(drop=False) gp_params = pd.merge(gp_params, diauxie_df, on='Sample_ID') # save gp_data fit x_out = x_full.copy() for key, mapping in factor_dict.items(): if key in x_out.keys(): x_out.loc[:, key] = x_out.loc[:, key].replace(reverseDict(mapping)) if key in gp_params.keys(): gp_params.loc[:, key] = gp_params.loc[:, key].replace( reverseDict(mapping)) #params = initParamList(0) diauxie = initDiauxieList() params = initParamList(0) + initParamList(1) params = list(set(params).intersection(set(gp_params.keys()))) df_params = gp_params.drop(diauxie, axis=1).drop_duplicates() df_params = minimizeParameterReport(df_params) df_diauxie = gp_params[gp_params.diauxie == 1].drop(params, axis=1) df_diauxie = minimizeDiauxieReport(df_diauxie) if posterior: df_params = prettyifyParameterReport(df_params, variable, confidence) df_params = articulateParameters(df_params, axis=0) summ_path = assembleFullName(dir_path, '', file_name, 'params', '.txt') diux_path = assembleFullName(dir_path, '', file_name, 'diauxie', '.txt') #plate_cond.to_csv(file_path,sep='\t',header=True,index=True) df_params.to_csv(summ_path, sep='\t', header=True, index=posterior) if df_diauxie.shape[0] > 0: df_diauxie.to_csv(diux_path, sep='\t', header=True, index=False) if save_latent: file_path = assembleFullName(dir_path, '', file_name, 'output', '.txt') x_out.to_csv(file_path, sep='\t', header=True, index=True)