Python subsetDf示例，libs.utils.subsetDf Python示例

示例#1

0

显示文件

    def subtractControl(self, to_do=False, drop=True):
        '''
        Subtract from each treatment sample's growth curve, the growth curve of its corresponding control sample.

        Args:
            to_do (boolean): if False, do not subtract control wells and return None.
            drop (boolean): if True, drop control samples from data.
        '''

        if not to_do: return None

        data = self.data.copy()
        mapping = self.key

        # find all unique groups
        plate_groups = mapping.loc[:, ['Plate_ID', 'Group']].drop_duplicates()
        plate_groups = [tuple(x) for x in plate_groups.values]

        for plate_group in plate_groups:

            pid, group = plate_group

            # grab lists of Sample_ID of wells corresponding to control and cases
            controls = subsetDf(mapping, {
                'Plate_ID': [pid],
                'Group': [group],
                'Control': [1]
            }).index.values
            cases = subsetDf(mapping, {
                'Plate_ID': [pid],
                'Group': [group]
            }).index.values  # includes controls

            if len(controls) == 0:
                msg = '\nFATAL ERROR: User requested subtraction of control samples. However, '
                msg += 'samples belonging to group {} of plate {} lack '.format(
                    group, pid)
                msg += 'any corresponding control samples in the current working directory.\n'
                sys.exit(msg)

            data_controls = data.loc[:, controls]
            data_cases = data.loc[:, cases]

            # for each case, divide data by mean controls (if log-transformed), o.w. subtract mean controls
            data_controls = data_controls.mean(1)
            data_cases = (data_cases.T - data_controls).T
            data.loc[:, cases] = data_cases.values

            if drop: data = data.drop(controls, axis=1)

        self.data = data
        self.mods.controlled = True

示例#2

0

显示文件

文件： analyze.py 项目： Mariacherepkova/amiga

def normalizePooledParameters(args, df):
    '''
    Normalizes growth parameters to control samples for pooled parametes. 

    Args:
        args (dictionary): keys are arguments and value are user/default choices
        df (pandas.DataFrame): rows are samples, columns are experimental variables. Must include
            Plate_ID, Group, Control, auc, k, gr, dr, td, lag.

    Returns:
        df (pandas.DataFrame): input but with an additional 6 columns.
    '''

    if (not args['norm']) or (not args['pool']): return df

    df_orig = df.copy()
    df_orig_keys = df_orig.columns

    poolby = args['pb'].split(',')
    normalizeby = checkParameterCommand(args['nb'])

    params_1 = initParamList(0)
    params_1.remove('diauxie')
    params_2 = ['mean({})'.format(ii) for ii in params_1]
    if any([ii in df_orig_keys for ii in params_2]): params = params_2
    else: params = params_1

    params_norm = initParamList(2)
    params_keep = ['Sample_ID'] + poolby + params

    df = df.loc[:, params_keep]
    controls = subsetDf(df, normalizeby)
    variable = list(set(poolby).difference(set(normalizeby.keys())))

    norm_df = []
    for _, row in df[variable].drop_duplicates().iterrows():

        sub_df = subsetDf(df, row.to_dict()).set_index(['Sample_ID'] + poolby)
        sub_ctrl = subsetDf(controls,
                            row.to_dict()).set_index(['Sample_ID'] + poolby)
        norm_df.append((sub_df / sub_ctrl.values))  #.reset_index())

    norm_df = pd.concat(norm_df, 0)
    norm_df.columns = params_norm
    norm_df = norm_df.reset_index(drop=False)

    df = pd.merge(df_orig, norm_df, on=['Sample_ID'] + poolby)
    #df = df.drop(['Group','Control'],1)

    return df

示例#3

0

显示文件

文件： heatmap.py 项目： firasmidani/amiga

def main(args):

	verbose = args.verbose
	#directory = assemblePath(args.input,'summary')
	directory = args.input

	msg = 'AMiGA is peeking inside the summary directory'

	smartPrint('',verbose)
	smartPrint(tidyMessage(msg),verbose)
	# smartPrint(checkDirectoryNotEmpty(directory,'Summary')[1],verbose)

	criteria = checkParameterCommand(args.subset,sep=',')
	
	directory,filename = isFileOrFolder(directory,up=1)

	if filename: ls_files = ['{}{}{}'.format(directory,os.sep,filename)]
	else: ls_files = findPlateReaderFiles(directory,'.txt')

	full_df = read(ls_files)
	sub_df = subsetDf(full_df,criteria)
	sub_df = group(sub_df,args)
	sub_df = pivot(sub_df,args,args.value)
	sub_df = reduceDf(sub_df,args)
	clusterMap(sub_df,full_df,args,directory)
	saveDf(full_df,sub_df,args,directory)

示例#4

0

显示文件

def addRealPlotLine(ax, plate, criteria, color, plot_params):
    '''
    Given data (plate) and criteria, find relevant sample IDs and plot them on axis.

    Args:
        ax (matplotlib.axes._subplots.AxesSubplot)
        plate (GrowthPlate object)
        criteria (dictionary): keys must be column headers in plate.key, values must be 
            values in plate.key.
        color (str or (R,G,B,A)) where R,G,B,A are floats [0,1]
        plot_params (dictionary)

    Returns:
        ax (matplotlib.axes._subplots.AxesSubplot)    
    '''

    if plot_params['overlay_actual_data']:

        samples = list(subsetDf(plate.key, criteria).index)

        if len(samples) == 0: return ax

        time = plate.time.copy()
        data = plate.data.copy()

        #if plot_params['plot_linear_od']:
        #    data = data.apply(np.exp).copy()

        wide_df = time.join(data)

        wide_df = wide_df.reindex(['Time'] + samples, axis=1).set_index('Time')
        wide_df = wide_df.dropna(axis=1)  ## necessary to get rid of controls
        ax.plot(wide_df, color=color, alpha=0.5, lw=1, zorder=1)

    return ax

示例#5

0

显示文件

文件： compare.py 项目： Mariacherepkova/amiga

def subset(args, df):

    ls_df, ls_varbs = [], []

    for ii in args['s']:
        criteria = checkParameterCommand(ii, sep=',')
        ls_df.append(subsetDf(df, criteria))
        ls_varbs.append(list(criteria.keys()))

    df = pd.concat(ls_df, sort=False).reset_index(drop=True).drop_duplicates()

    if df.shape[0] > 2:

        msg = '\nFATAL USER ERROR: User-provided summary files and subsetting crieteria '
        msg += 'selected for more than two conditions. AMiGA can not perform comparison '
        msg += 'on more than two conditions. Please check your arguments and try again. '
        msg += 'Below are the currently selected conditions.\n\n'

        keys = [ii for ii in df.keys() if ('(' not in ii) & (ii != 'diauxie')]

        print(msg)
        print(df.loc[:, keys], '\n\n')
        sys.exit()

    else:

        ls_varbs = [item for sublist in ls_varbs for item in sublist]
        ls_varbs = list(set(ls_varbs))

        return df, ls_varbs

示例#6

0

显示文件

文件： heatmap.py 项目： Mariacherepkova/amiga

def main():

    args = parseCommand()
    verbose = args['verbose']
    #directory = assemblePath(args['fi'],'summary')
    directory = args['fi']

    msg = 'AMiGA is peeking inside the summary directory'

    smartPrint('', verbose)
    smartPrint(tidyMessage(msg), verbose)
    # smartPrint(checkDirectoryNotEmpty(directory,'Summary')[1],verbose)

    criteria = checkParameterCommand(args['s'], sep=',')

    directory, filename = isFileOrFolder(directory, up=1)

    if filename: ls_files = ['{}{}{}'.format(directory, os.sep, filename)]
    else: ls_files = findPlateReaderFiles(directory, '.txt')

    df = read(ls_files)
    df = subsetDf(df, criteria)
    df = group(df, args)
    df = pivot(df, args)
    df = reduceDf(df, args)
    #plot(df,args,directory)
    clusterMap(df, args, directory)

示例#7

0

显示文件

def shouldYouSubtractControl(mapping, variables):
    '''
    Checks if control samples must be subtracted from treatment samples for proper hypothesis testing.
        In particular, make sure that the variable of interest is binary (i.e. it has only two possible
        values in the mapping dataframe. This makes sure that GP regression on variable of interest is 
        performing a test on a binary variable.

    Args:
        mapping (pandas.DataFrame): samples (n) by variables (k)
        variable (str): must be one of the column headers for mapping argument

    Returns:
        (boolean)
    '''

    unique_values = mapping.loc[:, variables].drop_duplicates().reset_index()

    # subtract control curves if none of the values correspond to a control
    subtract_control = False

    for _, row in unique_values.iterrows():
        criteria = row.to_dict()
        sub_map = subsetDf(mapping, criteria)
        sub_map_controls_n = sub_map[sub_map.Control == 1].shape[0]
        sub_map_total_n = sub_map.shape[0]
        if (sub_map_controls_n
                == sub_map_total_n) and (sub_map_controls_n > 0):
            return False  # found a value whose samples all correspond to control wells
    else:
        return True

示例#8

0

显示文件

文件： trim.py 项目： Mariacherepkova/amiga

def trimMergeMapping(mapping_dict, verbose=False):
    '''
    Trims and merges mapping dataframes into one master mapping data frame.

    Args:
        mapping (dictionary): keys are plate IDs and values are pandas.DataFrames with size n x (p)
            where is the number of wells (or samples) in plate, and p are the number of variables or
            parameters described in dataframe.
        params (dictionary): must at least include 'subset' and 'flag' keys and their values
        verbose (boolean)

    Returns:
        mapping (pandas.DataFrame): number of wells/samples (n) x number of variables (p)  
    '''

    # merge mapping dataFrames
    #   sort will force shared (inner) keys to the lead and unshared (outer) keys to the caboose
    #   useful because individual mapping files may lack certain columns, some may even be empty

    master_mapping = pd.concat(mapping_dict.values(),
                               ignore_index=True,
                               join='outer',
                               sort=False)

    # trim mapping based on Subset and Flag columns
    master_mapping = subsetDf(master_mapping, {'Subset': [1], 'Flag': [0]})

    # reset_index and set as Sample_ID
    master_mapping = resetNameIndex(master_mapping, 'Sample_ID', True)

    return master_mapping

示例#9

0

显示文件

文件： heatmap.py 项目： firasmidani/amiga

def saveDf(full_df,sub_df,args,directory):

	if not args.save_filtered_table:
		return None

	sub_df = subsetDf(full_df,{args.y_variable:list(sub_df.index.values),
		                       args.x_variable:list(sub_df.keys().values)})

	fpath = assembleFullName(directory,'',args.output,'filtered','.txt')
	sub_df.to_csv(fpath,sep='\t',header=True,index=True)

示例#10

0

显示文件

def updateMappingControls(master_mapping, mapping_dict, to_do=False):
    '''
    For all samples in master mapping, find relevant controls and add these controls to the master mapping dataframe.

    Args:
        master_mapping (pandas.DataFrame)
        mapping_dict (dictionary)
        to_do (boolean)

    Returns:
        master_mapping (pandas.DataFrame): will have more rows (i.e. samples) than input
    '''

    # check first if you need to do this
    if not to_do: return master_mapping

    # find all unique groups
    plate_groups = master_mapping.loc[:,
                                      ['Plate_ID', 'Group']].drop_duplicates()
    plate_groups = [tuple(x) for x in plate_groups.values]

    # grab all relevant control samples
    df_controls = []
    for plate_group in plate_groups:
        pid, group = plate_group
        pid_mapping = mapping_dict[pid]
        df_controls.append(
            subsetDf(pid_mapping, {
                'Plate_ID': [pid],
                'Group': [group],
                'Control': [1]
            }))

    # re-assemble the master mapping dataframe, including the propercontrols
    df_controls = pd.concat(df_controls)
    master_mapping = pd.concat(
        [master_mapping.copy(), df_controls.copy()], sort=True)
    master_mapping = master_mapping.reset_index(drop=True)
    master_mapping.index.name = 'Sample_ID'
    master_mapping = master_mapping.sort_values(
        ['Plate_ID', 'Group', 'Control'])

    # if mapping has an interaction column, replace NaN with 0 (so it won't be dropped later)
    #   because you are (above) adding control samples to master_mapping,
    #   they will not have the interaction column and their values will default to NaN
    variable = [v for v in master_mapping.keys() if '*' in v]
    master_mapping.loc[:, variable] = master_mapping.loc[:, variable].fillna(0)

    return master_mapping

示例#11

0

显示文件

    def addInteractionTerm(self):
        '''
        If user passed hypothesis with an interaction term (identified by an astersisk), then
            create 
        '''

        # add interaction term, if needed
        mapping = self.master_mapping

        for variable in self.target:

            if ('*' in variable):
                pairs = variable.split("*")
                var_dict = {}
                if ('(' in variable) and (')' in variable):

                    for pair in pairs:
                        var, ctrl = pair.split('(')
                        var_dict[var] = ctrl[:-1]

                    intx = subsetDf(mapping, var_dict).index.values
                    mapping.loc[:, variable] = [0] * mapping.shape[0]
                    mapping.loc[intx, variable] = [1] * len(intx)

                else:

                    df = mapping.loc[:, pairs].drop_duplicates()
                    df.loc[:, variable] = df.apply(
                        lambda x: '{} x {}'.format(x[pairs[0]], x[pairs[1]]),
                        axis=1)

                    mapping = pd.merge(mapping.reset_index(),
                                       df,
                                       on=pairs,
                                       how='left')
                    mapping = mapping.set_index('Sample_ID')

        self.master_mapping = mapping

示例#12

0

显示文件

    def savePredictions(self):
        '''
        Given model predictions of growth curves (for each unique set of conditions tested),
            describe the latent function and its derivative in terms of growth parameters. 
            Reports results in a file with {file_name}_params name in dir_path directory. 

        Args:
            model (GPy.models.gp_regression.GPRegression)
            data (pandas.DataFrame)
            hypothesis (dictionary): e.g. {'H0':['Time'],'H1':['Time','Substrate']}
            actor_dict (dictionary): mapping of unique values of variables to numerical integers
            posterior (boolean)
            save_latent (boolean)
            dir_path (str): path to directory
            file_name (str): file name

        Returns:
            x_full (pandas.DataFrame): 
            x_min (pandas.DataFrame):

        '''

        data = self.data
        model = self.model
        hypothesis = self.hypothesis
        factor_dict = self.factor_dict
        variable = self.target[0]
        confidence = getValue('confidence')  # confidence interval, e.g. 0.95

        posterior = self.args['slf']
        save_latent = self.args['sgd']
        fix_noise = self.args['fn']

        dir_path = self.paths_dict['dir']
        file_name = self.paths_dict['filename']

        # define hypothesis paraameters
        model_input = hypothesis['H1']  #grab minimal input data for prediction
        x_full = self.x_full
        x_min = self.x_min

        diauxie_dict = {}
        params_latent = initParamDf(x_min.index, complexity=0)
        params_sample = initParamDf(x_min.index, complexity=1)

        for idx, row in x_min.iterrows():

            # get x and y data
            df = subsetDf(x_full.drop(['mu', 'Sigma', 'Noise'], 1),
                          row.to_dict())

            # get curve based on model predictions
            gm = GrowthModel(model=model.model, x_new=df.values, ARD=True)
            curve = gm.run()

            # get parameter estimates using predicted curve
            diauxie_dict[idx] = curve.params.pop('df_dx')
            params_latent.loc[idx, :] = curve.params

            if posterior: params_sample.loc[idx, :] = curve.sample().posterior

        # summarize diauxie results
        diauxie_df = mergeDiauxieDfs(diauxie_dict)

        if posterior: gp_params = params_sample.join(params_latent['diauxie'])
        else: gp_params = params_latent

        gp_params = x_min.join(gp_params)
        gp_params.index.name = 'Sample_ID'
        gp_params = gp_params.reset_index(drop=False)
        gp_params = pd.merge(gp_params, diauxie_df, on='Sample_ID')

        # save gp_data fit
        x_out = x_full.copy()
        for key, mapping in factor_dict.items():
            if key in x_out.keys():
                x_out.loc[:,
                          key] = x_out.loc[:,
                                           key].replace(reverseDict(mapping))
            if key in gp_params.keys():
                gp_params.loc[:, key] = gp_params.loc[:, key].replace(
                    reverseDict(mapping))

        #params = initParamList(0)
        diauxie = initDiauxieList()
        params = initParamList(0) + initParamList(1)
        params = list(set(params).intersection(set(gp_params.keys())))

        df_params = gp_params.drop(diauxie, axis=1).drop_duplicates()
        df_params = minimizeParameterReport(df_params)
        df_diauxie = gp_params[gp_params.diauxie == 1].drop(params, axis=1)
        df_diauxie = minimizeDiauxieReport(df_diauxie)

        if posterior:
            df_params = prettyifyParameterReport(df_params, variable,
                                                 confidence)
            df_params = articulateParameters(df_params, axis=0)

        summ_path = assembleFullName(dir_path, '', file_name, 'params', '.txt')
        diux_path = assembleFullName(dir_path, '', file_name, 'diauxie',
                                     '.txt')

        #plate_cond.to_csv(file_path,sep='\t',header=True,index=True)
        df_params.to_csv(summ_path, sep='\t', header=True, index=posterior)
        if df_diauxie.shape[0] > 0:
            df_diauxie.to_csv(diux_path, sep='\t', header=True, index=False)

        if save_latent:
            file_path = assembleFullName(dir_path, '', file_name, 'output',
                                         '.txt')
            x_out.to_csv(file_path, sep='\t', header=True, index=True)

示例#13

0

显示文件

    def __init__(self,
                 df=None,
                 model=None,
                 x_new=None,
                 baseline=1.0,
                 ARD=False,
                 heteroscedastic=False,
                 nthin=1):
        '''
        Data structure for Gaussian Process regression and related parameter inference.

        Attributes:
            x (numpy.ndarray): independent variables (N x D), where N is the number of observations, and 
                D is the number of dimensions (or variables).
            y (numpy.ndarray): dependent variables (N x 1), where N is the number of observations, and 
                the only column is the dependent or obesrved variable (often Optical Density or OD).
            key (dict or pandas.DataFrame): dictionary (k) or pandas.DataFrame (1 x k) that describes 
                k experimental variables about sample. Must include 'OD_Baseline' and 'Fold_Change' variables.

        Notes:
            for growth curve analysis, it is assumed that y was log-transformeed and baseline-corrected. 

        '''

        if model:
            self.model = model
            self.x_new = x_new
            self.ARD = ARD
            self.baseline = baseline
            self.y = None
            self.df = None
            return None

        self.df = df.copy()

        # create a dummy non-unique variable/column
        foo = uniqueRandomString(avoid=df.keys())
        df[foo] = [1] * df.shape[0]
        varbs = df.drop(['Time', 'OD'], axis=1).drop_duplicates()

        # for each unique non-time variable, estimate variance
        new_df = []
        for idx, row in varbs.iterrows():
            sub_df = subsetDf(df, row.to_dict())
            sub_df = describeVariance(sub_df, time='Time', od='OD')
            new_df.append(sub_df)
        new_df = pd.concat(new_df, axis=0)
        new_df = new_df.drop(['SID', foo], axis=1)

        # construct a thinner dataframe to speed up regression
        time = new_df.Time.sort_values().unique()
        time = time[::int(nthin)]
        thin_df = new_df[new_df.Time.isin(time)]

        # predictions of error and new are based on full dataframe
        tmp = new_df.drop(['OD'], axis=1).drop_duplicates()
        error_new = tmp.loc[:, ['error']].values
        x_new = tmp.drop(['error'], axis=1).values

        # regression are based on input/output/error from thinned dataframe
        x = thin_df.drop(['OD', 'error'], axis=1).values
        y = thin_df.loc[:, ['OD']].values
        error = thin_df.loc[:, ['error']].values
        x_keys = thin_df.drop(['OD', 'error'], axis=1).keys()

        # save attributes
        self.x_keys = x_keys
        self.x_new = x_new
        self.x = x
        self.y = y
        self.error = error
        self.error_new = error_new
        self.baseline = baseline

        self.model = model
        self.ARD = ARD
        self.heteroscedastic = heteroscedastic
        self.noise = None

示例#14

0

显示文件

    def computeFoldChange(self, subtract_baseline=True):
        '''
        Computes the fold change for all wells using the object's unmodified raw data. The object's key
            must have the following columns ['Plate_ID','Gropu','Control']. Control values must be {0,1}.
            The fold change is computed using measurements that have had the first measurment (first time-
            point subtracted, first. The maximum measurement in controls are averaged to get the 447r 
            (i.e. the average maximum OD of control wells) which divides the maximum OD of all cases.
            Fold-changes are normalized to controls belonging to the same group, all wells in a Biolog plate
            will belong to the same group and have the same control (A1 well).
        '''

        mapping = self.key.copy()

        # if mapping lacks group and control columns skip
        if ('Group' not in mapping.keys()) or ('Control'
                                               not in mapping.keys()):

            mapping.loc[:, 'Fold_Change'] = [np.nan] * mapping.shape[0]

            self.key = mapping

            return None

        df = self.input_data.copy(
        )  # timepoints by wells, input data that remains unmodified

        # subtract first time point from each column (i.e. wells)
        if subtract_baseline:
            baseline = df.iloc[0, :]
            df = df.apply(lambda row: row - baseline, axis=1)

        # find all unique groups
        plate_groups = mapping.loc[:, ['Plate_ID', 'Group']].drop_duplicates()
        plate_groups = [tuple(x) for x in plate_groups.values]

        for plate_group in plate_groups:

            pid, group = plate_group

            # grab lists of Sample_ID of wells corresponding to control and cases
            controls = subsetDf(mapping, {
                'Plate_ID': [pid],
                'Group': [group],
                'Control': [1]
            }).index.values
            cases = subsetDf(mapping, {
                'Plate_ID': [pid],
                'Group': [group],
                'Control': [0]
            }).index.values

            # if group does not have a control, skip
            if len(controls) == 0:

                mapping.loc[cases, 'Fold_Change'] = [np.nan] * len(cases)

                continue

            df_controls = df.loc[:, controls]
            df_cases = df.loc[:, cases]

            # for denominator, max by control column (i.e. well), then average all controls
            df_controls_fc = df_controls.max(0) / df_controls.max(0).mean(0)
            df_cases_fc = df_cases.max(0) / df_controls.max(0).mean(0)

            mapping.loc[controls, 'Fold_Change'] = df_controls_fc
            mapping.loc[cases, 'Fold_Change'] = df_cases_fc

        self.key = mapping

示例#15

0

显示文件

文件： analyze.py 项目： Mariacherepkova/amiga

def runCombinedGrowthFitting(data, mapping, directory, args, verbose=False):
    '''
    Uses Gaussian Processes to fit growth curves and infer paramters of growth kinetics.
        While runGrowthFitting() analyzes data one plate at a time, runCombinedGrowthFitting()
        can pool experimental replicates across different plates. The downside is that data
        summary must be merged and no 96-well plate grid figure can be produced.  

    Args:
        data (pandas.DataFrame): number of time points (t) x number of variables plus-one (p+1)
            plus-one because Time is not an index but rather a column.
        mapping (pandas.DataFrame): number of wells/samples (n) x number of variables (p)
        directory (dictionary): keys are folder names, values are their paths
        args (dictionary): keys are arguments and value are user/default choices
        verbose (boolean)

    Action:
        saves summary text file(s) in summary folder in the parent directory.
        saves figures (PDFs) in figures folder in the parent directory.
        saves data text file(s) in derived folder in the parent directory.
    '''

    # if user did not pass file name for output, use time stamp, see selectFileName()
    filename = selectFileName(args['fout'])

    # pre-process data
    plate = prepDataForFitting(data, mapping, subtract_baseline=False)

    # which meta-data variables do you use to group replicates?
    combine_keys = args['pb'].split(',')
    missing_keys = [ii for ii in combine_keys if ii not in plate.key.columns]

    if missing_keys:
        msg = 'FATAL USER ERROR: The following keys {} are '.format(
            missing_keys)
        msg += 'missing from mapping files.'
        sys.exit(msg)

    # continue processing data
    plate.subtractBaseline(to_do=True,
                           poly=getValue('PolyFit'),
                           groupby=combine_keys)
    plate_key = plate.key.copy()
    plate_data = plate.data.copy()
    plate_time = plate.time.copy()
    plate_cond = plate_key.loc[:, combine_keys +
                               ['Group', 'Control']].drop_duplicates(
                                   combine_keys).reset_index(drop=True)

    smartPrint(
        'AMiGA detected {} unique conditions.\n'.format(plate_cond.shape[0]),
        verbose)

    data_ls, diauxie_dict = [], {}

    # get user-defined values from config.py
    dx_ratio_varb = getValue('diauxie_ratio_varb')
    dx_ratio_min = getValue('diauxie_ratio_min')
    posterior_n = getValue('n_posterior_samples')
    scale = getValue('params_scale')

    posterior = args['slf']
    fix_noise = args['fn']
    nthin = args['nthin']

    # initialize empty dataframes for storing growth parameters
    params_latent = initParamDf(plate_cond.index, complexity=0)
    params_sample = initParamDf(plate_cond.index, complexity=1)

    # for each unique condition based on user request
    for idx, condition in plate_cond.iterrows():

        # get list of sample IDs
        cond_dict = condition.drop(['Group', 'Control'])
        cond_dict = cond_dict.to_dict(
        )  # e.g. {'Substate':['D-Trehalose'],'PM':[1]}
        cond_idx = subsetDf(
            plate_key,
            cond_dict).index.values  # list of index values for N samples
        smartPrint('Fitting\n{}'.format(tidyDictPrint(cond_dict)), verbose)

        # get data and format for GP instance
        cond_data = plate_data.loc[:, list(cond_idx)]  # T x N
        cond_data = plate_time.join(cond_data)  # T x N+1

        cond_data = cond_data.melt(id_vars='Time',
                                   var_name='Sample_ID',
                                   value_name='OD')
        cond_data = cond_data.drop(
            ['Sample_ID'], axis=1)  # T*R x 2 (where R is number of replicates)
        cond_data = cond_data.dropna()

        gm = GrowthModel(df=cond_data,
                         ARD=True,
                         heteroscedastic=fix_noise,
                         nthin=nthin)  #,

        curve = gm.run(name=idx)

        # get parameter estimates using latent function
        diauxie_dict[idx] = curve.params.pop('df_dx')
        params_latent.loc[idx, :] = curve.params

        # get parameter estimates using samples fom the posterior distribution
        if posterior: params_sample.loc[idx, :] = curve.sample().posterior

        # passively save data, manipulation occurs below (input OD, GP fit, & GP derivative)
        if args['sgd']:
            time = pd.DataFrame(gm.x_new, columns=['Time'])
            mu0, var0 = np.ravel(gm.y0), np.ravel(np.diag(gm.cov0))
            mu1, var1 = np.ravel(gm.y1), np.ravel(np.diag(gm.cov1))

            if fix_noise: sigma_noise = np.ravel(gm.error_new) + gm.noise
            else: sigma_noise = np.ravel([gm.noise] * time.shape[0])

            mu_var = pd.DataFrame(
                [mu0, var0, mu1, var1, sigma_noise],
                index=['mu', 'Sigma', 'mu1', 'Sigma1', 'Noise']).T
            gp_data = pd.DataFrame([list(condition.values)] * len(mu0),
                                   columns=condition.keys())
            gp_data = gp_data.join(time).join(mu_var)
            data_ls.append(gp_data)

    # summarize diauxie results
    diauxie_df = mergeDiauxieDfs(diauxie_dict)

    if posterior: gp_params = params_sample.join(params_latent['diauxie'])
    else: gp_params = params_latent

    # record results in object's key
    plate_cond = plate_cond.join(gp_params)
    plate_cond.index.name = 'Sample_ID'
    plate_cond = plate_cond.reset_index(drop=False)
    plate_cond = pd.merge(plate_cond, diauxie_df, on='Sample_ID')

    params = initParamList(0) + initParamList(1)
    params = list(set(params).intersection(set(plate_cond.keys())))

    df_params = plate_cond.drop(initDiauxieList(), axis=1).drop_duplicates()
    df_diauxie = plate_cond[plate_cond.diauxie == 1]
    df_diauxie = df_diauxie.drop(params, axis=1)
    df_diauxie = minimizeDiauxieReport(df_diauxie)

    summ_path = assembleFullName(directory['summary'], '', filename, 'summary',
                                 '.txt')
    diux_path = assembleFullName(directory['summary'], '', filename, 'diauxie',
                                 '.txt')

    # normalize parameters, if requested
    df_params = normalizePooledParameters(args, df_params)
    df_params = df_params.drop(['Group', 'Control'], 1)
    df_params = minimizeParameterReport(df_params)

    # save results
    df_params.to_csv(summ_path, sep='\t', header=True, index=False)
    if df_diauxie.shape[0] > 0:
        df_diauxie.to_csv(diux_path, sep='\t', header=True, index=False)

    # save latent functions
    if args['sgd']:
        file_path = assembleFullName(directory['derived'], '', filename,
                                     'gp_data', '.txt')
        gp_data = pd.concat(data_ls, sort=False).reset_index(drop=True)
        gp_data.to_csv(file_path, sep='\t', header=True, index=True)

    return None

示例#16

0

显示文件

def addMVNPlotLine(ax,
                   x,
                   criteria,
                   label,
                   z_value,
                   color,
                   plot_params,
                   noise=False):
    '''
    Given data (x) and criteria, find relevant sample IDs and plot them on axis.

    Args:
        ax (matplotlib.axes._subplots.AxesSubplot)
        x (pandas.DataFrame): must include columns for Time, mu, Sigma
        criteria (dictionary): keys must be column headers in x, values must be values in x.
        label (str): used for legend label of plotted line.
        z_value (float): z-value for computing confidence interval
        color (str or (R,G,B,A)) where R,G,B,A are floats [0,1]
        plot_params (dictionary)
        noise (boolean): whetehr to plot 95-pct credibel intervals including sample uncertainty

    Returns:
        ax (matplotlib.axes._subplots.AxesSubplot)    
    '''
    scaler = norm.ppf(
        z_value)  # define confidence interval scaler for MVN predictions
    x = subsetDf(x, criteria)  # grab value-specific model predictions

    if noise: Sigma = x.Sigma + x.Noise
    else: Sigma = x.Sigma

    # compute credible interval
    xtime = x.Time
    y_avg = x.mu
    y_low = x.mu - scaler * np.sqrt(Sigma)
    y_upp = x.mu + scaler * np.sqrt(Sigma)

    # convert from log2 to linear OD
    # if plot_params['plot_linear_od']:
    #     y_avg = np.exp(y_avg)
    #     y_low = np.exp(y_low)
    #     y_upp = np.exp(y_upp)

    ax.plot(xtime,
            y_avg,
            color=color,
            label=label,
            alpha=0.9,
            lw=3.0,
            zorder=10)
    ax.fill_between(x=xtime,
                    y1=y_low,
                    y2=y_upp,
                    color=color,
                    alpha=0.10,
                    zorder=5)
    ax = largeTickLabels(ax, fontsize=plot_params['fontsize'])

    #if plot_params['plot_linear_od']:
    #    ax.axhline(y=1,xmin=0,xmax=xtime.max(),lw=3.0,color=(0,0,0,1))
    #else:
    ax.axhline(y=0, xmin=0, xmax=xtime.max(), lw=3.0, color=(0, 0, 0, 1))

    return ax

示例#17

0

显示文件

文件： normalize.py 项目： firasmidani/amiga

def normalizeParameters(args,df):
	'''
	Normalizes growth parameters to control samples. 

	Args:
	    args (dictionary): keys are arguments and value are user/default choices
	    df (pandas.DataFrame): rows are samples, columns are experimental variables. Must include
	        Plate_ID, Group, Control, auc, k, gr, dr, td, lag.

	Returns:
	    df (pandas.DataFrame): input but with an additional 6 columns.
	'''

	# let's keep original dataframe
	df_orig = df.copy()
	df_orig_keys = df_orig.columns 
	df = df.reset_index()

	# How you should normalize?
	if args.normalize_method == 'division':  opr = operator.truediv
	elif args.normalize_method == 'subtraction':  opr = operator.sub

	# How to group samples and which ones are control samples?

	# if user specifies with command-line arguments
	if args.group_by is not None and args.normalize_by is not None:

		groupby = args.group_by.split(',')
		controlby = checkParameterCommand(args.normalize_by)

	elif args.normalize_by is not None and args.group_by is None:

		controlby = checkParameterCommand(args.normalize_by)
		df.loc[:,'Group'] = [1]*df.shape[0]
		groupby = ['Group']

	# else check columns for Group and Contol variables
	elif 'Group' in df_orig_keys and 'Control' in df_orig_keys:

		groupby = ['Group']
		controlby = {'Control':1}

		if (len(df.Group.unique())==1) and (len(df.Plate_ID.unique())>1):
			msg = '\nUSER WARNING: AMiGA detected a single "Group" but multiple Plate_IDs.\n'
			msg += 'Wells from different plates will thus be normalized togther as a group.\n'
			msg += 'If this was not your intention, please pass explicit arguments to AMiGA\n'
			msg += 'using "--group-by" and "--control-by" arguments to avoid any ambiguity.\n' 
			print(msg)

    # else exit with error message
	else:
		msg = 'FATAL USER ERROR: User must specify groups of samples and '
		msg += 'their corresponding control samples.'
		sys.exit(msg)

	# which parameters to normalize and/or to keep
	params_1 = initParamList(0)
	params_1.remove('diauxie')
	params_2 = ['mean({})'.format(ii) for ii in params_1]
	params_3 = initParamList(2)

	if any([ii in df_orig_keys for ii in params_2]):  
		params = params_2
	elif any([ii in df_orig_keys for ii in params_3]):
		params = params_3 
	else:  params = params_1

	#params_norm = initParamList(2)

	params_keep = groupby + list(controlby.keys()) + ['Sample_ID','Plate_ID'] + params
	params_keep = list(df.columns[df.columns.isin(params_keep)])
	params_varbs = list(set(params_keep).difference(set(params)))
	df = df.loc[:,params_keep]

	norm_df = []
	for idx,row in df.loc[:,groupby].drop_duplicates().iterrows():

		df_group = subsetDf(df,row.to_dict()).loc[:,params_keep]
		df_group = df_group.sort_values(params_varbs)
		df_control = subsetDf(df_group,controlby)

		df_group.set_index(params_varbs,inplace=True)
		df_control.set_index(params_varbs,inplace=True)

		dgv = df_group.values
		dcv = df_control.mean().values

		df_group.loc[:,:] = opr(dgv,dcv)
		norm_df.append(df_group)	

	norm_df = pd.concat(norm_df,axis=0)
	norm_df.columns = ['norm({})'.format(ii) for ii in norm_df.columns]	
	norm_df = norm_df.reset_index(drop=False)

	df = pd.merge(df_orig,norm_df,on=params_varbs)

	if 'Sample_ID' in df.columns: df = df.set_index('Sample_ID')

	return df