def get_baseline_stats(keys): ''' get raw baseline values of multiple parameters. :param keys: list parameters to be included. :return: ''' # list of raw data sets (for inspection) raw_sets = [] # lists to store statistics from every data_set data_sets_means = [] data_sets_err = [] significance_list = [] for key in keys: # get the data raw = raw_data.get_raw_data(key)['c'] # exclude week days week_ends = [day for day in raw.index if day in [0, 7, 14, 21, 28]] raw = raw.loc[week_ends] # transpose, stack, drop irrelevant levels transposed = raw.T stacked = transposed.stack() raw = stacked.droplevel(['replicate', 'days']) # rename resulting Series raw.name = key # append to raw_sets list raw_sets.append(raw) # compute statistics group_by_soil = raw.groupby(raw.index) means = group_by_soil.mean() std_err = group_by_soil.sem() # compute significance and rename booleans = significance.get_significance_booleans(raw) annotations = significance.annotate(booleans) annotations.name = key # append statistics to lists data_sets_means.append(means) data_sets_err.append(std_err) significance_list.append(annotations) # combine stats from all data sets into Dataframe baseline_means = pandas.concat(data_sets_means, axis=1) baseline_std_errors = pandas.concat(data_sets_err, axis=1) baseline_significance = pandas.concat(significance_list, axis=1) return Stats(means=baseline_means,stde=baseline_std_errors,),\ baseline_significance,\ raw_sets
def stack_data(raw_data, normalize_by=None, treatment: str = None): ''' stack raw data . parameters ---------- raw_data: str or DataFrame if a string is given it will be used as an argument for get_raw_data().. normalize_by: function, optional how to normalize the data. treatment: str, optional can be either "t" (use data from treated samples) or "c" (use data from control samples) returns ------- stacked_data_set: Series Series.index are the group id's for regression and Series.values are the results being regressed. Series.name is the name of the data set. ''' # data data = get_raw_data(raw_data) if type(raw_data) == str else raw_data data = (normalize_by(data) if normalize_by else data[treatment] if treatment else data) # stack columns_level_names = data.columns.names stacked_data_set = data.stack(columns_level_names) return stacked_data_set
def combine_raw_data(data_set_name, ): ''' combine raw_data from all LTTs. :param data_set_name: str which data set to use. :param treatment: str 't' for treated samples, 'c' for control samples :return: combined_raw_data: Series index is sampling event. ''' def drop_outliers(data, drop_index): is_respiration_data = data.name == 'Resp' if is_respiration_data: data.drop(drop_index, inplace=True) # get raw data combined_raw = get_raw_data(data_set_name)['t'] # stack and drop 'soil' and 'replicate' levels columns_levels = combined_raw.columns.names combined_raw = combined_raw.stack(columns_levels).droplevel(columns_levels) # name raw data combined_raw.name = data_set_name # # drop outlying data # indices_to_drop = [0.17, 7, 7.17] # drop_outliers(combined_raw, indices_to_drop) # # insert initial control values for respiration data of treated samples # if data_set_name == 'Resp' and treatment == 't': # # get data # control_raw = get_raw_data(data_set_name)['c'] # # first sampling raw data # first_row = control_raw.iloc[0] # # drop all index levels # first_row.reset_index(drop=True) # # set all index labels to 0 # new_index = [0 for i in first_row.index] # first_row.index = new_index # # insert control first sampling data as the first value of treated samples data # raw = first_row.append(raw) return combined_raw
def get_carbon_efficiency(treatment): wknds = [0, 7, 14, 21] # raw data raw_mbc = get_raw_data('MBC')[treatment] raw_mbc = raw_mbc.loc[wknds] # start-finish of first 3 weeks # get stats mbc_stats = get_stats(raw_mbc) mbc_means = mbc_stats.means mbc_errors = mbc_stats.stde # weekly change weekly_mbc_change = mbc_means.diff() weekly_mbc_change = weekly_mbc_change.shift(-1).drop(21) # associated errors for weekly change errors_squared = mbc_errors**2 add_errors = errors_squared.add(errors_squared.shift(1)) square_root = add_errors**0.5 error_mbc_change = square_root.shift(-1).drop(21) # weekly respiration stats weekly_respiration_stats = get_weekly_respiration(treatment) weekly_respiration = weekly_respiration_stats.means repiration_error = weekly_respiration_stats.stde # impose same index for MBC and Respiration data index = weekly_respiration.index weekly_mbc_change.index = index error_mbc_change.index = index # assimilation-to-consumption ratio (CUE) CUE = weekly_mbc_change / (weekly_respiration + weekly_mbc_change) # error propogation growth_relative_err = error_mbc_change / weekly_mbc_change respiration_relative_err = repiration_error / weekly_respiration CUE_error = propagate_error(CUE, respiration_relative_err, growth_relative_err) return Stats( means=CUE, stde=CUE_error, )
def get_raw_ltt( data_set_name, ltt, ): ''' get treated samples raw data, for a given LTT. :param data_set_name: :param ltt: :return: ''' raw_data = get_raw_data(data_set_name)['t'] # treated samples raw_ltt = raw_data[ltt] # stack and drop 'replicate' level stacked = raw_ltt.stack() droped_n_stacked = stacked.droplevel('replicate') raw_ltt = droped_n_stacked raw_ltt.name = data_set_name return raw_ltt
# drop irrelevant days t = drop_days(t, days_to_drop) if days_to_drop else t data = data.loc[t] # dependent variable abd weights y = data['mean'].values error = data['error'].values # intialize model and perform fit fit = model.fit(y, t=t, weights=error, nan_policy='omit') return fit if __name__ == '__main__': data_set_name = 'MBC' soil = 'MIN' raw = get_raw_data(data_set_name) raw = baseline_normalize(raw)[soil] mean = raw.T.mean() sem = raw.T.sem() mean[0] = 0 sem[0] = 0 mean.name = 'mean' sem.name = 'error' data = pandas.concat([mean, sem], axis=1) data.dropna(how='any', inplace=True) fit = fit(data, weekly_growth_decay)
# todo compute stde from pandas import DataFrame, MultiIndex from data.raw_data import get_raw_data from data.stats import get_stats from data.helpers import * SOILS = Constants.LTTs #raw data RAW_DATA = get_raw_data('Resp') # limits of time intervals between samplings timepoints = RAW_DATA.index.values n_intervals = len(timepoints) - 1 INTERVALS_LIST = [ [timepoints[i], timepoints[i + 1]] for i in range(n_intervals) ] # a list of intervals start and end (i.e [0, 2] for the interval between incubation start and 2 h) intervals_arrayed = numpy.asarray(INTERVALS_LIST) intervals = intervals_arrayed.T # array.shape-->(2, len(SAMPLING_TIMEPOINTS)) BEGININGS = intervals[0] ENDINGS = intervals[1] SAMPLING_INTERVALS = ENDINGS - BEGININGS def get_mean_rates(treatment): ''' get average rate between every two consecutive sampling points. :param treatment: str