def getChis(crosstab): chi2, p, dof, ex = sps.chi2_contingency(crosstab[0]) crit = sps.chi2.ppf(q=0.95, df=dof) if (crit < chi2): evaluation = True else: evaluation = False obs = crosstab[0].as_matrix() obs_list = obs.tolist() ex_list = ex.tolist() z_scores = zmap(obs_list, ex_list) z_list = z_scores.tolist() z_indicators = [] for z in z_list: z_sig = [ "+" if i > 1.96 else "-" if i < -1.96 else " " for i in z ] z_indicators.append(z_sig) results = { 'chi-sq': chi2, 'p-val': p, 'eval': evaluation, 'dof': dof, 'explanandum': crosstab[1], 'expected': ex_list, 'observed': obs_list, 'z_scores': z_indicators, 'row_lab': crosstab[0].index.tolist(), 'col_lab': crosstab[0].columns.tolist() } return results
def getChis(crosstab, variable): chi2, p, dof, ex = sps.chi2_contingency(crosstab) x = sps.chi2_contingency(crosstab) crit = sps.chi2.ppf(q=0.95, df=dof) if (crit < chi2): evaluation = True else: evaluation = False obs = crosstab.as_matrix() obs_list = obs.tolist() ex_list = ex.tolist() z_scores = sps.zmap(obs_list, ex_list) z_list = z_scores.tolist() z_indicators = [] for z in z_list: z_sig = ["+" if i > 1.96 else "-" if i < -1.96 else " " for i in z] z_indicators.append(z_sig) results = {'chi-sq': chi2, 'p-val': p, 'eval': evaluation, 'dof': dof, 'explanans': variable, 'expected': ex_list, 'observed': obs_list, 'z_scores': z_indicators, 'row_lab': crosstab.index.tolist(), 'col_lab': crosstab.columns.tolist() } print results return results
def zscore_modified(vals): if any(isinstance(v, str) for v in vals): return np.zeros(len(vals)) vals = [np.nan if v is None else v for v in vals] # replace None with np.nan # print(type(vals[0])) # print(vals[:10]) vals_no_outliers = reject_outliers(np.array(vals)).flatten() return zmap(vals, vals_no_outliers)
def population_coupling( spiketrain: np.ndarray, spiketrain_list: list, bin_window: float = 0.01, num_lags: int = 100, as_df: bool = False, t_start: float = None, t_stop: float = None, frac_zscore: float = 0.25, return_all: bool = False, ): """ Calculate the population-coupling index between a spiketrain and the population. The metric is calculated by computing and standardising cross correlation between an individual spiketrain and the "population spiketrain", consisting of all other neurons. Large Z score cross correlation at lag=0 is indicative of high population coupling. Args: spiketrain: A numpy array of spiketimes spiketrain_list: A list of numpy-array spiketrains binsize: The size of the time bin in seconds num_lags: The number of lags forward and backwards around lag 0 to return as_df: Whether to return results as pandas DataFrame t_start: Minimum timepoint t_stop: Maximum timepoint return_all: If true, all time bins and cross correlation values are returned Returns: The zscore at lag=0 between the spiketrain and the population """ T_ROUNDING_PRECISION = 5 t_cutoff = ((num_lags * 2) + 1) // (1 / frac_zscore) if t_start is None: t_start = spiketrain[0] if t_stop is None: t_stop = spiketrain[-1] population_spiketrain = np.sort(np.concatenate(spiketrain_list)) time_bins, values = cross_corr( spiketrain, population_spiketrain, bin_window=bin_window, num_lags=num_lags, as_df=False, t_start=t_start, t_stop=t_stop, delete_0_lag=False, ) time_bins = np.round(time_bins, T_ROUNDING_PRECISION) values = zmap(values, values[:t_cutoff]) if not return_all: return values[time_bins == 0] else: return time_bins, values
def StoreOrDisplayData(): if request.method == "POST": JsonData = json.loads(request.data) print(JsonData) database.session.add(TempHumidityHeatIndex(JsonData["temperature"],JsonData["heatindex"],JsonData["humidity"])) database.session.commit() # You would not do the following in production: # You would have all the elements of the z-score calculation "cached" and only change it when needed return '{{ "Z-Score":{} }}'.format(zmap(JsonData["heatindex"],[RowObj.HeatIndex for RowObj in TempHumidityHeatIndex.query.all()])) elif request.method == "GET": return render_template("index.html",WeatherData = TempHumidityHeatIndex.query.all())
def zscore_standardise(to_standardise: np.ndarray, baseline: np.ndarray): """ Convert an array to zscores calculated on a baseline period. Args: to_normalise: A numpy array to be converted to zscores. baseline: A numpy array containing data used to calculate the mean and standard deviation for zscore conversions. This is usually (but not necessarily) a subsection of to_standardise Returns: A numpy array of zscores """ return zmap(to_standardise, baseline)
def do_cv(cv_dict, is_multi_sess, classifier, x, y, permute=False): """ Loop over all cross validation folds, return area under the curve (AUC) and class probabilities. """ # permute distribution of behavior if desired. Should this be done with each fold? if permute: y = np.random.permutation(y) # if leave-one-session-out cross validation, this will hold area under the curve for each hold out fold_aucs = np.empty(shape=(len(cv_dict)), dtype=np.float) # will hold the predicted class probability for all the test data probs = np.empty(shape=y.shape, dtype=np.float) # now loop over all the cross validation folds for cv_num, cv in enumerate(cv_dict.keys()): # Training data for fold x_train = x[cv_dict[cv]['train_bool']] y_train = y[cv_dict[cv]['train_bool']] # Test data for fold x_test = x[cv_dict[cv]['test_bool']] y_test = y[cv_dict[cv]['test_bool']] # normalize the train data, and then normalize the test data by the mean and sd of the train data # this is a little silly because the data are already zscored by session, but it could presumably # have an effect for within-session leave-out-trial-out cross validation. The main point is that train # and test data should be scaled the same x_train = zscore(x_train, axis=0) x_test = zmap(x_test, x_train, axis=0) # fit the model for this fold classifier = SubjectClassifierAnalysis.do_fit_model( classifier, x_train, y_train) # now predict class probability of test data test_probs = classifier.predict_proba(x_test)[:, 1] probs[cv_dict[cv]['test_bool']] = test_probs # if session level CV, compute the area under the curve for this fold and store if is_multi_sess: fold_aucs[cv_num] = roc_auc_score(y_test, test_probs) # compute AUC based on all CVs, either as the average of the session-level AUCs, or all the cross-validated # predictions of the within session CVs all_test_bool = np.any(np.stack([cv_dict[x]['test_bool'] for x in cv_dict]), axis=0) auc = fold_aucs.mean() if is_multi_sess else roc_auc_score( y[all_test_bool], probs[all_test_bool]) return auc, probs
def write_specimen_info(wt_wev, mut_wev, outfile): """ Write a csv with some summary info on specimens currently only returns Z-score of mutants """ def sortwev(x): print(x) return x wev_z = zmap(mut_wev.staging, wt_wev.staging) mut_wev['WEV_zscore'] = wev_z mut_wev.sort_values('WEV_zscore', key=sortwev, inplace=True) mut_wev.to_csv(outfile)
def do_cv(cv_dict, is_multi_sess, classifier, x, y, permute=False): """ Loop over all cross validation folds, return area under the curve (AUC) and class probabilities. """ # permute distribution of behavior if desired. Should this be done with each fold? if permute: y = np.random.permutation(y) # if leave-one-session-out cross validation, this will hold area under the curve for each hold out fold_aucs = np.empty(shape=(len(cv_dict)), dtype=np.float) # will hold the predicted class probability for all the test data probs = np.empty(shape=y.shape, dtype=np.float) # now loop over all the cross validation folds for cv_num, cv in enumerate(cv_dict.keys()): # Training data for fold x_train = x[cv_dict[cv]['train_bool']] y_train = y[cv_dict[cv]['train_bool']] # Test data for fold x_test = x[cv_dict[cv]['test_bool']] y_test = y[cv_dict[cv]['test_bool']] # normalize the train data, and then normalize the test data by the mean and sd of the train data # this is a little silly because the data are already zscored by session, but it could presumably # have an effect for within-session leave-out-trial-out cross validation. The main point is that train # and test data should be scaled the same x_train = zscore(x_train, axis=0) x_test = zmap(x_test, x_train, axis=0) # fit the model for this fold classifier = SubjectClassifierAnalysis.do_fit_model(classifier, x_train, y_train) # now predict class probability of test data test_probs = classifier.predict_proba(x_test)[:, 1] probs[cv_dict[cv]['test_bool']] = test_probs # if session level CV, compute the area under the curve for this fold and store if is_multi_sess: fold_aucs[cv_num] = roc_auc_score(y_test, test_probs) # compute AUC based on all CVs, either as the average of the session-level AUCs, or all the cross-validated # predictions of the within session CVs all_test_bool = np.any(np.stack([cv_dict[x]['test_bool'] for x in cv_dict]), axis=0) auc = fold_aucs.mean() if is_multi_sess else roc_auc_score(y[all_test_bool], probs[all_test_bool]) return auc, probs
def normalize(signals, axis=None, groups=None, MP=False, comp=None): """ :param signals: 1D, 2D or 3D signals returns zscored per patient """ if comp is not None: print('zmapping with axis {}'.format(axis)) return stats.zmap(signals, comp, axis=axis) if groups is None: print('zscoring with axis {}'.format(axis)) return stats.zscore(signals, axis=axis) if signals.ndim == 1: signals = np.expand_dims(signals, 0) if signals.ndim == 2: signals = np.expand_dims(signals, 2) if MP: print('zscoring per patient using {} cores'.format(cpu_count())) p = Pool(cpu_count()) #use all except for one res = [] new_signals = np.zeros_like(signals) for ID in np.unique(groups): idx = groups == ID job = p.apply_async(stats.zscore, args=(signals[idx], ), kwds={'axis': None}) res.append(job) start = 0 for r in res: values = r.get(timeout=1200) end = start + len(values) new_signals[start:end] = values end = start return new_signals else: print('zscoring per patient') res = [] for ID in np.unique(groups): idx = groups == ID job = stats.zscore(signals[idx], axis=None) res.append(job) new_signals = np.vstack(res) return new_signals
def rescale_test(X_train, X_test, Y_train, U, V): """ Generates out-of-sample predicted `Y` values Parameters ---------- X_train : (S1, B) array_like Data matrix, where `S1` is observations and `B` is features X_test : (S2, B) Data matrix, where `S2` is observations and `B` is features Y_train : (S1, T) array_like Behavioral matrix, where `S1` is observations and `T` is features Returns ------- Y_pred : (S2, T) `numpy.ndarray` Behavioral matrix, where `S2` is observations and `T` is features """ X_resc = zmap(X_test, compare=X_train, ddof=1) Y_pred = (X_resc @ U @ V.T) + Y_train.mean(axis=0, keepdims=True) return Y_pred
def test_zmap(self): for n in self.get_n(): x, y, xm, ym = self.generate_xy_sample(n) z = stats.zmap(x,y) zm = stats.mstats.zmap(xm,ym) assert_allclose(z, zm[0:len(z)], atol=1e-10)
def residualize(X, Y, Xc=None, Yc=None, normalize=True, add_intercept=True): """ Returns residuals of regression equation from `Y ~ X` Parameters ---------- X : (N[, R]) array_like Coefficient matrix of `R` variables for `N` subjects Y : (N[, F]) array_like Dependent variable matrix of `F` variables for `N` subjects Xc : (M[, R]) array_like, optional Coefficient matrix of `R` variables for `M` subjects. If not specified then `X` is used to estimate betas. Default: None Yc : (M[, F]) array_like, optional Dependent variable matrix of `F` variables for `M` subjects. If not specified then `Y` is used to estimate betas. Default: None normalize : bool, optional Whether to normalize (i.e., z-score) residuals. Will use residuals from `Yc ~ Xc` for generating mean and variance. Default: True add_intercept : bool, optional Whether to add intercept to `X` (and `Xc`, if provided). The intercept will not be removed, just used in beta estimation. Default: True Returns ------- Yr : (N, F) numpy.ndarray Residuals of `Y ~ X` Notes ----- If both `Xc` and `Yc` are provided, these are used to calculate betas which are then applied to `X` and `Y`. """ if ((Yc is None and Xc is not None) or (Yc is not None and Xc is None)): raise ValueError('If processing against a comparative group, you must ' 'provide both `Xc` and `Yc`.') X, Y = np.asarray(X), np.asarray(Y) if Yc is None: Xc, Yc = X.copy(), Y.copy() else: Xc, Yc = np.asarray(Xc), np.asarray(Yc) # add intercept to regressors if requested and calculate fit if add_intercept: X, Xc = utils.add_constant(X), utils.add_constant(Xc) betas, *rest = np.linalg.lstsq(Xc, Yc, rcond=None) # remove intercept from regressors and betas for calculation of residuals if add_intercept: betas = betas[:-1] X, Xc = X[:, :-1], Xc[:, :-1] # calculate residuals Yr = Y - (X @ betas) Ycr = Yc - (Xc @ betas) if normalize: Yr = sstats.zmap(Yr, compare=Ycr) return Yr
## construct dataset ## ####################### breastCancer = load_breast_cancer() D = pd.DataFrame(breastCancer['data']).assign( target=breastCancer['target']).sample(frac=1, random_state=662352).values ## train and test set sizes Ntr = int(np.ceil(0.75 * D.shape[0])) Nte = D.shape[0] - Ntr P = D.shape[1] - 1 ### train and test splits Xtr, Ytr = D[:Ntr, :-1], D[:Ntr, -1] Xva, Yva = D[Ntr:, :-1], D[Ntr:, -1] Xva, Xtr = ss.zmap(Xva, Xtr), ss.zscore(Xtr) Tmax = 1001 M = 2 # number of cadres alpha_d, alpha_W = 0.95, 0.05 # d is more l1, W is more l2 lambda_d, lambda_W = 0.01, 0.05 # regularization strength ################## ## learn models ## ################## cadreModel = sc.kClassCadreModel(M=M, alpha_d=alpha_d, alpha_W=alpha_W, lambda_d=lambda_d, lambda_W=lambda_W,
def test_zmap(self): for n in self.get_n(): x, y, xm, ym = self.generate_xy_sample(n) z = stats.zmap(x, y) zm = stats.mstats.zmap(xm, ym) assert_allclose(z, zm[0:len(z)], atol=1e-10)
def estimate_model_quality(bst, *, hmm=None, n_states=None, n_shuffles=1000, k_folds=5, mode='timeswap-pooled', verbose=False): """Estimate the HMM 'model quality' associated with the set of events in bst. TODO: finish docstring, and do some more consistency checking... TODO: add other modes of shuffling Params ====== Returns ======= quality : scores : shuffled : """ from .decoding import k_fold_cross_validation from scipy.stats import zmap if hmm: if not n_states: n_states = hmm.n_components X = [ii for ii in range(bst.n_epochs)] scores = np.zeros(bst.n_epochs) shuffled = np.zeros((bst.n_epochs, n_shuffles)) if mode == 'timeswap-pooled': # shuffle data coherently, pooled over all events: shuffle_func = replay.pooled_time_swap_bst elif mode == 'timeswap-within-event': # shuffle data coherently within events: shuffle_func = replay.time_swap_bst elif mode == 'temporal-within-event': shuffle_func = replay.incoherent_shuffle_bst else: raise NotImplementedError for kk, (training, validation) in enumerate(k_fold_cross_validation(X, k=k_folds)): if verbose: print(' fold {}/{}'.format(kk + 1, k_folds)) PBEs_train = bst[training] PBEs_test = bst[validation] # train HMM on all training PBEs hmm = PoissonHMM(n_components=n_states, verbose=False) hmm.fit(PBEs_train) # compute scores_hmm (log likelihoods) of validation set: scores[validation] = hmm.score(PBEs_test) for nn in range(n_shuffles): # shuffle data: bst_test_shuffled = shuffle_func(PBEs_test) # score validation set with shuffled-data HMM shuffled[validation, nn] = hmm.score(bst_test_shuffled) quality = zmap(scores.mean(), shuffled.mean(axis=0)) return quality, scores, shuffled
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--infile", required=True, help="Tabular file.") parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.") parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;") parser.add_argument("--test_id", help="statistical test method") parser.add_argument( "--mwu_use_continuity", action="store_true", default=False, help="Whether a continuity correction (1/2.) should be taken into account.", ) parser.add_argument( "--equal_var", action="store_true", default=False, help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.", ) parser.add_argument( "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values." ) parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used") parser.add_argument( "--bias", action="store_true", default=False, help="if false,then the calculations are corrected for statistical bias", ) parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored") parser.add_argument( "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored" ) parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored") parser.add_argument( "--printextras", action="store_true", default=False, help="If True, if there are extra points a warning is raised saying how many of those points there are", ) parser.add_argument( "--initial_lexsort", action="store_true", default="False", help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.", ) parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ") parser.add_argument( "--axis", type=int, default=0, help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)", ) parser.add_argument( "--n", type=int, default=0, help="the number of trials. This is ignored if x gives both the number of successes and failures", ) parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram") parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction") parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--m", type=float, default=0.0, help="limits") parser.add_argument("--mf", type=float, default=2.0, help="lower limit") parser.add_argument("--nf", type=float, default=99.9, help="higher_limit") parser.add_argument( "--p", type=float, default=0.5, help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5", ) parser.add_argument("--alpha", type=float, default=0.9, help="probability") parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds") parser.add_argument( "--proportiontocut", type=float, default=0.0, help="Proportion (in range 0-1) of total data set to trim of each end.", ) parser.add_argument( "--lambda_", type=float, default=1.0, help="lambda_ gives the power in the Cressie-Read power divergence statistic", ) parser.add_argument( "--imbda", type=float, default=0, help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.", ) parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e") parser.add_argument("--dtype", help="dtype") parser.add_argument("--med", help="med") parser.add_argument("--cdf", help="cdf") parser.add_argument("--zero_method", help="zero_method options") parser.add_argument("--dist", help="dist options") parser.add_argument("--ties", help="ties options") parser.add_argument("--alternative", help="alternative options") parser.add_argument("--mode", help="mode options") parser.add_argument("--method", help="method options") parser.add_argument("--md", help="md options") parser.add_argument("--center", help="center options") parser.add_argument("--kind", help="kind options") parser.add_argument("--tail", help="tail options") parser.add_argument("--interpolation", help="interpolation options") parser.add_argument("--statistic", help="statistic options") args = parser.parse_args() infile = args.infile outfile = open(args.outfile, "w+") test_id = args.test_id nf = args.nf mf = args.mf imbda = args.imbda inclusive1 = args.inclusive1 inclusive2 = args.inclusive2 sample0 = 0 sample1 = 0 sample2 = 0 if args.sample_cols != None: sample0 = 1 barlett_samples = [] for sample in args.sample_cols.split(";"): barlett_samples.append(map(int, sample.split(","))) if args.sample_one_cols != None: sample1 = 1 sample_one_cols = args.sample_one_cols.split(",") if args.sample_two_cols != None: sample_two_cols = args.sample_two_cols.split(",") sample2 = 1 for line in open(infile): sample_one = [] sample_two = [] cols = line.strip().split("\t") if sample0 == 1: b_samples = columns_to_values(barlett_samples, line) if sample1 == 1: for index in sample_one_cols: sample_one.append(cols[int(index) - 1]) if sample2 == 1: for index in sample_two_cols: sample_two.append(cols[int(index) - 1]) if test_id.strip() == "describe": size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one)) cols.append(size) cols.append(min_max) cols.append(mean) cols.append(uv) cols.append(bs) cols.append(bk) elif test_id.strip() == "mode": vals, counts = stats.mode(map(float, sample_one)) cols.append(vals) cols.append(counts) elif test_id.strip() == "nanmean": m = stats.nanmean(map(float, sample_one)) cols.append(m) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "kurtosistest": z_value, p_value = stats.kurtosistest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "itemfreq": freq = stats.itemfreq(map(float, sample_one)) for list in freq: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "boxcox_llf": IIf = stats.boxcox_llf(imbda, map(float, sample_one)) cols.append(IIf) elif test_id.strip() == "tiecorrect": fa = stats.tiecorrect(map(float, sample_one)) cols.append(fa) elif test_id.strip() == "rankdata": r = stats.rankdata(map(float, sample_one), method=args.md) cols.append(r) elif test_id.strip() == "nanstd": s = stats.nanstd(map(float, sample_one), bias=args.bias) cols.append(s) elif test_id.strip() == "anderson": A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist) cols.append(A2) for list in critical: cols.append(list) cols.append(",") for list in sig: cols.append(list) elif test_id.strip() == "binom_test": p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p) cols.append(p_value) elif test_id.strip() == "gmean": gm = stats.gmean(map(float, sample_one), dtype=args.dtype) cols.append(gm) elif test_id.strip() == "hmean": hm = stats.hmean(map(float, sample_one), dtype=args.dtype) cols.append(hm) elif test_id.strip() == "kurtosis": k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias) cols.append(k) elif test_id.strip() == "moment": n_moment = stats.moment(map(float, sample_one), n=args.n) cols.append(n_moment) elif test_id.strip() == "normaltest": k2, p_value = stats.normaltest(map(float, sample_one)) cols.append(k2) cols.append(p_value) elif test_id.strip() == "skew": skewness = stats.skew(map(float, sample_one), bias=args.bias) cols.append(skewness) elif test_id.strip() == "skewtest": z_value, p_value = stats.skewtest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "sem": s = stats.sem(map(float, sample_one), ddof=args.ddof) cols.append(s) elif test_id.strip() == "zscore": z = stats.zscore(map(float, sample_one), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "signaltonoise": s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof) cols.append(s2n) elif test_id.strip() == "percentileofscore": p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind) cols.append(p) elif test_id.strip() == "bayes_mvs": c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha) cols.append(c_mean) cols.append(c_var) cols.append(c_std) elif test_id.strip() == "sigmaclip": c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n) cols.append(c) cols.append(c_low) cols.append(c_up) elif test_id.strip() == "kstest": d, p_value = stats.kstest( map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode ) cols.append(d) cols.append(p_value) elif test_id.strip() == "chi2_contingency": chi2, p, dof, ex = stats.chi2_contingency( map(float, sample_one), correction=args.correction, lambda_=args.lambda_ ) cols.append(chi2) cols.append(p) cols.append(dof) cols.append(ex) elif test_id.strip() == "tmean": if nf is 0 and mf is 0: mean = stats.tmean(map(float, sample_one)) else: mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(mean) elif test_id.strip() == "tmin": if mf is 0: min = stats.tmin(map(float, sample_one)) else: min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive) cols.append(min) elif test_id.strip() == "tmax": if nf is 0: max = stats.tmax(map(float, sample_one)) else: max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive) cols.append(max) elif test_id.strip() == "tvar": if nf is 0 and mf is 0: var = stats.tvar(map(float, sample_one)) else: var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(var) elif test_id.strip() == "tstd": if nf is 0 and mf is 0: std = stats.tstd(map(float, sample_one)) else: std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(std) elif test_id.strip() == "tsem": if nf is 0 and mf is 0: s = stats.tsem(map(float, sample_one)) else: s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(s) elif test_id.strip() == "scoreatpercentile": if nf is 0 and mf is 0: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation ) else: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation ) for list in s: cols.append(list) elif test_id.strip() == "relfreq": if nf is 0 and mf is 0: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b) else: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf)) for list in rel: cols.append(list) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "binned_statistic": if nf is 0 and mf is 0: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b ) else: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, range=(mf, nf), ) cols.append(st) cols.append(b_edge) cols.append(b_n) elif test_id.strip() == "threshold": if nf is 0 and mf is 0: o = stats.threshold(map(float, sample_one), newval=args.new) else: o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new) for list in o: cols.append(list) elif test_id.strip() == "trimboth": o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut) for list in o: cols.append(list) elif test_id.strip() == "trim1": t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail) for list in t1: cols.append(list) elif test_id.strip() == "histogram": if nf is 0 and mf is 0: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b) else: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf)) cols.append(hi) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "cumfreq": if nf is 0 and mf is 0: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b) else: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf)) cols.append(cum) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "boxcox_normmax": if nf is 0 and mf is 0: ma = stats.boxcox_normmax(map(float, sample_one)) else: ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method) cols.append(ma) elif test_id.strip() == "boxcox": if imbda is 0: box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha) cols.append(box) cols.append(ma) cols.append(ci) else: box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha) cols.append(box) elif test_id.strip() == "histogram2": h2 = stats.histogram2(map(float, sample_one), map(float, sample_two)) for list in h2: cols.append(list) elif test_id.strip() == "ranksums": z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two)) cols.append(z_statistic) cols.append(p_value) elif test_id.strip() == "ttest_1samp": t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two)) for list in t: cols.append(list) for list in prob: cols.append(list) elif test_id.strip() == "ansari": AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two)) cols.append(AB) cols.append(p_value) elif test_id.strip() == "linregress": slope, intercept, r_value, p_value, stderr = stats.linregress( map(float, sample_one), map(float, sample_two) ) cols.append(slope) cols.append(intercept) cols.append(r_value) cols.append(p_value) cols.append(stderr) elif test_id.strip() == "pearsonr": cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two)) cols.append(cor) cols.append(p_value) elif test_id.strip() == "pointbiserialr": r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two)) cols.append(r) cols.append(p_value) elif test_id.strip() == "ks_2samp": d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two)) cols.append(d) cols.append(p_value) elif test_id.strip() == "mannwhitneyu": mw_stats_u, p_value = stats.mannwhitneyu( map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "zmap": z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "ttest_ind": mw_stats_u, p_value = stats.ttest_ind( map(float, sample_one), map(float, sample_two), equal_var=args.equal_var ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "ttest_rel": t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(t) cols.append(prob) elif test_id.strip() == "mood": z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(z) cols.append(p_value) elif test_id.strip() == "shapiro": W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta) cols.append(W) cols.append(p_value) for list in a: cols.append(list) elif test_id.strip() == "kendalltau": k, p_value = stats.kendalltau( map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort ) cols.append(k) cols.append(p_value) elif test_id.strip() == "entropy": s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base) cols.append(s) elif test_id.strip() == "spearmanr": if sample2 == 1: rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two)) else: rho, p_value = stats.spearmanr(map(float, sample_one)) cols.append(rho) cols.append(p_value) elif test_id.strip() == "wilcoxon": if sample2 == 1: T, p_value = stats.wilcoxon( map(float, sample_one), map(float, sample_two), zero_method=args.zero_method, correction=args.correction, ) else: T, p_value = stats.wilcoxon( map(float, sample_one), zero_method=args.zero_method, correction=args.correction ) cols.append(T) cols.append(p_value) elif test_id.strip() == "chisquare": if sample2 == 1: rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof) else: rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof) cols.append(rho) cols.append(p_value) elif test_id.strip() == "power_divergence": if sample2 == 1: stat, p_value = stats.power_divergence( map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_ ) else: stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_) cols.append(stat) cols.append(p_value) elif test_id.strip() == "theilslopes": if sample2 == 1: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha) else: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha) cols.append(mpe) cols.append(met) cols.append(lo) cols.append(up) elif test_id.strip() == "combine_pvalues": if sample2 == 1: stat, p_value = stats.combine_pvalues( map(float, sample_one), method=args.med, weights=map(float, sample_two) ) else: stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med) cols.append(stat) cols.append(p_value) elif test_id.strip() == "obrientransform": ob = stats.obrientransform(*b_samples) for list in ob: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "f_oneway": f_value, p_value = stats.f_oneway(*b_samples) cols.append(f_value) cols.append(p_value) elif test_id.strip() == "kruskal": h, p_value = stats.kruskal(*b_samples) cols.append(h) cols.append(p_value) elif test_id.strip() == "friedmanchisquare": fr, p_value = stats.friedmanchisquare(*b_samples) cols.append(fr) cols.append(p_value) elif test_id.strip() == "fligner": xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(xsq) cols.append(p_value) elif test_id.strip() == "bartlett": T, p_value = stats.bartlett(*b_samples) cols.append(T) cols.append(p_value) elif test_id.strip() == "levene": w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(w) cols.append(p_value) elif test_id.strip() == "median_test": stat, p_value, m, table = stats.median_test( ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples ) cols.append(stat) cols.append(p_value) cols.append(m) cols.append(table) for list in table: elements = ",".join(map(str, list)) cols.append(elements) outfile.write("%s\n" % "\t".join(map(str, cols))) outfile.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--infile", required=True, help="Tabular file.") parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.") parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi") parser.add_argument( "--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;", ) parser.add_argument("--test_id", help="statistical test method") parser.add_argument( "--mwu_use_continuity", action="store_true", default=False, help= "Whether a continuity correction (1/2.) should be taken into account.", ) parser.add_argument( "--equal_var", action="store_true", default=False, help= "If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.", ) parser.add_argument( "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values.", ) parser.add_argument( "--fisher", action="store_true", default=False, help="if true then Fisher definition is used", ) parser.add_argument( "--bias", action="store_true", default=False, help= "if false,then the calculations are corrected for statistical bias", ) parser.add_argument( "--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored", ) parser.add_argument( "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored", ) parser.add_argument( "--inclusive", action="store_true", default=False, help="if false,limit will be ignored", ) parser.add_argument( "--printextras", action="store_true", default=False, help= "If True, if there are extra points a warning is raised saying how many of those points there are", ) parser.add_argument( "--initial_lexsort", action="store_true", default="False", help= "Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.", ) parser.add_argument( "--correction", action="store_true", default=False, help="continuity correction ", ) parser.add_argument( "--axis", type=int, default=0, help= "Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)", ) parser.add_argument( "--n", type=int, default=0, help= "the number of trials. This is ignored if x gives both the number of successes and failures", ) parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram") parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction") parser.add_argument( "--score", type=int, default=0, help="Score that is compared to the elements in a.", ) parser.add_argument("--m", type=float, default=0.0, help="limits") parser.add_argument("--mf", type=float, default=2.0, help="lower limit") parser.add_argument("--nf", type=float, default=99.9, help="higher_limit") parser.add_argument( "--p", type=float, default=0.5, help= "The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5", ) parser.add_argument("--alpha", type=float, default=0.9, help="probability") parser.add_argument( "--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds", ) parser.add_argument( "--proportiontocut", type=float, default=0.0, help="Proportion (in range 0-1) of total data set to trim of each end.", ) parser.add_argument( "--lambda_", type=float, default=1.0, help= "lambda_ gives the power in the Cressie-Read power divergence statistic", ) parser.add_argument( "--imbda", type=float, default=0, help= "If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.", ) parser.add_argument( "--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e", ) parser.add_argument("--dtype", help="dtype") parser.add_argument("--med", help="med") parser.add_argument("--cdf", help="cdf") parser.add_argument("--zero_method", help="zero_method options") parser.add_argument("--dist", help="dist options") parser.add_argument("--ties", help="ties options") parser.add_argument("--alternative", help="alternative options") parser.add_argument("--mode", help="mode options") parser.add_argument("--method", help="method options") parser.add_argument("--md", help="md options") parser.add_argument("--center", help="center options") parser.add_argument("--kind", help="kind options") parser.add_argument("--tail", help="tail options") parser.add_argument("--interpolation", help="interpolation options") parser.add_argument("--statistic", help="statistic options") args = parser.parse_args() infile = args.infile outfile = open(args.outfile, "w+") test_id = args.test_id nf = args.nf mf = args.mf imbda = args.imbda inclusive1 = args.inclusive1 inclusive2 = args.inclusive2 sample0 = 0 sample1 = 0 sample2 = 0 if args.sample_cols is not None: sample0 = 1 barlett_samples = [] for sample in args.sample_cols.split(";"): barlett_samples.append(map(int, sample.split(","))) if args.sample_one_cols is not None: sample1 = 1 sample_one_cols = args.sample_one_cols.split(",") if args.sample_two_cols is not None: sample_two_cols = args.sample_two_cols.split(",") sample2 = 1 for line in open(infile): sample_one = [] sample_two = [] cols = line.strip().split("\t") if sample0 == 1: b_samples = columns_to_values(barlett_samples, line) if sample1 == 1: for index in sample_one_cols: sample_one.append(cols[int(index) - 1]) if sample2 == 1: for index in sample_two_cols: sample_two.append(cols[int(index) - 1]) if test_id.strip() == "describe": size, min_max, mean, uv, bs, bk = stats.describe( map(float, sample_one)) cols.append(size) cols.append(min_max) cols.append(mean) cols.append(uv) cols.append(bs) cols.append(bk) elif test_id.strip() == "mode": vals, counts = stats.mode(map(float, sample_one)) cols.append(vals) cols.append(counts) elif test_id.strip() == "nanmean": m = stats.nanmean(map(float, sample_one)) cols.append(m) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "kurtosistest": z_value, p_value = stats.kurtosistest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "itemfreq": freq = stats.itemfreq(map(float, sample_one)) for list in freq: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "boxcox_llf": IIf = stats.boxcox_llf(imbda, map(float, sample_one)) cols.append(IIf) elif test_id.strip() == "tiecorrect": fa = stats.tiecorrect(map(float, sample_one)) cols.append(fa) elif test_id.strip() == "rankdata": r = stats.rankdata(map(float, sample_one), method=args.md) cols.append(r) elif test_id.strip() == "nanstd": s = stats.nanstd(map(float, sample_one), bias=args.bias) cols.append(s) elif test_id.strip() == "anderson": A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist) cols.append(A2) for list in critical: cols.append(list) cols.append(",") for list in sig: cols.append(list) elif test_id.strip() == "binom_test": p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p) cols.append(p_value) elif test_id.strip() == "gmean": gm = stats.gmean(map(float, sample_one), dtype=args.dtype) cols.append(gm) elif test_id.strip() == "hmean": hm = stats.hmean(map(float, sample_one), dtype=args.dtype) cols.append(hm) elif test_id.strip() == "kurtosis": k = stats.kurtosis( map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias, ) cols.append(k) elif test_id.strip() == "moment": n_moment = stats.moment(map(float, sample_one), n=args.n) cols.append(n_moment) elif test_id.strip() == "normaltest": k2, p_value = stats.normaltest(map(float, sample_one)) cols.append(k2) cols.append(p_value) elif test_id.strip() == "skew": skewness = stats.skew(map(float, sample_one), bias=args.bias) cols.append(skewness) elif test_id.strip() == "skewtest": z_value, p_value = stats.skewtest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "sem": s = stats.sem(map(float, sample_one), ddof=args.ddof) cols.append(s) elif test_id.strip() == "zscore": z = stats.zscore(map(float, sample_one), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "signaltonoise": s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof) cols.append(s2n) elif test_id.strip() == "percentileofscore": p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind) cols.append(p) elif test_id.strip() == "bayes_mvs": c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha) cols.append(c_mean) cols.append(c_var) cols.append(c_std) elif test_id.strip() == "sigmaclip": c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n) cols.append(c) cols.append(c_low) cols.append(c_up) elif test_id.strip() == "kstest": d, p_value = stats.kstest( map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode, ) cols.append(d) cols.append(p_value) elif test_id.strip() == "chi2_contingency": chi2, p, dof, ex = stats.chi2_contingency( map(float, sample_one), correction=args.correction, lambda_=args.lambda_) cols.append(chi2) cols.append(p) cols.append(dof) cols.append(ex) elif test_id.strip() == "tmean": if nf == 0 and mf == 0: mean = stats.tmean(map(float, sample_one)) else: mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(mean) elif test_id.strip() == "tmin": if mf == 0: min = stats.tmin(map(float, sample_one)) else: min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive) cols.append(min) elif test_id.strip() == "tmax": if nf == 0: max = stats.tmax(map(float, sample_one)) else: max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive) cols.append(max) elif test_id.strip() == "tvar": if nf == 0 and mf == 0: var = stats.tvar(map(float, sample_one)) else: var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(var) elif test_id.strip() == "tstd": if nf == 0 and mf == 0: std = stats.tstd(map(float, sample_one)) else: std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(std) elif test_id.strip() == "tsem": if nf == 0 and mf == 0: s = stats.tsem(map(float, sample_one)) else: s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(s) elif test_id.strip() == "scoreatpercentile": if nf == 0 and mf == 0: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation, ) else: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation, ) for list in s: cols.append(list) elif test_id.strip() == "relfreq": if nf == 0 and mf == 0: rel, low_range, binsize, ex = stats.relfreq( map(float, sample_one), args.b) else: rel, low_range, binsize, ex = stats.relfreq( map(float, sample_one), args.b, (mf, nf)) for list in rel: cols.append(list) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "binned_statistic": if nf == 0 and mf == 0: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, ) else: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, range=(mf, nf), ) cols.append(st) cols.append(b_edge) cols.append(b_n) elif test_id.strip() == "threshold": if nf == 0 and mf == 0: o = stats.threshold(map(float, sample_one), newval=args.new) else: o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new) for list in o: cols.append(list) elif test_id.strip() == "trimboth": o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut) for list in o: cols.append(list) elif test_id.strip() == "trim1": t1 = stats.trim1( map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail, ) for list in t1: cols.append(list) elif test_id.strip() == "histogram": if nf == 0 and mf == 0: hi, low_range, binsize, ex = stats.histogram( map(float, sample_one), args.b) else: hi, low_range, binsize, ex = stats.histogram( map(float, sample_one), args.b, (mf, nf)) cols.append(hi) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "cumfreq": if nf == 0 and mf == 0: cum, low_range, binsize, ex = stats.cumfreq( map(float, sample_one), args.b) else: cum, low_range, binsize, ex = stats.cumfreq( map(float, sample_one), args.b, (mf, nf)) cols.append(cum) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "boxcox_normmax": if nf == 0 and mf == 0: ma = stats.boxcox_normmax(map(float, sample_one)) else: ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method) cols.append(ma) elif test_id.strip() == "boxcox": if imbda == 0: box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha) cols.append(box) cols.append(ma) cols.append(ci) else: box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha) cols.append(box) elif test_id.strip() == "histogram2": h2 = stats.histogram2(map(float, sample_one), map(float, sample_two)) for list in h2: cols.append(list) elif test_id.strip() == "ranksums": z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two)) cols.append(z_statistic) cols.append(p_value) elif test_id.strip() == "ttest_1samp": t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two)) for list in t: cols.append(list) for list in prob: cols.append(list) elif test_id.strip() == "ansari": AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two)) cols.append(AB) cols.append(p_value) elif test_id.strip() == "linregress": slope, intercept, r_value, p_value, stderr = stats.linregress( map(float, sample_one), map(float, sample_two)) cols.append(slope) cols.append(intercept) cols.append(r_value) cols.append(p_value) cols.append(stderr) elif test_id.strip() == "pearsonr": cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two)) cols.append(cor) cols.append(p_value) elif test_id.strip() == "pointbiserialr": r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two)) cols.append(r) cols.append(p_value) elif test_id.strip() == "ks_2samp": d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two)) cols.append(d) cols.append(p_value) elif test_id.strip() == "mannwhitneyu": mw_stats_u, p_value = stats.mannwhitneyu( map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity, ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "zmap": z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "ttest_ind": mw_stats_u, p_value = stats.ttest_ind(map(float, sample_one), map(float, sample_two), equal_var=args.equal_var) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "ttest_rel": t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(t) cols.append(prob) elif test_id.strip() == "mood": z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(z) cols.append(p_value) elif test_id.strip() == "shapiro": W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta) cols.append(W) cols.append(p_value) for list in a: cols.append(list) elif test_id.strip() == "kendalltau": k, p_value = stats.kendalltau( map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort, ) cols.append(k) cols.append(p_value) elif test_id.strip() == "entropy": s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base) cols.append(s) elif test_id.strip() == "spearmanr": if sample2 == 1: rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two)) else: rho, p_value = stats.spearmanr(map(float, sample_one)) cols.append(rho) cols.append(p_value) elif test_id.strip() == "wilcoxon": if sample2 == 1: T, p_value = stats.wilcoxon( map(float, sample_one), map(float, sample_two), zero_method=args.zero_method, correction=args.correction, ) else: T, p_value = stats.wilcoxon( map(float, sample_one), zero_method=args.zero_method, correction=args.correction, ) cols.append(T) cols.append(p_value) elif test_id.strip() == "chisquare": if sample2 == 1: rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof) else: rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof) cols.append(rho) cols.append(p_value) elif test_id.strip() == "power_divergence": if sample2 == 1: stat, p_value = stats.power_divergence( map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_, ) else: stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_) cols.append(stat) cols.append(p_value) elif test_id.strip() == "theilslopes": if sample2 == 1: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha) else: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha) cols.append(mpe) cols.append(met) cols.append(lo) cols.append(up) elif test_id.strip() == "combine_pvalues": if sample2 == 1: stat, p_value = stats.combine_pvalues( map(float, sample_one), method=args.med, weights=map(float, sample_two), ) else: stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med) cols.append(stat) cols.append(p_value) elif test_id.strip() == "obrientransform": ob = stats.obrientransform(*b_samples) for list in ob: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "f_oneway": f_value, p_value = stats.f_oneway(*b_samples) cols.append(f_value) cols.append(p_value) elif test_id.strip() == "kruskal": h, p_value = stats.kruskal(*b_samples) cols.append(h) cols.append(p_value) elif test_id.strip() == "friedmanchisquare": fr, p_value = stats.friedmanchisquare(*b_samples) cols.append(fr) cols.append(p_value) elif test_id.strip() == "fligner": xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(xsq) cols.append(p_value) elif test_id.strip() == "bartlett": T, p_value = stats.bartlett(*b_samples) cols.append(T) cols.append(p_value) elif test_id.strip() == "levene": w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(w) cols.append(p_value) elif test_id.strip() == "median_test": stat, p_value, m, table = stats.median_test( ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples) cols.append(stat) cols.append(p_value) cols.append(m) cols.append(table) for list in table: elements = ",".join(map(str, list)) cols.append(elements) outfile.write("%s\n" % "\t".join(map(str, cols))) outfile.close()
def population_coupling_df( df: pd.core.frame.DataFrame, spiketrain_col: str = "spiketrain", spiketimes_col: str = "spiketimes", binsize: float = 0.01, num_lags: int = 100, t_start: float = None, t_stop: float = None, return_all: bool = False, ): """ Calculate the population-coupling index between each spiketrain and all others in a DataFrame. The metric is calculated by computing and standardising cross correlation between an individual spiketrain and the "population spiketrain", consisting of all other neurons. Large Z score cross correlation at lag=0 is indicative of high population coupling. Args: df: A pandas DataFrame containing spiketimes indexed by spiketrain spiketrain_col: The column containing spiketimes spiketimes_col: The column containing spiketrain identifiers binsize: The size of the time bin in seconds num_lags: The number of lags forward and backwards around lag 0 to return t_start: Minimum timepoint t_stop: Maximum timepoint return_all: If true, all time bins and cross correlation values are returned Returns: A pandas DataFrame containing one row per spiketrain with columns {spiketrain_col, 'population_coupling'} """ ROUNDING_PRECISION = 5 FRAC_TO_COMPARE = 4 bin_idx_to_start = ((num_lags * 2) + 1) // FRAC_TO_COMPARE out: list = [] spiketrains = df[spiketrain_col].unique() for spiketrain in spiketrains: spiketrain_oi = df[df[spiketrain_col] == spiketrain][spiketimes_col].values population_spiketrain = np.sort( df[df[spiketrain_col] != spiketrain][spiketimes_col].values) t, cc = spiketimes.correlate.cross_corr( spiketrain_1=spiketrain_oi, spiketrain_2=population_spiketrain, binsize=binsize, num_lags=num_lags, as_df=False, t_start=t_start, t_stop=t_stop, delete_0_lag=False, ) z = stats.zmap(cc, cc[:bin_idx_to_start]) t = np.round(t, ROUNDING_PRECISION) if return_all: out.append( pd.DataFrame({ "time_sec": t, "zscore": z, spiketrain_col: spiketrain })) else: out.append(z[t == 0][0]) if return_all: df = pd.concat(out, axis=0) else: df = pd.DataFrame({ spiketrain_col: spiketrains, "population_coupling": out }) return df
## short-example.py ## short example analysis, just to make sure dependencies are installed correctly import numpy as np import pandas as pd import sys sys.path.insert(0, '../cadreModels') from classificationBinary import binaryCadreModel from sklearn.datasets import make_classification from scipy.stats import zscore, zmap from sklearn.model_selection import train_test_split X, y = make_classification(n_samples=10000, random_state=2125615) data = pd.DataFrame(X).assign(target=y) features = data.columns[data.columns != 'target'] D_tr, D_va = train_test_split(data, test_size=0.2, random_state=313616) D_va[features] = zmap(D_va[features], D_tr[features]) D_tr[features] = zscore(D_tr[features]) scm = binaryCadreModel(Tmax=1001, record=50) scm.fit(D_tr, 'target', features, features, D_va, progress=True)
def test_zmap(self): for n in self.get_n(): x,y,xm,ym = self.generate_xy_sample(n) z = stats.zmap(x,y) zm = stats.mstats.zmap(xm,ym) assert(np.all(abs(z-zm[0:len(z)]) < 1.E-10))
####################### ## construct dataset ## ####################### boston = load_boston() D = pd.DataFrame(boston['data']).assign(target=boston['target']).sample(frac=1, random_state=662352).values ## train and test set sizes Ntr = int(np.ceil(0.75*D.shape[0])) Nva = D.shape[0] - Ntr P = D.shape[1]-1 ## train and test splits Dtr, Dva = D[:Ntr,:], D[Ntr:,:] Dva, Dtr = ss.zmap(Dva, Dtr), ss.zscore(Dtr) Xtr, Ytr = Dtr[:,:-1], np.expand_dims(Dtr[:,-1], 1) Xva, Yva = Dva[:,:-1], np.expand_dims(Dva[:,-1], 1) M = 3 # number of cadres alpha = [0.95, 0.05] # d is more l1, W is more l2 lam = [1, 1] # regularization strength ################## ## learn models ## ################## cadreModel = sc.regressionCadreModel(lambda_d=lam[0], lambda_W=lam[1], M=M) cadreModel.fit(Xtr, Ytr) ## learn SVRs
def coerce_levels(image_numpy, levels=255, method="divide", reference_image=[], reference_norm_range=[.075, 1], mask_value=0, coerce_positive=True): """ In volumes with huge outliers, the divide method will likely result in many zero values. This happens in practice quite often. TO-DO: find a better method to bin image values. I'm sure there are a thousand such algorithms out there to do so. Maybe something based on median's, rather than means. This, of course, loses the 'Extremeness' of extreme values. An open question of how to reconcile this -- maybe best left to the user. Note that there is some dubious +1s and -1s in this function. It may be better to clean these up in the future. I have also built-in the coerce-positive function into this function. The other one was not working for mysterious reasons. """ if np.min(image_numpy) < 0 and coerce_positive: reference_image -= np.min(image_numpy) image_numpy[image_numpy != mask_value] -= np.min(image_numpy) levels -= 1 if method == "divide": if reference_image == []: image_max = np.max(image_numpy) else: image_max = np.max(reference_image) for x in xrange(image_numpy.shape[0]): for y in xrange(image_numpy.shape[1]): for z in xrange(image_numpy.shape[2]): if image_numpy[x, y, z] != mask_value: image_numpy[x, y, z] = np.round( (image_numpy[x, y, z] / image_max) * levels) + 1 """ Another method is to bin values based on their z-score. I provide two options: within-ROI normalization, and whole-image normalization. The output is always the ROI image, but in the latter option z-scores are generated from the range of intensities across the entire image within some range of percentages. This range is currently determined from the mean, but it may make more sense to do it from the median; this protects the algorithm from extreme values. On the other hand, using the median could white out an otherwise heterogenous hotspot. """ if method == "z_score": # check_image(image_numpy, mode="maximal_slice", mask_value=mask_value) ## Note that this is a bad way to check this variable. if reference_image == []: masked_image_numpy = np.ma.masked_equal(image_numpy, mask_value) z_image_numpy = stats.zscore(masked_image_numpy, axis=None) # image_range = [np.min(z_image_numpy), np.max(z_image_numpy)] image_range = [ np.mean(z_image_numpy) - np.std(z_image_numpy), np.mean(z_image_numpy) + np.std(z_image_numpy) ] bins = np.linspace(image_range[0], image_range[1], levels) # distribution = stats.norm(loc=np.mean(z_image_numpy), scale=np.var(z_image_numpy)) # # percentile point, the range for the inverse cumulative distribution function: # bounds_for_range = distribution.cdf([0, 100]) # # Linspace for the inverse cdf: # pp = np.linspace(*bounds_for_range, num=levels) # bins = distribution.ppf(pp) # print bins else: masked_reference_image = np.ma.masked_equal( reference_image, mask_value) masked_reference_image = np.ma.masked_less( masked_reference_image, reference_norm_range[0] * np.max(reference_image)) masked_reference_image = np.ma.masked_greater( masked_reference_image, reference_norm_range[1] * np.max(reference_image)) masked_image_numpy = np.ma.masked_equal(image_numpy, mask_value) z_image_numpy = stats.zmap(masked_image_numpy, masked_reference_image, axis=None) z_reference_image = stats.zscore(masked_reference_image, axis=None) # distribution = stats.norm(loc=np.mean(z_reference_image), scale=np.var(z_reference_image)) # # percentile point, the range for the inverse cumulative distribution function: # bounds_for_range = distribution.cdf([0, 100]) # # Linspace for the inverse cdf: # pp = np.linspace(*bounds_for_range, num=levels) # bins = distribution.ppf(pp) # image_range = [np.mean(z_reference_image) - np.std(z_reference_image), np.mean(z_reference_image) + np.std(z_reference_image)] image_range = [ np.min(z_reference_image), np.max(z_reference_image) ] bins = np.linspace(image_range[0], image_range[1], levels) for x in xrange(image_numpy.shape[0]): for y in xrange(image_numpy.shape[1]): for z in xrange(image_numpy.shape[2]): if image_numpy[x, y, z] != mask_value: image_numpy[x, y, z] = ( np.abs(bins - z_image_numpy[x, y, z])).argmin() + 1 # check_image(image_numpy, mode="maximal_slice", mask_value=mask_value) image_numpy[image_numpy == mask_value] = 0 return image_numpy
def run_prediction_models(hdf, feats=None, verbose=True): """ Runs model using diffusion embedding scores to predict behavioral measures Parameters ---------- hdf : structures.Frog HDF5 file containing SNF gridsearch outputs feats : list of str, optional List of behavioral features to use as prediction targets Returns ------- Y_corrs : (K, F) numpy.ndarray Correlation between predicted and actual behavioral values for `F` features across `K` folds Y_mses : (N, F) numpy.ndarray Mean-squared error of predicted and actual behavioral values for `F` features across `N` subjects """ if feats is None: feats = ['pigd', 'tremor'] holdout = hdf.load('/snf/processed/holdout/all/sqeuclidean/gridsearch') X_holdout = holdout['embedding'][:, :5] behavior = hdf.load('/processed/pd_behavioral_measures') Y_holdout = np.asarray(behavior[feats]) consensus = hdf.load('/snf/processed/all/sqeuclidean/gridsearch/consensus') # to store out-of-sample correlations and MSE scores n_splits = 5 Y_corrs = np.zeros((n_splits, 2)) Y_mses = np.zeros_like(Y_holdout) # 5-fold CV kf = KFold(n_splits=n_splits, shuffle=True, random_state=SEED) for n, (train_index, test_index) in enumerate(kf.split(X_holdout)): # split X and Y into train/test X_train, X_test = X_holdout[train_index], X_holdout[test_index] Y_train, Y_test = Y_holdout[train_index], Y_holdout[test_index] # zscore / zmap and add constant to X matrix X_test = add_constant(sstats.zmap(X_test, X_train, ddof=1)) X_train = add_constant(sstats.zscore(X_train, ddof=1)) Y_test = sstats.zmap(Y_test, Y_train, ddof=1) Y_train = sstats.zscore(Y_train, ddof=1) # fit model and predict out-of-sample betas = np.linalg.lstsq(X_train, Y_train, rcond=None)[0] Y_pred = X_test @ betas # get correlation and MSE Y_corrs[n] = utils.efficient_corr(Y_pred, Y_test) Y_mses[test_index] = (Y_test - Y_pred)**2 Y_mse_mean = np.mean(Y_mses[test_index], axis=0) if verbose: print(f'Fold {n + 1}: r = {Y_corrs[n]:}, mse = {Y_mse_mean:}') if verbose: print('\nAverage correlations across folds:') corrs_mean, corrs_std = Y_corrs.mean(0), Y_corrs.std(0, ddof=1) for n, t in enumerate(feats): print(r'{:<9}: r = {:.3f} $\pm$ {:.3f}'.format( t, corrs_mean[n], corrs_std[n])) print('\nGroups differences in MSE:') f_hold, p_hold = sstats.f_oneway(*(Y_mses[consensus == cl] for cl in np.unique(consensus))) for n, t in enumerate(feats): print('{:<9}: F = {:.2f}, p = {:.3f}'.format( t, f_hold[n], p_hold[n])) return Y_corrs, Y_mses