예제 #1
0
        def getChis(crosstab):
            chi2, p, dof, ex = sps.chi2_contingency(crosstab[0])
            crit = sps.chi2.ppf(q=0.95, df=dof)
            if (crit < chi2):
                evaluation = True
            else:
                evaluation = False

            obs = crosstab[0].as_matrix()
            obs_list = obs.tolist()
            ex_list = ex.tolist()
            z_scores = zmap(obs_list, ex_list)
            z_list = z_scores.tolist()
            z_indicators = []
            for z in z_list:
                z_sig = [
                    "+" if i > 1.96 else "-" if i < -1.96 else " " for i in z
                ]
                z_indicators.append(z_sig)

            results = {
                'chi-sq': chi2,
                'p-val': p,
                'eval': evaluation,
                'dof': dof,
                'explanandum': crosstab[1],
                'expected': ex_list,
                'observed': obs_list,
                'z_scores': z_indicators,
                'row_lab': crosstab[0].index.tolist(),
                'col_lab': crosstab[0].columns.tolist()
            }
            return results
예제 #2
0
def getChis(crosstab, variable):
    chi2, p, dof, ex = sps.chi2_contingency(crosstab)
    x = sps.chi2_contingency(crosstab)

    crit = sps.chi2.ppf(q=0.95, df=dof)
    if (crit < chi2):
        evaluation = True
    else:
        evaluation = False

    obs = crosstab.as_matrix()
    obs_list = obs.tolist()
    ex_list = ex.tolist()
    z_scores = sps.zmap(obs_list, ex_list)
    z_list = z_scores.tolist()
    z_indicators = []
    for z in z_list:
        z_sig = ["+" if i > 1.96 else "-" if i < -1.96 else " " for i in z]
        z_indicators.append(z_sig)

    results = {'chi-sq': chi2,
               'p-val': p,
               'eval': evaluation,
               'dof': dof,
               'explanans': variable,
               'expected': ex_list,
               'observed': obs_list,
               'z_scores': z_indicators,
               'row_lab': crosstab.index.tolist(),
               'col_lab': crosstab.columns.tolist()
               }
    print results
    return results
예제 #3
0
def zscore_modified(vals):
    if any(isinstance(v, str) for v in vals):
        return np.zeros(len(vals))
    vals = [np.nan if v is None else v
            for v in vals]  # replace None with np.nan
    # print(type(vals[0]))
    # print(vals[:10])
    vals_no_outliers = reject_outliers(np.array(vals)).flatten()
    return zmap(vals, vals_no_outliers)
예제 #4
0
def population_coupling(
    spiketrain: np.ndarray,
    spiketrain_list: list,
    bin_window: float = 0.01,
    num_lags: int = 100,
    as_df: bool = False,
    t_start: float = None,
    t_stop: float = None,
    frac_zscore: float = 0.25,
    return_all: bool = False,
):
    """
    Calculate the population-coupling index between a spiketrain and the population.

    The metric is calculated by computing and standardising cross correlation
    between an individual spiketrain and the "population spiketrain", consisting of all other neurons.
    Large Z score cross correlation at lag=0 is indicative of high population coupling.

    Args:
        spiketrain: A numpy array of spiketimes
        spiketrain_list: A list of numpy-array spiketrains
        binsize: The size of the time bin in seconds
        num_lags: The number of lags forward and backwards around lag 0 to return
        as_df: Whether to return results as pandas DataFrame
        t_start: Minimum timepoint
        t_stop: Maximum timepoint
        return_all: If true, all time bins and cross correlation values are returned
    Returns:
        The zscore at lag=0 between the spiketrain and the population
    """
    T_ROUNDING_PRECISION = 5

    t_cutoff = ((num_lags * 2) + 1) // (1 / frac_zscore)
    if t_start is None:
        t_start = spiketrain[0]
    if t_stop is None:
        t_stop = spiketrain[-1]
    population_spiketrain = np.sort(np.concatenate(spiketrain_list))
    time_bins, values = cross_corr(
        spiketrain,
        population_spiketrain,
        bin_window=bin_window,
        num_lags=num_lags,
        as_df=False,
        t_start=t_start,
        t_stop=t_stop,
        delete_0_lag=False,
    )
    time_bins = np.round(time_bins, T_ROUNDING_PRECISION)
    values = zmap(values, values[:t_cutoff])
    if not return_all:
        return values[time_bins == 0]
    else:
        return time_bins, values
def StoreOrDisplayData():
    if request.method == "POST":
        JsonData = json.loads(request.data)
        print(JsonData)
        database.session.add(TempHumidityHeatIndex(JsonData["temperature"],JsonData["heatindex"],JsonData["humidity"]))
        database.session.commit()

        # You would not do the following in production:
        # You would have all the elements of the z-score calculation "cached" and only change it when needed
        return '{{ "Z-Score":{} }}'.format(zmap(JsonData["heatindex"],[RowObj.HeatIndex for RowObj in TempHumidityHeatIndex.query.all()]))
    elif request.method == "GET":
        return render_template("index.html",WeatherData = TempHumidityHeatIndex.query.all())
예제 #6
0
def zscore_standardise(to_standardise: np.ndarray, baseline: np.ndarray):
    """
    Convert an array to zscores calculated on a baseline period.

    Args:
        to_normalise: A numpy array to be converted to zscores.
        baseline: A numpy array containing data used to calculate the mean and standard deviation
                  for zscore conversions. This is usually (but not necessarily) a subsection of to_standardise
    Returns:
        A numpy array of zscores
    """
    return zmap(to_standardise, baseline)
예제 #7
0
def do_cv(cv_dict, is_multi_sess, classifier, x, y, permute=False):
    """
    Loop over all cross validation folds, return area under the curve (AUC) and class probabilities.
    """

    # permute distribution of behavior if desired. Should this be done with each fold?
    if permute:
        y = np.random.permutation(y)

    # if leave-one-session-out cross validation, this will hold area under the curve for each hold out
    fold_aucs = np.empty(shape=(len(cv_dict)), dtype=np.float)

    # will hold the predicted class probability for all the test data
    probs = np.empty(shape=y.shape, dtype=np.float)

    # now loop over all the cross validation folds
    for cv_num, cv in enumerate(cv_dict.keys()):

        # Training data for fold
        x_train = x[cv_dict[cv]['train_bool']]
        y_train = y[cv_dict[cv]['train_bool']]

        # Test data for fold
        x_test = x[cv_dict[cv]['test_bool']]
        y_test = y[cv_dict[cv]['test_bool']]

        # normalize the train data, and then normalize the test data by the mean and sd of the train data
        # this is a little silly because the data are already zscored by session, but it could presumably
        # have an effect for within-session leave-out-trial-out cross validation. The main point is that train
        # and test data should be scaled the same
        x_train = zscore(x_train, axis=0)
        x_test = zmap(x_test, x_train, axis=0)

        # fit the model for this fold
        classifier = SubjectClassifierAnalysis.do_fit_model(
            classifier, x_train, y_train)

        # now predict class probability of test data
        test_probs = classifier.predict_proba(x_test)[:, 1]
        probs[cv_dict[cv]['test_bool']] = test_probs

        # if session level CV, compute the area under the curve for this fold and store
        if is_multi_sess:
            fold_aucs[cv_num] = roc_auc_score(y_test, test_probs)

    # compute AUC based on all CVs, either as the average of the session-level AUCs, or all the cross-validated
    # predictions of the within session CVs
    all_test_bool = np.any(np.stack([cv_dict[x]['test_bool']
                                     for x in cv_dict]),
                           axis=0)
    auc = fold_aucs.mean() if is_multi_sess else roc_auc_score(
        y[all_test_bool], probs[all_test_bool])
    return auc, probs
예제 #8
0
def write_specimen_info(wt_wev, mut_wev, outfile):
    """
    Write a csv with some summary info on specimens
    currently only returns Z-score of mutants
    """
    def sortwev(x):
        print(x)
        return x

    wev_z = zmap(mut_wev.staging, wt_wev.staging)
    mut_wev['WEV_zscore'] = wev_z
    mut_wev.sort_values('WEV_zscore', key=sortwev, inplace=True)
    mut_wev.to_csv(outfile)
예제 #9
0
def do_cv(cv_dict, is_multi_sess, classifier, x, y, permute=False):
    """
    Loop over all cross validation folds, return area under the curve (AUC) and class probabilities.
    """

    # permute distribution of behavior if desired. Should this be done with each fold?
    if permute:
        y = np.random.permutation(y)

    # if leave-one-session-out cross validation, this will hold area under the curve for each hold out
    fold_aucs = np.empty(shape=(len(cv_dict)), dtype=np.float)

    # will hold the predicted class probability for all the test data
    probs = np.empty(shape=y.shape, dtype=np.float)

    # now loop over all the cross validation folds
    for cv_num, cv in enumerate(cv_dict.keys()):

        # Training data for fold
        x_train = x[cv_dict[cv]['train_bool']]
        y_train = y[cv_dict[cv]['train_bool']]

        # Test data for fold
        x_test = x[cv_dict[cv]['test_bool']]
        y_test = y[cv_dict[cv]['test_bool']]

        # normalize the train data, and then normalize the test data by the mean and sd of the train data
        # this is a little silly because the data are already zscored by session, but it could presumably
        # have an effect for within-session leave-out-trial-out cross validation. The main point is that train
        # and test data should be scaled the same
        x_train = zscore(x_train, axis=0)
        x_test = zmap(x_test, x_train, axis=0)

        # fit the model for this fold
        classifier = SubjectClassifierAnalysis.do_fit_model(classifier, x_train, y_train)

        # now predict class probability of test data
        test_probs = classifier.predict_proba(x_test)[:, 1]
        probs[cv_dict[cv]['test_bool']] = test_probs

        # if session level CV, compute the area under the curve for this fold and store
        if is_multi_sess:
            fold_aucs[cv_num] = roc_auc_score(y_test, test_probs)

    # compute AUC based on all CVs, either as the average of the session-level AUCs, or all the cross-validated
    # predictions of the within session CVs
    all_test_bool = np.any(np.stack([cv_dict[x]['test_bool'] for x in cv_dict]), axis=0)
    auc = fold_aucs.mean() if is_multi_sess else roc_auc_score(y[all_test_bool], probs[all_test_bool])
    return auc, probs
예제 #10
0
def normalize(signals, axis=None, groups=None, MP=False, comp=None):
    """
    :param signals: 1D, 2D or 3D signals
    returns zscored per patient
    """

    if comp is not None:
        print('zmapping with axis {}'.format(axis))
        return stats.zmap(signals, comp, axis=axis)

    if groups is None:
        print('zscoring with axis {}'.format(axis))
        return stats.zscore(signals, axis=axis)

    if signals.ndim == 1: signals = np.expand_dims(signals, 0)
    if signals.ndim == 2: signals = np.expand_dims(signals, 2)

    if MP:
        print('zscoring per patient using {} cores'.format(cpu_count()))
        p = Pool(cpu_count())  #use all except for one
        res = []
        new_signals = np.zeros_like(signals)
        for ID in np.unique(groups):
            idx = groups == ID
            job = p.apply_async(stats.zscore,
                                args=(signals[idx], ),
                                kwds={'axis': None})
            res.append(job)
        start = 0
        for r in res:
            values = r.get(timeout=1200)
            end = start + len(values)
            new_signals[start:end] = values
            end = start
        return new_signals
    else:
        print('zscoring per patient')
        res = []
        for ID in np.unique(groups):
            idx = groups == ID
            job = stats.zscore(signals[idx], axis=None)
            res.append(job)
        new_signals = np.vstack(res)
        return new_signals
예제 #11
0
def rescale_test(X_train, X_test, Y_train, U, V):
    """
    Generates out-of-sample predicted `Y` values

    Parameters
    ----------
    X_train : (S1, B) array_like
        Data matrix, where `S1` is observations and `B` is features
    X_test : (S2, B)
        Data matrix, where `S2` is observations and `B` is features
    Y_train : (S1, T) array_like
        Behavioral matrix, where `S1` is observations and `T` is features

    Returns
    -------
    Y_pred : (S2, T) `numpy.ndarray`
        Behavioral matrix, where `S2` is observations and `T` is features
    """

    X_resc = zmap(X_test, compare=X_train, ddof=1)
    Y_pred = (X_resc @ U @ V.T) + Y_train.mean(axis=0, keepdims=True)

    return Y_pred
예제 #12
0
 def test_zmap(self):
     for n in self.get_n():
         x, y, xm, ym = self.generate_xy_sample(n)
         z = stats.zmap(x,y)
         zm = stats.mstats.zmap(xm,ym)
         assert_allclose(z, zm[0:len(z)], atol=1e-10)
예제 #13
0
def residualize(X, Y, Xc=None, Yc=None, normalize=True, add_intercept=True):
    """
    Returns residuals of regression equation from `Y ~ X`

    Parameters
    ----------
    X : (N[, R]) array_like
        Coefficient matrix of `R` variables for `N` subjects
    Y : (N[, F]) array_like
        Dependent variable matrix of `F` variables for `N` subjects
    Xc : (M[, R]) array_like, optional
        Coefficient matrix of `R` variables for `M` subjects. If not specified
        then `X` is used to estimate betas. Default: None
    Yc : (M[, F]) array_like, optional
        Dependent variable matrix of `F` variables for `M` subjects. If not
        specified then `Y` is used to estimate betas. Default: None
    normalize : bool, optional
        Whether to normalize (i.e., z-score) residuals. Will use residuals from
        `Yc ~ Xc` for generating mean and variance. Default: True
    add_intercept : bool, optional
        Whether to add intercept to `X` (and `Xc`, if provided). The intercept
        will not be removed, just used in beta estimation. Default: True

    Returns
    -------
    Yr : (N, F) numpy.ndarray
        Residuals of `Y ~ X`

    Notes
    -----
    If both `Xc` and `Yc` are provided, these are used to calculate betas which
    are then applied to `X` and `Y`.
    """

    if ((Yc is None and Xc is not None) or (Yc is not None and Xc is None)):
        raise ValueError('If processing against a comparative group, you must '
                         'provide both `Xc` and `Yc`.')

    X, Y = np.asarray(X), np.asarray(Y)

    if Yc is None:
        Xc, Yc = X.copy(), Y.copy()
    else:
        Xc, Yc = np.asarray(Xc), np.asarray(Yc)

    # add intercept to regressors if requested and calculate fit
    if add_intercept:
        X, Xc = utils.add_constant(X), utils.add_constant(Xc)
    betas, *rest = np.linalg.lstsq(Xc, Yc, rcond=None)

    # remove intercept from regressors and betas for calculation of residuals
    if add_intercept:
        betas = betas[:-1]
        X, Xc = X[:, :-1], Xc[:, :-1]

    # calculate residuals
    Yr = Y - (X @ betas)
    Ycr = Yc - (Xc @ betas)

    if normalize:
        Yr = sstats.zmap(Yr, compare=Ycr)

    return Yr
## construct dataset ##
#######################

breastCancer = load_breast_cancer()
D = pd.DataFrame(breastCancer['data']).assign(
    target=breastCancer['target']).sample(frac=1, random_state=662352).values

## train and test set sizes
Ntr = int(np.ceil(0.75 * D.shape[0]))
Nte = D.shape[0] - Ntr
P = D.shape[1] - 1

### train and test splits
Xtr, Ytr = D[:Ntr, :-1], D[:Ntr, -1]
Xva, Yva = D[Ntr:, :-1], D[Ntr:, -1]
Xva, Xtr = ss.zmap(Xva, Xtr), ss.zscore(Xtr)

Tmax = 1001
M = 2  # number of cadres
alpha_d, alpha_W = 0.95, 0.05  # d is more l1, W is more l2
lambda_d, lambda_W = 0.01, 0.05  # regularization strength

##################
## learn models ##
##################

cadreModel = sc.kClassCadreModel(M=M,
                                 alpha_d=alpha_d,
                                 alpha_W=alpha_W,
                                 lambda_d=lambda_d,
                                 lambda_W=lambda_W,
예제 #15
0
 def test_zmap(self):
     for n in self.get_n():
         x, y, xm, ym = self.generate_xy_sample(n)
         z = stats.zmap(x, y)
         zm = stats.mstats.zmap(xm, ym)
         assert_allclose(z, zm[0:len(z)], atol=1e-10)
예제 #16
0
def estimate_model_quality(bst,
                           *,
                           hmm=None,
                           n_states=None,
                           n_shuffles=1000,
                           k_folds=5,
                           mode='timeswap-pooled',
                           verbose=False):
    """Estimate the HMM 'model quality' associated with the set of events in bst.

    TODO: finish docstring, and do some more consistency checking...
    TODO: add other modes of shuffling

    Params
    ======

    Returns
    =======

    quality :
    scores :
    shuffled :

    """
    from .decoding import k_fold_cross_validation
    from scipy.stats import zmap

    if hmm:
        if not n_states:
            n_states = hmm.n_components

    X = [ii for ii in range(bst.n_epochs)]

    scores = np.zeros(bst.n_epochs)
    shuffled = np.zeros((bst.n_epochs, n_shuffles))

    if mode == 'timeswap-pooled':
        # shuffle data coherently, pooled over all events:
        shuffle_func = replay.pooled_time_swap_bst
    elif mode == 'timeswap-within-event':
        # shuffle data coherently within events:
        shuffle_func = replay.time_swap_bst
    elif mode == 'temporal-within-event':
        shuffle_func = replay.incoherent_shuffle_bst
    else:
        raise NotImplementedError

    for kk, (training,
             validation) in enumerate(k_fold_cross_validation(X, k=k_folds)):
        if verbose:
            print('  fold {}/{}'.format(kk + 1, k_folds))

        PBEs_train = bst[training]
        PBEs_test = bst[validation]

        # train HMM on all training PBEs
        hmm = PoissonHMM(n_components=n_states, verbose=False)
        hmm.fit(PBEs_train)

        # compute scores_hmm (log likelihoods) of validation set:
        scores[validation] = hmm.score(PBEs_test)

        for nn in range(n_shuffles):
            # shuffle data:
            bst_test_shuffled = shuffle_func(PBEs_test)

            # score validation set with shuffled-data HMM
            shuffled[validation, nn] = hmm.score(bst_test_shuffled)

    quality = zmap(scores.mean(), shuffled.mean(axis=0))

    return quality, scores, shuffled
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--infile", required=True, help="Tabular file.")
    parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.")
    parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;")
    parser.add_argument("--test_id", help="statistical test method")
    parser.add_argument(
        "--mwu_use_continuity",
        action="store_true",
        default=False,
        help="Whether a continuity correction (1/2.) should be taken into account.",
    )
    parser.add_argument(
        "--equal_var",
        action="store_true",
        default=False,
        help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.",
    )
    parser.add_argument(
        "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values."
    )
    parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used")
    parser.add_argument(
        "--bias",
        action="store_true",
        default=False,
        help="if false,then the calculations are corrected for statistical bias",
    )
    parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored")
    parser.add_argument(
        "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored"
    )
    parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored")
    parser.add_argument(
        "--printextras",
        action="store_true",
        default=False,
        help="If True, if there are extra points a warning is raised saying how many of those points there are",
    )
    parser.add_argument(
        "--initial_lexsort",
        action="store_true",
        default="False",
        help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.",
    )
    parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ")
    parser.add_argument(
        "--axis",
        type=int,
        default=0,
        help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)",
    )
    parser.add_argument(
        "--n",
        type=int,
        default=0,
        help="the number of trials. This is ignored if x gives both the number of successes and failures",
    )
    parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram")
    parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction")
    parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--m", type=float, default=0.0, help="limits")
    parser.add_argument("--mf", type=float, default=2.0, help="lower limit")
    parser.add_argument("--nf", type=float, default=99.9, help="higher_limit")
    parser.add_argument(
        "--p",
        type=float,
        default=0.5,
        help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5",
    )
    parser.add_argument("--alpha", type=float, default=0.9, help="probability")
    parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds")
    parser.add_argument(
        "--proportiontocut",
        type=float,
        default=0.0,
        help="Proportion (in range 0-1) of total data set to trim of each end.",
    )
    parser.add_argument(
        "--lambda_",
        type=float,
        default=1.0,
        help="lambda_ gives the power in the Cressie-Read power divergence statistic",
    )
    parser.add_argument(
        "--imbda",
        type=float,
        default=0,
        help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.",
    )
    parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e")
    parser.add_argument("--dtype", help="dtype")
    parser.add_argument("--med", help="med")
    parser.add_argument("--cdf", help="cdf")
    parser.add_argument("--zero_method", help="zero_method options")
    parser.add_argument("--dist", help="dist options")
    parser.add_argument("--ties", help="ties options")
    parser.add_argument("--alternative", help="alternative options")
    parser.add_argument("--mode", help="mode options")
    parser.add_argument("--method", help="method options")
    parser.add_argument("--md", help="md options")
    parser.add_argument("--center", help="center options")
    parser.add_argument("--kind", help="kind options")
    parser.add_argument("--tail", help="tail options")
    parser.add_argument("--interpolation", help="interpolation options")
    parser.add_argument("--statistic", help="statistic options")

    args = parser.parse_args()
    infile = args.infile
    outfile = open(args.outfile, "w+")
    test_id = args.test_id
    nf = args.nf
    mf = args.mf
    imbda = args.imbda
    inclusive1 = args.inclusive1
    inclusive2 = args.inclusive2
    sample0 = 0
    sample1 = 0
    sample2 = 0
    if args.sample_cols != None:
        sample0 = 1
        barlett_samples = []
        for sample in args.sample_cols.split(";"):
            barlett_samples.append(map(int, sample.split(",")))
    if args.sample_one_cols != None:
        sample1 = 1
        sample_one_cols = args.sample_one_cols.split(",")
    if args.sample_two_cols != None:
        sample_two_cols = args.sample_two_cols.split(",")
        sample2 = 1
    for line in open(infile):
        sample_one = []
        sample_two = []
        cols = line.strip().split("\t")
        if sample0 == 1:
            b_samples = columns_to_values(barlett_samples, line)
        if sample1 == 1:
            for index in sample_one_cols:
                sample_one.append(cols[int(index) - 1])
        if sample2 == 1:
            for index in sample_two_cols:
                sample_two.append(cols[int(index) - 1])
        if test_id.strip() == "describe":
            size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one))
            cols.append(size)
            cols.append(min_max)
            cols.append(mean)
            cols.append(uv)
            cols.append(bs)
            cols.append(bk)
        elif test_id.strip() == "mode":
            vals, counts = stats.mode(map(float, sample_one))
            cols.append(vals)
            cols.append(counts)
        elif test_id.strip() == "nanmean":
            m = stats.nanmean(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "kurtosistest":
            z_value, p_value = stats.kurtosistest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "itemfreq":
            freq = stats.itemfreq(map(float, sample_one))
            for list in freq:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "boxcox_llf":
            IIf = stats.boxcox_llf(imbda, map(float, sample_one))
            cols.append(IIf)
        elif test_id.strip() == "tiecorrect":
            fa = stats.tiecorrect(map(float, sample_one))
            cols.append(fa)
        elif test_id.strip() == "rankdata":
            r = stats.rankdata(map(float, sample_one), method=args.md)
            cols.append(r)
        elif test_id.strip() == "nanstd":
            s = stats.nanstd(map(float, sample_one), bias=args.bias)
            cols.append(s)
        elif test_id.strip() == "anderson":
            A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist)
            cols.append(A2)
            for list in critical:
                cols.append(list)
            cols.append(",")
            for list in sig:
                cols.append(list)
        elif test_id.strip() == "binom_test":
            p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p)
            cols.append(p_value)
        elif test_id.strip() == "gmean":
            gm = stats.gmean(map(float, sample_one), dtype=args.dtype)
            cols.append(gm)
        elif test_id.strip() == "hmean":
            hm = stats.hmean(map(float, sample_one), dtype=args.dtype)
            cols.append(hm)
        elif test_id.strip() == "kurtosis":
            k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias)
            cols.append(k)
        elif test_id.strip() == "moment":
            n_moment = stats.moment(map(float, sample_one), n=args.n)
            cols.append(n_moment)
        elif test_id.strip() == "normaltest":
            k2, p_value = stats.normaltest(map(float, sample_one))
            cols.append(k2)
            cols.append(p_value)
        elif test_id.strip() == "skew":
            skewness = stats.skew(map(float, sample_one), bias=args.bias)
            cols.append(skewness)
        elif test_id.strip() == "skewtest":
            z_value, p_value = stats.skewtest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "sem":
            s = stats.sem(map(float, sample_one), ddof=args.ddof)
            cols.append(s)
        elif test_id.strip() == "zscore":
            z = stats.zscore(map(float, sample_one), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "signaltonoise":
            s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof)
            cols.append(s2n)
        elif test_id.strip() == "percentileofscore":
            p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind)
            cols.append(p)
        elif test_id.strip() == "bayes_mvs":
            c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha)
            cols.append(c_mean)
            cols.append(c_var)
            cols.append(c_std)
        elif test_id.strip() == "sigmaclip":
            c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n)
            cols.append(c)
            cols.append(c_low)
            cols.append(c_up)
        elif test_id.strip() == "kstest":
            d, p_value = stats.kstest(
                map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode
            )
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "chi2_contingency":
            chi2, p, dof, ex = stats.chi2_contingency(
                map(float, sample_one), correction=args.correction, lambda_=args.lambda_
            )
            cols.append(chi2)
            cols.append(p)
            cols.append(dof)
            cols.append(ex)
        elif test_id.strip() == "tmean":
            if nf is 0 and mf is 0:
                mean = stats.tmean(map(float, sample_one))
            else:
                mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(mean)
        elif test_id.strip() == "tmin":
            if mf is 0:
                min = stats.tmin(map(float, sample_one))
            else:
                min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive)
            cols.append(min)
        elif test_id.strip() == "tmax":
            if nf is 0:
                max = stats.tmax(map(float, sample_one))
            else:
                max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive)
            cols.append(max)
        elif test_id.strip() == "tvar":
            if nf is 0 and mf is 0:
                var = stats.tvar(map(float, sample_one))
            else:
                var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(var)
        elif test_id.strip() == "tstd":
            if nf is 0 and mf is 0:
                std = stats.tstd(map(float, sample_one))
            else:
                std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(std)
        elif test_id.strip() == "tsem":
            if nf is 0 and mf is 0:
                s = stats.tsem(map(float, sample_one))
            else:
                s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(s)
        elif test_id.strip() == "scoreatpercentile":
            if nf is 0 and mf is 0:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation
                )
            else:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation
                )
            for list in s:
                cols.append(list)
        elif test_id.strip() == "relfreq":
            if nf is 0 and mf is 0:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b)
            else:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf))
            for list in rel:
                cols.append(list)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "binned_statistic":
            if nf is 0 and mf is 0:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b
                )
            else:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                    range=(mf, nf),
                )
            cols.append(st)
            cols.append(b_edge)
            cols.append(b_n)
        elif test_id.strip() == "threshold":
            if nf is 0 and mf is 0:
                o = stats.threshold(map(float, sample_one), newval=args.new)
            else:
                o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trimboth":
            o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trim1":
            t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail)
            for list in t1:
                cols.append(list)
        elif test_id.strip() == "histogram":
            if nf is 0 and mf is 0:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b)
            else:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf))
            cols.append(hi)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "cumfreq":
            if nf is 0 and mf is 0:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b)
            else:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf))
            cols.append(cum)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "boxcox_normmax":
            if nf is 0 and mf is 0:
                ma = stats.boxcox_normmax(map(float, sample_one))
            else:
                ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method)
            cols.append(ma)
        elif test_id.strip() == "boxcox":
            if imbda is 0:
                box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha)
                cols.append(box)
                cols.append(ma)
                cols.append(ci)
            else:
                box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha)
                cols.append(box)
        elif test_id.strip() == "histogram2":
            h2 = stats.histogram2(map(float, sample_one), map(float, sample_two))
            for list in h2:
                cols.append(list)
        elif test_id.strip() == "ranksums":
            z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two))
            cols.append(z_statistic)
            cols.append(p_value)
        elif test_id.strip() == "ttest_1samp":
            t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two))
            for list in t:
                cols.append(list)
            for list in prob:
                cols.append(list)
        elif test_id.strip() == "ansari":
            AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two))
            cols.append(AB)
            cols.append(p_value)
        elif test_id.strip() == "linregress":
            slope, intercept, r_value, p_value, stderr = stats.linregress(
                map(float, sample_one), map(float, sample_two)
            )
            cols.append(slope)
            cols.append(intercept)
            cols.append(r_value)
            cols.append(p_value)
            cols.append(stderr)
        elif test_id.strip() == "pearsonr":
            cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two))
            cols.append(cor)
            cols.append(p_value)
        elif test_id.strip() == "pointbiserialr":
            r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two))
            cols.append(r)
            cols.append(p_value)
        elif test_id.strip() == "ks_2samp":
            d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two))
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "mannwhitneyu":
            mw_stats_u, p_value = stats.mannwhitneyu(
                map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "zmap":
            z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "ttest_ind":
            mw_stats_u, p_value = stats.ttest_ind(
                map(float, sample_one), map(float, sample_two), equal_var=args.equal_var
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "ttest_rel":
            t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(t)
            cols.append(prob)
        elif test_id.strip() == "mood":
            z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(z)
            cols.append(p_value)
        elif test_id.strip() == "shapiro":
            W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta)
            cols.append(W)
            cols.append(p_value)
            for list in a:
                cols.append(list)
        elif test_id.strip() == "kendalltau":
            k, p_value = stats.kendalltau(
                map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort
            )
            cols.append(k)
            cols.append(p_value)
        elif test_id.strip() == "entropy":
            s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base)
            cols.append(s)
        elif test_id.strip() == "spearmanr":
            if sample2 == 1:
                rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two))
            else:
                rho, p_value = stats.spearmanr(map(float, sample_one))
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "wilcoxon":
            if sample2 == 1:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    map(float, sample_two),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            else:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one), zero_method=args.zero_method, correction=args.correction
                )
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "chisquare":
            if sample2 == 1:
                rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            else:
                rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof)
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "power_divergence":
            if sample2 == 1:
                stat, p_value = stats.power_divergence(
                    map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_
                )
            else:
                stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "theilslopes":
            if sample2 == 1:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha)
            else:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha)
            cols.append(mpe)
            cols.append(met)
            cols.append(lo)
            cols.append(up)
        elif test_id.strip() == "combine_pvalues":
            if sample2 == 1:
                stat, p_value = stats.combine_pvalues(
                    map(float, sample_one), method=args.med, weights=map(float, sample_two)
                )
            else:
                stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "obrientransform":
            ob = stats.obrientransform(*b_samples)
            for list in ob:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "f_oneway":
            f_value, p_value = stats.f_oneway(*b_samples)
            cols.append(f_value)
            cols.append(p_value)
        elif test_id.strip() == "kruskal":
            h, p_value = stats.kruskal(*b_samples)
            cols.append(h)
            cols.append(p_value)
        elif test_id.strip() == "friedmanchisquare":
            fr, p_value = stats.friedmanchisquare(*b_samples)
            cols.append(fr)
            cols.append(p_value)
        elif test_id.strip() == "fligner":
            xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(xsq)
            cols.append(p_value)
        elif test_id.strip() == "bartlett":
            T, p_value = stats.bartlett(*b_samples)
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "levene":
            w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(w)
            cols.append(p_value)
        elif test_id.strip() == "median_test":
            stat, p_value, m, table = stats.median_test(
                ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples
            )
            cols.append(stat)
            cols.append(p_value)
            cols.append(m)
            cols.append(table)
            for list in table:
                elements = ",".join(map(str, list))
                cols.append(elements)
        outfile.write("%s\n" % "\t".join(map(str, cols)))
    outfile.close()
예제 #18
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--infile", required=True, help="Tabular file.")
    parser.add_argument("-o",
                        "--outfile",
                        required=True,
                        help="Path to the output file.")
    parser.add_argument("--sample_one_cols",
                        help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_two_cols",
                        help="Input format, like smi, sdf, inchi")
    parser.add_argument(
        "--sample_cols",
        help="Input format, like smi, sdf, inchi,separate arrays using ;",
    )
    parser.add_argument("--test_id", help="statistical test method")
    parser.add_argument(
        "--mwu_use_continuity",
        action="store_true",
        default=False,
        help=
        "Whether a continuity correction (1/2.) should be taken into account.",
    )
    parser.add_argument(
        "--equal_var",
        action="store_true",
        default=False,
        help=
        "If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.",
    )
    parser.add_argument(
        "--reta",
        action="store_true",
        default=False,
        help="Whether or not to return the internally computed a values.",
    )
    parser.add_argument(
        "--fisher",
        action="store_true",
        default=False,
        help="if true then Fisher definition is used",
    )
    parser.add_argument(
        "--bias",
        action="store_true",
        default=False,
        help=
        "if false,then the calculations are corrected for statistical bias",
    )
    parser.add_argument(
        "--inclusive1",
        action="store_true",
        default=False,
        help="if false,lower_limit will be ignored",
    )
    parser.add_argument(
        "--inclusive2",
        action="store_true",
        default=False,
        help="if false,higher_limit will be ignored",
    )
    parser.add_argument(
        "--inclusive",
        action="store_true",
        default=False,
        help="if false,limit will be ignored",
    )
    parser.add_argument(
        "--printextras",
        action="store_true",
        default=False,
        help=
        "If True, if there are extra points a warning is raised saying how many of those points there are",
    )
    parser.add_argument(
        "--initial_lexsort",
        action="store_true",
        default="False",
        help=
        "Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.",
    )
    parser.add_argument(
        "--correction",
        action="store_true",
        default=False,
        help="continuity correction ",
    )
    parser.add_argument(
        "--axis",
        type=int,
        default=0,
        help=
        "Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)",
    )
    parser.add_argument(
        "--n",
        type=int,
        default=0,
        help=
        "the number of trials. This is ignored if x gives both the number of successes and failures",
    )
    parser.add_argument("--b",
                        type=int,
                        default=0,
                        help="The number of bins to use for the histogram")
    parser.add_argument("--N",
                        type=int,
                        default=0,
                        help="Score that is compared to the elements in a.")
    parser.add_argument("--ddof",
                        type=int,
                        default=0,
                        help="Degrees of freedom correction")
    parser.add_argument(
        "--score",
        type=int,
        default=0,
        help="Score that is compared to the elements in a.",
    )
    parser.add_argument("--m", type=float, default=0.0, help="limits")
    parser.add_argument("--mf", type=float, default=2.0, help="lower limit")
    parser.add_argument("--nf", type=float, default=99.9, help="higher_limit")
    parser.add_argument(
        "--p",
        type=float,
        default=0.5,
        help=
        "The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5",
    )
    parser.add_argument("--alpha", type=float, default=0.9, help="probability")
    parser.add_argument(
        "--new",
        type=float,
        default=0.0,
        help="Value to put in place of values in a outside of bounds",
    )
    parser.add_argument(
        "--proportiontocut",
        type=float,
        default=0.0,
        help="Proportion (in range 0-1) of total data set to trim of each end.",
    )
    parser.add_argument(
        "--lambda_",
        type=float,
        default=1.0,
        help=
        "lambda_ gives the power in the Cressie-Read power divergence statistic",
    )
    parser.add_argument(
        "--imbda",
        type=float,
        default=0,
        help=
        "If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.",
    )
    parser.add_argument(
        "--base",
        type=float,
        default=1.6,
        help="The logarithmic base to use, defaults to e",
    )
    parser.add_argument("--dtype", help="dtype")
    parser.add_argument("--med", help="med")
    parser.add_argument("--cdf", help="cdf")
    parser.add_argument("--zero_method", help="zero_method options")
    parser.add_argument("--dist", help="dist options")
    parser.add_argument("--ties", help="ties options")
    parser.add_argument("--alternative", help="alternative options")
    parser.add_argument("--mode", help="mode options")
    parser.add_argument("--method", help="method options")
    parser.add_argument("--md", help="md options")
    parser.add_argument("--center", help="center options")
    parser.add_argument("--kind", help="kind options")
    parser.add_argument("--tail", help="tail options")
    parser.add_argument("--interpolation", help="interpolation options")
    parser.add_argument("--statistic", help="statistic options")

    args = parser.parse_args()
    infile = args.infile
    outfile = open(args.outfile, "w+")
    test_id = args.test_id
    nf = args.nf
    mf = args.mf
    imbda = args.imbda
    inclusive1 = args.inclusive1
    inclusive2 = args.inclusive2
    sample0 = 0
    sample1 = 0
    sample2 = 0
    if args.sample_cols is not None:
        sample0 = 1
        barlett_samples = []
        for sample in args.sample_cols.split(";"):
            barlett_samples.append(map(int, sample.split(",")))
    if args.sample_one_cols is not None:
        sample1 = 1
        sample_one_cols = args.sample_one_cols.split(",")
    if args.sample_two_cols is not None:
        sample_two_cols = args.sample_two_cols.split(",")
        sample2 = 1
    for line in open(infile):
        sample_one = []
        sample_two = []
        cols = line.strip().split("\t")
        if sample0 == 1:
            b_samples = columns_to_values(barlett_samples, line)
        if sample1 == 1:
            for index in sample_one_cols:
                sample_one.append(cols[int(index) - 1])
        if sample2 == 1:
            for index in sample_two_cols:
                sample_two.append(cols[int(index) - 1])
        if test_id.strip() == "describe":
            size, min_max, mean, uv, bs, bk = stats.describe(
                map(float, sample_one))
            cols.append(size)
            cols.append(min_max)
            cols.append(mean)
            cols.append(uv)
            cols.append(bs)
            cols.append(bk)
        elif test_id.strip() == "mode":
            vals, counts = stats.mode(map(float, sample_one))
            cols.append(vals)
            cols.append(counts)
        elif test_id.strip() == "nanmean":
            m = stats.nanmean(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "kurtosistest":
            z_value, p_value = stats.kurtosistest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "itemfreq":
            freq = stats.itemfreq(map(float, sample_one))
            for list in freq:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "boxcox_llf":
            IIf = stats.boxcox_llf(imbda, map(float, sample_one))
            cols.append(IIf)
        elif test_id.strip() == "tiecorrect":
            fa = stats.tiecorrect(map(float, sample_one))
            cols.append(fa)
        elif test_id.strip() == "rankdata":
            r = stats.rankdata(map(float, sample_one), method=args.md)
            cols.append(r)
        elif test_id.strip() == "nanstd":
            s = stats.nanstd(map(float, sample_one), bias=args.bias)
            cols.append(s)
        elif test_id.strip() == "anderson":
            A2, critical, sig = stats.anderson(map(float, sample_one),
                                               dist=args.dist)
            cols.append(A2)
            for list in critical:
                cols.append(list)
            cols.append(",")
            for list in sig:
                cols.append(list)
        elif test_id.strip() == "binom_test":
            p_value = stats.binom_test(map(float, sample_one),
                                       n=args.n,
                                       p=args.p)
            cols.append(p_value)
        elif test_id.strip() == "gmean":
            gm = stats.gmean(map(float, sample_one), dtype=args.dtype)
            cols.append(gm)
        elif test_id.strip() == "hmean":
            hm = stats.hmean(map(float, sample_one), dtype=args.dtype)
            cols.append(hm)
        elif test_id.strip() == "kurtosis":
            k = stats.kurtosis(
                map(float, sample_one),
                axis=args.axis,
                fisher=args.fisher,
                bias=args.bias,
            )
            cols.append(k)
        elif test_id.strip() == "moment":
            n_moment = stats.moment(map(float, sample_one), n=args.n)
            cols.append(n_moment)
        elif test_id.strip() == "normaltest":
            k2, p_value = stats.normaltest(map(float, sample_one))
            cols.append(k2)
            cols.append(p_value)
        elif test_id.strip() == "skew":
            skewness = stats.skew(map(float, sample_one), bias=args.bias)
            cols.append(skewness)
        elif test_id.strip() == "skewtest":
            z_value, p_value = stats.skewtest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "sem":
            s = stats.sem(map(float, sample_one), ddof=args.ddof)
            cols.append(s)
        elif test_id.strip() == "zscore":
            z = stats.zscore(map(float, sample_one), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "signaltonoise":
            s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof)
            cols.append(s2n)
        elif test_id.strip() == "percentileofscore":
            p = stats.percentileofscore(map(float, sample_one),
                                        score=args.score,
                                        kind=args.kind)
            cols.append(p)
        elif test_id.strip() == "bayes_mvs":
            c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one),
                                                   alpha=args.alpha)
            cols.append(c_mean)
            cols.append(c_var)
            cols.append(c_std)
        elif test_id.strip() == "sigmaclip":
            c, c_low, c_up = stats.sigmaclip(map(float, sample_one),
                                             low=args.m,
                                             high=args.n)
            cols.append(c)
            cols.append(c_low)
            cols.append(c_up)
        elif test_id.strip() == "kstest":
            d, p_value = stats.kstest(
                map(float, sample_one),
                cdf=args.cdf,
                N=args.N,
                alternative=args.alternative,
                mode=args.mode,
            )
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "chi2_contingency":
            chi2, p, dof, ex = stats.chi2_contingency(
                map(float, sample_one),
                correction=args.correction,
                lambda_=args.lambda_)
            cols.append(chi2)
            cols.append(p)
            cols.append(dof)
            cols.append(ex)
        elif test_id.strip() == "tmean":
            if nf == 0 and mf == 0:
                mean = stats.tmean(map(float, sample_one))
            else:
                mean = stats.tmean(map(float, sample_one), (mf, nf),
                                   (inclusive1, inclusive2))
            cols.append(mean)
        elif test_id.strip() == "tmin":
            if mf == 0:
                min = stats.tmin(map(float, sample_one))
            else:
                min = stats.tmin(map(float, sample_one),
                                 lowerlimit=mf,
                                 inclusive=args.inclusive)
            cols.append(min)
        elif test_id.strip() == "tmax":
            if nf == 0:
                max = stats.tmax(map(float, sample_one))
            else:
                max = stats.tmax(map(float, sample_one),
                                 upperlimit=nf,
                                 inclusive=args.inclusive)
            cols.append(max)
        elif test_id.strip() == "tvar":
            if nf == 0 and mf == 0:
                var = stats.tvar(map(float, sample_one))
            else:
                var = stats.tvar(map(float, sample_one), (mf, nf),
                                 (inclusive1, inclusive2))
            cols.append(var)
        elif test_id.strip() == "tstd":
            if nf == 0 and mf == 0:
                std = stats.tstd(map(float, sample_one))
            else:
                std = stats.tstd(map(float, sample_one), (mf, nf),
                                 (inclusive1, inclusive2))
            cols.append(std)
        elif test_id.strip() == "tsem":
            if nf == 0 and mf == 0:
                s = stats.tsem(map(float, sample_one))
            else:
                s = stats.tsem(map(float, sample_one), (mf, nf),
                               (inclusive1, inclusive2))
            cols.append(s)
        elif test_id.strip() == "scoreatpercentile":
            if nf == 0 and mf == 0:
                s = stats.scoreatpercentile(
                    map(float, sample_one),
                    map(float, sample_two),
                    interpolation_method=args.interpolation,
                )
            else:
                s = stats.scoreatpercentile(
                    map(float, sample_one),
                    map(float, sample_two),
                    (mf, nf),
                    interpolation_method=args.interpolation,
                )
            for list in s:
                cols.append(list)
        elif test_id.strip() == "relfreq":
            if nf == 0 and mf == 0:
                rel, low_range, binsize, ex = stats.relfreq(
                    map(float, sample_one), args.b)
            else:
                rel, low_range, binsize, ex = stats.relfreq(
                    map(float, sample_one), args.b, (mf, nf))
            for list in rel:
                cols.append(list)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "binned_statistic":
            if nf == 0 and mf == 0:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                )
            else:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                    range=(mf, nf),
                )
            cols.append(st)
            cols.append(b_edge)
            cols.append(b_n)
        elif test_id.strip() == "threshold":
            if nf == 0 and mf == 0:
                o = stats.threshold(map(float, sample_one), newval=args.new)
            else:
                o = stats.threshold(map(float, sample_one),
                                    mf,
                                    nf,
                                    newval=args.new)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trimboth":
            o = stats.trimboth(map(float, sample_one),
                               proportiontocut=args.proportiontocut)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trim1":
            t1 = stats.trim1(
                map(float, sample_one),
                proportiontocut=args.proportiontocut,
                tail=args.tail,
            )
            for list in t1:
                cols.append(list)
        elif test_id.strip() == "histogram":
            if nf == 0 and mf == 0:
                hi, low_range, binsize, ex = stats.histogram(
                    map(float, sample_one), args.b)
            else:
                hi, low_range, binsize, ex = stats.histogram(
                    map(float, sample_one), args.b, (mf, nf))
            cols.append(hi)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "cumfreq":
            if nf == 0 and mf == 0:
                cum, low_range, binsize, ex = stats.cumfreq(
                    map(float, sample_one), args.b)
            else:
                cum, low_range, binsize, ex = stats.cumfreq(
                    map(float, sample_one), args.b, (mf, nf))
            cols.append(cum)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "boxcox_normmax":
            if nf == 0 and mf == 0:
                ma = stats.boxcox_normmax(map(float, sample_one))
            else:
                ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf),
                                          method=args.method)
            cols.append(ma)
        elif test_id.strip() == "boxcox":
            if imbda == 0:
                box, ma, ci = stats.boxcox(map(float, sample_one),
                                           alpha=args.alpha)
                cols.append(box)
                cols.append(ma)
                cols.append(ci)
            else:
                box = stats.boxcox(map(float, sample_one),
                                   imbda,
                                   alpha=args.alpha)
                cols.append(box)
        elif test_id.strip() == "histogram2":
            h2 = stats.histogram2(map(float, sample_one),
                                  map(float, sample_two))
            for list in h2:
                cols.append(list)
        elif test_id.strip() == "ranksums":
            z_statistic, p_value = stats.ranksums(map(float, sample_one),
                                                  map(float, sample_two))
            cols.append(z_statistic)
            cols.append(p_value)
        elif test_id.strip() == "ttest_1samp":
            t, prob = stats.ttest_1samp(map(float, sample_one),
                                        map(float, sample_two))
            for list in t:
                cols.append(list)
            for list in prob:
                cols.append(list)
        elif test_id.strip() == "ansari":
            AB, p_value = stats.ansari(map(float, sample_one),
                                       map(float, sample_two))
            cols.append(AB)
            cols.append(p_value)
        elif test_id.strip() == "linregress":
            slope, intercept, r_value, p_value, stderr = stats.linregress(
                map(float, sample_one), map(float, sample_two))
            cols.append(slope)
            cols.append(intercept)
            cols.append(r_value)
            cols.append(p_value)
            cols.append(stderr)
        elif test_id.strip() == "pearsonr":
            cor, p_value = stats.pearsonr(map(float, sample_one),
                                          map(float, sample_two))
            cols.append(cor)
            cols.append(p_value)
        elif test_id.strip() == "pointbiserialr":
            r, p_value = stats.pointbiserialr(map(float, sample_one),
                                              map(float, sample_two))
            cols.append(r)
            cols.append(p_value)
        elif test_id.strip() == "ks_2samp":
            d, p_value = stats.ks_2samp(map(float, sample_one),
                                        map(float, sample_two))
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "mannwhitneyu":
            mw_stats_u, p_value = stats.mannwhitneyu(
                map(float, sample_one),
                map(float, sample_two),
                use_continuity=args.mwu_use_continuity,
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "zmap":
            z = stats.zmap(map(float, sample_one),
                           map(float, sample_two),
                           ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "ttest_ind":
            mw_stats_u, p_value = stats.ttest_ind(map(float, sample_one),
                                                  map(float, sample_two),
                                                  equal_var=args.equal_var)
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "ttest_rel":
            t, prob = stats.ttest_rel(map(float, sample_one),
                                      map(float, sample_two),
                                      axis=args.axis)
            cols.append(t)
            cols.append(prob)
        elif test_id.strip() == "mood":
            z, p_value = stats.mood(map(float, sample_one),
                                    map(float, sample_two),
                                    axis=args.axis)
            cols.append(z)
            cols.append(p_value)
        elif test_id.strip() == "shapiro":
            W, p_value, a = stats.shapiro(map(float, sample_one),
                                          map(float, sample_two), args.reta)
            cols.append(W)
            cols.append(p_value)
            for list in a:
                cols.append(list)
        elif test_id.strip() == "kendalltau":
            k, p_value = stats.kendalltau(
                map(float, sample_one),
                map(float, sample_two),
                initial_lexsort=args.initial_lexsort,
            )
            cols.append(k)
            cols.append(p_value)
        elif test_id.strip() == "entropy":
            s = stats.entropy(map(float, sample_one),
                              map(float, sample_two),
                              base=args.base)
            cols.append(s)
        elif test_id.strip() == "spearmanr":
            if sample2 == 1:
                rho, p_value = stats.spearmanr(map(float, sample_one),
                                               map(float, sample_two))
            else:
                rho, p_value = stats.spearmanr(map(float, sample_one))
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "wilcoxon":
            if sample2 == 1:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    map(float, sample_two),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            else:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "chisquare":
            if sample2 == 1:
                rho, p_value = stats.chisquare(map(float, sample_one),
                                               map(float, sample_two),
                                               ddof=args.ddof)
            else:
                rho, p_value = stats.chisquare(map(float, sample_one),
                                               ddof=args.ddof)
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "power_divergence":
            if sample2 == 1:
                stat, p_value = stats.power_divergence(
                    map(float, sample_one),
                    map(float, sample_two),
                    ddof=args.ddof,
                    lambda_=args.lambda_,
                )
            else:
                stat, p_value = stats.power_divergence(map(float, sample_one),
                                                       ddof=args.ddof,
                                                       lambda_=args.lambda_)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "theilslopes":
            if sample2 == 1:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one),
                                                     map(float, sample_two),
                                                     alpha=args.alpha)
            else:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one),
                                                     alpha=args.alpha)
            cols.append(mpe)
            cols.append(met)
            cols.append(lo)
            cols.append(up)
        elif test_id.strip() == "combine_pvalues":
            if sample2 == 1:
                stat, p_value = stats.combine_pvalues(
                    map(float, sample_one),
                    method=args.med,
                    weights=map(float, sample_two),
                )
            else:
                stat, p_value = stats.combine_pvalues(map(float, sample_one),
                                                      method=args.med)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "obrientransform":
            ob = stats.obrientransform(*b_samples)
            for list in ob:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "f_oneway":
            f_value, p_value = stats.f_oneway(*b_samples)
            cols.append(f_value)
            cols.append(p_value)
        elif test_id.strip() == "kruskal":
            h, p_value = stats.kruskal(*b_samples)
            cols.append(h)
            cols.append(p_value)
        elif test_id.strip() == "friedmanchisquare":
            fr, p_value = stats.friedmanchisquare(*b_samples)
            cols.append(fr)
            cols.append(p_value)
        elif test_id.strip() == "fligner":
            xsq, p_value = stats.fligner(center=args.center,
                                         proportiontocut=args.proportiontocut,
                                         *b_samples)
            cols.append(xsq)
            cols.append(p_value)
        elif test_id.strip() == "bartlett":
            T, p_value = stats.bartlett(*b_samples)
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "levene":
            w, p_value = stats.levene(center=args.center,
                                      proportiontocut=args.proportiontocut,
                                      *b_samples)
            cols.append(w)
            cols.append(p_value)
        elif test_id.strip() == "median_test":
            stat, p_value, m, table = stats.median_test(
                ties=args.ties,
                correction=args.correction,
                lambda_=args.lambda_,
                *b_samples)
            cols.append(stat)
            cols.append(p_value)
            cols.append(m)
            cols.append(table)
            for list in table:
                elements = ",".join(map(str, list))
                cols.append(elements)
        outfile.write("%s\n" % "\t".join(map(str, cols)))
    outfile.close()
예제 #19
0
def population_coupling_df(
    df: pd.core.frame.DataFrame,
    spiketrain_col: str = "spiketrain",
    spiketimes_col: str = "spiketimes",
    binsize: float = 0.01,
    num_lags: int = 100,
    t_start: float = None,
    t_stop: float = None,
    return_all: bool = False,
):
    """
    Calculate the population-coupling index between each spiketrain and all others in a DataFrame.

    The metric is calculated by computing and standardising cross correlation
    between an individual spiketrain and the "population spiketrain", consisting of all other neurons.
    Large Z score cross correlation at lag=0 is indicative of high population coupling.

    Args:
        df: A pandas DataFrame containing spiketimes indexed by spiketrain
        spiketrain_col: The column containing spiketimes
        spiketimes_col: The column containing spiketrain identifiers
        binsize: The size of the time bin in seconds
        num_lags: The number of lags forward and backwards around lag 0 to return
        t_start: Minimum timepoint
        t_stop: Maximum timepoint
        return_all: If true, all time bins and cross correlation values are returned
    Returns:
        A pandas DataFrame containing one row per spiketrain with columns {spiketrain_col, 'population_coupling'}
    """
    ROUNDING_PRECISION = 5
    FRAC_TO_COMPARE = 4

    bin_idx_to_start = ((num_lags * 2) + 1) // FRAC_TO_COMPARE
    out: list = []
    spiketrains = df[spiketrain_col].unique()
    for spiketrain in spiketrains:
        spiketrain_oi = df[df[spiketrain_col] ==
                           spiketrain][spiketimes_col].values
        population_spiketrain = np.sort(
            df[df[spiketrain_col] != spiketrain][spiketimes_col].values)
        t, cc = spiketimes.correlate.cross_corr(
            spiketrain_1=spiketrain_oi,
            spiketrain_2=population_spiketrain,
            binsize=binsize,
            num_lags=num_lags,
            as_df=False,
            t_start=t_start,
            t_stop=t_stop,
            delete_0_lag=False,
        )
        z = stats.zmap(cc, cc[:bin_idx_to_start])
        t = np.round(t, ROUNDING_PRECISION)
        if return_all:
            out.append(
                pd.DataFrame({
                    "time_sec": t,
                    "zscore": z,
                    spiketrain_col: spiketrain
                }))
        else:
            out.append(z[t == 0][0])
    if return_all:
        df = pd.concat(out, axis=0)
    else:
        df = pd.DataFrame({
            spiketrain_col: spiketrains,
            "population_coupling": out
        })
    return df
## short-example.py
## short example analysis, just to make sure dependencies are installed correctly

import numpy as np
import pandas as pd
import sys

sys.path.insert(0, '../cadreModels')

from classificationBinary import binaryCadreModel
from sklearn.datasets import make_classification
from scipy.stats import zscore, zmap

from sklearn.model_selection import train_test_split

X, y = make_classification(n_samples=10000, random_state=2125615)

data = pd.DataFrame(X).assign(target=y)
features = data.columns[data.columns != 'target']

D_tr, D_va = train_test_split(data, test_size=0.2, random_state=313616)

D_va[features] = zmap(D_va[features], D_tr[features])
D_tr[features] = zscore(D_tr[features])

scm = binaryCadreModel(Tmax=1001, record=50)
scm.fit(D_tr, 'target', features, features, D_va, progress=True)
예제 #21
0
 def test_zmap(self):
     for n in self.get_n():
         x,y,xm,ym = self.generate_xy_sample(n)
         z = stats.zmap(x,y)
         zm = stats.mstats.zmap(xm,ym)
         assert(np.all(abs(z-zm[0:len(z)]) < 1.E-10))
예제 #22
0
#######################
## construct dataset ##
#######################

boston = load_boston()
D = pd.DataFrame(boston['data']).assign(target=boston['target']).sample(frac=1, random_state=662352).values

## train and test set sizes
Ntr = int(np.ceil(0.75*D.shape[0]))
Nva = D.shape[0] - Ntr
P = D.shape[1]-1

## train and test splits
Dtr, Dva = D[:Ntr,:], D[Ntr:,:]
Dva, Dtr = ss.zmap(Dva, Dtr), ss.zscore(Dtr)    
Xtr, Ytr = Dtr[:,:-1], np.expand_dims(Dtr[:,-1], 1)
Xva, Yva = Dva[:,:-1], np.expand_dims(Dva[:,-1], 1)

M = 3                # number of cadres
alpha = [0.95, 0.05] # d is more l1, W is more l2
lam = [1, 1]         # regularization strength

##################
## learn models ##
##################

cadreModel = sc.regressionCadreModel(lambda_d=lam[0], lambda_W=lam[1], M=M)
cadreModel.fit(Xtr, Ytr)

## learn SVRs
예제 #23
0
def coerce_levels(image_numpy,
                  levels=255,
                  method="divide",
                  reference_image=[],
                  reference_norm_range=[.075, 1],
                  mask_value=0,
                  coerce_positive=True):
    """ In volumes with huge outliers, the divide method will
        likely result in many zero values. This happens in practice
        quite often. TO-DO: find a better method to bin image values.
        I'm sure there are a thousand such algorithms out there to do
        so. Maybe something based on median's, rather than means. This,
        of course, loses the 'Extremeness' of extreme values. An open
        question of how to reconcile this -- maybe best left to the user.
        Note that there is some dubious +1s and -1s in this function. It
        may be better to clean these up in the future. I have also built-in
        the coerce-positive function into this function. The other one
        was not working for mysterious reasons.
    """

    if np.min(image_numpy) < 0 and coerce_positive:
        reference_image -= np.min(image_numpy)
        image_numpy[image_numpy != mask_value] -= np.min(image_numpy)

    levels -= 1
    if method == "divide":
        if reference_image == []:
            image_max = np.max(image_numpy)
        else:
            image_max = np.max(reference_image)
        for x in xrange(image_numpy.shape[0]):
            for y in xrange(image_numpy.shape[1]):
                for z in xrange(image_numpy.shape[2]):
                    if image_numpy[x, y, z] != mask_value:
                        image_numpy[x, y, z] = np.round(
                            (image_numpy[x, y, z] / image_max) * levels) + 1
    """ Another method is to bin values based on their z-score. I provide
        two options: within-ROI normalization, and whole-image normalization.
        The output is always the ROI image, but in the latter option z-scores
        are generated from the range of intensities across the entire image
        within some range of percentages. This range is currently determined
        from the mean, but it may make more sense to do it from the median;
        this protects the algorithm from extreme values. On the other hand,
        using the median could white out an otherwise heterogenous hotspot.
    """

    if method == "z_score":

        # check_image(image_numpy, mode="maximal_slice", mask_value=mask_value)

        ## Note that this is a bad way to check this variable.
        if reference_image == []:
            masked_image_numpy = np.ma.masked_equal(image_numpy, mask_value)
            z_image_numpy = stats.zscore(masked_image_numpy, axis=None)

            # image_range = [np.min(z_image_numpy), np.max(z_image_numpy)]
            image_range = [
                np.mean(z_image_numpy) - np.std(z_image_numpy),
                np.mean(z_image_numpy) + np.std(z_image_numpy)
            ]
            bins = np.linspace(image_range[0], image_range[1], levels)

            # distribution = stats.norm(loc=np.mean(z_image_numpy), scale=np.var(z_image_numpy))

            # # percentile point, the range for the inverse cumulative distribution function:
            # bounds_for_range = distribution.cdf([0, 100])

            # # Linspace for the inverse cdf:
            # pp = np.linspace(*bounds_for_range, num=levels)

            # bins = distribution.ppf(pp)
            # print bins
        else:
            masked_reference_image = np.ma.masked_equal(
                reference_image, mask_value)
            masked_reference_image = np.ma.masked_less(
                masked_reference_image,
                reference_norm_range[0] * np.max(reference_image))
            masked_reference_image = np.ma.masked_greater(
                masked_reference_image,
                reference_norm_range[1] * np.max(reference_image))
            masked_image_numpy = np.ma.masked_equal(image_numpy, mask_value)
            z_image_numpy = stats.zmap(masked_image_numpy,
                                       masked_reference_image,
                                       axis=None)

            z_reference_image = stats.zscore(masked_reference_image, axis=None)

            # distribution = stats.norm(loc=np.mean(z_reference_image), scale=np.var(z_reference_image))

            # # percentile point, the range for the inverse cumulative distribution function:
            # bounds_for_range = distribution.cdf([0, 100])

            # # Linspace for the inverse cdf:
            # pp = np.linspace(*bounds_for_range, num=levels)

            # bins = distribution.ppf(pp)

            # image_range = [np.mean(z_reference_image) - np.std(z_reference_image), np.mean(z_reference_image) + np.std(z_reference_image)]
            image_range = [
                np.min(z_reference_image),
                np.max(z_reference_image)
            ]
            bins = np.linspace(image_range[0], image_range[1], levels)

        for x in xrange(image_numpy.shape[0]):
            for y in xrange(image_numpy.shape[1]):
                for z in xrange(image_numpy.shape[2]):
                    if image_numpy[x, y, z] != mask_value:
                        image_numpy[x, y, z] = (
                            np.abs(bins - z_image_numpy[x, y, z])).argmin() + 1

        # check_image(image_numpy, mode="maximal_slice", mask_value=mask_value)
    image_numpy[image_numpy == mask_value] = 0
    return image_numpy
예제 #24
0
def run_prediction_models(hdf, feats=None, verbose=True):
    """
    Runs model using diffusion embedding scores to predict behavioral measures

    Parameters
    ----------
    hdf : structures.Frog
        HDF5 file containing SNF gridsearch outputs
    feats : list of str, optional
        List of behavioral features to use as prediction targets

    Returns
    -------
    Y_corrs : (K, F) numpy.ndarray
        Correlation between predicted and actual behavioral values for `F`
        features across `K` folds
    Y_mses : (N, F) numpy.ndarray
        Mean-squared error of predicted and actual behavioral values for `F`
        features across `N` subjects
    """

    if feats is None:
        feats = ['pigd', 'tremor']

    holdout = hdf.load('/snf/processed/holdout/all/sqeuclidean/gridsearch')
    X_holdout = holdout['embedding'][:, :5]
    behavior = hdf.load('/processed/pd_behavioral_measures')
    Y_holdout = np.asarray(behavior[feats])
    consensus = hdf.load('/snf/processed/all/sqeuclidean/gridsearch/consensus')

    # to store out-of-sample correlations and MSE scores
    n_splits = 5
    Y_corrs = np.zeros((n_splits, 2))
    Y_mses = np.zeros_like(Y_holdout)

    # 5-fold CV
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    for n, (train_index, test_index) in enumerate(kf.split(X_holdout)):
        # split X and Y into train/test
        X_train, X_test = X_holdout[train_index], X_holdout[test_index]
        Y_train, Y_test = Y_holdout[train_index], Y_holdout[test_index]

        # zscore / zmap and add constant to X matrix
        X_test = add_constant(sstats.zmap(X_test, X_train, ddof=1))
        X_train = add_constant(sstats.zscore(X_train, ddof=1))
        Y_test = sstats.zmap(Y_test, Y_train, ddof=1)
        Y_train = sstats.zscore(Y_train, ddof=1)

        # fit model and predict out-of-sample
        betas = np.linalg.lstsq(X_train, Y_train, rcond=None)[0]
        Y_pred = X_test @ betas

        # get correlation and MSE
        Y_corrs[n] = utils.efficient_corr(Y_pred, Y_test)
        Y_mses[test_index] = (Y_test - Y_pred)**2
        Y_mse_mean = np.mean(Y_mses[test_index], axis=0)

        if verbose:
            print(f'Fold {n + 1}: r = {Y_corrs[n]:}, mse = {Y_mse_mean:}')

    if verbose:
        print('\nAverage correlations across folds:')
        corrs_mean, corrs_std = Y_corrs.mean(0), Y_corrs.std(0, ddof=1)
        for n, t in enumerate(feats):
            print(r'{:<9}: r = {:.3f} $\pm$ {:.3f}'.format(
                t, corrs_mean[n], corrs_std[n]))

        print('\nGroups differences in MSE:')
        f_hold, p_hold = sstats.f_oneway(*(Y_mses[consensus == cl]
                                           for cl in np.unique(consensus)))
        for n, t in enumerate(feats):
            print('{:<9}: F = {:.2f}, p = {:.3f}'.format(
                t, f_hold[n], p_hold[n]))

    return Y_corrs, Y_mses