Пример #1
0
def get_df_stats(scores_df, analyspar):
    """
    get_df_stats(scores_df, analyspar)

    Returns statistics (mean/median and error) for each data column.

    Required args:
        - scores_df (pd.DataFrame):
            dataframe where each column contains data for which statistics 
            should be measured
        - analyspar (AnalysPar): 
            named tuple containing analysis parameters
    
    Returns:
        - stats_df (pd.DataFrame):
            dataframe with only one data row containing data stats for each 
            original column under "{col}_stat" and "{col}_err"
    """

    # take statistics
    stats_df = pd.DataFrame()
    for col in scores_df.columns:

        # get stats
        stat = math_util.mean_med(scores_df[col].to_numpy(),
                                  stats=analyspar.stats,
                                  nanpol="omit")
        err = math_util.error_stat(scores_df[col].to_numpy(),
                                   stats=analyspar.stats,
                                   error=analyspar.error,
                                   nanpol="omit")

        if isinstance(err, np.ndarray):
            err = err.tolist()
            stats_df = gen_util.set_object_columns(stats_df, [f"{col}_err"],
                                                   in_place=True)

        stats_df.loc[0, f"{col}_stat"] = stat
        stats_df.at[0, f"{col}_err"] = err

    return stats_df
Пример #2
0
def add_CI_p_vals(shuffle_df, stats_data_df, permpar):
    """
    add_CI_p_vals(shuffle_df, stats_data_df, permpar)

    Returns confidence intervals from shuffled data, and p-values for real data.

    Required args:
        - shuffle_df (pd.DataFrame):
            dataframe where each row contains data for different data 
            shuffles, and each column contains data to use to construct null 
            distributions.
        - stats_data_df (pd.DataFrame):
            dataframe with only one data row containing real data stats for 
            each shuffle_df column. Columns should have the same names as 
            shuffle_df, as "{col}_stat" and "{col}_err".
        - permpar (PermPar): 
            named tuple containing permutation parameters

    Returns:
        - stats_df (pd.DataFrame):
            dataframe with only one data row containing real data stats, 
            shuffled data stats, and p-values for real data test set results.
    """

    if len(stats_data_df) != 1:
        raise ValueError("Expected stats_data_df to have length 1.")

    multcomp = 1 if not permpar.multcomp else permpar.multcomp
    p_thresh_corr = permpar.p_val / multcomp
    percs = math_util.get_percentiles(CI=(1 - p_thresh_corr),
                                      tails=permpar.tails)[0]
    percs = [percs[0], 50, percs[1]]

    stats_df = pd.DataFrame()
    for col in shuffle_df.columns:
        # add real data
        stat_key = f"{col}_stat"
        err_key = f"{col}_err"
        if (stat_key not in stats_data_df.columns
                or err_key not in stats_data_df.columns):
            raise KeyError(
                f"{stat_key} and {err_key} not found stats_data_df.")
        stats_df[stat_key] = stats_data_df[stat_key]
        stats_df[err_key] = stats_data_df[err_key]

        # get and add null CI data
        shuffle_data = shuffle_df[col].to_numpy()
        shuffle_data = shuffle_data[~np.isnan(shuffle_data)]  # remove NaN data

        rand_util.check_n_rand(len(shuffle_data), p_thresh_corr)
        null_CI = [np.percentile(shuffle_data, p) for p in percs]

        null_key = f"{col}_null_CIs"
        stats_df = gen_util.set_object_columns(stats_df, [null_key],
                                               in_place=True)
        stats_df.at[0, null_key] = null_CI

        # get and add p-value
        if "test" in col:
            perc = scist.percentileofscore(shuffle_data,
                                           stats_data_df.loc[0, stat_key],
                                           kind='mean')
            if perc > 50:
                perc = 100 - perc

            p_val = perc / 100
            stats_df.loc[0, f"{col}_p_vals"] = p_val

    return stats_df
Пример #3
0
def add_relative_resp_data(resp_data_df,
                           analyspar,
                           rel_sess=1,
                           in_place=False):
    """
    add_relative_resp_data(resp_data_df, analyspar)

    Adds relative response data to input dataframe for any column with "exp" 
    in the name, optionally in place.

    Required args:
        - resp_data_df (pd.DataFrame):
            dataframe with one row per session, and response stats 
            (2D array, ROI x stats) under keys for expected ("exp") and 
            unexpected ("unexp") data, separated by Gabor frame 
            (e.g., "exp_3", "unexp_G") if applicable.
        - analyspar (AnalysPar): 
            named tuple containing analysis parameters

    Optional args:
        - rel_sess (int):
            number of session relative to which data should be scaled, for each 
            mouse
            default: 1
        - in_place (bool):
            if True, dataframe is modified in place

    Returns:
        - resp_data_df (pd.DataFrame):
            input dataframe, with "rel_{}" columns added for each input column 
            with "exp" in its name
    """

    if not in_place:
        resp_data_df = resp_data_df.copy(deep=True)

    nanpol = None if analyspar.rem_bad else "omit"

    source_columns = [col for col in resp_data_df.columns if "exp" in col]
    rel_columns = [f"rel_{col}" for col in source_columns]
    resp_data_df = gen_util.set_object_columns(resp_data_df,
                                               rel_columns,
                                               in_place=True)

    # calculate relative value for each
    for mouse_n, resp_mouse_df in resp_data_df.groupby("mouse_ns"):
        resp_data_df = resp_data_df.sort_values("sess_ns")
        # find sess 1 and check that there is only 1
        rel_sess_idx = resp_mouse_df.loc[resp_mouse_df["sess_ns"] ==
                                         rel_sess].index
        mouse_n_idxs = resp_mouse_df.index
        if len(rel_sess_idx) != 1:
            raise RuntimeError(
                f"Expected to find session {rel_sess} data for each mouse, "
                f"but if is missing for mouse {mouse_n}.")

        mouse_row = resp_mouse_df.loc[rel_sess_idx[0]]
        for source_col in source_columns:
            rel_col = source_col.replace("unexp", "exp")
            rel_data = math_util.mean_med(mouse_row[rel_col],
                                          analyspar.stats,
                                          nanpol=nanpol)
            for mouse_n_idx in mouse_n_idxs:
                resp_data_df.at[mouse_n_idx, f"rel_{source_col}"] = \
                    resp_data_df.loc[mouse_n_idx, source_col] / rel_data

    return resp_data_df
Пример #4
0
def get_resp_df(sessions, analyspar, stimpar, rel_sess=1, parallel=False):
    """
    get_resp_df(sessions, analyspar, stimpar)

    Returns relative response dataframe for requested sessions.

    Required args:
        - sessions (list): 
            session objects
        - analyspar (AnalysPar): 
            named tuple containing analysis parameters
        - stimpar (StimPar): 
            named tuple containing stimulus parameters

    Optional args:
        - rel_sess (int):
            number of session relative to which data should be scaled, for each 
            mouse. If None, relative data is not added.
            default: 1
        - parallel (bool): 
            if True, some of the analysis is run in parallel across CPU cores 
            default: False

    Returns:
        - resp_data_df (pd.DataFrame):
            data dictionary with response stats (2D array, ROI x stats) under 
            keys for expected ("exp") and unexpected ("unexp") data, 
            separated by Gabor frame (e.g., "exp_3", "unexp_G") 
            if stimpar.stimtype == "gabors", and 
            with "rel_{}" columns added for each input column with "exp" in its 
            name if rel_sess is not None.
    """

    if analyspar.tracked:
        misc_analys.check_sessions_complete(sessions, raise_err=True)

    sessids = [sess.sessid for sess in sessions]
    resp_data_df = misc_analys.get_check_sess_df(sessions, analyspar=analyspar)

    # double check that sessions are in correct order
    if resp_data_df["sessids"].tolist() != sessids:
        raise NotImplementedError(
            "Implementation error. Sessions must appear in correct order in "
            "resp_data_df.")

    logger.info(f"Loading data for each session...", extra={"spacing": TAB})
    data_dicts = gen_util.parallel_wrap(get_sess_integ_resp_dict,
                                        sessions,
                                        args_list=[analyspar, stimpar],
                                        parallel=parallel)

    # add data to df
    misc_analys.get_check_sess_df(sessions, resp_data_df)
    for i, idx in enumerate(resp_data_df.index):
        for key, value in data_dicts[i].items():
            if i == 0:
                resp_data_df = gen_util.set_object_columns(resp_data_df, [key],
                                                           in_place=True)
            resp_data_df.at[idx, key] = value[:,
                                              0]  # retain stat only, not error

    # add relative data
    if rel_sess is not None:
        resp_data_df = add_relative_resp_data(resp_data_df,
                                              analyspar,
                                              rel_sess=rel_sess,
                                              in_place=True)

    return resp_data_df
Пример #5
0
def corr_scatterplots(sessions, analyspar, stimpar, basepar, idxpar, permpar, 
                      permute="sess", sig_only=False, randst=None, n_bins=200, 
                      parallel=False):
    """
    corr_scatterplots(sessions, analyspar, stimpar, basepar, idxpar, permpar)

    Returns ROI index correlation scatterplot data for each line/plane/session 
    comparison.

    Required args:
        - sessions (list): 
            Session objects
        - analyspar (AnalysPar): 
            named tuple containing analysis parameters
        - stimpar (StimPar): 
            named tuple containing stimulus parameters
        - basepar (BasePar): 
            named tuple containing baseline parameters
        - idxpar (IdxPar): 
            named tuple containing index parameters
        - permpar (PermPar): 
            named tuple containing permutation parameters.
    
    Optional args:
        - permute (bool):
            type of permutation to due ("tracking", "sess" or "all")
            default: "sess"
        - sig_only (bool):
            if True, ROIs with significant USIs are included 
            (only possible if analyspar.tracked is True)
            default: False
        - randst (int or np.random.RandomState): 
            seed value to use. (-1 treated as None)
            default: None
        - n_bins (int): 
            number of bins for random data
            default: 200
        - parallel (bool): 
            if True, some of the analysis is run in parallel across CPU cores 
            default: False
   
    Returns:
        - idx_corr_df (pd.DataFrame):
            dataframe with one row per line/plane, and the 
            following columns, in addition to the basic sess_df columns:

            for correlation data (normalized if corr_type is "diff_corr") for 
            session comparisons (x, y), e.g. 1v2
            - binned_rand_stats (list): number of random correlation values per 
                bin (xs x ys)
            - corr_data_xs (list): USI values for x
            - corr_data_ys (list): USI values for y
            - corrs (float): correlation between session data (x and y)
            - p_vals (float): p-value for correlation, corrected for 
                multiple comparisons and tails
            - rand_corr_meds (float): median of the random correlations
            - raw_p_vals (float): p-value for intersession correlations
            - regr_coefs (float): regression correlation coefficient (slope)
            - regr_intercepts (float): regression correlation intercept
            - x_bin_mids (list): x mid point for each random correlation bin
            - y_bin_mids (list): y mid point for each random correlation bin
    """
    
    lp_idx_df = get_lp_idx_df(
        sessions, 
        analyspar=analyspar, 
        stimpar=stimpar, 
        basepar=basepar, 
        idxpar=idxpar,
        permpar=permpar,
        sig_only=sig_only,
        randst=randst,
        parallel=parallel,
        )

    idx_corr_df = get_basic_idx_corr_df(
        lp_idx_df, consec_only=False, null_CI_cols=False
        )

    # get correlation pairs
    corr_ns = get_corr_pairs(lp_idx_df)
    if len(corr_ns) != 1:
        raise ValueError("Expected only 1 session correlation pair.")
    sess_pair = corr_ns[0]

    # get norm information
    norm = False
    corr_type = "corr"
    if permute in ["sess", "all"]:
        corr_type = "diff_corr"
        norm = True

    # add array columns
    columns = ["corr_data_xs", "corr_data_ys", "binned_rand_stats", 
        "x_bin_mids", "y_bin_mids"]
    idx_corr_df = gen_util.set_object_columns(idx_corr_df, columns)

    logger.info(
        ("Calculating ROI USI correlations across sessions..."), 
        extra={"spacing": TAB}
        )
    group_columns = ["lines", "planes"]
    for grp_vals, grp_df in lp_idx_df.groupby(group_columns):
        grp_df = grp_df.sort_values("sess_ns") # mice already aggregated
        line, plane = grp_vals
        row_idx = idx_corr_df.loc[
            (idx_corr_df["lines"] == line) &
            (idx_corr_df["planes"] == plane)
        ].index

        if len(row_idx) != 1:
            raise RuntimeError("Expected exactly one row to match.")
        row_idx = row_idx[0]

        if len(grp_df) > 2:
            raise RuntimeError("Expected no more than 2 rows to correlate.")
        if len(grp_df) < 2:
            continue # no pair
    
        use_randst = copy.deepcopy(randst) # reset each time

        # obtain correlation data
        args_dict = {
            "data_df"    : grp_df,
            "analyspar"  : analyspar,
            "permpar"    : permpar,
            "permute"    : permute,
            "corr_type"  : corr_type,
            "absolute"   : False,
            "norm"       : norm,
            "randst"     : use_randst,
            "return_data": True,
            "return_rand": True,
            "n_rand_ex"  : 1000,
        }

        all_corr_data = get_corr_data(sess_pair, **args_dict)
        [roi_corr, _, null_CI, p_val, corr_data, _, rand_exs, _] = all_corr_data

        regr = LinearRegression().fit(corr_data[0].reshape(-1, 1), corr_data[1])

        # bin data
        rand_stats, x_edge, y_edge = np.histogram2d(
            rand_exs[0].reshape(-1), rand_exs[1].reshape(-1), bins=n_bins, 
            density=False
            )
        x_mids = np.diff(x_edge) / 2 + x_edge[:-1]
        y_mids = np.diff(y_edge) / 2 + y_edge[:-1]

        rand_binned = scind.gaussian_filter(
            rand_stats, n_bins / 20, mode="constant"
            )

        idx_corr_df.loc[row_idx, "corrs"] = roi_corr
        idx_corr_df.loc[row_idx, "rand_corr_meds"] = null_CI[1]
        idx_corr_df.loc[row_idx, "regr_coefs"] = regr.coef_
        idx_corr_df.loc[row_idx, "regr_intercepts"] = regr.intercept_

        idx_corr_df.at[row_idx, "corr_data_xs"] = corr_data[0].tolist()
        idx_corr_df.at[row_idx, "corr_data_ys"] = corr_data[1].tolist()

        idx_corr_df.at[row_idx, "binned_rand_stats"] = rand_binned.tolist()
        idx_corr_df.at[row_idx, "x_bin_mids"] = x_mids.tolist()
        idx_corr_df.at[row_idx, "y_bin_mids"] = y_mids.tolist()

        idx_corr_df.loc[row_idx, "p_vals"] = p_val

    # corrected p-values
    idx_corr_df = misc_analys.add_corr_p_vals(idx_corr_df, permpar)

    return idx_corr_df
Пример #6
0
def get_ex_traces_df(sessions,
                     analyspar,
                     stimpar,
                     basepar,
                     n_ex=6,
                     rolling_win=4,
                     randst=None,
                     parallel=False):
    """
    get_ex_traces_df(sessions, analyspar, stimpar, basepar)

    Returns example ROI traces dataframe.

    Required args:
        - sessions (list):
            Session objects
        - analyspar (AnalysPar): 
            named tuple containing analysis parameters
        - stimpar (StimPar): 
            named tuple containing stimulus parameters
        - basepar (BasePar): 
            named tuple containing baseline parameters
    
    Optional args:
        - n_ex (int):
            number of example traces to retain
            default: 6
        - rolling_win (int):
            window to use in rolling mean over individual trial traces
            default: 4 
        - randst (int or np.random.RandomState): 
            random state or seed value to use. (-1 treated as None)
            default: None
        - parallel (bool): 
            if True, some of the analysis is run in parallel across CPU cores 
            default: False

    Returns:
        - selected_roi_data (pd.DataFrame):
            dataframe with a row for each ROI, and the following columns, 
            in addition to the basic sess_df columns: 
            - time_values (list): values for each frame, in seconds
                (only 0 to stimpar.post, unless split is "by_exp")
            - roi_ns (list): selected ROI number
            - traces_sm (list): selected ROI sequence traces, smoothed, with 
                dims: seq x frames
            - trace_stats (list): selected ROI trace mean or median
    """

    retained_traces_df = misc_analys.get_check_sess_df(sessions, None,
                                                       analyspar)
    initial_columns = retained_traces_df.columns

    logger.info(f"Identifying example ROIs for each session...",
                extra={"spacing": TAB})

    retained_roi_data = gen_util.parallel_wrap(
        get_sess_ex_traces,
        sessions, [analyspar, stimpar, basepar, rolling_win],
        parallel=parallel)

    randst = rand_util.get_np_rand_state(randst, set_none=True)

    # add data to dataframe
    new_columns = list(retained_roi_data[0])
    retained_traces_df = gen_util.set_object_columns(retained_traces_df,
                                                     new_columns,
                                                     in_place=True)

    for i, sess in enumerate(sessions):
        row_idx = retained_traces_df.loc[retained_traces_df["sessids"] ==
                                         sess.sessid].index

        if len(row_idx) != 1:
            raise RuntimeError(
                "Expected exactly one dataframe row to match session ID.")
        row_idx = row_idx[0]

        for column, value in retained_roi_data[i].items():
            retained_traces_df.at[row_idx, column] = value

    # select a few ROIs per line/plane/session
    columns = retained_traces_df.columns.tolist()
    columns = [column.replace("roi_trace", "trace") for column in columns]
    selected_traces_df = pd.DataFrame(columns=columns)

    group_columns = ["lines", "planes", "sess_ns"]
    for _, trace_grp_df in retained_traces_df.groupby(group_columns):
        trace_grp_df = trace_grp_df.sort_values("mouse_ns")
        grp_indices = trace_grp_df.index
        n_per = np.asarray([len(roi_ns) for roi_ns in trace_grp_df["roi_ns"]])
        roi_ns = np.concatenate(trace_grp_df["roi_ns"].tolist())
        concat_idxs = np.sort(randst.choice(len(roi_ns), n_ex, replace=False))

        for concat_idx in concat_idxs:
            row_idx = len(selected_traces_df)
            sess_idx = np.where(concat_idx < np.cumsum(n_per))[0][0]
            source_row = trace_grp_df.loc[grp_indices[sess_idx]]
            for column in initial_columns:
                selected_traces_df.at[row_idx, column] = source_row[column]

            selected_traces_df.at[row_idx, "time_values"] = \
                source_row["time_values"].tolist()

            roi_idx = concat_idx - n_per[:sess_idx].sum()
            for col in ["roi_ns", "traces_sm", "trace_stats"]:
                source_col = col.replace("trace", "roi_trace")
                selected_traces_df.at[row_idx, col] = \
                    source_row[source_col][roi_idx].tolist()

    for column in [
            "mouse_ns", "mouseids", "sess_ns", "sessids", "nrois", "roi_ns"
    ]:
        selected_traces_df[column] = selected_traces_df[column].astype(int)

    return selected_traces_df
Пример #7
0
def get_ex_idx_corr_norm_df(sessions, analyspar, stimpar, basepar, idxpar, 
                            permpar, permute="sess", sig_only=False, n_bins=40, 
                            randst=None, parallel=False):
    """
    get_ex_idx_corr_norm_df(sessions, analyspar, stimpar, basepar, idxpar, 
                            permpar)

    Returns example correlation normalization data.

    Required args:
        - sessions (list): 
            Session objects
        - analyspar (AnalysPar): 
            named tuple containing analysis parameters
        - stimpar (StimPar): 
            named tuple containing stimulus parameters
        - basepar (BasePar): 
            named tuple containing baseline parameters
        - idxpar (IdxPar): 
            named tuple containing index parameters
        - permpar (PermPar): 
            named tuple containing permutation parameters.
    
    Optional args:
        - permute (bool):
            type of permutation to due ("tracking", "sess" or "all")
            default: "sess"
        - sig_only (bool):
            if True, ROIs with significant USIs are included 
            (only possible if analyspar.tracked is True)
            default: False
        - n_bins (int):
            number of bins
            default: 40
        - randst (int or np.random.RandomState): 
            seed value to use. (-1 treated as None)
            default: None
        - parallel (bool): 
            if True, some of the analysis is run in parallel across CPU cores 
            default: False

    Returns:
        - idx_corr_norm_df (pd.DataFrame):
            dataframe with one row for a line/plane, and the 
            following columns, in addition to the basic sess_df columns:

            for a specific session comparison, e.g. 1v2
            - {}v{}_corrs (float): unnormalized intersession ROI index 
                correlations
            - {}v{}_norm_corrs (float): normalized intersession ROI index 
                correlations
            - {}v{}_rand_ex_corrs (float): unnormalized intersession 
                ROI index correlations for an example of randomized data
            - {}v{}_rand_corr_meds (float): median of randomized correlations

            - {}v{}_corr_data (list): intersession values to correlate
            - {}v{}_rand_ex (list): intersession values for an example of 
                randomized data
            - {}v{}_rand_corrs_binned (list): binned random unnormalized 
                intersession ROI index correlations
            - {}v{}_rand_corrs_bin_edges (list): bins edges
    """

    nanpol = None if analyspar.rem_bad else "omit"

    initial_columns = misc_analys.get_sess_df_columns(sessions[0], analyspar)
    
    lp_idx_df = get_lp_idx_df(
        sessions, 
        analyspar=analyspar, 
        stimpar=stimpar, 
        basepar=basepar, 
        idxpar=idxpar,
        permpar=permpar,
        sig_only=sig_only,
        randst=randst,
        parallel=parallel,
        )

    idx_corr_norm_df = get_basic_idx_corr_df(lp_idx_df, consec_only=False)
    if len(idx_corr_norm_df) != 1:
        raise ValueError("sessions should be from the same line/plane.")

    # get correlation pairs
    corr_ns = get_corr_pairs(lp_idx_df)

    if len(corr_ns) != 1:
        raise ValueError("Sessions should allow only one pair.")
    sess_pair = corr_ns[0]
    corr_name = f"{sess_pair[0]}v{sess_pair[1]}"

    drop_columns = [
        col for col in idx_corr_norm_df.columns if col not in initial_columns
        ]
    idx_corr_norm_df = idx_corr_norm_df.drop(columns=drop_columns)

    logger.info(
        ("Calculating ROI USI correlations for a single session pair..."), 
        extra={"spacing": TAB}
        )

    corr_type = "diff_corr"
    returns = get_corr_data(
        sess_pair, 
        data_df=lp_idx_df, 
        analyspar=analyspar, 
        permpar=permpar, 
        permute=permute, 
        corr_type=corr_type,
        absolute=False,
        norm=False,
        return_data=True,
        return_rand=True,
        n_rand_ex=1, 
        randst=randst
        )

    roi_corr, _, _, _, corr_data, rand_corrs, rand_exs, rand_ex_corrs = returns
    rand_ex = rand_exs[..., 0]
    rand_ex_corr = rand_ex_corrs[0]

    rand_corr_med = math_util.mean_med(
        rand_corrs, stats="median", nanpol=nanpol
        )
    norm_roi_corr = float(
        get_norm_corrs(roi_corr, med=rand_corr_med, corr_type=corr_type)
        )

    row_idx = idx_corr_norm_df.index[0]

    idx_corr_norm_df.loc[row_idx, f"{corr_name}_corrs"] = roi_corr
    idx_corr_norm_df.loc[row_idx, f"{corr_name}_rand_ex_corrs"] = rand_ex_corr
    idx_corr_norm_df.loc[row_idx, f"{corr_name}_rand_corr_meds"] = rand_corr_med
    idx_corr_norm_df.loc[row_idx, f"{corr_name}_norm_corrs"] = norm_roi_corr

    cols = [
        f"{corr_name}_{col_name}" 
        for col_name in 
        ["corr_data", "rand_ex", "rand_corrs_binned", "rand_corrs_bin_edges"]
        ]
    idx_corr_norm_df = gen_util.set_object_columns(
        idx_corr_norm_df, cols, in_place=True
        )

    idx_corr_norm_df.at[row_idx, f"{corr_name}_corr_data"] = corr_data.tolist()
    idx_corr_norm_df.at[row_idx, f"{corr_name}_rand_ex"] = rand_ex.tolist()

    fcts = [np.min, np.max] if nanpol is None else [np.nanmin, np.nanmax]
    bounds = [fct(rand_corrs) for fct in fcts]
    bins = np.linspace(*bounds, n_bins + 1)
    rand_corrs_binned = np.histogram(rand_corrs, bins=bins)[0]

    idx_corr_norm_df.at[row_idx, f"{corr_name}_rand_corrs_bin_edges"] = \
        [bounds[0], bounds[-1]]
    idx_corr_norm_df.at[row_idx, f"{corr_name}_rand_corrs_binned"] = \
        rand_corrs_binned.tolist()

    return idx_corr_norm_df
Пример #8
0
def get_roi_tracking_ex_df(sessions, analyspar, parallel=False):
    """
    get_roi_tracking_ex_df(sessions, analyspar)

    Return ROI tracking example information for the requested sessions, showing 
    the different ROI matches identified depending on the orderin which the 
    sessions are matched.

    Only sessions from certain mice have the requisit data stored in their 
    nway-match files.

    Required args:
        - sessions (list): 
            Session objects
        - analyspar (AnalysPar): 
            named tuple containing analysis parameters

    Optional args:
        - parallel (bool): 
            if True, some of the analysis is run in parallel across CPU cores 
            default: False
    
    Returns:
        - roi_mask_df (pd.DataFrame in dict format):
            dataframe with a row for each mouse, and the following 
            columns, in addition to the basic sess_df columns: 
            - "roi_mask_shapes" (list): shape into which ROI mask indices index 
                (sess x hei x wid)
            - "union_n_conflicts" (int): number of conflicts after union
            for "union", "fewest" and "most" tracked ROIs:
            - "{}_registered_roi_mask_idxs" (list): list of mask indices, 
                registered across sessions, for each session 
                (flattened across ROIs) ((sess, hei, wid) x val), 
                ordered by {}_sess_ns if "fewest" or "most"
            - "{}_n_tracked" (int): number of tracked ROIs
            for "fewest", "most" tracked ROIs:
            - "{}_sess_ns" (list): ordered session number 
    """

    perm_types = ["fewest", "most"]
    add_cols = ["union_n_conflicts"]
    for perm_type in perm_types:
        add_cols.append(f"{perm_type}_registered_roi_mask_idxs")
        add_cols.append(f"{perm_type}_n_tracked")
        add_cols.append(f"{perm_type}_sess_ns")

    # collect ROI mask information
    roi_mask_df = get_roi_tracking_df(sessions,
                                      analyspar,
                                      reg_only=True,
                                      parallel=parallel)
    roi_mask_df = gen_util.set_object_columns(roi_mask_df,
                                              add_cols,
                                              in_place=True)
    roi_mask_df = roi_mask_df.rename(
        columns={"registered_roi_mask_idxs": "union_registered_roi_mask_idxs"})

    all_sessids = [sess.sessid for sess in sessions]
    for row_idx in roi_mask_df.index:
        sess_ns = roi_mask_df.loc[row_idx, "sess_ns"]
        sessids = roi_mask_df.loc[row_idx, "sessids"]

        mouse_sessions = [
            sessions[all_sessids.index(sessid)] for sessid in sessids
        ]

        masks, ordered_sess_ns, n_tracked = collect_roi_tracking_example_data(
            mouse_sessions)

        roi_mask_df.loc[row_idx, "union_n_tracked"] = n_tracked["union"]
        roi_mask_df.loc[row_idx, "union_n_conflicts"] = n_tracked["conflict"]
        for perm_type in perm_types:
            if set(ordered_sess_ns[perm_type]) != set(sess_ns):
                raise RuntimeError("Session number do not match.")

            roi_mask_df.at[row_idx,
                           f"{perm_type}_registered_roi_mask_idxs"] = [
                               idxs.tolist()
                               for idxs in np.where(masks[perm_type])
                           ]
            roi_mask_df.at[row_idx, f"{perm_type}_sess_ns"] = \
                ordered_sess_ns[perm_type]
            roi_mask_df.loc[row_idx, f"{perm_type}_n_tracked"] = \
                n_tracked[perm_type]

    int_cols = [col for col in roi_mask_df.columns if "_n_" in col]
    for col in int_cols:
        roi_mask_df[col] = roi_mask_df[col].astype(int)

    return roi_mask_df
Пример #9
0
def add_stim_roi_stats(stim_stats_df,
                       sessions,
                       analyspar,
                       stimpar,
                       permpar,
                       comp_sess=[1, 3],
                       in_place=False,
                       randst=None):
    """
    add_stim_roi_stats(stim_stats_df, sessions, analyspar, stimpar, permpar)

    Adds to dataframe comparison of absolute fractional data changes 
    between sessions for different stimuli, calculated for individual ROIs.

    Required args:
        - stim_stats_df (pd.DataFrame):
            dataframe with one row per line/plane, and the basic sess_df 
            columns, as well as stimulus columns for each comp_sess:
            - {stimpar.stimtype}_s{comp_sess[0]}: 
                first comp_sess data for each ROI
            - {stimpar.stimtype}_s{comp_sess[1]}: 
                second comp_sess data for each ROI
        - sessions (list): 
            session objects
        - analyspar (AnalysPar): 
            named tuple containing analysis parameters
        - stimpar (StimPar): 
            named tuple containing stimulus parameters
        - permpar (PermPar): 
            named tuple containing permutation parameters

    Optional args:
        - comp_sess (int):
            sessions for which to obtain absolute fractional change 
            [x, y] => |(y - x) / x|
            default: [1, 3]
        - in_place (bool):
            if True, targ_df is modified in place. Otherwise, a deep copy is 
            modified. targ_df is returned in either case.
            default: False
        - randst (int or np.random.RandomState): 
            random state or seed value to use. (-1 treated as None)
            default: None

    Returns:
        - stim_stats_df (pd.DataFrame):
            dataframe with one row per line/plane and one for all line/planes 
            together, and the basic sess_df columns, in addition to the input 
            columns, and for each stimtype:
            - {stimtype} (list): absolute fractional change statistics (me, err)
            - p_vals (float): p-value for data differences between stimulus 
                types, corrected for multiple comparisons and tails
    """

    nanpol = None if analyspar.rem_bad else "omit"

    if analyspar.tracked:
        misc_analys.check_sessions_complete(sessions, raise_err=True)
    else:
        raise ValueError(
            "If analysis is run for individual ROIs and not population "
            "statistics, analyspar.tracked must be set to True.")

    if not in_place:
        stim_stats_df = stim_stats_df.copy(deep=True)

    stimtypes = gen_util.list_if_not(stimpar.stimtype)
    stim_stats_df = gen_util.set_object_columns(stim_stats_df,
                                                stimtypes,
                                                in_place=True)

    # compile all data
    full_data = dict()
    for stimtype in stimpar.stimtype:
        for n in comp_sess:
            stim_col = f"{stimtype}_s{n}"
            full_data[stim_col] = np.concatenate(stim_stats_df[stim_col])

    row_idx = len(stim_stats_df)
    for col in stim_stats_df.columns:
        stim_stats_df.loc[row_idx, col] = "all"
        if col in full_data.keys():
            stim_stats_df.loc[row_idx, col] = full_data[col]

    # take statistics
    for row_idx in stim_stats_df.index:
        comp_data = [None, None]
        for s, stimtype in enumerate(stimpar.stimtype):
            stim_data = []
            for n in comp_sess:
                data_col = f"{stimtype}_s{n}"
                stim_data.append(stim_stats_df.loc[row_idx, data_col])
                abs_fractional_diff(stim_data)

            # get stats and add to dataframe
            stim_stats_df.at[row_idx, stimtype] = \
                math_util.get_stats(
                    comp_data[s], analyspar.stats, analyspar.error,
                    nanpol=nanpol
                    ).tolist()

        # obtain p-values
        stim_stats_df.loc[row_idx, "p_vals"] = rand_util.get_op_p_val(
            comp_data,
            permpar.n_perms,
            stats=analyspar.stats,
            paired=True,
            nanpol=nanpol,
            randst=randst)

    # remove full data columns
    data_cols = []
    for s, stimtype in enumerate(stimpar.stimtype):
        for n in comp_sess:
            data_cols.append(f"{stimtype}_s{n}")
    stim_stats_df = stim_stats_df.drop(data_cols, axis=1)

    return stim_stats_df
Пример #10
0
def add_stim_pop_stats(stim_stats_df,
                       sessions,
                       analyspar,
                       stimpar,
                       permpar,
                       comp_sess=[1, 3],
                       in_place=False,
                       randst=None):
    """
    add_stim_pop_stats(stim_stats_df, sessions, analyspar, stimpar, permpar)

    Adds to dataframe comparison of absolute fractional data changes 
    between sessions for different stimuli, calculated for population 
    statistics.

    Required args:
        - stim_stats_df (pd.DataFrame):
            dataframe with one row per line/plane, and the basic sess_df 
            columns, as well as stimulus columns for each comp_sess:
            - {stimpar.stimtype}_s{comp_sess[0]}: 
                first comp_sess data for each ROI
            - {stimpar.stimtype}_s{comp_sess[1]}: 
                second comp_sess data for each ROI
        - sessions (list): 
            session objects
        - analyspar (AnalysPar): 
            named tuple containing analysis parameters
        - stimpar (StimPar): 
            named tuple containing stimulus parameters
        - permpar (PermPar): 
            named tuple containing permutation parameters

    Optional args:
        - comp_sess (int):
            sessions for which to obtain absolute fractional change 
            [x, y] => |(y - x) / x|
            default: [1, 3]
        - in_place (bool):
            if True, targ_df is modified in place. Otherwise, a deep copy is 
            modified. targ_df is returned in either case.
            default: False
        - randst (int or np.random.RandomState): 
            random state or seed value to use. (-1 treated as None)
            default: None

    Returns:
        - stim_stats_df (pd.DataFrame):
            dataframe with one row per line/plane and one for all line/planes 
            together, and the basic sess_df columns, in addition to the input 
            columns, and for each stimtype:
            - {stimtype} (list): absolute fractional change statistics (me, err)
            - p_vals (float): p-value for data differences between stimulus 
                types, corrected for multiple comparisons and tails
    """

    nanpol = None if analyspar.rem_bad else "omit"

    if analyspar.tracked:
        misc_analys.check_sessions_complete(sessions, raise_err=False)

    if not in_place:
        stim_stats_df = stim_stats_df.copy(deep=True)

    stimtypes = gen_util.list_if_not(stimpar.stimtype)
    stim_stats_df = gen_util.set_object_columns(stim_stats_df,
                                                stimtypes,
                                                in_place=True)

    if analyspar.stats != "mean" or analyspar.error != "std":
        raise NotImplementedError("For population statistics analysis, "
                                  "analyspar.stats must be set to 'mean', and "
                                  "analyspar.error must be set to 'std'.")

    # initialize arrays for all data
    n_linpla = len(stim_stats_df)
    n_stims = len(stimpar.stimtype)
    n_bootstrp = misc_analys.N_BOOTSTRP

    all_stats = np.full((n_linpla, n_stims), np.nan)
    all_btstrap_stats = np.full((n_linpla, n_stims, n_bootstrp), np.nan)
    all_rand_stat_diffs = np.full((n_linpla, permpar.n_perms), np.nan)

    for i, row_idx in enumerate(stim_stats_df.index):
        full_comp_data = [[], []]
        for s, stimtype in enumerate(stimpar.stimtype):
            comp_data, btstrap_comp_data = [], []
            choices = None
            for n in comp_sess:
                data_col = f"{stimtype}_s{n}"

                # get data
                data = stim_stats_df.loc[row_idx, data_col]

                # get session stats
                comp_data.append(
                    math_util.mean_med(data, analyspar.stats, nanpol=nanpol))

                # get bootstrapped data
                returns = rand_util.bootstrapped_std(
                    data,
                    randst=randst,
                    n_samples=n_bootstrp,
                    return_rand=True,
                    return_choices=analyspar.tracked,
                    choices=choices,
                    nanpol=nanpol)

                btstrap_data = returns[1]
                if analyspar.tracked:
                    choices = returns[-1]  # use same choices across sessions

                btstrap_comp_data.append(btstrap_data)
                full_comp_data[s].append(data)  # retain full data

            # compute absolute fractional change stats (bootstrapped std)
            all_stats[i, s] = abs_fractional_diff(comp_data)
            all_btstrap_stats[i, s] = abs_fractional_diff(btstrap_comp_data)
            error = np.std(all_btstrap_stats[i, s])

            # add to dataframe
            stim_stats_df.at[row_idx, stimtype] = [all_stats[i, s], error]

        # obtain p-values for real data wrt random data
        stim_stat_diff = all_stats[i, 1] - all_stats[i, 0]

        # permute data for each session across stimtypes
        sess_rand_stats = []  # sess x stim
        for j in range(len(comp_sess)):
            rand_concat = [stim_data[j] for stim_data in full_comp_data]
            rand_concat = np.stack(rand_concat).T
            rand_stats = rand_util.permute_diff_ratio(
                rand_concat,
                div=None,
                n_perms=permpar.n_perms,
                stats=analyspar.stats,
                op="none",
                paired=True,  # pair stimuli
                nanpol=nanpol,
                randst=randst)
            sess_rand_stats.append(rand_stats)

        # obtain stats per stimtypes, then differences between stimtypes
        stim_rand_stats = list(zip(*sess_rand_stats))  # stim x sess
        all_rand_stats = []
        for rand_stats in stim_rand_stats:
            all_rand_stats.append(abs_fractional_diff(rand_stats))
        all_rand_stat_diffs[i] = all_rand_stats[1] - all_rand_stats[0]

        # calculate p-value
        p_val = rand_util.get_p_val_from_rand(stim_stat_diff,
                                              all_rand_stat_diffs[i],
                                              tails=permpar.tails,
                                              nanpol=nanpol)
        stim_stats_df.loc[row_idx, "p_vals"] = p_val

    # collect stats for all line/planes
    row_idx = len(stim_stats_df)
    for col in stim_stats_df.columns:
        stim_stats_df.loc[row_idx, col] = "all"

    # average across line/planes
    all_data = []
    for data in [all_stats, all_btstrap_stats, all_rand_stat_diffs]:
        all_data.append(
            math_util.mean_med(data, analyspar.stats, nanpol=nanpol, axis=0))
    stat, btstrap_stats, rand_stat_diffs = all_data

    for s, stimtype in enumerate(stimpar.stimtype):
        error = np.std(btstrap_stats[s])
        stim_stats_df.at[row_idx, stimtype] = [stat[s], error]

    p_val = rand_util.get_p_val_from_rand(stat[1] - stat[0],
                                          rand_stat_diffs,
                                          tails=permpar.tails,
                                          nanpol=nanpol)
    stim_stats_df.loc[row_idx, "p_vals"] = p_val

    return stim_stats_df
Пример #11
0
def check_init_stim_data_df(data_df,
                            sessions,
                            stimpar,
                            comp_sess=[1, 3],
                            stim_data_df=None,
                            analyspar=None):
    """
    check_init_stim_data_df(data_df, stimpar)

    Checks existing stimulus dataframe or creates one for each line/plane.

    Required args:
        - data_df (pd.DataFrame):
            dataframe with one row per session, and the basic sess_df columns
        - sessions (list): 
            session objects
        - stimpar (StimPar): 
            named tuple containing stimulus parameters

    Optional args:
        - comp_sess (int):
            sessions for which to obtain absolute fractional change 
            [x, y] => |(y - x) / x|
            default: [1, 3]
        - stim_data_df (pd.DataFrame):
            dataframe with one row per line/plane, and the basic sess_df 
            columns
            default: None
        - analyspar (AnalysPar): 
            named tuple containing analysis parameters
            default: None

    Returns:
        - stim_data_df (pd.DataFrame):
            dataframe with one row per line/plane, and the basic sess_df 
            columns, as well as stimulus columns for each comp_sess:
            - {stimpar.stimtype}_s{comp_sess[0]}: for first comp_sess data
            - {stimpar.stimtype}_s{comp_sess[1]}: for second comp_sess data
    """

    initial_columns = misc_analys.get_sess_df_columns(sessions[0], analyspar)

    stimtype_cols = [f"{stimpar.stimtype}_s{i}" for i in comp_sess]
    if stim_data_df is None:
        new_df = True
        if analyspar is None:
            raise ValueError(
                "If stim_data_df is None, analyspar must be provided.")
        columns = initial_columns + stimtype_cols
        stim_data_df = pd.DataFrame(columns=columns)
    else:
        new_df = False
        if stimpar.stimtype in stim_data_df:
            raise KeyError(
                f"{stimpar.stimtype} should not already be in stim_data_df.")
        stim_data_df = gen_util.set_object_columns(stim_data_df,
                                                   stimtype_cols,
                                                   in_place=True)

    group_columns = ["lines", "planes"]
    aggreg_cols = [
        col for col in initial_columns
        if col not in group_columns + ["sess_ns"]
    ]

    # populate dataframe
    for grp_vals, grp_df in data_df.groupby(group_columns):
        grp_df = grp_df.sort_values(["sess_ns", "mouse_ns"])
        line, plane = grp_vals
        if new_df:
            row_idx = len(stim_data_df)
            for g, group_column in enumerate(group_columns):
                stim_data_df.loc[row_idx, group_column] = grp_vals[g]
        else:
            row_idxs = stim_data_df.loc[(stim_data_df["lines"] == line) & (
                stim_data_df["planes"] == plane)].index
            if len(row_idxs) != 1:
                raise ValueError(
                    "Expected exactly one row to match line/plane.")
            row_idx = row_idxs[0]

        # add aggregated values for initial columns
        ext_stim_data_df = misc_analys.aggreg_columns(grp_df,
                                                      stim_data_df,
                                                      aggreg_cols,
                                                      row_idx=row_idx,
                                                      in_place=new_df)

        # check data was added correctly
        if not new_df:
            for col in aggreg_cols:
                if (ext_stim_data_df.loc[row_idx, col] !=
                        stim_data_df.loc[row_idx, col]):
                    raise RuntimeError(
                        "If stim_data_df is not None, it must contain columns "
                        "generated from data_df. This does not appear to be "
                        f"the case, as the values in {col} do not match the "
                        "values that would be added if stim_data_df was None.")

    if new_df:
        stim_data_df = ext_stim_data_df

    return stim_data_df