예제 #1
0
파일: pca.py 프로젝트: schudds/tedana
def tedpca(data_cat,
           data_oc,
           combmode,
           mask,
           adaptive_mask,
           t2sG,
           ref_img,
           tes,
           algorithm='mdl',
           kdaw=10.,
           rdaw=1.,
           out_dir='.',
           verbose=False,
           low_mem=False):
    """
    Use principal components analysis (PCA) to identify and remove thermal
    noise from multi-echo data.

    Parameters
    ----------
    data_cat : (S x E x T) array_like
        Input functional data
    data_oc : (S x T) array_like
        Optimally combined time series data
    combmode : {'t2s', 'paid'} str
        How optimal combination of echos should be made, where 't2s' indicates
        using the method of Posse 1999 and 'paid' indicates using the method of
        Poser 2006
    mask : (S,) array_like
        Boolean mask array
    adaptive_mask : (S,) array_like
        Array where each value indicates the number of echoes with good signal
        for that voxel. This mask may be thresholded; for example, with values
        less than 3 set to 0.
        For more information on thresholding, see `make_adaptive_mask`.
    t2sG : (S,) array_like
        Map of voxel-wise T2* estimates.
    ref_img : :obj:`str` or img_like
        Reference image to dictate how outputs are saved to disk
    tes : :obj:`list`
        List of echo times associated with `data_cat`, in milliseconds
    algorithm : {'kundu', 'kundu-stabilize', 'mdl', 'aic', 'kic', float}, optional
        Method with which to select components in TEDPCA. PCA
        decomposition with the mdl, kic and aic options are based on a Moving Average
        (stationary Gaussian) process and are ordered from most to least aggressive
        (see Li et al., 2007).
        If a float is provided, then it is assumed to represent percentage of variance
        explained (0-1) to retain from PCA.
        Default is 'mdl'.
    kdaw : :obj:`float`, optional
        Dimensionality augmentation weight for Kappa calculations. Must be a
        non-negative float, or -1 (a special value). Default is 10.
    rdaw : :obj:`float`, optional
        Dimensionality augmentation weight for Rho calculations. Must be a
        non-negative float, or -1 (a special value). Default is 1.
    out_dir : :obj:`str`, optional
        Output directory.
    verbose : :obj:`bool`, optional
        Whether to output files from fitmodels_direct or not. Default: False
    low_mem : :obj:`bool`, optional
        Whether to use incremental PCA (for low-memory systems) or not.
        This is only compatible with the "kundu" or "kundu-stabilize" algorithms.
        Default: False

    Returns
    -------
    kept_data : (S x T) :obj:`numpy.ndarray`
        Dimensionally reduced optimally combined functional data
    n_components : :obj:`int`
        Number of components retained from PCA decomposition

    Notes
    -----
    ======================    =================================================
    Notation                  Meaning
    ======================    =================================================
    :math:`\\kappa`            Component pseudo-F statistic for TE-dependent
                              (BOLD) model.
    :math:`\\rho`              Component pseudo-F statistic for TE-independent
                              (artifact) model.
    :math:`v`                 Voxel
    :math:`V`                 Total number of voxels in mask
    :math:`\\zeta`             Something
    :math:`c`                 Component
    :math:`p`                 Something else
    ======================    =================================================

    Steps:

    1.  Variance normalize either multi-echo or optimally combined data,
        depending on settings.
    2.  Decompose normalized data using PCA or SVD.
    3.  Compute :math:`{\\kappa}` and :math:`{\\rho}`:

            .. math::
                {\\kappa}_c = \\frac{\\sum_{v}^V {\\zeta}_{c,v}^p * \
                      F_{c,v,R_2^*}}{\\sum {\\zeta}_{c,v}^p}

                {\\rho}_c = \\frac{\\sum_{v}^V {\\zeta}_{c,v}^p * \
                      F_{c,v,S_0}}{\\sum {\\zeta}_{c,v}^p}

    4.  Some other stuff. Something about elbows.
    5.  Classify components as thermal noise if they meet both of the
        following criteria:

            - Nonsignificant :math:`{\\kappa}` and :math:`{\\rho}`.
            - Nonsignificant variance explained.

    Outputs:

    This function writes out several files:

    ======================    =================================================
    Filename                  Content
    ======================    =================================================
    pca_decomposition.json    PCA component table.
    pca_mixing.tsv            PCA mixing matrix.
    pca_components.nii.gz     Component weight maps.
    ======================    =================================================

    See Also
    --------
    :func:`tedana.utils.make_adaptive_mask` : The function used to create the ``adaptive_mask``
                                              parameter.
    """
    if algorithm == 'kundu':
        alg_str = ("followed by the Kundu component selection decision "
                   "tree (Kundu et al., 2013)")
        RefLGR.info("Kundu, P., Brenowitz, N. D., Voon, V., Worbe, Y., "
                    "Vértes, P. E., Inati, S. J., ... & Bullmore, E. T. "
                    "(2013). Integrated strategy for improving functional "
                    "connectivity mapping using multiecho fMRI. Proceedings "
                    "of the National Academy of Sciences, 110(40), "
                    "16187-16192.")
    elif algorithm == 'kundu-stabilize':
        alg_str = ("followed by the 'stabilized' Kundu component "
                   "selection decision tree (Kundu et al., 2013)")
        RefLGR.info("Kundu, P., Brenowitz, N. D., Voon, V., Worbe, Y., "
                    "Vértes, P. E., Inati, S. J., ... & Bullmore, E. T. "
                    "(2013). Integrated strategy for improving functional "
                    "connectivity mapping using multiecho fMRI. Proceedings "
                    "of the National Academy of Sciences, 110(40), "
                    "16187-16192.")
    elif isinstance(algorithm, Number):
        alg_str = (
            "in which the number of components was determined based on a "
            "variance explained threshold")
    else:
        alg_str = (
            "based on the PCA component estimation with a Moving Average"
            "(stationary Gaussian) process (Li et al., 2007)")
        RefLGR.info("Li, Y.O., Adalı, T. and Calhoun, V.D., (2007). "
                    "Estimating the number of independent components for "
                    "functional magnetic resonance imaging data. "
                    "Human brain mapping, 28(11), pp.1251-1266.")

    RepLGR.info("Principal component analysis {0} was applied to "
                "the optimally combined data for dimensionality "
                "reduction.".format(alg_str))

    n_samp, n_echos, n_vols = data_cat.shape

    LGR.info('Computing PCA of optimally combined multi-echo data')
    data = data_oc[mask, :]

    data_z = ((data.T - data.T.mean(axis=0)) /
              data.T.std(axis=0)).T  # var normalize ts
    data_z = (data_z -
              data_z.mean()) / data_z.std()  # var normalize everything

    if algorithm in ['mdl', 'aic', 'kic']:
        data_img = io.new_nii_like(ref_img, utils.unmask(data, mask))
        mask_img = io.new_nii_like(ref_img, mask.astype(int))
        voxel_comp_weights, varex, varex_norm, comp_ts = ma_pca.ma_pca(
            data_img, mask_img, algorithm)
    elif isinstance(algorithm, Number):
        ppca = PCA(copy=False, n_components=algorithm, svd_solver="full")
        ppca.fit(data_z)
        comp_ts = ppca.components_.T
        varex = ppca.explained_variance_
        voxel_comp_weights = np.dot(np.dot(data_z, comp_ts),
                                    np.diag(1. / varex))
        varex_norm = varex / varex.sum()
    elif low_mem:
        voxel_comp_weights, varex, comp_ts = low_mem_pca(data_z)
        varex_norm = varex / varex.sum()
    else:
        ppca = PCA(copy=False, n_components=(n_vols - 1))
        ppca.fit(data_z)
        comp_ts = ppca.components_.T
        varex = ppca.explained_variance_
        voxel_comp_weights = np.dot(np.dot(data_z, comp_ts),
                                    np.diag(1. / varex))
        varex_norm = varex / varex.sum()

    # Compute Kappa and Rho for PCA comps
    # Normalize each component's time series
    vTmixN = stats.zscore(comp_ts, axis=0)
    comptable, _, _, _ = metrics.dependence_metrics(data_cat,
                                                    data_oc,
                                                    comp_ts,
                                                    adaptive_mask,
                                                    tes,
                                                    ref_img,
                                                    reindex=False,
                                                    mmixN=vTmixN,
                                                    algorithm=None,
                                                    label='mepca_',
                                                    out_dir=out_dir,
                                                    verbose=verbose)

    # varex_norm from PCA overrides varex_norm from dependence_metrics,
    # but we retain the original
    comptable['estimated normalized variance explained'] = \
        comptable['normalized variance explained']
    comptable['normalized variance explained'] = varex_norm

    # write component maps to 4D image
    comp_ts_z = stats.zscore(comp_ts, axis=0)
    comp_maps = utils.unmask(computefeats2(data_oc, comp_ts_z, mask), mask)
    io.filewrite(comp_maps, op.join(out_dir, 'pca_components.nii.gz'), ref_img)

    # Select components using decision tree
    if algorithm == 'kundu':
        comptable = kundu_tedpca(comptable,
                                 n_echos,
                                 kdaw,
                                 rdaw,
                                 stabilize=False)
    elif algorithm == 'kundu-stabilize':
        comptable = kundu_tedpca(comptable,
                                 n_echos,
                                 kdaw,
                                 rdaw,
                                 stabilize=True)
    else:
        alg_str = "variance explained-based" if isinstance(
            algorithm, Number) else algorithm
        LGR.info('Selected {0} components with {1} dimensionality '
                 'detection'.format(comptable.shape[0], alg_str))
        comptable['classification'] = 'accepted'
        comptable['rationale'] = ''

    # Save decomposition
    comp_names = [
        io.add_decomp_prefix(comp,
                             prefix='pca',
                             max_value=comptable.index.max())
        for comp in comptable.index.values
    ]

    mixing_df = pd.DataFrame(data=comp_ts, columns=comp_names)
    mixing_df.to_csv(op.join(out_dir, 'pca_mixing.tsv'), sep='\t', index=False)

    comptable['Description'] = 'PCA fit to optimally combined data.'
    mmix_dict = {}
    mmix_dict['Method'] = ('Principal components analysis implemented by '
                           'sklearn. Components are sorted by variance '
                           'explained in descending order. '
                           'Component signs are flipped to best match the '
                           'data.')
    io.save_comptable(comptable,
                      op.join(out_dir, 'pca_decomposition.json'),
                      label='pca',
                      metadata=mmix_dict)

    acc = comptable[comptable.classification == 'accepted'].index.values
    n_components = acc.size
    voxel_kept_comp_weighted = (voxel_comp_weights[:, acc] * varex[None, acc])
    kept_data = np.dot(voxel_kept_comp_weighted, comp_ts[:, acc].T)

    kept_data = stats.zscore(kept_data,
                             axis=1)  # variance normalize time series
    kept_data = stats.zscore(kept_data,
                             axis=None)  # variance normalize everything

    return kept_data, n_components
예제 #2
0
파일: pca.py 프로젝트: azitennis50/tedana
def tedpca(data_cat, data_oc, combmode, mask, t2s, t2sG,
           ref_img, tes, algorithm='mdl', source_tes=-1, kdaw=10., rdaw=1.,
           out_dir='.', verbose=False, low_mem=False):
    """
    Use principal components analysis (PCA) to identify and remove thermal
    noise from multi-echo data.

    Parameters
    ----------
    data_cat : (S x E x T) array_like
        Input functional data
    data_oc : (S x T) array_like
        Optimally combined time series data
    combmode : {'t2s', 'paid'} str
        How optimal combination of echos should be made, where 't2s' indicates
        using the method of Posse 1999 and 'paid' indicates using the method of
        Poser 2006
    mask : (S,) array_like
        Boolean mask array
    t2s : (S,) array_like
        Map of voxel-wise T2* estimates.
    t2sG : (S,) array_like
        Map of voxel-wise T2* estimates.
    ref_img : :obj:`str` or img_like
        Reference image to dictate how outputs are saved to disk
    tes : :obj:`list`
        List of echo times associated with `data_cat`, in milliseconds
    algorithm : {'mle', 'kundu', 'kundu-stabilize', 'mdl', 'aic', 'kic'}, optional
        Method with which to select components in TEDPCA. Default is 'mdl'. PCA
        decomposition with the mdl, kic and aic options are based on a Moving Average
        (stationary Gaussian) process and are ordered from most to least aggresive.
        See (Li et al., 2007).
    source_tes : :obj:`int` or :obj:`list` of :obj:`int`, optional
        Which echos to use in PCA. Values -1 and 0 are special, where a value
        of -1 will indicate using the optimal combination of the echos
        and 0  will indicate using all the echos. A list can be provided
        to indicate a subset of echos.
        Default: -1
    kdaw : :obj:`float`, optional
        Dimensionality augmentation weight for Kappa calculations. Must be a
        non-negative float, or -1 (a special value). Default is 10.
    rdaw : :obj:`float`, optional
        Dimensionality augmentation weight for Rho calculations. Must be a
        non-negative float, or -1 (a special value). Default is 1.
    out_dir : :obj:`str`, optional
        Output directory.
    verbose : :obj:`bool`, optional
        Whether to output files from fitmodels_direct or not. Default: False
    low_mem : :obj:`bool`, optional
        Whether to use incremental PCA (for low-memory systems) or not.
        Default: False

    Returns
    -------
    kept_data : (S x T) :obj:`numpy.ndarray`
        Dimensionally reduced optimally combined functional data
    n_components : :obj:`int`
        Number of components retained from PCA decomposition

    Notes
    -----
    ======================    =================================================
    Notation                  Meaning
    ======================    =================================================
    :math:`\\kappa`            Component pseudo-F statistic for TE-dependent
                              (BOLD) model.
    :math:`\\rho`              Component pseudo-F statistic for TE-independent
                              (artifact) model.
    :math:`v`                 Voxel
    :math:`V`                 Total number of voxels in mask
    :math:`\\zeta`             Something
    :math:`c`                 Component
    :math:`p`                 Something else
    ======================    =================================================

    Steps:

    1.  Variance normalize either multi-echo or optimally combined data,
        depending on settings.
    2.  Decompose normalized data using PCA or SVD.
    3.  Compute :math:`{\\kappa}` and :math:`{\\rho}`:

            .. math::
                {\\kappa}_c = \\frac{\\sum_{v}^V {\\zeta}_{c,v}^p * \
                      F_{c,v,R_2^*}}{\\sum {\\zeta}_{c,v}^p}

                {\\rho}_c = \\frac{\\sum_{v}^V {\\zeta}_{c,v}^p * \
                      F_{c,v,S_0}}{\\sum {\\zeta}_{c,v}^p}

    4.  Some other stuff. Something about elbows.
    5.  Classify components as thermal noise if they meet both of the
        following criteria:

            - Nonsignificant :math:`{\\kappa}` and :math:`{\\rho}`.
            - Nonsignificant variance explained.

    Outputs:

    This function writes out several files:

    ======================    =================================================
    Filename                  Content
    ======================    =================================================
    pca_decomposition.json    PCA component table.
    pca_mixing.tsv            PCA mixing matrix.
    pca_components.nii.gz     Component weight maps.
    ======================    =================================================
    """
    if low_mem and algorithm == 'mle':
        LGR.warning('Low memory option is not compatible with MLE '
                    'dimensionality estimation. Switching to Kundu decision '
                    'tree.')
        algorithm = 'kundu'

    if algorithm == 'mle':
        alg_str = "using MLE dimensionality estimation (Minka, 2001)"
        RefLGR.info("Minka, T. P. (2001). Automatic choice of dimensionality "
                    "for PCA. In Advances in neural information processing "
                    "systems (pp. 598-604).")
    elif algorithm == 'kundu':
        alg_str = ("followed by the Kundu component selection decision "
                   "tree (Kundu et al., 2013)")
        RefLGR.info("Kundu, P., Brenowitz, N. D., Voon, V., Worbe, Y., "
                    "Vértes, P. E., Inati, S. J., ... & Bullmore, E. T. "
                    "(2013). Integrated strategy for improving functional "
                    "connectivity mapping using multiecho fMRI. Proceedings "
                    "of the National Academy of Sciences, 110(40), "
                    "16187-16192.")
    elif algorithm == 'kundu-stabilize':
        alg_str = ("followed by the 'stabilized' Kundu component "
                   "selection decision tree (Kundu et al., 2013)")
        RefLGR.info("Kundu, P., Brenowitz, N. D., Voon, V., Worbe, Y., "
                    "Vértes, P. E., Inati, S. J., ... & Bullmore, E. T. "
                    "(2013). Integrated strategy for improving functional "
                    "connectivity mapping using multiecho fMRI. Proceedings "
                    "of the National Academy of Sciences, 110(40), "
                    "16187-16192.")
    else:
        alg_str = ("based on the PCA component estimation with a Moving Average"
                   "(stationary Gaussian) process (Li et al., 2007)")
        RefLGR.info("Li, Y.O., Adalı, T. and Calhoun, V.D., (2007). "
                    "Estimating the number of independent components for "
                    "functional magnetic resonance imaging data. "
                    "Human brain mapping, 28(11), pp.1251-1266.")

    if source_tes == -1:
        dat_str = "the optimally combined data"
    elif source_tes == 0:
        dat_str = "the z-concatenated multi-echo data"
    else:
        dat_str = "a z-concatenated subset of echoes from the input data"

    RepLGR.info("Principal component analysis {0} was applied to "
                "{1} for dimensionality reduction.".format(alg_str, dat_str))

    n_samp, n_echos, n_vols = data_cat.shape
    source_tes = np.array([int(ee) for ee in str(source_tes).split(',')])

    if len(source_tes) == 1 and source_tes[0] == -1:
        LGR.info('Computing PCA of optimally combined multi-echo data')
        data = data_oc[mask, :][:, np.newaxis, :]
    elif len(source_tes) == 1 and source_tes[0] == 0:
        LGR.info('Computing PCA of spatially concatenated multi-echo data')
        data = data_cat[mask, ...]
    else:
        LGR.info('Computing PCA of echo #{0}'.format(','.join([str(ee) for ee in source_tes])))
        data = np.stack([data_cat[mask, ee, :] for ee in source_tes - 1], axis=1)

    eim = np.squeeze(_utils.eimask(data))
    data = np.squeeze(data[eim])

    data_z = ((data.T - data.T.mean(axis=0)) / data.T.std(axis=0)).T  # var normalize ts
    data_z = (data_z - data_z.mean()) / data_z.std()  # var normalize everything

    if algorithm in ['mdl', 'aic', 'kic']:
        data_img = io.new_nii_like(
            ref_img, utils.unmask(utils.unmask(data, eim), mask))
        mask_img = io.new_nii_like(ref_img,
                                   utils.unmask(eim, mask).astype(int))
        voxel_comp_weights, varex, varex_norm, comp_ts = ma_pca.ma_pca(
            data_img, mask_img, algorithm)
    elif algorithm == 'mle':
        voxel_comp_weights, varex, varex_norm, comp_ts = run_mlepca(data_z)
    elif low_mem:
        voxel_comp_weights, varex, comp_ts = low_mem_pca(data_z)
        varex_norm = varex / varex.sum()
    else:
        ppca = PCA(copy=False, n_components=(n_vols - 1))
        ppca.fit(data_z)
        comp_ts = ppca.components_.T
        varex = ppca.explained_variance_
        voxel_comp_weights = np.dot(np.dot(data_z, comp_ts),
                                    np.diag(1. / varex))
        varex_norm = varex / varex.sum()

    # Compute Kappa and Rho for PCA comps
    eimum = np.atleast_2d(eim)
    eimum = np.transpose(eimum, np.argsort(eimum.shape)[::-1])
    eimum = eimum.prod(axis=1)
    o = np.zeros((mask.shape[0], *eimum.shape[1:]))
    o[mask, ...] = eimum
    eimum = np.squeeze(o).astype(bool)

    # Normalize each component's time series
    vTmixN = stats.zscore(comp_ts, axis=0)
    comptable, _, _, _ = metrics.dependence_metrics(data_cat,
                                                    data_oc,
                                                    comp_ts,
                                                    t2s,
                                                    tes,
                                                    ref_img,
                                                    reindex=False,
                                                    mmixN=vTmixN,
                                                    algorithm=None,
                                                    label='mepca_',
                                                    out_dir=out_dir,
                                                    verbose=verbose)

    # varex_norm from PCA overrides varex_norm from dependence_metrics,
    # but we retain the original
    comptable['estimated normalized variance explained'] = \
        comptable['normalized variance explained']
    comptable['normalized variance explained'] = varex_norm

    # write component maps to 4D image
    comp_ts_z = stats.zscore(comp_ts, axis=0)
    comp_maps = utils.unmask(computefeats2(data_oc, comp_ts_z, mask), mask)
    io.filewrite(comp_maps, op.join(out_dir, 'pca_components.nii.gz'), ref_img)

    # Select components using decision tree
    if algorithm == 'kundu':
        comptable = kundu_tedpca(comptable, n_echos, kdaw, rdaw, stabilize=False)
    elif algorithm == 'kundu-stabilize':
        comptable = kundu_tedpca(comptable, n_echos, kdaw, rdaw, stabilize=True)
    elif algorithm == 'mle':
        LGR.info('Selected {0} components with MLE dimensionality '
                 'detection'.format(comptable.shape[0]))
        comptable['classification'] = 'accepted'
        comptable['rationale'] = ''

    elif algorithm in ['mdl', 'aic', 'kic']:
        LGR.info('Selected {0} components with {1} dimensionality '
                 'detection'.format(comptable.shape[0], algorithm))
        comptable['classification'] = 'accepted'
        comptable['rationale'] = ''

    # Save decomposition
    comp_names = [io.add_decomp_prefix(comp, prefix='pca', max_value=comptable.index.max())
                  for comp in comptable.index.values]

    mixing_df = pd.DataFrame(data=comp_ts, columns=comp_names)
    mixing_df.to_csv(op.join(out_dir, 'pca_mixing.tsv'), sep='\t', index=False)

    data_type = 'optimally combined data' if source_tes == -1 else 'z-concatenated data'
    comptable['Description'] = 'PCA fit to {0}.'.format(data_type)
    mmix_dict = {}
    mmix_dict['Method'] = ('Principal components analysis implemented by '
                           'sklearn. Components are sorted by variance '
                           'explained in descending order. '
                           'Component signs are flipped to best match the '
                           'data.')
    io.save_comptable(comptable, op.join(out_dir, 'pca_decomposition.json'),
                      label='pca', metadata=mmix_dict)

    acc = comptable[comptable.classification == 'accepted'].index.values
    n_components = acc.size
    voxel_kept_comp_weighted = (voxel_comp_weights[:, acc] * varex[None, acc])
    kept_data = np.dot(voxel_kept_comp_weighted, comp_ts[:, acc].T)

    kept_data = stats.zscore(kept_data, axis=1)  # variance normalize time series
    kept_data = stats.zscore(kept_data, axis=None)  # variance normalize everything

    return kept_data, n_components
예제 #3
0
파일: pca.py 프로젝트: thinkingg/tedana
def tedpca(data_cat, data_oc, combmode, mask, t2s, t2sG,
           ref_img, tes, algorithm='mle', source_tes=-1, kdaw=10., rdaw=1.,
           out_dir='.', verbose=False, low_mem=False):
    """
    Use principal components analysis (PCA) to identify and remove thermal
    noise from multi-echo data.

    Parameters
    ----------
    data_cat : (S x E x T) array_like
        Input functional data
    data_oc : (S x T) array_like
        Optimally combined time series data
    combmode : {'t2s', 'paid'} str
        How optimal combination of echos should be made, where 't2s' indicates
        using the method of Posse 1999 and 'paid' indicates using the method of
        Poser 2006
    mask : (S,) array_like
        Boolean mask array
    t2s : (S,) array_like
        Map of voxel-wise T2* estimates.
    t2sG : (S,) array_like
        Map of voxel-wise T2* estimates.
    ref_img : :obj:`str` or img_like
        Reference image to dictate how outputs are saved to disk
    tes : :obj:`list`
        List of echo times associated with `data_cat`, in milliseconds
    algorithm : {'mle', 'kundu', 'kundu-stabilize'}, optional
        Method with which to select components in TEDPCA. Default is 'mle'.
    source_tes : :obj:`int` or :obj:`list` of :obj:`int`, optional
        Which echos to use in PCA. Values -1 and 0 are special, where a value
        of -1 will indicate using the optimal combination of the echos
        and 0  will indicate using all the echos. A list can be provided
        to indicate a subset of echos.
        Default: -1
    kdaw : :obj:`float`, optional
        Dimensionality augmentation weight for Kappa calculations. Must be a
        non-negative float, or -1 (a special value). Default is 10.
    rdaw : :obj:`float`, optional
        Dimensionality augmentation weight for Rho calculations. Must be a
        non-negative float, or -1 (a special value). Default is 1.
    out_dir : :obj:`str`, optional
        Output directory.
    verbose : :obj:`bool`, optional
        Whether to output files from fitmodels_direct or not. Default: False
    low_mem : :obj:`bool`, optional
        Whether to use incremental PCA (for low-memory systems) or not.
        Default: False

    Returns
    -------
    kept_data : (S x T) :obj:`numpy.ndarray`
        Dimensionally reduced optimally combined functional data
    n_components : :obj:`int`
        Number of components retained from PCA decomposition

    Notes
    -----
    ======================    =================================================
    Notation                  Meaning
    ======================    =================================================
    :math:`\\kappa`            Component pseudo-F statistic for TE-dependent
                              (BOLD) model.
    :math:`\\rho`              Component pseudo-F statistic for TE-independent
                              (artifact) model.
    :math:`v`                 Voxel
    :math:`V`                 Total number of voxels in mask
    :math:`\\zeta`             Something
    :math:`c`                 Component
    :math:`p`                 Something else
    ======================    =================================================

    Steps:

    1.  Variance normalize either multi-echo or optimally combined data,
        depending on settings.
    2.  Decompose normalized data using PCA or SVD.
    3.  Compute :math:`{\\kappa}` and :math:`{\\rho}`:

            .. math::
                {\\kappa}_c = \\frac{\\sum_{v}^V {\\zeta}_{c,v}^p * \
                      F_{c,v,R_2^*}}{\\sum {\\zeta}_{c,v}^p}

                {\\rho}_c = \\frac{\\sum_{v}^V {\\zeta}_{c,v}^p * \
                      F_{c,v,S_0}}{\\sum {\\zeta}_{c,v}^p}

    4.  Some other stuff. Something about elbows.
    5.  Classify components as thermal noise if they meet both of the
        following criteria:

            - Nonsignificant :math:`{\\kappa}` and :math:`{\\rho}`.
            - Nonsignificant variance explained.

    Outputs:

    This function writes out several files:

    ======================    =================================================
    Filename                  Content
    ======================    =================================================
    pcastate.pkl              Values from PCA results.
    comp_table_pca.txt        PCA component table.
    mepca_mix.1D              PCA mixing matrix.
    ======================    =================================================
    """
    if low_mem and algorithm == 'mle':
        LGR.warning('Low memory option is not compatible with MLE '
                    'dimensionality estimation. Switching to Kundu decision '
                    'tree.')
        algorithm = 'kundu'

    n_samp, n_echos, n_vols = data_cat.shape
    source_tes = np.array([int(ee) for ee in str(source_tes).split(',')])

    if len(source_tes) == 1 and source_tes[0] == -1:
        LGR.info('Computing PCA of optimally combined multi-echo data')
        data = data_oc[mask, :][:, np.newaxis, :]
    elif len(source_tes) == 1 and source_tes[0] == 0:
        LGR.info('Computing PCA of spatially concatenated multi-echo data')
        data = data_cat[mask, ...]
    else:
        LGR.info('Computing PCA of echo #{0}'.format(','.join([str(ee) for ee in source_tes])))
        data = np.stack([data_cat[mask, ee, :] for ee in source_tes - 1], axis=1)

    eim = np.squeeze(eimask(data))
    data = np.squeeze(data[eim])

    data_z = ((data.T - data.T.mean(axis=0)) / data.T.std(axis=0)).T  # var normalize ts
    data_z = (data_z - data_z.mean()) / data_z.std()  # var normalize everything

    if algorithm == 'mle':
        voxel_comp_weights, varex, varex_norm, comp_ts = run_mlepca(data_z)
    elif low_mem:
        voxel_comp_weights, varex, comp_ts = low_mem_pca(data_z)
        varex_norm = varex / varex.sum()
    else:
        ppca = PCA(copy=False, n_components=(n_vols - 1))
        ppca.fit(data_z)
        comp_ts = ppca.components_.T
        varex = ppca.explained_variance_
        voxel_comp_weights = np.dot(np.dot(data_z, comp_ts),
                                    np.diag(1. / varex))
        varex_norm = varex / varex.sum()

    # Compute Kappa and Rho for PCA comps
    eimum = np.atleast_2d(eim)
    eimum = np.transpose(eimum, np.argsort(eimum.shape)[::-1])
    eimum = eimum.prod(axis=1)
    o = np.zeros((mask.shape[0], *eimum.shape[1:]))
    o[mask, ...] = eimum
    eimum = np.squeeze(o).astype(bool)

    # Normalize each component's time series
    vTmixN = stats.zscore(comp_ts, axis=0)
    comptable, _, _, _ = metrics.dependence_metrics(
                data_cat, data_oc, comp_ts, t2s, tes, ref_img,
                reindex=False, mmixN=vTmixN, algorithm=None,
                label='mepca_', out_dir=out_dir, verbose=verbose)

    # varex_norm from PCA overrides varex_norm from dependence_metrics,
    # but we retain the original
    comptable['estimated normalized variance explained'] = \
        comptable['normalized variance explained']
    comptable['normalized variance explained'] = varex_norm

    np.savetxt('mepca_mix.1D', comp_ts)

    # write component maps to 4D image
    comp_maps = np.zeros((data_oc.shape[0], comp_ts.shape[1]))
    for i_comp in range(comp_ts.shape[1]):
        temp_comp_ts = comp_ts[:, i_comp][:, None]
        comp_map = utils.unmask(computefeats2(data_oc, temp_comp_ts, mask), mask)
        comp_maps[:, i_comp] = np.squeeze(comp_map)
    io.filewrite(comp_maps, 'mepca_OC_components.nii', ref_img)

    # Select components using decision tree
    if algorithm == 'kundu':
        comptable = kundu_tedpca(comptable, n_echos, kdaw, rdaw, stabilize=False)
    elif algorithm == 'kundu-stabilize':
        comptable = kundu_tedpca(comptable, n_echos, kdaw, rdaw, stabilize=True)
    elif algorithm == 'mle':
        LGR.info('Selected {0} components with MLE dimensionality '
                 'detection'.format(comptable.shape[0]))
        comptable['classification'] = 'accepted'
        comptable['rationale'] = ''

    comptable.to_csv('comp_table_pca.txt', sep='\t', index=True,
                     index_label='component', float_format='%.6f')

    acc = comptable[comptable.classification == 'accepted'].index.values
    n_components = acc.size
    voxel_kept_comp_weighted = (voxel_comp_weights[:, acc] *
                                varex[None, acc])
    kept_data = np.dot(voxel_kept_comp_weighted, comp_ts[:, acc].T)

    kept_data = stats.zscore(kept_data, axis=1)  # variance normalize time series
    kept_data = stats.zscore(kept_data, axis=None)  # variance normalize everything

    return kept_data, n_components
예제 #4
0
파일: pca.py 프로젝트: handwerkerd/tedana
def tedpca(
    data_cat,
    data_oc,
    combmode,
    mask,
    adaptive_mask,
    t2sG,
    io_generator,
    tes,
    algorithm="aic",
    kdaw=10.0,
    rdaw=1.0,
    verbose=False,
    low_mem=False,
):
    """
    Use principal components analysis (PCA) to identify and remove thermal
    noise from multi-echo data.

    Parameters
    ----------
    data_cat : (S x E x T) array_like
        Input functional data
    data_oc : (S x T) array_like
        Optimally combined time series data
    combmode : {'t2s', 'paid'} str
        How optimal combination of echos should be made, where 't2s' indicates
        using the method of Posse 1999 and 'paid' indicates using the method of
        Poser 2006
    mask : (S,) array_like
        Boolean mask array
    adaptive_mask : (S,) array_like
        Array where each value indicates the number of echoes with good signal
        for that voxel. This mask may be thresholded; for example, with values
        less than 3 set to 0.
        For more information on thresholding, see `make_adaptive_mask`.
    t2sG : (S,) array_like
        Map of voxel-wise T2* estimates.
    io_generator : :obj:`tedana.io.OutputGenerator`
        The output generation object for this workflow
    tes : :obj:`list`
        List of echo times associated with `data_cat`, in milliseconds
    algorithm : {'kundu', 'kundu-stabilize', 'mdl', 'aic', 'kic', float}, optional
        Method with which to select components in TEDPCA. PCA
        decomposition with the mdl, kic and aic options are based on a Moving Average
        (stationary Gaussian) process and are ordered from most to least aggressive
        (see Li et al., 2007).
        If a float is provided, then it is assumed to represent percentage of variance
        explained (0-1) to retain from PCA.
        If an int is provided, then it is assumed to be the number of components
        to select
        Default is 'aic'.
    kdaw : :obj:`float`, optional
        Dimensionality augmentation weight for Kappa calculations. Must be a
        non-negative float, or -1 (a special value). Default is 10.
    rdaw : :obj:`float`, optional
        Dimensionality augmentation weight for Rho calculations. Must be a
        non-negative float, or -1 (a special value). Default is 1.
    verbose : :obj:`bool`, optional
        Whether to output files from fitmodels_direct or not. Default: False
    low_mem : :obj:`bool`, optional
        Whether to use incremental PCA (for low-memory systems) or not.
        This is only compatible with the "kundu" or "kundu-stabilize" algorithms.
        Default: False

    Returns
    -------
    kept_data : (S x T) :obj:`numpy.ndarray`
        Dimensionally reduced optimally combined functional data
    n_components : :obj:`int`
        Number of components retained from PCA decomposition

    Notes
    -----
    ======================    =================================================
    Notation                  Meaning
    ======================    =================================================
    :math:`\\kappa`            Component pseudo-F statistic for TE-dependent
                              (BOLD) model.
    :math:`\\rho`              Component pseudo-F statistic for TE-independent
                              (artifact) model.
    :math:`v`                 Voxel
    :math:`V`                 Total number of voxels in mask
    :math:`\\zeta`             Something
    :math:`c`                 Component
    :math:`p`                 Something else
    ======================    =================================================

    Steps:

    1.  Variance normalize either multi-echo or optimally combined data,
        depending on settings.
    2.  Decompose normalized data using PCA or SVD.
    3.  Compute :math:`{\\kappa}` and :math:`{\\rho}`:

            .. math::
                {\\kappa}_c = \\frac{\\sum_{v}^V {\\zeta}_{c,v}^p * \
                      F_{c,v,R_2^*}}{\\sum {\\zeta}_{c,v}^p}

                {\\rho}_c = \\frac{\\sum_{v}^V {\\zeta}_{c,v}^p * \
                      F_{c,v,S_0}}{\\sum {\\zeta}_{c,v}^p}

    4.  Some other stuff. Something about elbows.
    5.  Classify components as thermal noise if they meet both of the
        following criteria:

            - Nonsignificant :math:`{\\kappa}` and :math:`{\\rho}`.
            - Nonsignificant variance explained.

    Outputs:

    This function writes out several files:

    ===========================    =============================================
    Default Filename               Content
    ===========================    =============================================
    desc-PCA_metrics.tsv           PCA component table
    desc-PCA_metrics.json          Metadata sidecar file describing the
                                   component table
    desc-PCA_mixing.tsv            PCA mixing matrix
    desc-PCA_components.nii.gz     Component weight maps
    desc-PCA_decomposition.json    Metadata sidecar file describing the PCA
                                   decomposition
    ===========================    =============================================

    See Also
    --------
    :func:`tedana.utils.make_adaptive_mask` : The function used to create
        the ``adaptive_mask`` parameter.
    :py:mod:`tedana.constants` : The module describing the filenames for
        various naming conventions
    """
    if algorithm == "kundu":
        alg_str = "followed by the Kundu component selection decision tree (Kundu et al., 2013)"
        RefLGR.info(
            "Kundu, P., Brenowitz, N. D., Voon, V., Worbe, Y., "
            "Vértes, P. E., Inati, S. J., ... & Bullmore, E. T. "
            "(2013). Integrated strategy for improving functional "
            "connectivity mapping using multiecho fMRI. Proceedings "
            "of the National Academy of Sciences, 110(40), "
            "16187-16192."
        )
    elif algorithm == "kundu-stabilize":
        alg_str = (
            "followed by the 'stabilized' Kundu component "
            "selection decision tree (Kundu et al., 2013)"
        )
        RefLGR.info(
            "Kundu, P., Brenowitz, N. D., Voon, V., Worbe, Y., "
            "Vértes, P. E., Inati, S. J., ... & Bullmore, E. T. "
            "(2013). Integrated strategy for improving functional "
            "connectivity mapping using multiecho fMRI. Proceedings "
            "of the National Academy of Sciences, 110(40), "
            "16187-16192."
        )
    elif isinstance(algorithm, Number):
        if isinstance(algorithm, float):
            alg_str = (
                "in which the number of components was determined based on a "
                "variance explained threshold"
            )
        else:
            alg_str = "in which the number of components is pre-defined"
    else:
        alg_str = (
            "based on the PCA component estimation with a Moving Average"
            "(stationary Gaussian) process (Li et al., 2007)"
        )
        RefLGR.info(
            "Li, Y.O., Adalı, T. and Calhoun, V.D., (2007). "
            "Estimating the number of independent components for "
            "functional magnetic resonance imaging data. "
            "Human brain mapping, 28(11), pp.1251-1266."
        )

    RepLGR.info(
        "Principal component analysis {0} was applied to "
        "the optimally combined data for dimensionality "
        "reduction.".format(alg_str)
    )

    n_samp, n_echos, n_vols = data_cat.shape

    LGR.info(
        f"Computing PCA of optimally combined multi-echo data with selection criteria: {algorithm}"
    )
    data = data_oc[mask, :]

    data_z = ((data.T - data.T.mean(axis=0)) / data.T.std(axis=0)).T  # var normalize ts
    data_z = (data_z - data_z.mean()) / data_z.std()  # var normalize everything

    if algorithm in ["mdl", "aic", "kic"]:
        data_img = io.new_nii_like(io_generator.reference_img, utils.unmask(data, mask))
        mask_img = io.new_nii_like(io_generator.reference_img, mask.astype(int))
        ma_pca = MovingAveragePCA(criterion=algorithm, normalize=True)
        _ = ma_pca.fit_transform(data_img, mask_img)

        # Extract results from maPCA
        voxel_comp_weights = ma_pca.u_
        varex = ma_pca.explained_variance_
        varex_norm = ma_pca.explained_variance_ratio_
        comp_ts = ma_pca.components_.T
        aic = ma_pca.aic_
        kic = ma_pca.kic_
        mdl = ma_pca.mdl_
        varex_90 = ma_pca.varexp_90_
        varex_95 = ma_pca.varexp_95_
        all_comps = ma_pca.all_

        # Extract number of components and variance explained for logging and plotting
        n_aic = aic["n_components"]
        aic_varexp = np.round(aic["explained_variance_total"], 3)
        n_kic = kic["n_components"]
        kic_varexp = np.round(kic["explained_variance_total"], 3)
        n_mdl = mdl["n_components"]
        mdl_varexp = np.round(mdl["explained_variance_total"], 3)
        n_varex_90 = varex_90["n_components"]
        varex_90_varexp = np.round(varex_90["explained_variance_total"], 3)
        n_varex_95 = varex_95["n_components"]
        varex_95_varexp = np.round(varex_95["explained_variance_total"], 3)
        all_varex = np.round(all_comps["explained_variance_total"], 3)

        # Print out the results
        LGR.info("Optimal number of components based on different criteria:")
        LGR.info(
            f"AIC: {n_aic} | KIC: {n_kic} | MDL: {n_mdl} | 90% varexp: {n_varex_90} "
            f"| 95% varexp: {n_varex_95}"
        )

        LGR.info("Explained variance based on different criteria:")
        LGR.info(
            f"AIC: {aic_varexp}% | KIC: {kic_varexp}% | MDL: {mdl_varexp}% | "
            f"90% varexp: {varex_90_varexp}% | 95% varexp: {varex_95_varexp}%"
        )

        pca_optimization_curves = np.array([aic["value"], kic["value"], mdl["value"]])
        pca_criteria_components = np.array(
            [
                n_aic,
                n_kic,
                n_mdl,
                n_varex_90,
                n_varex_95,
            ]
        )

        # Plot maPCA optimization curves
        LGR.info("Plotting maPCA optimization curves")
        plot_pca_results(pca_optimization_curves, pca_criteria_components, all_varex, io_generator)

        # Save maPCA results into a dictionary
        mapca_results = {
            "aic": {
                "n_components": n_aic,
                "explained_variance_total": aic_varexp,
                "curve": aic["value"],
            },
            "kic": {
                "n_components": n_kic,
                "explained_variance_total": kic_varexp,
                "curve": kic["value"],
            },
            "mdl": {
                "n_components": n_mdl,
                "explained_variance_total": mdl_varexp,
                "curve": mdl["value"],
            },
            "varex_90": {
                "n_components": n_varex_90,
                "explained_variance_total": varex_90_varexp,
            },
            "varex_95": {
                "n_components": n_varex_95,
                "explained_variance_total": varex_95_varexp,
            },
        }

        # Save dictionary
        io_generator.save_file(mapca_results, "PCA cross component metrics json")

    elif isinstance(algorithm, Number):
        ppca = PCA(copy=False, n_components=algorithm, svd_solver="full")
        ppca.fit(data_z)
        comp_ts = ppca.components_.T
        varex = ppca.explained_variance_
        voxel_comp_weights = np.dot(np.dot(data_z, comp_ts), np.diag(1.0 / varex))
        varex_norm = ppca.explained_variance_ratio_
    elif low_mem:
        voxel_comp_weights, varex, varex_norm, comp_ts = low_mem_pca(data_z)
    else:
        ppca = PCA(copy=False, n_components=(n_vols - 1))
        ppca.fit(data_z)
        comp_ts = ppca.components_.T
        varex = ppca.explained_variance_
        voxel_comp_weights = np.dot(np.dot(data_z, comp_ts), np.diag(1.0 / varex))
        varex_norm = ppca.explained_variance_ratio_

    # Compute Kappa and Rho for PCA comps
    required_metrics = [
        "kappa",
        "rho",
        "countnoise",
        "countsigFT2",
        "countsigFS0",
        "dice_FT2",
        "dice_FS0",
        "signal-noise_t",
        "variance explained",
        "normalized variance explained",
        "d_table_score",
    ]
    comptable = metrics.collect.generate_metrics(
        data_cat,
        data_oc,
        comp_ts,
        adaptive_mask,
        tes,
        io_generator,
        "PCA",
        metrics=required_metrics,
    )

    # varex_norm from PCA overrides varex_norm from dependence_metrics,
    # but we retain the original
    comptable["estimated normalized variance explained"] = comptable[
        "normalized variance explained"
    ]
    comptable["normalized variance explained"] = varex_norm

    # write component maps to 4D image
    comp_maps = utils.unmask(computefeats2(data_oc, comp_ts, mask), mask)
    io_generator.save_file(comp_maps, "z-scored PCA components img")

    # Select components using decision tree
    if algorithm == "kundu":
        comptable, metric_metadata = kundu_tedpca(
            comptable,
            n_echos,
            kdaw,
            rdaw,
            stabilize=False,
        )
    elif algorithm == "kundu-stabilize":
        comptable, metric_metadata = kundu_tedpca(
            comptable,
            n_echos,
            kdaw,
            rdaw,
            stabilize=True,
        )
    else:
        if isinstance(algorithm, float):
            alg_str = "variance explained-based"
        elif isinstance(algorithm, int):
            alg_str = "a fixed number of components and no"
        else:
            alg_str = algorithm
        LGR.info(
            f"Selected {comptable.shape[0]} components with {round(100*varex_norm.sum(),2)}% "
            f"normalized variance explained using {alg_str} dimensionality detection"
        )
        comptable["classification"] = "accepted"
        comptable["rationale"] = ""

    # Save decomposition files
    comp_names = [
        io.add_decomp_prefix(comp, prefix="pca", max_value=comptable.index.max())
        for comp in comptable.index.values
    ]

    mixing_df = pd.DataFrame(data=comp_ts, columns=comp_names)
    io_generator.save_file(mixing_df, "PCA mixing tsv")

    # Save component table and associated json
    io_generator.save_file(comptable, "PCA metrics tsv")

    metric_metadata = metrics.collect.get_metadata(comptable)
    io_generator.save_file(metric_metadata, "PCA metrics json")

    decomp_metadata = {
        "Method": (
            "Principal components analysis implemented by sklearn. "
            "Components are sorted by variance explained in descending order. "
        ),
    }
    for comp_name in comp_names:
        decomp_metadata[comp_name] = {
            "Description": "PCA fit to optimally combined data.",
            "Method": "tedana",
        }
    io_generator.save_file(decomp_metadata, "PCA decomposition json")

    acc = comptable[comptable.classification == "accepted"].index.values
    n_components = acc.size
    voxel_kept_comp_weighted = voxel_comp_weights[:, acc] * varex[None, acc]
    kept_data = np.dot(voxel_kept_comp_weighted, comp_ts[:, acc].T)

    kept_data = stats.zscore(kept_data, axis=1)  # variance normalize time series
    kept_data = stats.zscore(kept_data, axis=None)  # variance normalize everything

    return kept_data, n_components