Exemplo n.º 1
0
def filter_probes(pacall, probes, threshold=0.5):
    """
    Performs intensity based filtering (IBF) of expression probes

    Uses binary indicator for expression levels in `pacall` to determine which
    probes have expression levels above background noise in `threshold` of
    samples across donors.

    Parameters
    ----------
    pacall : list
        List of PACall files from Allen Brain Institute. Optimally obtained by
        calling `abagen.fetch_microarray()` and accessing the `pacall`
        attribute on the resulting object
    probes : pandas.DataFrame
        Dataframe containing information on genetic probes that should be
        considered in filtering
    threshold : (0, 1) float, optional
        Threshold for filtering probes. Specifies the proportion of samples for
        which a given probe must have expression levels above background noise.
        Default: 0.5

    Returns
    -------
    filtered : pandas.DataFrame
        Dataframe containing information on probes that should be retained
        according to intensity-based filtering
    """

    probes = io.read_probes(probes)
    signal, samples = [], 0
    for fname in pacall:
        data = io.read_pacall(fname).loc[probes.index]
        samples += data.shape[-1]
        # sum binary expression indicator across samples for current subject
        signal.append(data.sum(axis=1).values)

    # calculate proportion of signal to noise for given probe across samples
    keep = (np.sum(signal, axis=0) / samples) > threshold

    # drop "bad" probes
    filtered = probes[keep]

    return filtered
Exemplo n.º 2
0
def reannotate_probes(probes):
    """
    Replaces gene symbols in `probes` with reannotated data

    Uses annotations from [1]_ to replace probe annotations shipped with AHBA
    data. Any probes that were unable to be matched to a gene in reannotation
    procedure are not retained.

    Parameters
    ----------
    probes : str or pandas.DataFrame
        Probe file or loaded probe dataframe from Allen Brain Institute.
        Optimally obtained by calling `abagen.fetch_microarray()` and accessing
        the `probes` attribute on the resulting object

    Returns
    -------
    reannotated : pandas.DataFrame


    References
    ----------
    .. [1] Arnatkeviciute, A., Fulcher, B. D., & Fornito, A. (2018). A
       practical guide to linking brain-wide gene expression and neuroimaging
       data. bioRxiv, 380089.
    """

    # load in reannotated probes
    reannot = resource_filename('abagen', 'data/reannotated.csv.gz')
    with gzip.open(reannot, 'r') as src:
        reannot = pd.read_csv(StringIO(src.read().decode('utf-8'))).iloc[:-1]
    reannot = reannot[['probe_name', 'gene_symbol', 'entrez_id']]

    # merge reannotated with original, keeping only reannotated
    probes = io.read_probes(probes).reset_index()[['probe_name', 'probe_id']]
    merged = pd.merge(reannot, probes, on='probe_name', how='left')

    # reset index as probe_id and sort
    reannotated = merged.set_index('probe_id').sort_index()

    return reannotated
Exemplo n.º 3
0
def get_expression_data(atlas,
                        atlas_info=None,
                        *,
                        exact=True,
                        tolerance=2,
                        metric='mean',
                        ibf_threshold=0.5,
                        corrected_mni=True,
                        reannotated=True,
                        return_counts=False,
                        return_donors=False,
                        donors='all',
                        data_dir=None):
    """
    Assigns microarray expression data to ROIs defined in `atlas`

    This function aims to provide a workflow for generating pre-processed,
    microarray expression data for abitrary `atlas` designations. First, some
    basic filtering of genetic probes is performed, including:

        1. Intensity-based filtering of microarray probes to remove probes that
           do not exceed a certain level of background noise (specified via the
           `ibf_threshold` parameter), and
        2. Selection of a single, representative probe for each gene via a
           differential stability metric, wherein the probe that has the most
           consistent regional variation across donors is retained.

    Tissue samples are then matched to parcels in the defined `atlas` for each
    donor. If `atlas_info` is provided then this matching is constrained by
    both hemisphere and tissue class designation (e.g., cortical samples from
    the left hemisphere are only matched to ROIs in the left cortex,
    subcortical samples from the right hemisphere are only matched to ROIs in
    the left subcortex); see the `atlas_info` parameter description for more
    information.

    Matching of microarray samples to parcels in `atlas` is done via a multi-
    step process:

        1. Determine if the sample falls directly within a parcel,
        2. Check to see if there are nearby parcels by slowly expanding the
           search space to include nearby voxels, up to a specified distance
           (specified via the `tolerance` parameter),
        3. If there are multiple nearby parcels, the sample is assigned to the
           closest parcel, as determined by the parcel centroid.

    If at any step a sample can be assigned to a parcel the matching process is
    terminated. If multiple sample are assigned to the same parcel they are
    aggregated with the metric specified via the `metric` parameter. More
    control over the sample matching can be obtained by setting the `exact`
    parameter; see the parameter description for more information.

    Once all samples have been matched to parcels for all supplied donors, the
    microarray expression data are normalized within-donor via a scaled robust
    sigmoid (SRS) procedure before being combined across donors via the
    supplied `metric`.

    Parameters
    ----------
    atlas : niimg-like object
        A parcellation image in MNI space, where each parcel is identified by a
        unique integer ID
    atlas_info : str or :class:`pandas.DataFrame`, optional
        Filepath to or pre-loaded dataframe containing information about
        `atlas`. Must have at least columns 'id', 'hemisphere', and 'structure'
        containing information mapping atlas IDs to hemisphere (i.e, "L", "R")
        and broad structural class (i.e., "cortex", "subcortex", "cerebellum").
        Default: None
    exact : bool, optional
        Whether to use exact matching of donor tissue samples to parcels in
        `atlas`. If True, this function will match tissue samples to parcels
        within `threshold` mm of the sample; any samples that are beyond
        `threshold` mm of a parcel will be discarded. This may result in some
        parcels having no assigned sample / expression data. If False, the
        default matching procedure will be performed and followed by a check
        for parcels with no assigned samples; any such parcels will be matched
        to the nearest sample (nearest defined as the sample with the closest
        Euclidean distance to the parcel centroid). Default: True
    tolerance : int, optional
        Distance (in mm) that a sample must be from a parcel for it to be
        matched to that parcel. This is only considered if the sample is not
        directly within a parcel. Default: 2
    metric : str or func, optional
        Mechanism by which to collapse across donors, if input `files` provides
        multiple donor datasets. If a str, should be in ['mean', 'median']; if
        a function, should be able to accept an `N`-dimensional input and the
        `axis` keyword argument and return an `N-1`-dimensional output.
        Default: 'mean'
    ibf_threshold : [0, 1] float, optional
        Threshold for intensity-based filtering specifying. This number should
        specify the ratio of samples, across all supplied donors, for which a
        probe must have signal above background noise in order to be retained.
        Default: 0.5
    corrected_mni : bool, optional
        Whether to use the "corrected" MNI coordinates shipped with the
        `alleninf` package instead of the coordinates provided with the AHBA
        data when matching tissue samples to anatomical regions. Default: True
    reannotated : bool, optional
        Whether to use reannotated probe information provided by [1]_ instead
        of the default probe information from the AHBA dataset. Using
        reannotated information will discard probes that could not be reliably
        matched to genes. Default: True
    return_counts : bool, optional
        Whether to return how many samples were assigned to each parcel in
        `atlas` for each donor. Default: False
    return_donors : bool, optional
        Whether to return donor-level expression arrays instead of aggregating
        expression across donors with provided `metric`. Default: False
    donors : list, optional
        List of donors to use as sources of expression data. Can be either
        donor numbers or UID. If not specified will use all available donors.
        Default: 'all'
    data_dir : str, optional
        Directory where expression data should be downloaded (if it does not
        already exist) / loaded. If not specified will use the current
        directory. Default: None

    Returns
    -------
    expression : (R, G) :class:`pandas.DataFrame`
        Microarray expression for `R` regions in `atlas` for `G` genes,
        aggregated across donors, where the index corresponds to the unique
        integer IDs of `atlas` and the columns are gene names.
    counts : (R, D) :class:`pandas.DataFrame`
        Number of samples assigned to each of `R` regions in `atlas` for each
        of `D` donors (if multiple donors were specified); only returned if
        `return_counts=True`.

    References
    ----------
    .. [1] Arnatkevic̆iūtė, A., Fulcher, B. D., & Fornito, A. (2019). A
       practical guide to linking brain-wide gene expression and neuroimaging
       data. NeuroImage, 189, 353-367.
    .. [2] Hawrylycz, M.J. et al. (2012) An anatomically comprehensive atlas of
       the adult human transcriptome. Nature, 489, 391-399.
    """

    # fetch files
    files = datasets.fetch_microarray(data_dir=data_dir, donors=donors)
    for key in ['microarray', 'probes', 'annotation', 'pacall', 'ontology']:
        if key not in files:
            raise KeyError('Provided `files` dictionary is missing {}. '
                           'Please check inputs.'.format(key))

    # load atlas_info, if provided
    atlas = check_niimg_3d(atlas)
    if atlas_info is not None:
        atlas_info = utils.check_atlas_info(atlas, atlas_info)

    # get combination functions
    metric = utils.check_metric(metric)

    # get some info on the number of subjects, labels in `atlas_img`
    num_subj = len(files.microarray)
    all_labels = utils.get_unique_labels(atlas)
    if not exact:
        centroids = utils.get_centroids(atlas, labels=all_labels)

    # reannotate probes based on updates from Arnatkeviciute et al., 2018 then
    # perform intensity-based filter of probes and select probe with highest
    # differential stability for each gene amongst remaining probes
    if reannotated:
        probes = process.reannotate_probes(files.probes[0])
    else:
        probes = io.read_probes(files.probes[0])
    probes = process.filter_probes(files.pacall,
                                   probes,
                                   threshold=ibf_threshold)
    probes = process.get_stable_probes(files.microarray, files.annotation,
                                       probes)

    expression, missing = [], []
    counts = pd.DataFrame(np.zeros((len(all_labels) + 1, num_subj)),
                          index=np.append([0], all_labels))
    for subj in range(num_subj):
        # get rid of samples whose coordinates don't match ontological profile
        annotation = process.drop_mismatch_samples(files.annotation[subj],
                                                   files.ontology[subj],
                                                   corrected=corrected_mni)

        # subset representative probes + samples from microarray data
        microarray = io.read_microarray(files.microarray[subj])
        samples = microarray.loc[probes.index, annotation.index].T
        samples.columns = probes.gene_symbol

        # assign samples to regions and aggregate samples w/i the same region
        sample_labels = label_samples(annotation,
                                      atlas,
                                      atlas_info=atlas_info,
                                      tolerance=tolerance)
        expression += [
            group_by_label(samples, sample_labels, all_labels, metric=metric)
        ]

        # get counts of samples collapsed into each ROI
        labs, num = np.unique(sample_labels, return_counts=True)
        counts.loc[labs, subj] = num

        # if we don't want to do exact matching then cache which parcels are
        # missing data and the expression data for the closest sample to that
        # parcel; we'll use this once we've iterated through all donors
        if not exact:
            coords = utils.xyz_to_ijk(annotation[['mni_x', 'mni_y', 'mni_z']],
                                      atlas.affine)
            empty = ~np.in1d(all_labels, labs)
            closest, dist = utils.closest_centroid(coords,
                                                   centroids[empty],
                                                   return_dist=True)
            closest = samples.loc[annotation.iloc[closest].index]
            empty = all_labels[empty]
            closest.index = pd.Series(empty, name='label')
            missing += [(closest, dict(zip(empty, np.diag(dist))))]

    # check for missing ROIs and fill in, as needed
    if not exact:
        # find labels that are missing across all donors
        empty = reduce(set.intersection, [set(f.index) for f, d in missing])
        for roi in empty:
            # find donor with sample closest to centroid of empty parcel
            ind = np.argmin([d.get(roi) for f, d in missing])
            # assign expression data from that sample and add to count
            expression[ind].loc[roi] = missing[ind][0].loc[roi]
            counts.loc[roi, ind] += 1

    # normalize data with SRS and aggregate across donors
    expression = [process.normalize_expression(e) for e in expression]
    if not return_donors:
        expression = process.aggregate_donors(expression, metric)

    if return_counts:
        return expression, counts.iloc[1:]

    return expression