def test_fetch_datasets(testdir): # check downloading for a subset of donors files = datasets.fetch_microarray(data_dir=str(testdir), donors=['12876']) assert isinstance(files, dict) for k in KEYS: assert len(files.get(k)) == 1 # check downloading incorrect donor with pytest.raises(ValueError): datasets.fetch_microarray(donors=['notadonor']) files = datasets.fetch_microarray(data_dir=str(testdir), donors=None)
def testfiles(datadir): return fetch_microarray(data_dir=datadir, donors=['12876', '15496'], n_proc=2)
def testfiles(testdir): files = fetch_microarray(data_dir=str(testdir), donors=['12876', '15496'], convert=True) return files
def testfiles(): return fetch_microarray(donors=['12876', '15496'])
def get_expression_data(atlas, atlas_info=None, *, exact=True, tolerance=2, metric='mean', ibf_threshold=0.5, corrected_mni=True, reannotated=True, return_counts=False, return_donors=False, donors='all', data_dir=None): """ Assigns microarray expression data to ROIs defined in `atlas` This function aims to provide a workflow for generating pre-processed, microarray expression data for abitrary `atlas` designations. First, some basic filtering of genetic probes is performed, including: 1. Intensity-based filtering of microarray probes to remove probes that do not exceed a certain level of background noise (specified via the `ibf_threshold` parameter), and 2. Selection of a single, representative probe for each gene via a differential stability metric, wherein the probe that has the most consistent regional variation across donors is retained. Tissue samples are then matched to parcels in the defined `atlas` for each donor. If `atlas_info` is provided then this matching is constrained by both hemisphere and tissue class designation (e.g., cortical samples from the left hemisphere are only matched to ROIs in the left cortex, subcortical samples from the right hemisphere are only matched to ROIs in the left subcortex); see the `atlas_info` parameter description for more information. Matching of microarray samples to parcels in `atlas` is done via a multi- step process: 1. Determine if the sample falls directly within a parcel, 2. Check to see if there are nearby parcels by slowly expanding the search space to include nearby voxels, up to a specified distance (specified via the `tolerance` parameter), 3. If there are multiple nearby parcels, the sample is assigned to the closest parcel, as determined by the parcel centroid. If at any step a sample can be assigned to a parcel the matching process is terminated. If multiple sample are assigned to the same parcel they are aggregated with the metric specified via the `metric` parameter. More control over the sample matching can be obtained by setting the `exact` parameter; see the parameter description for more information. Once all samples have been matched to parcels for all supplied donors, the microarray expression data are normalized within-donor via a scaled robust sigmoid (SRS) procedure before being combined across donors via the supplied `metric`. Parameters ---------- atlas : niimg-like object A parcellation image in MNI space, where each parcel is identified by a unique integer ID atlas_info : str or :class:`pandas.DataFrame`, optional Filepath to or pre-loaded dataframe containing information about `atlas`. Must have at least columns 'id', 'hemisphere', and 'structure' containing information mapping atlas IDs to hemisphere (i.e, "L", "R") and broad structural class (i.e., "cortex", "subcortex", "cerebellum"). Default: None exact : bool, optional Whether to use exact matching of donor tissue samples to parcels in `atlas`. If True, this function will match tissue samples to parcels within `threshold` mm of the sample; any samples that are beyond `threshold` mm of a parcel will be discarded. This may result in some parcels having no assigned sample / expression data. If False, the default matching procedure will be performed and followed by a check for parcels with no assigned samples; any such parcels will be matched to the nearest sample (nearest defined as the sample with the closest Euclidean distance to the parcel centroid). Default: True tolerance : int, optional Distance (in mm) that a sample must be from a parcel for it to be matched to that parcel. This is only considered if the sample is not directly within a parcel. Default: 2 metric : str or func, optional Mechanism by which to collapse across donors, if input `files` provides multiple donor datasets. If a str, should be in ['mean', 'median']; if a function, should be able to accept an `N`-dimensional input and the `axis` keyword argument and return an `N-1`-dimensional output. Default: 'mean' ibf_threshold : [0, 1] float, optional Threshold for intensity-based filtering specifying. This number should specify the ratio of samples, across all supplied donors, for which a probe must have signal above background noise in order to be retained. Default: 0.5 corrected_mni : bool, optional Whether to use the "corrected" MNI coordinates shipped with the `alleninf` package instead of the coordinates provided with the AHBA data when matching tissue samples to anatomical regions. Default: True reannotated : bool, optional Whether to use reannotated probe information provided by [1]_ instead of the default probe information from the AHBA dataset. Using reannotated information will discard probes that could not be reliably matched to genes. Default: True return_counts : bool, optional Whether to return how many samples were assigned to each parcel in `atlas` for each donor. Default: False return_donors : bool, optional Whether to return donor-level expression arrays instead of aggregating expression across donors with provided `metric`. Default: False donors : list, optional List of donors to use as sources of expression data. Can be either donor numbers or UID. If not specified will use all available donors. Default: 'all' data_dir : str, optional Directory where expression data should be downloaded (if it does not already exist) / loaded. If not specified will use the current directory. Default: None Returns ------- expression : (R, G) :class:`pandas.DataFrame` Microarray expression for `R` regions in `atlas` for `G` genes, aggregated across donors, where the index corresponds to the unique integer IDs of `atlas` and the columns are gene names. counts : (R, D) :class:`pandas.DataFrame` Number of samples assigned to each of `R` regions in `atlas` for each of `D` donors (if multiple donors were specified); only returned if `return_counts=True`. References ---------- .. [1] Arnatkevic̆iūtė, A., Fulcher, B. D., & Fornito, A. (2019). A practical guide to linking brain-wide gene expression and neuroimaging data. NeuroImage, 189, 353-367. .. [2] Hawrylycz, M.J. et al. (2012) An anatomically comprehensive atlas of the adult human transcriptome. Nature, 489, 391-399. """ # fetch files files = datasets.fetch_microarray(data_dir=data_dir, donors=donors) for key in ['microarray', 'probes', 'annotation', 'pacall', 'ontology']: if key not in files: raise KeyError('Provided `files` dictionary is missing {}. ' 'Please check inputs.'.format(key)) # load atlas_info, if provided atlas = check_niimg_3d(atlas) if atlas_info is not None: atlas_info = utils.check_atlas_info(atlas, atlas_info) # get combination functions metric = utils.check_metric(metric) # get some info on the number of subjects, labels in `atlas_img` num_subj = len(files.microarray) all_labels = utils.get_unique_labels(atlas) if not exact: centroids = utils.get_centroids(atlas, labels=all_labels) # reannotate probes based on updates from Arnatkeviciute et al., 2018 then # perform intensity-based filter of probes and select probe with highest # differential stability for each gene amongst remaining probes if reannotated: probes = process.reannotate_probes(files.probes[0]) else: probes = io.read_probes(files.probes[0]) probes = process.filter_probes(files.pacall, probes, threshold=ibf_threshold) probes = process.get_stable_probes(files.microarray, files.annotation, probes) expression, missing = [], [] counts = pd.DataFrame(np.zeros((len(all_labels) + 1, num_subj)), index=np.append([0], all_labels)) for subj in range(num_subj): # get rid of samples whose coordinates don't match ontological profile annotation = process.drop_mismatch_samples(files.annotation[subj], files.ontology[subj], corrected=corrected_mni) # subset representative probes + samples from microarray data microarray = io.read_microarray(files.microarray[subj]) samples = microarray.loc[probes.index, annotation.index].T samples.columns = probes.gene_symbol # assign samples to regions and aggregate samples w/i the same region sample_labels = label_samples(annotation, atlas, atlas_info=atlas_info, tolerance=tolerance) expression += [ group_by_label(samples, sample_labels, all_labels, metric=metric) ] # get counts of samples collapsed into each ROI labs, num = np.unique(sample_labels, return_counts=True) counts.loc[labs, subj] = num # if we don't want to do exact matching then cache which parcels are # missing data and the expression data for the closest sample to that # parcel; we'll use this once we've iterated through all donors if not exact: coords = utils.xyz_to_ijk(annotation[['mni_x', 'mni_y', 'mni_z']], atlas.affine) empty = ~np.in1d(all_labels, labs) closest, dist = utils.closest_centroid(coords, centroids[empty], return_dist=True) closest = samples.loc[annotation.iloc[closest].index] empty = all_labels[empty] closest.index = pd.Series(empty, name='label') missing += [(closest, dict(zip(empty, np.diag(dist))))] # check for missing ROIs and fill in, as needed if not exact: # find labels that are missing across all donors empty = reduce(set.intersection, [set(f.index) for f, d in missing]) for roi in empty: # find donor with sample closest to centroid of empty parcel ind = np.argmin([d.get(roi) for f, d in missing]) # assign expression data from that sample and add to count expression[ind].loc[roi] = missing[ind][0].loc[roi] counts.loc[roi, ind] += 1 # normalize data with SRS and aggregate across donors expression = [process.normalize_expression(e) for e in expression] if not return_donors: expression = process.aggregate_donors(expression, metric) if return_counts: return expression, counts.iloc[1:] return expression