Пример #1
0
def download_dataset():
    coll = '1425'
    url = 'https://neurovault.org/collections/{0}/download'.format(coll)
    out_dir = op.join(get_resource_path(),
                      'data/neurovault-data/collection-{0}'.format(coll))

    os.makedirs(out_dir, exist_ok=True)

    # Download
    fname = download_file(url)

    # Unzip
    with zipfile.ZipFile(fname, 'r') as zip_ref:
        zip_ref.extractall(out_dir)

    collection_folders = [f for f in glob(op.join(out_dir, '*'))
                          if '.nidm' not in f]
    collection_folders = [f for f in collection_folders if op.isdir(f)]
    if len(collection_folders) > 1:
        raise Exception('More than one folder found: '
                        '{0}'.format(', '.join(collection_folders)))
    else:
        folder = collection_folders[0]
    zip_files = glob(op.join(folder, '*.zip'))
    for zf in zip_files:
        fn = op.splitext(op.basename(zf))[0]
        with zipfile.ZipFile(zf, 'r') as zip_ref:
            zip_ref.extractall(op.join(out_dir, fn))

    os.remove(fname)
    shutil.rmtree(folder)
Пример #2
0
def test_CogAtLemmatizer():
    """A smoke test for CogAtLemmatizer."""
    cogat = extract.download_cognitive_atlas(data_dir=utils.get_resource_path(), overwrite=False)
    id_df = pd.read_csv(cogat["ids"])
    id_df = id_df.loc[id_df["id"] == "trm_4aae62e4ad209"]
    lem = annotate.cogat.CogAtLemmatizer(id_df)
    true_text = "trm_4aae62e4ad209 is great"
    test_text = "Cognitive control is great"
    assert lem.transform(test_text) == true_text
Пример #3
0
def test_cogat(testdata_laird):
    """A smoke test for CogAt-related functions."""
    # A small test dataset with abstracts
    ns_dset_laird = testdata_laird.copy()
    cogat = extract.download_cognitive_atlas(data_dir=utils.get_resource_path(), overwrite=False)
    id_df = pd.read_csv(cogat["ids"])
    rel_df = pd.read_csv(cogat["relationships"])
    weights = {"isKindOf": 1, "isPartOf": 1, "inCategory": 1}
    counts_df, rep_text_df = annotate.cogat.extract_cogat(
        ns_dset_laird.texts, id_df, text_column="abstract"
    )
    assert "id" in ns_dset_laird.texts.columns
    expanded_df = annotate.cogat.expand_counts(counts_df, rel_df, weights)
    assert isinstance(expanded_df, pd.DataFrame)
Пример #4
0
import os

import nibabel as nib
import numpy as np
from nilearn import image, masking, plotting

from nimare import annotate, decode
from nimare.dataset import Dataset
from nimare.utils import get_resource_path

###############################################################################
# Load dataset with abstracts
# -----------------------------------------------------------------------------
# We'll load a small dataset composed only of studies in Neurosynth with
# Angela Laird as a coauthor, for the sake of speed.
dset = Dataset(os.path.join(get_resource_path(), "neurosynth_laird_studies.json"))
dset.texts.head(2)

###############################################################################
# Generate term counts
# -----------------------------------------------------------------------------
# GCLDA uses raw word counts instead of the tf-idf values generated by
# Neurosynth.
counts_df = annotate.text.generate_counts(
    dset.texts,
    text_column="abstract",
    tfidf=False,
    max_df=0.99,
    min_df=0.01,
)
counts_df.head(5)
Пример #5
0
def test_get_resource_path():
    """
    Test nimare.utils.get_resource_path
    """
    print(utils.get_resource_path())
    assert op.isdir(utils.get_resource_path())
Пример #6
0
"""
###############################################################################
# Start with the necessary imports
# -----------------------------------------------------------------------------
import os

from nilearn.plotting import plot_glass_brain

from nimare.dataset import Dataset
from nimare.meta.kernel import Peaks2MapsKernel
from nimare.utils import get_resource_path

###############################################################################
# Load Dataset
# -----------------------------------------------------------------------------
dset_file = os.path.join(get_resource_path(), "nidm_pain_dset.json")
dset = Dataset(dset_file)

###############################################################################
# Run peaks2maps
# -----------------------------------------------------------------------------
k = Peaks2MapsKernel()
imgs = k.transform(dset, return_type="image")

###############################################################################
# Plot modeled activation maps
# -----------------------------------------------------------------------------
for img in imgs:
    display = plot_glass_brain(img,
                               display_mode="lyrz",
                               plot_abs=False,
Пример #7
0
import matplotlib.pyplot as plt
from nilearn.plotting import plot_stat_map

###############################################################################
# Load Sleuth text files into Datasets
# -----------------------------------------------------------------------------
# The data for this example are a subset of studies from a meta-analysis on
# semantic cognition in children :footcite:p:`enge2021meta`.
# A first group of studies probed children's semantic world knowledge
# (e.g., correctly naming an object after hearing its auditory description)
# while a second group of studies asked children to decide if two (or more)
# words were semantically related to one another or not.
from nimare.io import convert_sleuth_to_dataset
from nimare.utils import get_resource_path

knowledge_file = os.path.join(get_resource_path(),
                              "semantic_knowledge_children.txt")
related_file = os.path.join(get_resource_path(),
                            "semantic_relatedness_children.txt")

knowledge_dset = convert_sleuth_to_dataset(knowledge_file)
related_dset = convert_sleuth_to_dataset(related_file)

###############################################################################
# Individual group ALEs
# -----------------------------------------------------------------------------
# Computing separate ALE analyses for each group is not strictly necessary for
# performing the subtraction analysis but will help the experimenter to appreciate the
# similarities and differences between the groups.
from nimare.correct import FWECorrector
from nimare.meta.cbma import ALE
Пример #8
0
===========================
Simple annotation from text
===========================

Perform simple term count or tf-idf value extraction from texts stored in a Dataset.
"""
import os

from nimare import annotate, dataset, utils

###############################################################################
# Load dataset with abstracts
# -----------------------------------------------------------------------------
# We'll load a small dataset composed only of studies in Neurosynth with
# Angela Laird as a coauthor, for the sake of speed.
dset = dataset.Dataset(os.path.join(utils.get_resource_path(), "neurosynth_laird_studies.json"))
dset.texts.head(2)

###############################################################################
# Generate term counts
# -----------------------------------------------------------------------------
# Let's start by extracting terms and their associated counts from article
# abstracts.
counts_df = annotate.text.generate_counts(
    dset.texts,
    text_column="abstract",
    tfidf=False,
    max_df=0.99,
    min_df=0.01,
)
counts_df.head(5)
Пример #9
0
"""
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from nimare import annotate, extract
from nimare.dataset import Dataset
from nimare.utils import get_resource_path

###############################################################################
# Load dataset with abstracts
# -----------------------------------------------------------------------------
dset = Dataset(
    os.path.join(get_resource_path(), "neurosynth_laird_studies.json"))

###############################################################################
# Download Cognitive Atlas
# -----------------------------------------------------------------------------
cogatlas = extract.download_cognitive_atlas(data_dir=get_resource_path(),
                                            overwrite=False)
id_df = pd.read_csv(cogatlas["ids"])
rel_df = pd.read_csv(cogatlas["relationships"])

###############################################################################
# ID DataFrame
id_df.head()

###############################################################################
# Relationships DataFrame
Пример #10
0
def make_json():
    dset_file = 'nimare/resources/nidm_pain_dset_with_subpeaks_docker.json'

    ddict = {}
    folders = sorted(glob(op.join(
        get_resource_path(),
        'data/neurovault-data/collection-1425/pain_*.nidm')))
    for folder in folders:
        name = op.basename(folder)
        ddict[name] = {}
        ddict[name]['contrasts'] = {}
        ddict[name]['contrasts']['1'] = {}
        ddict[name]['contrasts']['1']['coords'] = {}
        ddict[name]['contrasts']['1']['coords']['space'] = 'MNI'
        ddict[name]['contrasts']['1']['images'] = {}
        ddict[name]['contrasts']['1']['images']['space'] = 'MNI_2mm'
        # con file
        files = glob(op.join(folder, 'Contrast*.nii.gz'))
        files = [f for f in files if 'StandardError' not in op.basename(f)]
        if files:
            f = sorted(files)[0]
        else:
            f = None
        ddict[name]['contrasts']['1']['images']['con'] = f
        # se file
        files = glob(op.join(folder, 'ContrastStandardError*.nii.gz'))
        if files:
            f = sorted(files)[0]
        else:
            f = None
        ddict[name]['contrasts']['1']['images']['se'] = f
        # z file
        files = glob(op.join(folder, 'ZStatistic*.nii.gz'))
        if files:
            f = sorted(files)[0]
        else:
            f = None
        ddict[name]['contrasts']['1']['images']['z'] = f
        # t file
        # z file
        files = glob(op.join(folder, 'TStatistic*.nii.gz'))
        if files:
            f = sorted(files)[0]
        else:
            f = None
        ddict[name]['contrasts']['1']['images']['t'] = f
        # sample size
        f = op.join(folder, 'DesignMatrix.csv')
        if op.isfile(f):
            df = pd.read_csv(f, header=None)
            n = [df.shape[0]]
        else:
            n = None
        ddict[name]['contrasts']['1']['sample_sizes'] = n
        # foci
        files = glob(op.join(folder, 'ExcursionSet*.nii.gz'))
        f = sorted(files)[0]
        img = nib.load(f)
        data = np.nan_to_num(img.get_data())
        # positive clusters
        binarized = np.copy(data)
        binarized[binarized > 0] = 1
        binarized[binarized < 0] = 0
        binarized = binarized.astype(int)
        labeled = ndimage.measurements.label(binarized, np.ones((3, 3, 3)))[0]
        clust_ids = sorted(list(np.unique(labeled)[1:]))

        peak_vals = np.array([np.max(data * (labeled == c)) for c in clust_ids])
        clust_ids = [clust_ids[c] for c in (-peak_vals).argsort()]  # Sort by descending max value

        ijk = []
        for c_id, c_val in enumerate(clust_ids):
            cluster_mask = labeled == c_val
            masked_data = data * cluster_mask

            # Get peaks, subpeaks and associated statistics
            subpeak_ijk, subpeak_vals = _local_max(masked_data, img.affine,
                                                   min_distance=8)

            # Only report peak and, at most, top 3 subpeaks.
            n_subpeaks = np.min((len(subpeak_vals), 4))
            subpeak_ijk = subpeak_ijk[:n_subpeaks, :]
            ijk.append(subpeak_ijk)
        ijk = np.vstack(ijk)
        xyz = nib.affines.apply_affine(img.affine, ijk)
        ddict[name]['contrasts']['1']['coords']['x'] = list(xyz[:, 0])
        ddict[name]['contrasts']['1']['coords']['y'] = list(xyz[:, 1])
        ddict[name]['contrasts']['1']['coords']['z'] = list(xyz[:, 2])

    with open(dset_file, 'w') as fo:
        json.dump(ddict, fo, sort_keys=True, indent=4)
Пример #11
0
def _fetch_database(search_pairs, database_url, out_dir, overwrite=False):
    """Fetch generic database."""
    res_dir = get_resource_path()
    with open(op.join(res_dir, "database_file_manifest.json"), "r") as fo:
        database_file_manifest = json.load(fo)

    out_dir = op.abspath(out_dir)
    os.makedirs(out_dir, exist_ok=True)

    found_databases = []
    found_files = []
    log = True
    for database in database_file_manifest:
        coordinates_file = database["coordinates"]
        metadata_file = database["metadata"]
        if not _find_entities(coordinates_file, search_pairs, log=log):
            log = False
            continue

        log = False

        feature_dicts = database["features"]
        for feature_dict in feature_dicts:
            features_file = feature_dict["features"]
            # Other files associated with features have subset of entities,
            # so unnecessary to search them if we assume that the hard-coded manifest is valid.
            if not _find_entities(features_file, search_pairs):
                continue
            else:
                out_coordinates_file = op.join(out_dir, coordinates_file)
                out_metadata_file = op.join(out_dir, metadata_file)
                out_feature_dict = {k: op.join(out_dir, v) for k, v in feature_dict.items()}

                db_found = [
                    i_db
                    for i_db, db_dct in enumerate(found_databases)
                    if db_dct["coordinates"] == out_coordinates_file
                ]
                if len(db_found):
                    assert len(db_found) == 1

                    found_databases[db_found[0]]["features"].append(out_feature_dict)
                else:
                    found_databases.append(
                        {
                            "coordinates": out_coordinates_file,
                            "metadata": out_metadata_file,
                            "features": [out_feature_dict],
                        }
                    )
                found_files += [coordinates_file, metadata_file, *feature_dict.values()]

    found_files = sorted(list(set(found_files)))
    for found_file in found_files:
        print(f"Downloading {found_file}", flush=True)

        url = op.join(database_url, found_file + "?raw=true")
        out_file = op.join(out_dir, found_file)

        if op.isfile(out_file) and not overwrite:
            print("File exists and overwrite is False. Skipping.")
            continue

        with open(out_file, "wb") as fo:
            u = urlopen(url)

            block_size = 8192
            while True:
                buffer = u.read(block_size)
                if not buffer:
                    break
                fo.write(buffer)

    return found_databases
Пример #12
0
def mni_mask():
    """Load MNI mask for testing."""
    return nib.load(
        os.path.join(get_resource_path(), "templates",
                     "MNI152_2x2x2_brainmask.nii.gz"))
Пример #13
0
def generate_counts(text_df,
                    text_column="abstract",
                    tfidf=True,
                    min_df=50,
                    max_df=0.5):
    """Generate tf-idf weights for unigrams/bigrams derived from textual data.

    Parameters
    ----------
    text_df : (D x 2) :obj:`pandas.DataFrame`
        A DataFrame with two columns ('id' and 'text'). D = document.

    Returns
    -------
    weights_df : (D x T) :obj:`pandas.DataFrame`
        A DataFrame where the index is 'id' and the columns are the
        unigrams/bigrams derived from the data. D = document. T = term.
    """
    if text_column not in text_df.columns:
        raise ValueError(f"Column '{text_column}' not found in DataFrame")

    # Remove rows with empty text cells
    orig_ids = text_df["id"].tolist()
    text_df = text_df.fillna("")
    keep_ids = text_df.loc[text_df[text_column] != "", "id"]
    text_df = text_df.loc[text_df["id"].isin(keep_ids)]

    if len(keep_ids) != len(orig_ids):
        LGR.info(f"Retaining {len(keep_ids)}/{len(orig_ids)} studies")

    ids = text_df["id"].tolist()
    text = text_df[text_column].tolist()
    stoplist = op.join(get_resource_path(), "neurosynth_stoplist.txt")
    with open(stoplist, "r") as fo:
        stop_words = fo.read().splitlines()

    if tfidf:
        vectorizer = TfidfVectorizer(
            min_df=min_df,
            max_df=max_df,
            ngram_range=(1, 2),
            vocabulary=None,
            stop_words=stop_words,
        )
    else:
        vectorizer = CountVectorizer(
            min_df=min_df,
            max_df=max_df,
            ngram_range=(1, 2),
            vocabulary=None,
            stop_words=stop_words,
        )
    weights = vectorizer.fit_transform(text).toarray()

    if hasattr(vectorizer, "get_feature_names_out"):
        # scikit-learn >= 1.0.0
        names = vectorizer.get_feature_names_out()
    else:
        # scikit-learn < 1.0.0
        # To remove when we drop support for 3.6 and increase minimum sklearn version to 1.0.0.
        names = vectorizer.get_feature_names()

    names = [str(name) for name in names]
    weights_df = pd.DataFrame(weights, columns=names, index=ids)
    weights_df.index.name = "id"
    return weights_df