def download_dataset(): coll = '1425' url = 'https://neurovault.org/collections/{0}/download'.format(coll) out_dir = op.join(get_resource_path(), 'data/neurovault-data/collection-{0}'.format(coll)) os.makedirs(out_dir, exist_ok=True) # Download fname = download_file(url) # Unzip with zipfile.ZipFile(fname, 'r') as zip_ref: zip_ref.extractall(out_dir) collection_folders = [f for f in glob(op.join(out_dir, '*')) if '.nidm' not in f] collection_folders = [f for f in collection_folders if op.isdir(f)] if len(collection_folders) > 1: raise Exception('More than one folder found: ' '{0}'.format(', '.join(collection_folders))) else: folder = collection_folders[0] zip_files = glob(op.join(folder, '*.zip')) for zf in zip_files: fn = op.splitext(op.basename(zf))[0] with zipfile.ZipFile(zf, 'r') as zip_ref: zip_ref.extractall(op.join(out_dir, fn)) os.remove(fname) shutil.rmtree(folder)
def test_CogAtLemmatizer(): """A smoke test for CogAtLemmatizer.""" cogat = extract.download_cognitive_atlas(data_dir=utils.get_resource_path(), overwrite=False) id_df = pd.read_csv(cogat["ids"]) id_df = id_df.loc[id_df["id"] == "trm_4aae62e4ad209"] lem = annotate.cogat.CogAtLemmatizer(id_df) true_text = "trm_4aae62e4ad209 is great" test_text = "Cognitive control is great" assert lem.transform(test_text) == true_text
def test_cogat(testdata_laird): """A smoke test for CogAt-related functions.""" # A small test dataset with abstracts ns_dset_laird = testdata_laird.copy() cogat = extract.download_cognitive_atlas(data_dir=utils.get_resource_path(), overwrite=False) id_df = pd.read_csv(cogat["ids"]) rel_df = pd.read_csv(cogat["relationships"]) weights = {"isKindOf": 1, "isPartOf": 1, "inCategory": 1} counts_df, rep_text_df = annotate.cogat.extract_cogat( ns_dset_laird.texts, id_df, text_column="abstract" ) assert "id" in ns_dset_laird.texts.columns expanded_df = annotate.cogat.expand_counts(counts_df, rel_df, weights) assert isinstance(expanded_df, pd.DataFrame)
import os import nibabel as nib import numpy as np from nilearn import image, masking, plotting from nimare import annotate, decode from nimare.dataset import Dataset from nimare.utils import get_resource_path ############################################################################### # Load dataset with abstracts # ----------------------------------------------------------------------------- # We'll load a small dataset composed only of studies in Neurosynth with # Angela Laird as a coauthor, for the sake of speed. dset = Dataset(os.path.join(get_resource_path(), "neurosynth_laird_studies.json")) dset.texts.head(2) ############################################################################### # Generate term counts # ----------------------------------------------------------------------------- # GCLDA uses raw word counts instead of the tf-idf values generated by # Neurosynth. counts_df = annotate.text.generate_counts( dset.texts, text_column="abstract", tfidf=False, max_df=0.99, min_df=0.01, ) counts_df.head(5)
def test_get_resource_path(): """ Test nimare.utils.get_resource_path """ print(utils.get_resource_path()) assert op.isdir(utils.get_resource_path())
""" ############################################################################### # Start with the necessary imports # ----------------------------------------------------------------------------- import os from nilearn.plotting import plot_glass_brain from nimare.dataset import Dataset from nimare.meta.kernel import Peaks2MapsKernel from nimare.utils import get_resource_path ############################################################################### # Load Dataset # ----------------------------------------------------------------------------- dset_file = os.path.join(get_resource_path(), "nidm_pain_dset.json") dset = Dataset(dset_file) ############################################################################### # Run peaks2maps # ----------------------------------------------------------------------------- k = Peaks2MapsKernel() imgs = k.transform(dset, return_type="image") ############################################################################### # Plot modeled activation maps # ----------------------------------------------------------------------------- for img in imgs: display = plot_glass_brain(img, display_mode="lyrz", plot_abs=False,
import matplotlib.pyplot as plt from nilearn.plotting import plot_stat_map ############################################################################### # Load Sleuth text files into Datasets # ----------------------------------------------------------------------------- # The data for this example are a subset of studies from a meta-analysis on # semantic cognition in children :footcite:p:`enge2021meta`. # A first group of studies probed children's semantic world knowledge # (e.g., correctly naming an object after hearing its auditory description) # while a second group of studies asked children to decide if two (or more) # words were semantically related to one another or not. from nimare.io import convert_sleuth_to_dataset from nimare.utils import get_resource_path knowledge_file = os.path.join(get_resource_path(), "semantic_knowledge_children.txt") related_file = os.path.join(get_resource_path(), "semantic_relatedness_children.txt") knowledge_dset = convert_sleuth_to_dataset(knowledge_file) related_dset = convert_sleuth_to_dataset(related_file) ############################################################################### # Individual group ALEs # ----------------------------------------------------------------------------- # Computing separate ALE analyses for each group is not strictly necessary for # performing the subtraction analysis but will help the experimenter to appreciate the # similarities and differences between the groups. from nimare.correct import FWECorrector from nimare.meta.cbma import ALE
=========================== Simple annotation from text =========================== Perform simple term count or tf-idf value extraction from texts stored in a Dataset. """ import os from nimare import annotate, dataset, utils ############################################################################### # Load dataset with abstracts # ----------------------------------------------------------------------------- # We'll load a small dataset composed only of studies in Neurosynth with # Angela Laird as a coauthor, for the sake of speed. dset = dataset.Dataset(os.path.join(utils.get_resource_path(), "neurosynth_laird_studies.json")) dset.texts.head(2) ############################################################################### # Generate term counts # ----------------------------------------------------------------------------- # Let's start by extracting terms and their associated counts from article # abstracts. counts_df = annotate.text.generate_counts( dset.texts, text_column="abstract", tfidf=False, max_df=0.99, min_df=0.01, ) counts_df.head(5)
""" import os import matplotlib.pyplot as plt import numpy as np import pandas as pd from nimare import annotate, extract from nimare.dataset import Dataset from nimare.utils import get_resource_path ############################################################################### # Load dataset with abstracts # ----------------------------------------------------------------------------- dset = Dataset( os.path.join(get_resource_path(), "neurosynth_laird_studies.json")) ############################################################################### # Download Cognitive Atlas # ----------------------------------------------------------------------------- cogatlas = extract.download_cognitive_atlas(data_dir=get_resource_path(), overwrite=False) id_df = pd.read_csv(cogatlas["ids"]) rel_df = pd.read_csv(cogatlas["relationships"]) ############################################################################### # ID DataFrame id_df.head() ############################################################################### # Relationships DataFrame
def make_json(): dset_file = 'nimare/resources/nidm_pain_dset_with_subpeaks_docker.json' ddict = {} folders = sorted(glob(op.join( get_resource_path(), 'data/neurovault-data/collection-1425/pain_*.nidm'))) for folder in folders: name = op.basename(folder) ddict[name] = {} ddict[name]['contrasts'] = {} ddict[name]['contrasts']['1'] = {} ddict[name]['contrasts']['1']['coords'] = {} ddict[name]['contrasts']['1']['coords']['space'] = 'MNI' ddict[name]['contrasts']['1']['images'] = {} ddict[name]['contrasts']['1']['images']['space'] = 'MNI_2mm' # con file files = glob(op.join(folder, 'Contrast*.nii.gz')) files = [f for f in files if 'StandardError' not in op.basename(f)] if files: f = sorted(files)[0] else: f = None ddict[name]['contrasts']['1']['images']['con'] = f # se file files = glob(op.join(folder, 'ContrastStandardError*.nii.gz')) if files: f = sorted(files)[0] else: f = None ddict[name]['contrasts']['1']['images']['se'] = f # z file files = glob(op.join(folder, 'ZStatistic*.nii.gz')) if files: f = sorted(files)[0] else: f = None ddict[name]['contrasts']['1']['images']['z'] = f # t file # z file files = glob(op.join(folder, 'TStatistic*.nii.gz')) if files: f = sorted(files)[0] else: f = None ddict[name]['contrasts']['1']['images']['t'] = f # sample size f = op.join(folder, 'DesignMatrix.csv') if op.isfile(f): df = pd.read_csv(f, header=None) n = [df.shape[0]] else: n = None ddict[name]['contrasts']['1']['sample_sizes'] = n # foci files = glob(op.join(folder, 'ExcursionSet*.nii.gz')) f = sorted(files)[0] img = nib.load(f) data = np.nan_to_num(img.get_data()) # positive clusters binarized = np.copy(data) binarized[binarized > 0] = 1 binarized[binarized < 0] = 0 binarized = binarized.astype(int) labeled = ndimage.measurements.label(binarized, np.ones((3, 3, 3)))[0] clust_ids = sorted(list(np.unique(labeled)[1:])) peak_vals = np.array([np.max(data * (labeled == c)) for c in clust_ids]) clust_ids = [clust_ids[c] for c in (-peak_vals).argsort()] # Sort by descending max value ijk = [] for c_id, c_val in enumerate(clust_ids): cluster_mask = labeled == c_val masked_data = data * cluster_mask # Get peaks, subpeaks and associated statistics subpeak_ijk, subpeak_vals = _local_max(masked_data, img.affine, min_distance=8) # Only report peak and, at most, top 3 subpeaks. n_subpeaks = np.min((len(subpeak_vals), 4)) subpeak_ijk = subpeak_ijk[:n_subpeaks, :] ijk.append(subpeak_ijk) ijk = np.vstack(ijk) xyz = nib.affines.apply_affine(img.affine, ijk) ddict[name]['contrasts']['1']['coords']['x'] = list(xyz[:, 0]) ddict[name]['contrasts']['1']['coords']['y'] = list(xyz[:, 1]) ddict[name]['contrasts']['1']['coords']['z'] = list(xyz[:, 2]) with open(dset_file, 'w') as fo: json.dump(ddict, fo, sort_keys=True, indent=4)
def _fetch_database(search_pairs, database_url, out_dir, overwrite=False): """Fetch generic database.""" res_dir = get_resource_path() with open(op.join(res_dir, "database_file_manifest.json"), "r") as fo: database_file_manifest = json.load(fo) out_dir = op.abspath(out_dir) os.makedirs(out_dir, exist_ok=True) found_databases = [] found_files = [] log = True for database in database_file_manifest: coordinates_file = database["coordinates"] metadata_file = database["metadata"] if not _find_entities(coordinates_file, search_pairs, log=log): log = False continue log = False feature_dicts = database["features"] for feature_dict in feature_dicts: features_file = feature_dict["features"] # Other files associated with features have subset of entities, # so unnecessary to search them if we assume that the hard-coded manifest is valid. if not _find_entities(features_file, search_pairs): continue else: out_coordinates_file = op.join(out_dir, coordinates_file) out_metadata_file = op.join(out_dir, metadata_file) out_feature_dict = {k: op.join(out_dir, v) for k, v in feature_dict.items()} db_found = [ i_db for i_db, db_dct in enumerate(found_databases) if db_dct["coordinates"] == out_coordinates_file ] if len(db_found): assert len(db_found) == 1 found_databases[db_found[0]]["features"].append(out_feature_dict) else: found_databases.append( { "coordinates": out_coordinates_file, "metadata": out_metadata_file, "features": [out_feature_dict], } ) found_files += [coordinates_file, metadata_file, *feature_dict.values()] found_files = sorted(list(set(found_files))) for found_file in found_files: print(f"Downloading {found_file}", flush=True) url = op.join(database_url, found_file + "?raw=true") out_file = op.join(out_dir, found_file) if op.isfile(out_file) and not overwrite: print("File exists and overwrite is False. Skipping.") continue with open(out_file, "wb") as fo: u = urlopen(url) block_size = 8192 while True: buffer = u.read(block_size) if not buffer: break fo.write(buffer) return found_databases
def mni_mask(): """Load MNI mask for testing.""" return nib.load( os.path.join(get_resource_path(), "templates", "MNI152_2x2x2_brainmask.nii.gz"))
def generate_counts(text_df, text_column="abstract", tfidf=True, min_df=50, max_df=0.5): """Generate tf-idf weights for unigrams/bigrams derived from textual data. Parameters ---------- text_df : (D x 2) :obj:`pandas.DataFrame` A DataFrame with two columns ('id' and 'text'). D = document. Returns ------- weights_df : (D x T) :obj:`pandas.DataFrame` A DataFrame where the index is 'id' and the columns are the unigrams/bigrams derived from the data. D = document. T = term. """ if text_column not in text_df.columns: raise ValueError(f"Column '{text_column}' not found in DataFrame") # Remove rows with empty text cells orig_ids = text_df["id"].tolist() text_df = text_df.fillna("") keep_ids = text_df.loc[text_df[text_column] != "", "id"] text_df = text_df.loc[text_df["id"].isin(keep_ids)] if len(keep_ids) != len(orig_ids): LGR.info(f"Retaining {len(keep_ids)}/{len(orig_ids)} studies") ids = text_df["id"].tolist() text = text_df[text_column].tolist() stoplist = op.join(get_resource_path(), "neurosynth_stoplist.txt") with open(stoplist, "r") as fo: stop_words = fo.read().splitlines() if tfidf: vectorizer = TfidfVectorizer( min_df=min_df, max_df=max_df, ngram_range=(1, 2), vocabulary=None, stop_words=stop_words, ) else: vectorizer = CountVectorizer( min_df=min_df, max_df=max_df, ngram_range=(1, 2), vocabulary=None, stop_words=stop_words, ) weights = vectorizer.fit_transform(text).toarray() if hasattr(vectorizer, "get_feature_names_out"): # scikit-learn >= 1.0.0 names = vectorizer.get_feature_names_out() else: # scikit-learn < 1.0.0 # To remove when we drop support for 3.6 and increase minimum sklearn version to 1.0.0. names = vectorizer.get_feature_names() names = [str(name) for name in names] weights_df = pd.DataFrame(weights, columns=names, index=ids) weights_df.index.name = "id" return weights_df