def save_dataset_instance(db_filename, kw_filename, instance_filename): # Create a new Dataset instance dataset = Dataset('./raw_data/' + db_filename + '.txt') # Add some features dataset.add_features('./raw_data/' + kw_filename + '.txt') # Save new file dataset.save('./raw_data/' + instance_filename + '.pkl') return dataset
def __init__(self, db, dataset=None, studies=None, features=None, reset_db=False, reset_dataset=False, download_data=False): """ Initialize instance from a pickled Neurosynth Dataset instance or a pair of study and analysis .txt files. Args: db: the SQLAlchemy database connection to use. dataset: an optional filename of a pickled neurosynth Dataset instance. studies: name of file containing activation data. If passed, a new Dataset instance will be constructed. features: name of file containing feature data. reset_db: if True, will drop and re-create all database tables before adding new content. If False (default), will add content incrementally. reset_dataset: if True, will regenerate the pickled Neurosynth dataset. download_data: if True, ignores any existing files and downloads the latest Neurosynth data files from GitHub. """ if (studies is not None and not os.path.exists(studies)) \ or settings.RESET_ASSETS: print("WARNING: RESETTING ALL NEUROSYNTH ASSETS!") self.reset_assets(download_data) # Load or create Neurosynth Dataset instance if dataset is None or reset_dataset or (isinstance(dataset, str) and not os.path.exists(dataset)): print("\tInitializing a new Dataset...") if (studies is None) or (features is None): raise ValueError( "To generate a new Dataset instance, both studies and " "analyses must be provided.") dataset = Dataset(studies) dataset.add_features(features) dataset.save(settings.PICKLE_DATABASE) else: print("Loading existing Dataset...") dataset = Dataset.load(dataset) if features is not None: dataset.add_features(features) self.dataset = dataset self.db = db if reset_db: print("WARNING: RESETTING DATABASE!!!") self.reset_database()
def __init__(self, db, dataset=None, studies=None, features=None, reset_db=False, reset_dataset=False, download_data=True): """ Initialize instance from a pickled Neurosynth Dataset instance or a pair of study and analysis .txt files. Args: db: the SQLAlchemy database connection to use. dataset: an optional filename of a pickled neurosynth Dataset instance. Note that the Dataset must contain the list of Mappables (i.e., save() must have been called with keep_mappables set to True). studies: name of file containing activation data. If passed, a new Dataset instance will be constructed. features: name of file containing feature data. reset_db: if True, will drop and re-create all database tables before adding new content. If False (default), will add content incrementally. reset_dataset: if True, will regenerate the pickled Neurosynth dataset. download_data: if True, ignores any existing files and downloads the latest Neurosynth data files from GitHub. """ if (studies is not None and not os.path.exists(studies)) \ or settings.RESET_ASSETS: print "WARNING: RESETTING ALL NEUROSYNTH ASSETS!" self.reset_assets(download_data) # Load or create Neurosynth Dataset instance if dataset is None or reset_dataset or (isinstance(dataset, basestring) and not os.path.exists(dataset)): print "\tInitializing a new Dataset..." if (studies is None) or (features is None): raise ValueError( "To generate a new Dataset instance, both studies and " "analyses must be provided.") dataset = Dataset(studies) dataset.add_features(features) dataset.save(settings.PICKLE_DATABASE, keep_mappables=True) else: print "\tLoading existing Dataset..." dataset = Dataset.load(dataset) if features is not None: dataset.add_features(features) self.dataset = dataset self.db = db if reset_db: print "WARNING: RESETTING DATABASE!!!" self.reset_database()
def _getdata(): """Downloads data from neurosynth and returns it as a Dataset. Also pickles the dataset for future use.""" LOG.warning("Downloading and processing Neurosynth database") os.makedirs("data", exist_ok=True) from neurosynth.base.dataset import download download(path="data", unpack=True) data = Dataset("data/database.txt") data.add_features("data/features.txt") data.save("data/dataset.pkl") return data
def fetch_neurosynth_dataset(data_dir, return_pkl=True): """Downloads the Neurosynth dataset Parameters ---------- data_dir : str Directory in which to download the dataset. return_pkl : bool If true, creates and returns the .pkl file. Otherwise returns the dataset and features files. Returns ------- tuple, str If save_pkl is false, returns a tuple containing the path to the database.txt and the features.txt file. Otherwise returns the path to the .pkl file. """ if not os.path.isdir(data_dir): os.mkdir(data_dir) dataset_file = os.path.join(data_dir, "database.txt") if not os.path.isfile(dataset_file): logging.info("Downloading the Neurosynth dataset.") download(data_dir, unpack=True) feature_file = os.path.join(data_dir, "features.txt") if return_pkl: pkl_file = os.path.join(data_dir, "dataset.pkl") if not os.path.isfile(pkl_file): logging.info( "Converting Neurosynth data to a .pkl file. This may take a while." ) dataset = Dataset(dataset_file, feature_file) dataset.save(pkl_file) return pkl_file return (dataset_file, feature_file)
# <codecell> # Create a new Dataset instance dataset = Dataset('data/database.txt') # Add some features dataset.add_features('data/features.txt') # <markdowncell> # Because this takes a while, we'll save our Dataset object to disk. That way, the next time we want to use it, we won't have to sit through the whole creation operation again: # <codecell> dataset.save('dataset.pkl') # <markdowncell> # Now in future, instead of waiting, we could just load the dataset from file: # <codecell> dataset = Dataset.load('dataset.pkl') # Note the capital D in the second Dataset--load() is a class method # <markdowncell> # ## Doing stuff with Neurosynth # Now that our Dataset has both activation data and some features, we're ready to start doing some analyses! By design, Neurosynth focuses on facilitating simple, fast, and modestly useful analyses. This means you probably won't break any new ground using Neurosynth, but you should be able to supplement results you've generated using other approaches with a bunch of nifty analyses that take just 2 - 3 lines of code. # # ### Simple feature-based meta-analyses
def create_dataset(database_location, feature_location): dataset = Dataset(database_location) dataset.add_features(feature_location) dataset.save('neurosynth-dataset.pkl') return dataset
xfm2vol.run() #make masks to input into neurosynth def cluster2masks(clusterfile): clustermap = nb.load(clusterfile).get_data() for x in range(1, clustermap.max() + 1): clustermask = (clustermap == x).astype(int) nImg = nb.Nifti1Image(clustermask, None) nb.save( nImg, os.path.abspath(clusterfile + '_clustermask' + str(x) + '.nii')) cluster2masks(volume_file) dataset_file = '/home/raid3/watanabe/neurosynth/data/dataset.pkl' if not os.path.exists(dataset_file): dataset = Dataset('/home/raid3/watanabe/neurosynth/data/database.txt') dataset.add_features('/home/raid3/watanabe/neurosynth/data/features.txt') dataset.save(dataset_file) else: dataset = cPickle.load(open(dataset_file, 'rb')) clustermask = volume_file + '_clustermask' + str(3) + '.nii' ids = dataset.get_ids_by_mask(clustermask) features = dataset.feature_table.get_features_by_ids(ids) #mri_surf2vol --identity fsaverage4 --surfval /scr/ilz1/Data/attemptsurface.nii --hemi 'lh' --o /scr/ilz1/Data/results/surf2volume.nii --template /scr/ilz1/Data/freesurfer/fsaverage4/mri/orig.mgz
def create_dataset(database_location, feature_location): dataset = Dataset(database_location) dataset.add_features(feature_location) dataset.save('dataset-old.pkl') print 'created dataset' return dataset
class Neurosynth: def __init__(self, datadir='../data/neurosynth', verbose=True, ma_count_thresh=16, meta_image='consistency_z', resolution=3): self.dataset = None self.concepts = None self.concepts_df = None self.concept_pmids = {} self.datadir = datadir self.datafile = os.path.join(datadir, 'database.txt') self.verbose = verbose self.ma_count_thresh = ma_count_thresh self.meta_image = meta_image self.resolution = resolution self.imagedir_resampled = None self.image_concepts = None self.desmtx = None if not os.path.exists(os.path.join(self.datadir, 'database.txt')): print('downloading neurosynth data') ns.dataset.download(path='/tmp', unpack=True) print('extracting data') tfile = tarfile.open("/tmp/current_data.tar.gz", 'r:gz') if not os.path.exists(self.datadir): os.mkdir(self.datadir) tfile.extractall(self.datadir) os.remove("/tmp/current_data.tar.gz") print('done creating dataset in', self.datadir) self.imagedir = os.path.join(self.datadir, 'ma_images') if not os.path.exists(self.imagedir): os.mkdir(self.imagedir) def get_dataset(self, force_load=False): if os.path.exists(os.path.join(self.datadir, 'dataset.pkl')) and not force_load: print('loading database from', os.path.join(self.datadir, 'dataset.pkl')) self.dataset = Dataset.load( os.path.join(self.datadir, 'dataset.pkl')) else: print('loading database - this takes a few minutes') self.dataset = Dataset(os.path.join(self.datadir, 'database.txt')) self.dataset.add_features( os.path.join(self.datadir, 'features.txt')) self.dataset.save(os.path.join(self.datadir, 'dataset.pkl')) def get_concepts(self, force_load=False): if os.path.exists(os.path.join(self.datadir, 'concepts_df.csv')) and not force_load: print('using cached cognitive atlas concepts') self.concepts_df = pandas.read_csv( os.path.join(self.datadir, 'concepts_df.csv')) else: self.concepts_df = get_concept().pandas self.concepts_df.to_csv( os.path.join(self.datadir, 'concepts_df.csv')) self.concepts = self.concepts_df.name.tolist() def get_concept_pmids(self, retmax=2000000, force_load=False): # get the pmids for each concept that are in neurosynth # for single-word concepts we use the neurosynth search tool # for phrases we use pubmed if os.path.exists(os.path.join( self.datadir, 'concept_pmids.pkl')) and not force_load: print('using cached concept_pmids') self.concept_pmids = pickle.load( open(os.path.join(self.datadir, 'concept_pmids.pkl'), 'rb')) return print('loading all neurosynth pmids') all_neurosynth_ids = self.dataset.image_table.ids.tolist() for id in self.concepts: time.sleep(0.5) handle = Entrez.esearch(db="pubmed", retmax=retmax, term='"%s"' % id) record = Entrez.read(handle) handle.close() # make sure we got all the records - rerun if we didn't if int(record['Count']) > retmax: handle = Entrez.esearch(db="pubmed", retmax=int(record['Count']), term='"%s"' % id) record = Entrez.read(handle) handle.close() records_int = [int(i) for i in record['IdList']] ns_pmids = intersect(all_neurosynth_ids, records_int) print('pubmed found', len(ns_pmids), 'matching pmids for', id) self.concept_pmids[id] = ns_pmids pickle.dump( self.concept_pmids, open(os.path.join(self.datadir, 'concept_pmids.pkl'), 'wb')) def get_concept_images(self, force_load=False): for c in self.concept_pmids.keys(): if not force_load and os.path.exists( os.path.join( self.imagedir, '%s_specificity_z.nii.gz' % c.replace(' ', '-'))): continue if len(self.concept_pmids[c]) < self.ma_count_thresh: #print('skipping',c,len(self.concept_pmids[c]),'pmids') continue print('running meta-analysis for', c) ma = meta.MetaAnalysis(self.dataset, self.concept_pmids[c]) ma.save_results(self.imagedir, c.replace(' ', '-')) if force_load or not os.path.exists( os.path.join(self.imagedir, 'mask_image.nii.gz')): # make mask of voxels with zero standard deviation concept_images = glob.glob( os.path.join(self.imagedir, '*_%s.nii.gz' % self.meta_image)) imgdata = numpy.zeros((91, 109, 91, len(concept_images))) print('loading concept images to compute std') for i, c in enumerate(concept_images): tmp = nibabel.load(c).get_data() imgdata[:, :, :, i] = tmp imgstd = numpy.std(imgdata, axis=3) maskdata = (imgstd > 0).astype('int') maskimg = nibabel.Nifti1Image(maskdata, affine=nibabel.load(c).affine) maskimg.to_filename( os.path.join(self.imagedir, 'mask_image.nii.gz')) def get_resampled_images(self, shape=None, affine=None, force_load=False): # use 3 mm as default if not shape: shape = [60, 72, 60] affine = numpy.array([[-3, 0, 0, 90], [0, 3, 0, -126], [0, 0, 3, -72], [0, 0, 0, 1]]) self.resolution = affine[1, 1].astype('int') print('resampling data to %d mm' % self.resolution) self.imagedir_resampled = os.path.join( self.datadir, 'ma_images_%dmm' % self.resolution) if not os.path.exists(self.imagedir_resampled): os.mkdir(self.imagedir_resampled) concept_images = glob.glob( os.path.join(self.imagedir, '*_%s.nii.gz' % self.meta_image)) for c in concept_images: if force_load or not os.path.exists( os.path.join(self.imagedir_resampled, os.path.basename(c))): img = nilearn.image.resample_img(c, target_affine=affine, target_shape=shape) img.to_filename( os.path.join(self.imagedir_resampled, os.path.basename(c))) if not os.path.exists( os.path.join(self.datadir, 'mask_%dmm.nii.gz' % self.resolution)): # make MNI mask at chosen resolution mask = os.path.join( os.environ['FSLDIR'], 'data/standard/MNI152_T1_2mm_brain_mask.nii.gz') maskimg = nilearn.image.resample_img(mask, target_affine=affine, target_shape=shape) maskimg.to_filename( os.path.join(self.datadir, 'mask_%dmm.nii.gz' % self.resolution)) def load_concept_images(self, force_load=True): concept_images = glob.glob( os.path.join(self.imagedir_resampled, '*_%s.nii.gz' % self.meta_image)) concept_images.sort() self.image_concepts = [ os.path.basename(i).split('_')[0] for i in concept_images ] if os.path.exists( os.path.join(self.datadir, 'imgdata_%dmm.npy' % self.resolution)): self.imgdata = numpy.load( os.path.join(self.datadir, 'imgdata_%dmm.npy' % self.resolution)) # make sure it's the right size if self.imgdata.shape[1] == len(concept_images): print('using cached concept image data') return masker = nilearn.input_data.NiftiMasker( mask_img=os.path.join(self.datadir, 'mask_%dmm.nii.gz' % self.resolution), target_shape=[60, 72, 60], target_affine=numpy.array([[-3, 0, 0, 90], [0, 3, 0, -126], [0, 0, 3, -72], [0, 0, 0, 1]])) print('loading concept image data') self.imgdata = masker.fit_transform(concept_images) numpy.save( os.path.join(self.datadir, 'imgdata_%dmm.npy' % self.resolution), self.imgdata) def save(self): with open('%s/neurovault_%dmm.pkl' % (self.datadir, self.resolution), 'wb') as f: pickle.dump(self, f) def build_design_matrix(self, force_load=False): if not force_load and os.path.exists( os.path.join(self.datadir, 'desmtx.csv')): self.desmtx = pandas.DataFrame.from_csv( os.path.join(self.datadir, 'desmtx.csv')) print('using cached design matrix') return print('building design matrix') all_concept_pmids = [] for k in self.concept_pmids.keys(): all_concept_pmids = all_concept_pmids + self.concept_pmids[k] all_concept_pmids = list(set(all_concept_pmids)) all_concept_pmids.sort() all_concepts = list(self.concept_pmids.keys()) self.desmtx = pandas.DataFrame(data=0, index=all_concept_pmids, columns=all_concepts) for k in self.concept_pmids.keys(): pmids = self.concept_pmids[k] self.desmtx[k][pmids] = 1 # drop columns with too few matches self.desmtx = self.desmtx.ix[:, self.desmtx.sum() > self.ma_count_thresh] self.desmtx.to_csv(os.path.join(self.datadir, 'desmtx.csv'))
""" Load a Dataset and generate a full set of meta-analysis images--i.e., run a meta-analysis on every single feature. """ neurosynth_data_dir = "/home/data/nbc/misc-projects/niconn-macm/code/neurosynth/" if not op.isfile(op.join(neurosynth_data_dir, "dataset.pkl")): # Create Dataset instance from a database file. dataset = Dataset(op.join(neurosynth_data_dir, "database.txt")) # Load features from file dataset.add_features(op.join(neurosynth_data_dir, "features.txt")) # Pickle the Dataset to file so we can use Dataset.load() next time # instead of having to sit through the generation process again. dataset.save(op.join(neurosynth_data_dir, "dataset.pkl")) # Load pickled Dataset--assumes you've previously saved it. If not, # follow the create_a_new_dataset_and_load_features example. dataset = Dataset.load(op.join(neurosynth_data_dir, "dataset.pkl")) # Get the full list of feature names feature_list = dataset.get_feature_names() # Run a meta-analysis on each feature, and save all the results to # a directory called results. Note that the directory will not be # created for you, so make sure it exists. # Here we use the default frequency threshold of 0.001 (i.e., a # study is said to have a feature if more than 1 in every 1,000 # words is the target word), and an FDR correction level of 0.05. out_dir = "/home/data/nbc/misc-projects/meta-gradients/code/feature_maps"
xfm2vol.inputs.identity = 'fsaverage4' xfm2vol.inputs.hemi = 'lh' xfm2vol.inputs.transformed_file = volume_file xfm2vol.inputs.template_file = template xfm2vol.run() #make masks to input into neurosynth def cluster2masks(clusterfile): clustermap = nb.load(clusterfile).get_data() for x in range(1,clustermap.max()+1): clustermask = (clustermap==x).astype(int) nImg = nb.Nifti1Image(clustermask, None) nb.save(nImg, os.path.abspath(clusterfile+'_clustermask'+str(x)+'.nii')) cluster2masks(volume_file) dataset_file = '/home/raid3/watanabe/neurosynth/data/dataset.pkl' if not os.path.exists(dataset_file): dataset = Dataset('/home/raid3/watanabe/neurosynth/data/database.txt') dataset.add_features('/home/raid3/watanabe/neurosynth/data/features.txt') dataset.save(dataset_file) else: dataset = cPickle.load(open(dataset_file,'rb')) clustermask = volume_file+'_clustermask'+str(3)+'.nii' ids = dataset.get_ids_by_mask(clustermask) features = dataset.feature_table.get_features_by_ids(ids) #mri_surf2vol --identity fsaverage4 --surfval /scr/ilz1/Data/attemptsurface.nii --hemi 'lh' --o /scr/ilz1/Data/results/surf2volume.nii --template /scr/ilz1/Data/freesurfer/fsaverage4/mri/orig.mgz