def load_data(self): """ Loads data to set up classificaiton problem. Most importantly self.data is filled in, which consists of a Numpy array (length = number of regions) with X and y data for each region """ from neurosynth.analysis.reduce import average_within_regions all_ids = self.dataset.image_table.ids high_thresh = average_within_regions( self.dataset, self.mask_img, threshold=self.thresh) low_thresh = average_within_regions( self.dataset, self.mask_img, threshold=self.thresh_low) self.data = np.empty(high_thresh.shape[0], dtype=np.object) for i, on_mask in enumerate(high_thresh): on_data = self.dataset.get_feature_data( ids=np.array(all_ids)[np.where(on_mask == True)[0]]).dropna() off_mask = low_thresh[i] off_ids = list( set(all_ids) - set(np.array(all_ids)[np.where(off_mask == True)[0]])) off_data = self.dataset.feature_table.get_feature_data( ids=off_ids).dropna() y = np.array([0] * off_data.shape[0] + [1] * on_data.shape[0]) X = np.vstack((np.array(off_data), np.array(on_data))) from sklearn.preprocessing import scale X = scale(X, with_mean=False) self.data[i] = (X, y) self.feature_names = self.dataset.get_feature_data().columns.tolist() self.n_regions = self.data.shape[0]
def test_roi_averaging(self): """ Test averaging within region labels in a mask. """ filename = get_test_data_path() + 'sgacc_mask.nii.gz' avg_vox = reduce.average_within_regions(self.dataset, filename) n_studies = self.dataset.image_table.data.shape[1] self.assertEqual(n_studies, avg_vox.shape[1]) self.assertGreater(avg_vox.sum(), 0.05)
def load_data(self, features): """ Loads ids and data for each individual mask """ print "Loading data from neurosynth..." from neurosynth.analysis.reduce import average_within_regions if self.mask_img is None: self.y = self.dataset.get_image_data() elif isinstance(self.mask_img, basestring): if self.mask_img[-3:] == ".pkl": import cPickle self.y = cPickle.load(open(self.mask_img, 'rb')) else: # ADD FEATURE TO FILTER BY FEATURES self.y = average_within_regions( self.dataset, self.mask_img) else: self. y = self.mask_img self.mask_num = self.y.shape[0] from neurosynth.analysis.classify import regularize X = self.dataset.get_feature_data(features=features) self.X = regularize(X, method='scale') self.set_dims()
def load_mask_data(self, features=None): """ Loads ids and data for each individual mask """ from neurosynth.analysis.reduce import average_within_regions # ADD FEATURE TO FILTER BY FEATURES masks_by_studies = average_within_regions( self.dataset, self.mask_img, threshold=self.thresh) study_ids = self.dataset.feature_table.data.index print "Loading data from neurosynth..." pb = tools.ProgressBar(len(list(masks_by_studies)), start=True) self.ids_by_masks = [] self.data_by_masks = [] for mask in masks_by_studies: m_ids = study_ids[np.where(mask == True)[0]] self.ids_by_masks.append(m_ids) self.data_by_masks.append(self.dataset.get_feature_data(ids=m_ids, features=features)) pb.next() self.mask_num = masks_by_studies.shape[0] self.feature_names = self.dataset.get_feature_data(features=features).columns.tolist()
def load_data(self, features, X_threshold): """ Load data into c_data """ from neurosynth.analysis.reduce import average_within_regions # Load Masks by studies matrix # ADD FEATURE TO FILTER BY FEATURES masks_by_studies = average_within_regions(self.dataset, self.mask_img, threshold = self.thresh) study_ids = self.dataset.feature_table.data.index print "Loading data from neurosynth..." pb = tools.ProgressBar(len(list(masks_by_studies)), start=True) self.ids_by_masks = [] self.data_by_masks = [] for mask in masks_by_studies: m_ids = study_ids[np.where(mask == True)[0]] self.ids_by_masks.append(m_ids) self.data_by_masks.append(self.dataset.get_feature_data(ids=m_ids)) pb.next() self.mask_num = masks_by_studies.shape[0] self.mask_pairs = list(itertools.permutations(range(0, self.mask_num), 2)) filename = path.join(mkdtemp(), 'c_data.dat') self.c_data = np.memmap(filename, dtype='object', mode='w+', shape=(self.mask_num, self.mask_num)) # Load data for pair in self.mask_pairs: reg1_ids = self.ids_by_masks[pair[0]] reg2_ids = self.ids_by_masks[pair[1]] reg1_set = list(set(reg1_ids) - set(reg2_ids)) reg2_set = list(set(reg2_ids) - set(reg1_ids)) x1 = self.data_by_masks[pair[0]] x1 = np.array(x1)[np.where(np.in1d(reg1_ids, reg1_set))[0]] x2 = self.data_by_masks[pair[1]] x2 = np.array(x2)[np.where(np.in1d(reg2_ids, reg2_set))[0]] y = np.array([0]*len(reg1_set) + [1]*len(reg2_set)) X = np.vstack((x1, x2)) if X_threshold is not None: X = binarize(X, X_threshold) from neurosynth.analysis.classify import regularize X = regularize(X, method='scale') self.c_data[pair] = (X, y) if self.memsave: self.data_by_masks = [] self.ids_by_masks = []
def __init__(self, neurosynth_dataset, image_file_name, word_counts_file): self.word_subset = neurosynth_dataset.get_feature_names() self.word_string_to_index = {} for i in range(len(self.word_subset)): self.word_string_to_index[self.word_subset[i]] = i self.dois = list(neurosynth_dataset.feature_table.ids) self.average_activation = nsar.average_within_regions(neurosynth_dataset, image_file_name) self.get_word_subset_counts(self.word_subset, word_counts_file)
def save_roi_data(self,atlas='all'): """save roi data for all atlases in atlas_list in a json file in data subfolder""" if atlas == 'all': for atlas in self.atlas_list: res = nr.average_within_regions(self.dataset,atlas) j=json.dumps(res.tolist()) atlas_name = 'data/'+ path.basename(atlas)[:-4] + '.json' #self.json_files.append(atlas_name) jfile = open(atlas_name,'w') jfile.write(j) jfile.close() if atlas == 'craddock': atlas = self.atlas_list[3] res = nr.average_within_regions(self.dataset,atlas) j=json.dumps(res.tolist()) atlas_name = 'data/'+ path.basename(atlas)[:-4] + '.json' jfile = open(atlas_name,'w') jfile.write(j) jfile.close()
def save_roi_data(self, atlas='all'): """save roi data for all atlases in atlas_list in a json file in data subfolder""" if atlas == 'all': for atlas in self.atlas_list: res = nr.average_within_regions(self.dataset, atlas) j = json.dumps(res.tolist()) atlas_name = 'data/' + path.basename(atlas)[:-4] + '.json' #self.json_files.append(atlas_name) jfile = open(atlas_name, 'w') jfile.write(j) jfile.close()
def load_data(self, features, X_threshold): """ Load data into c_data """ # Load data for each mask self.load_mask_data(features) filename = path.join(mkdtemp(), 'c_data.dat') self.c_data = np.memmap(filename, dtype='object', mode='w+', shape=(self.mask_num)) all_ids = self.dataset.image_table.ids # If a low thresh is set, then get ids for studies at that threshold if self.thresh_low is not None: ids_by_masks_low = [] from neurosynth.analysis.reduce import average_within_regions masks_by_studies_low = average_within_regions( self.dataset, self.mask_img, threshold=self.thresh_low) for mask in masks_by_studies_low: m_ids = np.array(all_ids)[np.where(mask == True)[0]] ids_by_masks_low.append(m_ids) # Set up data into c_data for num, on_ids in enumerate(self.ids_by_masks): # If a low threshold is set, then use that to filter "off_ids", otherwise use "on_ids" if self.thresh_low is not None: off_ids = list(set(all_ids) - set(ids_by_masks_low[num])) else: off_ids = list(set(all_ids) - set(on_ids)) on_data = self.data_by_masks[num].dropna() off_data = self.dataset.get_feature_data(ids=off_ids).dropna() y = np.array([0] * off_data.shape[0] + [1] * on_data.shape[0]) X = np.vstack((np.array(off_data), np.array(on_data))) from neurosynth.analysis.classify import regularize X = regularize(X, method='scale') if X_threshold is not None: X = binarize(X, X_threshold) self.c_data[num] = (X, y) if self.memsave: self.data_by_masks = [] self.ids_by_masks = [] self.comparisons = range(0, self.mask_num) self.comp_dims = (self.mask_num, )
def bootstrap_mv_full(dataset, clf, scorer, mask, features=None, processes=None, boot_n=100, method='combinatorial', outfile=None, thresh_high=0.05, thresh_low=0): from neurosynth.analysis.reduce import average_within_regions if processes != 1: from multiprocessing import Pool pool = Pool(processes=processes) else: pool = itertools pb = tools.ProgressBar(boot_n, start=True) if method == 'shannons': cols = ['shannons', 'region', 'boot_n'] else: cols = ['score', 'num_features', 'feature', 'region', 'boot_n'] X = dataset.get_feature_data(features=features) y_high = average_within_regions(dataset, mask, threshold=thresh_high) y_low = average_within_regions(dataset, mask, threshold=thresh_low) # Get feature names overall_results = [] for result in pool.imap(bootstrap_mv_full_parallel, itertools.izip(itertools.repeat(( X, y_high, y_low, clf, scorer, method)), range(boot_n))): pb.next() if result is not None: if method != 'shannons': for row in result: overall_results.append(row) else: overall_results.append(result) if outfile is not None: pd.DataFrame(overall_results, columns=cols).to_csv(outfile) print "Saved" overall_results = pd.DataFrame(overall_results, columns=cols) overall_results.region += 1 return overall_results
def _roi_association(self, imgs_to_decode, value='z', binarize=None): """ Computes the strength of association between activation in a mask and presence/absence of a semantic feature. This is essentially a generalization of the voxel-wise reverse inference z-score to the multivoxel case. """ imgs_to_decode = imgs_to_decode.squeeze() x = average_within_regions(self.dataset, imgs_to_decode).astype(float) y = self.dataset.feature_table.data[self.feature_names].values if binarize is not None: y[y > binarize] = 1. y[y < 1.] = 0. r = self._xy_corr(x.T, y) if value == 'r': return r elif value == 'z': f_r = np.arctanh(r) return f_r*np.sqrt(y.shape[0]-3)
print "Getting ready to cluster" ns_regions = Parallel(n_jobs=1)(delayed(cluster_ward)(dataset, d, r, regions) for d, r in distances) print "Classifying" ### For number of regions for n_ix, n_regions in enumerate(regions): ns_matched_regions = zip(*ns_regions)[n_ix] all_predictions = [] ### For each fold, predict activation with topics using corresponding clustering for fold_i, (train_index, test_index) in enumerate(cver): ys = (dataset.feature_table.data.values > 0.001).astype('int').T X = average_within_regions(dataset, ns_matched_regions[fold_i]).T match_predictions = Parallel(n_jobs=-1)( delayed(fit_predict)(classifier, X[train_index, :], X[ test_index, :], y[train_index], y[test_index]) for y in ys) match_predictions = pd.DataFrame(match_predictions, columns=['y_test', 'y_pred']) match_predictions['region'] = dataset.get_feature_names() match_predictions['atlas'] = 'ns_%s' % name match_predictions['fold'] = fold_i match_predictions['n_regions'] = n_regions all_predictions.append(match_predictions) all_predictions = pd.concat(all_predictions) all_predictions.to_pickle( '/home/delavega/projects/classification/results/cv_clust_predict/topics_%s_%d.pkl' % (name, n_regions))
predictions = [] n_regions = int(match_regions.get_data().max()) ### Generate matched clustering for each fold of ns data ns_regions = Parallel(n_jobs=-1)(delayed(cv_cluster)( dataset, reference.data[:, train_index], match_regions, train_index) for train_index, _ in cver) all_predictions = [] ### For each fold, predict activation with topics using corresponding clustering for fold_i, (train_index, test_index) in enumerate(cver): X_train = dataset.feature_table.data.iloc[train_index, :] X_test = dataset.feature_table.data.iloc[test_index, :] ## Predict using matched neurosynth atlas ys = average_within_regions(dataset, ns_regions[fold_i], threshold=0.05).astype('int64') ns_predictions = Parallel(n_jobs=-1)(delayed(fit_predict)( classifier, X_train, X_test, y[train_index], y[test_index]) for y in ys) ns_predictions = pd.DataFrame(ns_predictions, columns=['y_test', 'y_pred']) ns_predictions['region'] = range(0, n_regions + 1) ns_predictions['atlas'] = 'ns_craddock' ns_predictions['fold'] = fold_i all_predictions.append(ns_predictions) ## Predict using original craddock atlas ys = average_within_regions(dataset, match_regions, threshold=0.05).astype('int64') match_predictions = Parallel(n_jobs=-1)(delayed(fit_predict)( classifier, X_train, X_test, y[train_index], y[test_index]) for y in ys)