def test_hpal_svd_combo(self): # get seed dataset ds4l = datasets['uni4large'] ds_orig = ds4l[:, ds4l.a.nonbogus_features] # XXX Is this SVD mapping required? svm = SVDMapper() svm.train(ds_orig) ds_svs = svm.forward(ds_orig) ds_orig.samples = ds_svs.samples nf_true = ds_orig.nfeatures n = 4 # # of datasets to generate # Adding non-shared dimensions for each subject dss_rotated = [[]] * n for i in range(n): dss_rotated[i] = hstack( (ds_orig, ds4l[:, ds4l.a.bogus_features[i * 4:i * 4 + 4]])) # rotate data nf = dss_rotated[0].nfeatures dss_rotated = [ random_affine_transformation(dss_rotated[i]) for i in xrange(n) ] # Test if it is close to doing hpal+SVD in sequence outside hpal # First, as we do in sequence outside hpal ha = Hyperalignment() mappers_orig = ha(dss_rotated) dss_back = [ m.forward(ds_) for m, ds_ in zip(mappers_orig, dss_rotated) ] dss_mean = np.mean([sd.samples for sd in dss_back], axis=0) svm = SVDMapper() svm.train(dss_mean) dss_sv = [svm.forward(sd) for sd in dss_back] # Test for SVD dimensionality reduction even with 2 training subjects for output_dim in [1, 4]: ha = Hyperalignment(output_dim=output_dim) ha.train(dss_rotated[:2]) mappers = ha(dss_rotated) dss_back = [m.forward(ds_) for m, ds_ in zip(mappers, dss_rotated)] for sd in dss_back: assert (sd.nfeatures == output_dim) # Check if combined hpal+SVD works as expected sv_corrs = [] for sd1, sd2 in zip(dss_sv, dss_back): ndcs = np.diag(np.corrcoef(sd1.samples.T, sd2.samples.T)[nf:, :nf], k=0) sv_corrs.append(ndcs) self.assertTrue( np.all(np.abs(np.array(sv_corrs)) >= 0.95), msg="Hyperalignment with dimensionality reduction should have " "reconstructed SVD dataset. Got correlations %s." % sv_corrs) # Check if it recovers original SVs sv_corrs_orig = [] for sd in dss_back: ndcs = np.diag(np.corrcoef(sd.samples.T, ds_orig.samples.T)[nf_true:, :nf_true], k=0) sv_corrs_orig.append(ndcs) self.assertTrue(np.all(np.abs(np.array(sv_corrs_orig)) >= 0.9), msg="Expected original dimensions after " "SVD. Got correlations %s." % sv_corrs_orig)
def get_testdata(self): # get a dataset with some prominent trends in it ds4l = datasets['uni4large'] # lets select for now only meaningful features ds_orig = ds4l[:, ds4l.a.nonbogus_features] zscore(ds_orig, chunks_attr=None) n = 4 # # of datasets to generate Rs, dss_rotated, dss_rotated_clean = [], [], [] # now lets compose derived datasets by using some random # rotation(s) while len(dss_rotated_clean) < n: ds_ = random_affine_transformation(ds_orig, scale_fac=1.0, shift_fac=0.) if ds_.a.random_scale <= 0: continue Rs.append(ds_.a.random_rotation) zscore(ds_, chunks_attr=None) dss_rotated_clean.append(ds_) i = len(dss_rotated_clean) - 1 ds_2 = hstack( [ds_, ds4l[:, ds4l.a.bogus_features[i * 4:i * 4 + 4]]]) zscore(ds_2, chunks_attr=None) dss_rotated.append(ds_2) return ds_orig, dss_rotated, dss_rotated_clean, Rs
def test_mergeds(): data0 = Dataset.from_wizard(np.ones((5, 5)), targets=1) data0.fa['one'] = np.ones(5) data1 = Dataset.from_wizard(np.ones((5, 5)), targets=1, chunks=1) data1.fa['one'] = np.zeros(5) data2 = Dataset.from_wizard(np.ones((3, 5)), targets=2, chunks=1) data3 = Dataset.from_wizard(np.ones((4, 5)), targets=2) data4 = Dataset.from_wizard(np.ones((2, 5)), targets=3, chunks=2) data4.fa['test'] = np.arange(5) # cannot merge if there are attributes missing in one of the datasets assert_raises(DatasetError, data1.append, data0) merged = data1.copy() merged.append(data2) ok_( merged.nfeatures == 5 ) l12 = [1]*5 + [2]*3 l1 = [1]*8 ok_((merged.targets == l12).all()) ok_((merged.chunks == l1).all()) data_append = data1.copy() data_append.append(data2) ok_(data_append.nfeatures == 5) ok_((data_append.targets == l12).all()) ok_((data_append.chunks == l1).all()) # # appending # # we need the same samples attributes in both datasets assert_raises(DatasetError, data2.append, data3) # # vstacking # if __debug__: # tested only in __debug__ assert_raises(ValueError, vstack, (data0, data1, data2, data3)) datasets = (data1, data2, data4) merged = vstack(datasets) assert_equal(merged.shape, (np.sum([len(ds) for ds in datasets]), data1.nfeatures)) assert_true('test' in merged.fa) assert_array_equal(merged.sa.targets, [1]*5 + [2]*3 + [3]*2) # # hstacking # assert_raises(ValueError, hstack, datasets) datasets = (data0, data1) merged = hstack(datasets) assert_equal(merged.shape, (len(data1), np.sum([ds.nfeatures for ds in datasets]))) assert_true('chunks' in merged.sa) assert_array_equal(merged.fa.one, [1]*5 + [0]*5)
def test_mergeds(): data0 = Dataset.from_wizard(np.ones((5, 5)), targets=1) data0.fa['one'] = np.ones(5) data1 = Dataset.from_wizard(np.ones((5, 5)), targets=1, chunks=1) data1.fa['one'] = np.zeros(5) data2 = Dataset.from_wizard(np.ones((3, 5)), targets=2, chunks=1) data3 = Dataset.from_wizard(np.ones((4, 5)), targets=2) data4 = Dataset.from_wizard(np.ones((2, 5)), targets=3, chunks=2) data4.fa['test'] = np.arange(5) # cannot merge if there are attributes missing in one of the datasets assert_raises(DatasetError, data1.append, data0) merged = data1.copy() merged.append(data2) ok_(merged.nfeatures == 5) l12 = [1] * 5 + [2] * 3 l1 = [1] * 8 ok_((merged.targets == l12).all()) ok_((merged.chunks == l1).all()) data_append = data1.copy() data_append.append(data2) ok_(data_append.nfeatures == 5) ok_((data_append.targets == l12).all()) ok_((data_append.chunks == l1).all()) # # appending # # we need the same samples attributes in both datasets assert_raises(DatasetError, data2.append, data3) # # vstacking # if __debug__: # tested only in __debug__ assert_raises(ValueError, vstack, (data0, data1, data2, data3)) datasets = (data1, data2, data4) merged = vstack(datasets) assert_equal(merged.shape, (np.sum([len(ds) for ds in datasets]), data1.nfeatures)) assert_true('test' in merged.fa) assert_array_equal(merged.sa.targets, [1] * 5 + [2] * 3 + [3] * 2) # # hstacking # assert_raises(ValueError, hstack, datasets) datasets = (data0, data1) merged = hstack(datasets) assert_equal(merged.shape, (len(data1), np.sum([ds.nfeatures for ds in datasets]))) assert_true('chunks' in merged.sa) assert_array_equal(merged.fa.one, [1] * 5 + [0] * 5)
def test_stack_add_dataset_attributes(): data0 = Dataset.from_wizard(np.ones((5, 5)), targets=1) data0.a['one'] = np.ones(2) data0.a['two'] = 2 data0.a['three'] = 'three' data0.a['common'] = range(10) data0.a['array'] = np.arange(10) data1 = Dataset.from_wizard(np.ones((5, 5)), targets=1) data1.a['one'] = np.ones(3) data1.a['two'] = 3 data1.a['four'] = 'four' data1.a['common'] = range(10) data1.a['array'] = np.arange(10) vstacker = lambda x: vstack((data0, data1), a=x) hstacker = lambda x: hstack((data0, data1), a=x) add_params = (1, None, 'unique', 'uniques', 'all', 'drop_nonunique') for stacker in (vstacker, hstacker): for add_param in add_params: if add_param == 'unique': assert_raises(DatasetError, stacker, add_param) continue r = stacker(add_param) if add_param == 1: assert_array_equal(data1.a.one, r.a.one) assert_equal(r.a.two, 3) assert_equal(r.a.four, 'four') assert_true('three' not in r.a.keys()) assert_true('array' in r.a.keys()) elif add_param == 'uniques': assert_equal(set(r.a.keys()), set(['one', 'two', 'three', 'four', 'common', 'array'])) assert_equal(r.a.two, (2, 3)) assert_equal(r.a.four, ('four',)) elif add_param == 'all': assert_equal(set(r.a.keys()), set(['one', 'two', 'three', 'four', 'common', 'array'])) assert_equal(r.a.two, (2, 3)) assert_equal(r.a.three, ('three', None)) elif add_param == 'drop_nonunique': assert_equal(set(r.a.keys()), set(['common', 'three', 'four', 'array'])) assert_equal(r.a.three, 'three') assert_equal(r.a.four, 'four') assert_equal(r.a.common, range(10)) assert_array_equal(r.a.array, np.arange(10))
def test_hstack(): """Additional tests for hstacking of datasets """ ds3d = datasets['3dsmall'] nf1 = ds3d.nfeatures nf3 = 3 * nf1 ds3dstacked = hstack((ds3d, ds3d, ds3d)) ok_(ds3dstacked.nfeatures == nf3) for fav in ds3dstacked.fa.itervalues(): v = fav.value ok_(len(v) == nf3) assert_array_equal(v[:nf1], v[nf1:2 * nf1]) assert_array_equal(v[2 * nf1:], v[nf1:2 * nf1])
def test_hstack(): """Additional tests for hstacking of datasets """ ds3d = datasets['3dsmall'] nf1 = ds3d.nfeatures nf3 = 3 * nf1 ds3dstacked = hstack((ds3d, ds3d, ds3d)) ok_(ds3dstacked.nfeatures == nf3) for fav in ds3dstacked.fa.itervalues(): v = fav.value ok_(len(v) == nf3) assert_array_equal(v[:nf1], v[nf1:2*nf1]) assert_array_equal(v[2*nf1:], v[nf1:2*nf1])
def transform(self, ds): ds_ = SampleSlicer(self._selection).transform(ds) iterable = [np.unique(ds_.sa[a].value) for a in self._attr] ds_stack = [] for attr in product(*iterable): mask = np.ones_like(ds_.targets, dtype=np.bool) for i, a in enumerate(attr): mask = np.logical_and(mask, ds_.sa[self._attr[i]].value == a) ds_stacked = hstack([d for d in ds_[mask]]) ds_stacked = self.update_attribute(ds_stacked) ds_stack.append(ds_stacked) return vstack(ds_stack)
def transform(self, ds): ds_ = SampleSlicer(self._selection).transform(ds) iterable = [np.unique(ds_.sa[a].value) for a in self._attr] ds_stack = [] for attr in product(*iterable): mask = np.ones_like(ds_.targets, dtype=np.bool) for i, a in enumerate(attr): mask = np.logical_and(mask, ds_.sa[self._attr[i]].value == a) ds_stacked = hstack([d for d in ds_[mask]]) ds_stacked = self.update_attribute(ds_stacked) ds_stack.append(ds_stacked) return vstack(ds_stack)
def get_testdata(self): # get a dataset with some prominent trends in it ds4l = datasets['uni4large'] # lets select for now only meaningful features ds_orig = ds4l[:, ds4l.a.nonbogus_features] zscore(ds_orig, chunks_attr=None) n = 4 # # of datasets to generate Rs, dss_rotated, dss_rotated_clean = [], [], [] # now lets compose derived datasets by using some random # rotation(s) while len(dss_rotated_clean) < n: ds_ = random_affine_transformation(ds_orig, scale_fac=1.0, shift_fac=0.) if ds_.a.random_scale <= 0: continue Rs.append(ds_.a.random_rotation) zscore(ds_, chunks_attr=None) dss_rotated_clean.append(ds_) i = len(dss_rotated_clean) - 1 ds_2 = hstack([ds_, ds4l[:, ds4l.a.bogus_features[i * 4: i * 4 + 4]]]) zscore(ds_2, chunks_attr=None) dss_rotated.append(ds_2) return ds_orig, dss_rotated, dss_rotated_clean, Rs
def test_hpal_svd_combo(self): # get seed dataset ds4l = datasets['uni4large'] ds_orig = ds4l[:, ds4l.a.nonbogus_features] # XXX Is this SVD mapping required? svm = SVDMapper() svm.train(ds_orig) ds_svs = svm.forward(ds_orig) ds_orig.samples = ds_svs.samples nf_true = ds_orig.nfeatures n = 4 # # of datasets to generate # Adding non-shared dimensions for each subject dss_rotated = [[]]*n for i in range(n): dss_rotated[i] = hstack( (ds_orig, ds4l[:, ds4l.a.bogus_features[i * 4: i * 4 + 4]])) # rotate data nf = dss_rotated[0].nfeatures dss_rotated = [random_affine_transformation(dss_rotated[i]) for i in xrange(n)] # Test if it is close to doing hpal+SVD in sequence outside hpal # First, as we do in sequence outside hpal ha = Hyperalignment() mappers_orig = ha(dss_rotated) dss_back = [m.forward(ds_) for m, ds_ in zip(mappers_orig, dss_rotated)] dss_mean = np.mean([sd.samples for sd in dss_back], axis=0) svm = SVDMapper() svm.train(dss_mean) dss_sv = [svm.forward(sd) for sd in dss_back] # Test for SVD dimensionality reduction even with 2 training subjects for output_dim in [1, 4]: ha = Hyperalignment(output_dim=output_dim) ha.train(dss_rotated[:2]) mappers = ha(dss_rotated) dss_back = [m.forward(ds_) for m, ds_ in zip(mappers, dss_rotated)] for sd in dss_back: assert (sd.nfeatures == output_dim) # Check if combined hpal+SVD works as expected sv_corrs = [] for sd1, sd2 in zip(dss_sv, dss_back): ndcs = np.diag(np.corrcoef(sd1.samples.T, sd2.samples.T)[nf:, :nf], k=0) sv_corrs.append(ndcs) self.assertTrue( np.all(np.abs(np.array(sv_corrs)) >= 0.95), msg="Hyperalignment with dimensionality reduction should have " "reconstructed SVD dataset. Got correlations %s." % sv_corrs) # Check if it recovers original SVs sv_corrs_orig = [] for sd in dss_back: ndcs = np.diag( np.corrcoef(sd.samples.T, ds_orig.samples.T)[nf_true:, :nf_true], k=0) sv_corrs_orig.append(ndcs) self.assertTrue( np.all(np.abs(np.array(sv_corrs_orig)) >= 0.9), msg="Expected original dimensions after " "SVD. Got correlations %s." % sv_corrs_orig)
def _call(self, ds): # local binding generator = self._generator node = self._node ca = self.ca space = self.get_space() concat_as = self._concat_as if self.ca.is_enabled("stats") and (not 'stats' in node.ca or not node.ca.is_enabled("stats")): warning("'stats' conditional attribute was enabled, but " "the assigned node '%s' either doesn't support it, " "or it is disabled" % node) # precharge conditional attributes ca.datasets = [] # run the node an all generated datasets results = [] for i, sds in enumerate(generator.generate(ds) if generator else [ds]): if __debug__: debug('REPM', "%d-th iteration of %s on %s", (i, self, sds)) if ca.is_enabled("datasets"): # store dataset in ca ca.datasets.append(sds) # run the beast result = node(sds) # callback if self._callback is not None: self._callback(data=sds, node=node, result=result) # subclass postprocessing result = self._repetition_postcall(sds, node, result) if space: # XXX maybe try to get something more informative from the # processing node (e.g. in 0.5 it used to be 'chunks'->'chunks' # to indicate what was trained and what was tested. Now it is # more tricky, because `node` could be anything result.set_attr(space, (i,)) # store results.append(result) if ca.is_enabled("stats") and 'stats' in node.ca \ and node.ca.is_enabled("stats"): if not ca.is_set('stats'): # create empty stats container of matching type ca.stats = node.ca['stats'].value.__class__() # harvest summary stats ca['stats'].value.__iadd__(node.ca['stats'].value) # charge condition attribute self.ca.repetition_results = results # stack all results into a single Dataset if concat_as == 'samples': results = vstack(results, True) elif concat_as == 'features': results = hstack(results, True) else: raise ValueError("Unknown concatenation mode '%s'" % concat_as) # no need to store the raw results, since the Measure class will # automatically store them in a CA return results
def _call(self, ds): # local binding generator = self._generator node = self._node ca = self.ca space = self.get_space() concat_as = self._concat_as if self.ca.is_enabled("stats") and (not 'stats' in node.ca or not node.ca.is_enabled("stats")): warning("'stats' conditional attribute was enabled, but " "the assigned node '%s' either doesn't support it, " "or it is disabled" % node) # precharge conditional attributes ca.datasets = [] # run the node an all generated datasets results = [] for i, sds in enumerate(generator.generate(ds) if generator else [ds]): if __debug__: debug('REPM', "%d-th iteration of %s on %s", (i, self, sds)) if ca.is_enabled("datasets"): # store dataset in ca ca.datasets.append(sds) # run the beast result = node(sds) # callback if self._callback is not None: self._callback(data=sds, node=node, result=result) # subclass postprocessing result = self._repetition_postcall(sds, node, result) if space: # XXX maybe try to get something more informative from the # processing node (e.g. in 0.5 it used to be 'chunks'->'chunks' # to indicate what was trained and what was tested. Now it is # more tricky, because `node` could be anything result.set_attr(space, (i, )) # store results.append(result) if ca.is_enabled("stats") and 'stats' in node.ca \ and node.ca.is_enabled("stats"): if not ca.is_set('stats'): # create empty stats container of matching type ca.stats = node.ca['stats'].value.__class__() # harvest summary stats ca['stats'].value.__iadd__(node.ca['stats'].value) # charge condition attribute self.ca.repetition_results = results # stack all results into a single Dataset if concat_as == 'samples': results = vstack(results, True) elif concat_as == 'features': results = hstack(results, True) else: raise ValueError("Unknown concatenation mode '%s'" % concat_as) # no need to store the raw results, since the Measure class will # automatically store them in a CA return results
def hstack(dsets, pad_to_feature_index=None, hstack_method='drop_nonunique', set_empty_value=0.): '''Stacks NIML datasets while considering node indices Parameters ---------- dsets: list datasets to be stacked pad_to_feature_index: list or int or None If a list then it should be of the same length as dsets and indicates to which node index the input should be padded. A single int means that the same value is used for all dset in dsets. None means no padding, and is only allowed for non-sparse datasets. hstack_method: str: How datasets are stacked; see dataset.hstack. set_empty_value: float Value to which empty (padded) dataset values are set. Returns dset: Dataset Data combined from all dset in dsets. ''' n = len(dsets) # make sure pad_to_feature_index has n values if pad_to_feature_index is None or type(pad_to_feature_index) is int: pad_to_feature_index = [pad_to_feature_index] * n elif len(pad_to_feature_index) != n: raise ValueError("illegal pad_to_feature_index: expected list or int") # labels that can contain node indices node_indices_labels = ('node_indices', 'center_ids', 'ids', 'roi_ids') node_indices = [] # allocate space for output padded_dsets = [] hstack_indices = [] first_node_index = 0 for i, (dset, pad_to) in enumerate(zip(dsets, pad_to_feature_index)): # get node indices in this dataset node_index = _find_node_indices(dset, node_indices_labels) if node_index is None: node_index = np.arange(dset.nfeatures) max_node_index = np.max(node_index) # make a stripped version - without node index labels stripped_dset = dset.copy() for label in node_indices_labels: if label in stripped_dset.fa: stripped_dset.fa.pop(label) # see if padding is needed if pad_to is None or pad_to == max_node_index + 1: if not np.array_equal(np.arange(max_node_index + 1), np.sort(node_index)): raise ValueError("Sparse input %d: need pad_to input" % (i + 1)) padded_dset = stripped_dset other_index = np.arange(0) else: # have to use empty values nfeatures_empty = pad_to - dset.nfeatures if nfeatures_empty < 0: raise ValueError("Dataset has %d features, cannot pad " "to %d" % (dset.nfeatures, pad_to)) # make empty array empty_arr = np.zeros((dset.nsamples, nfeatures_empty), dtype=dset.samples.dtype) + set_empty_value empty_dset = Dataset(empty_arr, sa=stripped_dset.sa.copy(deep=True)) # combine current dset and empty array padded_dset = dataset.hstack((stripped_dset, empty_dset), hstack_method) # set the proper node indices other_index = np.setdiff1d(np.arange(pad_to), node_index) # sanity check to make sure that indices are ok # XXX could be more informative if len(np.setdiff1d(node_index, np.arange(pad_to or max_node_index + 1))): raise ValueError("Illegal indices") hstack_index = node_index + first_node_index hstack_other_index = other_index + first_node_index first_node_index += pad_to or (max_node_index + 1) # prepare for next iteration padded_dsets.append(padded_dset) hstack_indices.append(hstack_index) if len(other_index): hstack_indices.append(hstack_other_index) hstack_dset = dataset.hstack(padded_dsets, hstack_method) hstack_indices = np.hstack(hstack_indices) hstack_dset.fa[node_indices_labels[0]] = hstack_indices return hstack_dset
def _get_connectomes(self, datasets): params = self.params # If no precomputed connectomes are supplied, compute them. if params.connectomes is not None and os.path.exists(params.connectomes): _chpaldebug("Loading pre-computed connectomes from %s" % params.connectomes) connectomes = h5load(params.connectomes) return connectomes connectivity_mapper = FxyMapper(params.conn_metric) # Initializing datasets with original anatomically aligned datasets mfm = MeanFeatureMeasure() # TODO Handle seed_radius if seed queryengines are not provided seed_radius = params.seed_radius _chpaldebug("Performing surface connectivity hyperalignment with seeds") _chpaldebug("Computing connectomes.") ndatasets = len(datasets) if params.seed_queryengines is None: raise NotImplementedError("For now, we need seed queryengines.") qe_all = super(ConnectivityHyperalignment, self)._get_trained_queryengines( datasets, params.seed_queryengines, seed_radius, params.ref_ds) # If seed_indices are not supplied, use all as centers if not params.seed_indices: roi_ids = super(ConnectivityHyperalignment, self)._get_verified_ids(qe_all) else: roi_ids = params.seed_indices if len(qe_all) == 1: qe_all *= ndatasets # Computing Seed means to be used for aligning seed features seed_means = [self._get_seed_means(MeanFeatureMeasure(), qe, ds, params.seed_indices) for qe, ds in zip(qe_all, datasets)] if params.npcs is None: conn_targets = [] for seed_mean in seed_means: zscore(seed_mean, chunks_attr=None) conn_targets.append(seed_mean) else: # compute all PC-seed connectivity in each subject # 1. make common model SVs in each seed SL based on connectivity to seed_means # 2. Use these SVs for computing connectomes _chpaldebug("Aligning SVs in each searchlight across subjects") # Looping over all seeds in which SVD is done pc_data = [[] for isub in range(ndatasets)] sl_common_models = dict() if params.common_model is not None and os.path.exists(params.common_model): _chpaldebug("Loading common model from %s" % params.common_model) common_model = h5load(params.common_model) sl_common_models = common_model['local_models'] for inode in roi_ids: # For each SL, computing connectivity of features to seed means # This line below doesn't need common model sl_connectomes = self._get_sl_connectomes(seed_means, qe_all, datasets, inode, connectivity_mapper) # Hyperalign connectomes in SL # XXX TODO Common model input to below function should be updated. local_common_model = sl_common_models[inode][:, :params.npcs] \ if params.common_model else None sl_hmappers, svm, sl_common_model = self._get_hypesvs(sl_connectomes, local_common_model=local_common_model) if sl_common_model is not None: sl_common_models[inode] = sl_common_model # make common model SV timeseries data in each subject for sd, slhm, qe, pcd in zip(datasets, sl_hmappers, qe_all, pc_data): sd_svs = slhm.forward(sd[:, qe[inode]]) zscore(sd_svs, chunks_attr=None) if svm is not None: sd_svs = svm.forward(sd_svs) sd_svs = sd_svs[:, :params.npcs] zscore(sd_svs, chunks_attr=None) pcd.append(sd_svs) if params.save_model is not None: # TODO: should use debug print('Saving local models to %s' % params.save_model) h5save(params.save_model, sl_common_models) pc_data = [hstack(pcd) for pcd in pc_data] conn_targets = pc_data #print pc_data[-1] # compute connectomes using connectivity targets (PCs or seed means) connectomes = [] if params.common_model is not None and os.path.exists(params.common_model): # TODO: should use debug print('Loading from saved common model: %s' % params.common_model) connectome_model = common_model['connectome_model'] connectomes.append(connectome_model) for t_, ds in zip(conn_targets, datasets): connectivity_mapper.train(t_) connectome = connectivity_mapper.forward(ds) t_ = None connectome.fa = ds.fa if connectome.samples.dtype == 'float64': connectome.samples = connectome.samples.astype('float32') zscore(connectome, chunks_attr=None) connectomes.append(connectome) if params.connectomes is not None and not os.path.exists(params.connectomes): _chpaldebug("Saving connectomes to ", params.connectomes) h5save(params.connectomes, connectomes) return connectomes
def hstack(dsets, pad_to_feature_index=None, hstack_method='drop_nonunique', set_empty_value=0.): '''Stacks NIML datasets while considering node indices Parameters ---------- dsets: list datasets to be stacked pad_to_feature_index: list or int or None If a list then it should be of the same length as dsets and indicates to which node index the input should be padded. A single int means that the same value is used for all dset in dsets. None means no padding, and is only allowed for non-sparse datasets. hstack_method: str: How datasets are stacked; see dataset.hstack. set_empty_value: float Value to which empty (padded) dataset values are set. Returns dset: Dataset Data combined from all dset in dsets. ''' n = len(dsets) # make sure pad_to_feature_index has n values if pad_to_feature_index is None or type(pad_to_feature_index) is int: pad_to_feature_index = [pad_to_feature_index] * n elif len(pad_to_feature_index) != n: raise ValueError("illegal pad_to_feature_index: expected list or int") # labels that can contain node indices node_indices_labels = ('node_indices', 'center_ids', 'ids', 'roi_ids') node_indices = [] # allocate space for output padded_dsets = [] hstack_indices = [] first_node_index = 0 for i, (dset, pad_to) in enumerate(zip(dsets, pad_to_feature_index)): # get node indices in this dataset node_index = _find_node_indices(dset, node_indices_labels) if node_index is None: node_index = np.arange(dset.nfeatures) max_node_index = np.max(node_index) # make a stripped version - without node index labels stripped_dset = dset.copy() for label in node_indices_labels: if label in stripped_dset.fa: stripped_dset.fa.pop(label) # see if padding is needed if pad_to is None or pad_to == max_node_index + 1: if not np.array_equal(np.arange(max_node_index + 1), np.sort(node_index)): raise ValueError("Sparse input %d: need pad_to input" % (i + 1)) padded_dset = stripped_dset other_index = np.arange(0) else: # have to use empty values nfeatures_empty = pad_to - dset.nfeatures if nfeatures_empty < 0: raise ValueError("Dataset has %d features, cannot pad " "to %d" % (dset.nfeatures, pad_to)) # make empty array empty_arr = np.zeros((dset.nsamples, nfeatures_empty), dtype=dset.samples.dtype) + set_empty_value empty_dset = Dataset(empty_arr, sa=stripped_dset.sa.copy(deep=True)) # combine current dset and empty array padded_dset = dataset.hstack((stripped_dset, empty_dset), hstack_method) # set the proper node indices other_index = np.setdiff1d(np.arange(pad_to), node_index) # sanity check to make sure that indices are ok # XXX could be more informative if len(np.setdiff1d(node_index, np.arange(pad_to or max_node_index + 1))): raise ValueError("Illegal indices") hstack_index = node_index + first_node_index hstack_other_index = other_index + first_node_index first_node_index += pad_to or (max_node_index + 1) # prepare for next iteration padded_dsets.append(padded_dset) hstack_indices.append(hstack_index) if len(other_index): hstack_indices.append(hstack_other_index) hstack_dset = dataset.hstack(padded_dsets, hstack_method) hstack_indices = np.hstack(hstack_indices) hstack_dset.fa[node_indices_labels[0]] = hstack_indices return hstack_dset
def _get_connectomes(self, datasets): params = self.params # If no precomputed connectomes are supplied, compute them. if params.connectomes is not None and os.path.exists( params.connectomes): _chpaldebug("Loading pre-computed connectomes from ", params.connectomes) connectomes = h5load(params.connectomes) return connectomes connectivity_mapper = FxyMapper(params.conn_metric) # Initializing datasets with original anatomically aligned datasets mfm = MeanFeatureMeasure() # TODO Handle seed_radius if seed queryengines are not provided seed_radius = params.seed_radius _chpaldebug( "Performing surface connectivity hyperalignment with seeds") _chpaldebug("Computing connectomes.") ndatasets = len(datasets) if params.seed_queryengines is None: raise NotImplementedError("For now, we need seed queryengines.") qe_all = super(ConnectivityHyperalignment, self)._get_trained_queryengines( datasets, params.seed_queryengines, seed_radius, params.ref_ds) # If seed_indices are not supplied, use all as centers if not params.seed_indices: roi_ids = super(ConnectivityHyperalignment, self)._get_verified_ids(qe_all) else: roi_ids = params.seed_indices if len(qe_all) == 1: qe_all *= ndatasets # Computing Seed means to be used for aligning seed features seed_means = [ self._get_seed_means(MeanFeatureMeasure(), qe, ds, params.seed_indices) for qe, ds in zip(qe_all, datasets) ] if params.npcs is None: conn_targets = [] for seed_mean in seed_means: zscore(seed_mean, chunks_attr=None) conn_targets.append(seed_mean) else: # compute all PC-seed connectivity in each subject # 1. make common model SVs in each seed SL based on connectivity to seed_means # 2. Use these SVs for computing connectomes _chpaldebug("Aligning SVs in each searchlight across subjects") # Looping over all seeds in which SVD is done pc_data = [[] for isub in range(ndatasets)] sl_common_models = dict() if params.common_model is not None and os.path.exists( params.common_model): _chpaldebug("Loading common model from %s" % params.common_model) common_model = h5load(params.common_model) sl_common_models = common_model['local_models'] for inode in roi_ids: # For each SL, computing connectivity of features to seed means # This line below doesn't need common model sl_connectomes = self._get_sl_connectomes( seed_means, qe_all, datasets, inode, connectivity_mapper) # Hyperalign connectomes in SL # XXX TODO Common model input to below function should be updated. local_common_model = sl_common_models[inode][:, :params.npcs] \ if params.common_model else None sl_hmappers, svm, sl_common_model = self._get_hypesvs( sl_connectomes, local_common_model=local_common_model) if sl_common_model is not None: sl_common_models[inode] = sl_common_model # make common model SV timeseries data in each subject for sd, slhm, qe, pcd in zip(datasets, sl_hmappers, qe_all, pc_data): sd_svs = slhm.forward(sd[:, qe[inode]]) zscore(sd_svs, chunks_attr=None) if svm is not None: sd_svs = svm.forward(sd_svs) sd_svs = sd_svs[:, :params.npcs] zscore(sd_svs, chunks_attr=None) pcd.append(sd_svs) if params.save_model is not None: # TODO: should use debug print('Saving local models to %s' % params.save_model) h5save(params.save_model, sl_common_models) pc_data = [hstack(pcd) for pcd in pc_data] conn_targets = pc_data #print pc_data[-1] # compute connectomes using connectivity targets (PCs or seed means) connectomes = [] if params.common_model is not None and os.path.exists( params.common_model): # TODO: should use debug print('Loading from saved common model: %s' % params.common_model) connectome_model = common_model['connectome_model'] connectomes.append(connectome_model) for t_, ds in zip(conn_targets, datasets): connectivity_mapper.train(t_) connectome = connectivity_mapper.forward(ds) t_ = None connectome.fa = ds.fa if connectome.samples.dtype == 'float64': connectome.samples = connectome.samples.astype('float32') zscore(connectome, chunks_attr=None) connectomes.append(connectome) if params.connectomes is not None and not os.path.exists( params.connectomes): _chpaldebug("Saving connectomes to ", params.connectomes) h5save(params.connectomes, connectomes) return connectomes