def get_cluster_sizes(ds, cluster_counter=None): """Computer cluster sizes from all samples in a boolean dataset. Individually for each sample, in the input dataset, clusters of non-zero values will be determined after reverse-applying any transformation of the dataset's mapper (if any). Parameters ---------- ds : dataset or array A dataset with boolean samples. cluster_counter : list or None If not None, the given list is extended with the cluster sizes computed from the present input dataset. Otherwise, a new list is generated. Returns ------- list Unsorted list of cluster sizes from all samples in the input dataset (optionally appended to any values passed via ``cluster_counter``). """ # XXX input needs to be boolean for the cluster size calculation to work if cluster_counter is None: cluster_counter = Counter() mapper = IdentityMapper() data = np.asanyarray(ds) if hasattr(ds, 'a') and 'mapper' in ds.a: mapper = ds.a.mapper for i in xrange(len(ds)): osamp = mapper.reverse1(data[i]) m_clusters = _get_map_cluster_sizes(osamp) cluster_counter.update(m_clusters) return cluster_counter
def test_identity_mapper(s): idm = IdentityMapper() # doesn't matter what you throw at it assert_true(idm.forward(s) is s) assert_true(idm.forward1(s) is s) assert_true(idm.reverse(s) is s) assert_true(idm.reverse1(s) is s)
def test_identity_mapper(s): idm = IdentityMapper() # doesn't matter what you throw at it assert_true(idm.forward(s) is s) assert_true(idm.forward1(s) is s) assert_true(idm.reverse(s) is s) assert_true(idm.reverse1(s) is s) # even like this it should work, but type conversion # can happen assert_array_equal(_verified_reverse1(idm, s), s) assert_array_equal(idm.reverse1(s), s)
def _call(self, ds): if len(ds) > 1: # average all samples into one, assuming we got something like one # sample per subject as input avgr = mean_sample() ds = avgr(ds) # threshold input; at this point we only have one sample left thrd = ds.samples[0] > self._thrmap # mapper default mapper = IdentityMapper() # overwrite if possible if hasattr(ds, 'a') and 'mapper' in ds.a: mapper = ds.a.mapper # reverse-map input othrd = _verified_reverse1(mapper, thrd) # TODO: what is your purpose in life osamp? ;-) osamp = _verified_reverse1(mapper, ds.samples[0]) # prep output dataset outds = ds.copy(deep=False) outds.fa['featurewise_thresh'] = self._thrmap # determine clusters labels, num = measurements.label(othrd,structure=np.ones([3,3,3])) area = measurements.sum(othrd, labels, index=np.arange(1, num + 1)).astype(int) com = measurements.center_of_mass( osamp, labels=labels, index=np.arange(1, num + 1)) maxpos = measurements.maximum_position( osamp, labels=labels, index=np.arange(1, num + 1)) # for the rest we need the labels flattened labels = mapper.forward1(labels) # relabel clusters starting with the biggest and increase index with # decreasing size ordered_labels = np.zeros(labels.shape, dtype=int) ordered_area = np.zeros(area.shape, dtype=int) ordered_com = np.zeros((num, len(osamp.shape)), dtype=float) ordered_maxpos = np.zeros((num, len(osamp.shape)), dtype=float) for i, idx in enumerate(np.argsort(area)): ordered_labels[labels == idx + 1] = num - i # kinda ugly, but we are looping anyway ordered_area[i] = area[idx] ordered_com[i] = com[idx] ordered_maxpos[i] = maxpos[idx] labels = ordered_labels area = ordered_area[::-1] com = ordered_com[::-1] maxpos = ordered_maxpos[::-1] del ordered_labels # this one can be big # store cluster labels after forward-mapping outds.fa['clusters_featurewise_thresh'] = labels.copy() # location info outds.a['clusterlocations'] = \ np.rec.fromarrays( [com, maxpos], names=('center_of_mass', 'max')) # update cluster size histogram with the actual result to get a # proper lower bound for p-values # this will make a copy, because the original matrix is int cluster_probs_raw = _transform_to_pvals( area, self._null_cluster_sizes.astype('float')) clusterstats = ( [area, cluster_probs_raw], ['size', 'prob_raw'] ) # evaluate a bunch of stats for all clusters morestats = {} for cid in xrange(len(area)): # keep clusters on outer loop, because selection is more expensive clvals = ds.samples[0, labels == cid + 1] for id_, fx in ( ('mean', np.mean), ('median', np.median), ('min', np.min), ('max', np.max), ('std', np.std)): stats = morestats.get(id_, []) stats.append(fx(clvals)) morestats[id_] = stats for k, v in morestats.items(): clusterstats[0].append(v) clusterstats[1].append(k) if self.params.multicomp_correction is not None: # do a local import as only this tiny portion needs statsmodels import statsmodels.stats.multitest as smm rej, probs_corr = smm.multipletests( cluster_probs_raw, alpha=self.params.fwe_rate, method=self.params.multicomp_correction)[:2] # store corrected per-cluster probabilities clusterstats[0].append(probs_corr) clusterstats[1].append('prob_corrected') # remove cluster labels that did not pass the FWE threshold for i, r in enumerate(rej): if not r: labels[labels == i + 1] = 0 outds.fa['clusters_fwe_thresh'] = labels outds.a['clusterstats'] = \ np.rec.fromarrays(clusterstats[0], names=clusterstats[1]) return outds
def nohyper(dss): return [IdentityMapper() for ds in dss]
def timesegments_classification(dss, hyper=None, part1=HalfPartitioner(), part2=NFoldPartitioner(attr='subjects'), window_size=6, overlapping_windows=True, distance='correlation', do_zscore=True): """Time-segment classification across subjects using Hyperalignment Parameters ---------- dss : list of datasets Datasets to benchmark on. Usually a single dataset per subject. hyper : Hyperalignment-like, optional Beast which if called on a list of datasets should spit out trained mappers. If not specified, `IdentityMapper`s will be used part1 : Partitioner, optional Partitioner to split data for hyperalignment "cross-validation" part2 : Partitioner, optional Partitioner for CV within the hyperalignment test split window_size : int, optional How many temporal points to consider for a classification sample overlapping_windows : bool, optional Strategy to how create and classify "samples" for classification. If True -- `window_size` samples from each time point (but trailing ones) constitute a sample, and upon "predict" `window_size` of samples around each test point is not considered. If False -- samples are just taken (with training and testing splits) at `window_size` step from one to another. do_zscore : bool, optional Perform zscoring (overall, not per-chunk) for each dataset upon partitioning with part1 ... """ # Generate outer-most partitioning () parts = [copy.deepcopy(part1).generate(ds) for ds in dss] iter = 1 errors = [] while True: try: dss_partitioned = [p.next() for p in parts] except StopIteration: # we are done -- no more partitions break if __debug__: debug("BM", "Iteration %d", iter) dss_train, dss_test = zip(*[ list(Splitter("partitions").generate(ds)) for ds in dss_partitioned ]) # TODO: allow for doing feature selection if do_zscore: for ds in dss_train + dss_test: zscore(ds, chunks_attr=None) if hyper is not None: # since otherwise it would remember previous loop dataset as the "commonspace" # Now let's do hyperalignment but on a copy in each loop iteration hyper_ = copy.deepcopy(hyper) mappers = hyper_(dss_train) else: mappers = [IdentityMapper() for ds in dss_train] dss_test_aligned = [ mapper.forward(ds) for mapper, ds in zip(mappers, dss_test) ] # assign .sa.subjects to those datasets for i, ds in enumerate(dss_test_aligned): # part2.attr is by default "subjects" ds.sa[part2.attr] = [i] dss_test_bc = [] for ds in dss_test_aligned: if overlapping_windows: startpoints = range(len(ds) - window_size + 1) else: startpoints = _get_nonoverlapping_startpoints( len(ds), window_size) bm = BoxcarMapper(startpoints, window_size) bm.train(ds) ds_ = bm.forward(ds) ds_.sa['startpoints'] = startpoints # reassign subjects so they are not arrays def assign_unique(ds, sa): ds.sa[sa] = [ np.asscalar(np.unique(x)) for x in ds.sa[sa].value ] assign_unique(ds_, part2.attr) fm = FlattenMapper() fm.train(ds_) dss_test_bc.append(ds_.get_mapped(fm)) ds_test = vstack(dss_test_bc) # Perform classification across subjects comparing against mean # spatio-temporal pattern of other subjects errors_across_subjects = [] for ds_test_part in part2.generate(ds_test): ds_train_, ds_test_ = list( Splitter("partitions").generate(ds_test_part)) # average across subjects to get a representative pattern per timepoint ds_train_ = mean_group_sample(['startpoints'])(ds_train_) assert (ds_train_.shape == ds_test_.shape) if distance == 'correlation': # TODO: redo more efficiently since now we are creating full # corrcoef matrix. Also we might better just take a name for # the pdist measure but then implement them efficiently # (i.e. without hstacking both pieces together first) dist = 1 - np.corrcoef( ds_train_, ds_test_)[len(ds_test_):, :len(ds_test_)] else: raise NotImplementedError if overlapping_windows: dist = wipe_out_offdiag(dist, window_size) winners = np.argmin(dist, axis=1) error = np.mean(winners != np.arange(len(winners))) errors_across_subjects.append(error) errors.append(errors_across_subjects) iter += 1 errors = np.array(errors) if __debug__: debug( "BM", "Finished with %s array of errors. Mean error %.2f" % (errors.shape, np.mean(errors))) return errors
def _call(self, ds): if len(ds) > 1: # average all samples into one, assuming we got something like one # sample per subject as input avgr = mean_sample() ds = avgr(ds) # threshold input; at this point we only have one sample left thrd = ds.samples[0] > self._thrmap # mapper default mapper = IdentityMapper() # overwrite if possible if hasattr(ds, 'a') and 'mapper' in ds.a: mapper = ds.a.mapper # reverse-map input osamp = mapper.reverse1(thrd) # prep output dataset outds = ds.copy(deep=False) outds.fa['featurewise_thresh'] = self._thrmap # determine clusters labels, num = measurements.label(osamp) area = measurements.sum(osamp, labels, index=np.arange(1, num + 1)).astype(int) # for the rest we need the labels flattened labels = mapper.forward1(labels) # relabel clusters starting with the biggest and increase index with # decreasing size ordered_labels = np.zeros(labels.shape, dtype=int) ordered_area = np.zeros(area.shape, dtype=int) for i, idx in enumerate(np.argsort(area)): ordered_labels[labels == idx + 1] = num - i ordered_area[i] = area[idx] area = ordered_area[::-1] labels = ordered_labels del ordered_labels # this one can be big # store cluster labels after forward-mapping outds.fa['clusters_featurewise_thresh'] = labels.copy() # update cluster size histogram with the actual result to get a # proper lower bound for p-values # this will make a copy, because the original matrix is int cluster_probs_raw = _transform_to_pvals( area, self._null_cluster_sizes.astype('float')) if self.params.multicomp_correction is None: probs_corr = np.array(cluster_probs_raw) rej = probs_corr <= self.params.fwe_rate outds.a['clusterstats'] = \ np.rec.fromarrays( [area, cluster_probs_raw], names=('size', 'prob_raw')) else: # do a local import as only this tiny portion needs statsmodels import statsmodels.stats.multitest as smm rej, probs_corr = smm.multipletests( cluster_probs_raw, alpha=self.params.fwe_rate, method=self.params.multicomp_correction)[:2] # store corrected per-cluster probabilities outds.a['clusterstats'] = \ np.rec.fromarrays( [area, cluster_probs_raw, probs_corr], names=('size', 'prob_raw', 'prob_corrected')) # remove cluster labels that did not pass the FWE threshold for i, r in enumerate(rej): if not r: labels[labels == i + 1] = 0 outds.fa['clusters_fwe_thresh'] = labels return outds