def get_encoded_dataset(self, encoder_fn, selectors): """ This version is intended for use with a data dict / indices. :return: """ from deepthought.datasets.selection import DatasetMetaDB from deepthought.util.function_util import process_dataset import theano # build lookup structure metadb = DatasetMetaDB(self.full_meta, selectors.keys()) # get selected trial IDs selected_trial_ids = metadb.select(selectors) X, Y = process_dataset(self.full_hdf5, encoder_fn, indices=selected_trial_ids, input_sources=['indices'], target_source=self.hyper_params['classification_target_source']) meta = [self.full_meta[i] for i in selected_trial_ids] # flatten X (2d) and Y (1d) X = np.asarray(X, dtype=theano.config.floatX) X = X.reshape(X.shape[0], np.prod(X.shape[1:])) Y = Y.argmax(axis=1) return X, Y, meta
def get_dataset(hdf5name, selectors=None, sources=('features', 'targets', 'subjects')): if selectors is None: selectors = {} # load metadata import deepthought.util.fs_util as fs_util base_meta = fs_util.load(hdf5name + '.meta.pklz') # build lookup structure from deepthought.datasets.selection import DatasetMetaDB metadb = DatasetMetaDB(base_meta, selectors.keys()) # get selected trial IDs selected_trial_ids = metadb.select(selectors) log.debug('selectors: {}'.format(selectors)) log.debug('selected trials: {}'.format(selected_trial_ids)) log.debug('selected sources: {}'.format(sources)) # load data and generate metadata from fuel.datasets.hdf5 import H5PYDataset hdf5 = H5PYDataset(hdf5name, which_sets=('all', ), subset=selected_trial_ids, load_in_memory=True, sources=sources) meta = [base_meta[i] for i in selected_trial_ids] log.debug('number of examples: {}'.format(hdf5.num_examples)) return hdf5, meta
def __init__(self, db, selectors): metadb = DatasetMetaDB(db.metadata, selectors.keys()) selected_trial_ids = metadb.select(selectors) self.data = [db.data[i] for i in selected_trial_ids] self.metadata = [db.metadata[i] for i in selected_trial_ids] if hasattr(db, 'targets'): if db.targets is None: self.targets = None else: self.targets = [db.targets[i] for i in selected_trial_ids]
def __init__(self, root_path, selectors=dict()): # read metadata file: dict filename -> metadata meta_map = load(os.path.join(root_path, 'metadata_db.pklz')) filenames = list(meta_map.keys()) metadata = [meta_map[fn] for fn in filenames] # filter files by metadata selectors metadb = DatasetMetaDB(metadata, selectors.keys()) selected_file_ids = metadb.select(selectors) # log.info('selected files: {}'.format(selected_file_ids)) # load selected files self.data = [] self.metadata = [] for id in selected_file_ids: log.debug('loading data file #{} {}'.format(id, filenames[id])) f_data, f_metadata = load(os.path.join(root_path, filenames[id])) self.data.append(f_data) self.metadata.append(metadata[id]) print len(self.data), len(self.metadata)
def __init__(self, db, # data source name = '', # optional name selectors = dict(), partitioner = None, meta_sources = [], # optional sources other than 'features' and 'targets' from metadata channel_filter = NoChannelFilter(), # optional channel filter, default: keep all channel_names = None, # optional channel names (for metadata) label_attribute = 'label', # metadata attribute to be used as label label_map = None, # optional conversion of labels use_targets = True, # use targets if provides, otherwise labels are used remove_dc_offset = False, # optional subtraction of channel mean, usually done already earlier resample = None, # optional down-sampling normalize = True, # normalize to max=1 # optional sub-sequences selection start_sample = 0, stop_sample = None, # optional for selection of sub-sequences zero_padding = True, # if True (default) trials that are too short will be padded with # otherwise they will rejected. # optional signal filter to by applied before splitting the signal signal_filter = None, trial_processors = [], # optional processing of the trials target_processor = None, # optional processing of the targets, e.g. zero-padding transformers = [], # optional transformations of the dataset layout='tf', # (0,1)-axes layout tf=time x features or ft=features x time debug=False, ): ''' Constructor ''' # save params self.params = locals().copy() del self.params['self'] # print self.params self.name = name self.debug = debug metadb = DatasetMetaDB(db.metadata, selectors.keys()) if partitioner is not None: pass # FIXME selected_trial_ids = metadb.select(selectors) log.info('selectors: {}'.format(selectors)) log.info('selected trials: {}'.format(selected_trial_ids)) if normalize: log.info('Data will be normalized to max amplitude 1 per channel (normalize=True).') trials = list() labels = list() targets = list() meta = list() if stop_sample == 'auto-min': stop_sample = np.min([db.data[trial_i].shape[-1] for trial_i in selected_trial_ids]) log.info('Using minimum trial length. stop_sample={}'.format(stop_sample)) elif stop_sample == 'auto-max': stop_sample = np.max([db.data[trial_i].shape[-1] for trial_i in selected_trial_ids]) log.info('Using maximum trial length. stop_sample={}'.format(stop_sample)) for trial_i in selected_trial_ids: trial_meta = db.metadata[trial_i] if use_targets: if targets is None: target = None else: target = db.targets[trial_i] assert not np.isnan(np.sum(target)) if target_processor is not None: target = target_processor.process(target, trial_meta) assert not np.isnan(np.sum(target)) else: # get and process label label = db.metadata[trial_i][label_attribute] if label_map is not None: label = label_map[label] processed_trial = [] trial = db.data[trial_i] if np.isnan(np.sum(trial)): print trial_i, trial assert not np.isnan(np.sum(trial)) rejected = False # flag for trial rejection trial = np.atleast_2d(trial) # process 1 channel at a time for channel in xrange(trial.shape[0]): # filter channels if not channel_filter.keep_channel(channel): continue samples = trial[channel, :] # subtract channel mean if remove_dc_offset: samples -= samples.mean() # down-sample if requested if resample is not None and resample[0] != resample[1]: samples = librosa.resample(samples, resample[0], resample[1], res_type='sinc_best') # apply optional signal filter after down-sampling -> requires lower order if signal_filter is not None: samples = signal_filter.process(samples) # get sub-sequence in resampled space # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape)) if stop_sample is not None and stop_sample > len(samples): if zero_padding: tmp = np.zeros(stop_sample) tmp[:len(samples)] = samples samples = tmp else: rejected = True break # stop processing this trial s = samples[start_sample:stop_sample] # TODO optional channel processing # normalize to max amplitude 1 if normalize: s = librosa.util.normalize(s) # add 2nd data dimension s = s.reshape(s.shape[0], 1) # print s.shape s = np.asfarray(s, dtype=theano.config.floatX) processed_trial.append(s) ### end of channel iteration ### if rejected: continue # next trial processed_trial = np.asfarray([processed_trial], dtype=theano.config.floatX) # processed_trial = processed_trial.reshape((1, processed_trial.shape)) processed_trial = np.rollaxis(processed_trial, 1, 4) # optional (external) trial processing, e.g. windowing # trials will be in b01c format with tf layout for 01-axes for trial_processor in trial_processors: processed_trial = trial_processor.process(processed_trial, trial_meta) trials.append(processed_trial) for k in range(len(processed_trial)): meta.append(trial_meta) if use_targets: targets.append(target) else: labels.append(label) ### end of datafile iteration ### # turn into numpy arrays self.trials = np.vstack(trials) assert not np.isnan(np.sum(self.trials)) # prepare targets / labels if use_targets: self.targets = np.vstack(targets) assert not np.isnan(np.sum(self.targets)) else: labels = np.hstack(labels) if label_map is None: one_hot_formatter = OneHotFormatter(max(labels) + 1) else: one_hot_formatter = OneHotFormatter(max(label_map.values()) + 1) one_hot_y = one_hot_formatter.format(labels) self.targets = one_hot_y self.metadata = meta if layout == 'ft': # swap axes to (batch, feature, time, channels) self.trials = self.trials.swapaxes(1, 2) # transform after finalizing the data structure for transformer in transformers: self.trials, self.targets = transformer.process(self.trials, self.targets) self.trials = np.asarray(self.trials, dtype=theano.config.floatX) log.debug('final dataset shape: {} (b,0,1,c)'.format(self.trials.shape)) # super(EEGEpochsDataset, self).__init__(topo_view=self.trials, y=self.targets, axes=['b', 0, 1, 'c']) self.X = self.trials.reshape(self.trials.shape[0], np.prod(self.trials.shape[1:])) self.y = self.targets log.info('generated dataset "{}" with shape X={}={} y={} targets={} '. format(self.name, self.X.shape, self.trials.shape, self.y.shape, self.targets.shape)) # determine data specs features_space = Conv2DSpace( shape=[self.trials.shape[1], self.trials.shape[2]], num_channels=self.trials.shape[3] ) features_source = 'features' targets_space = VectorSpace(dim=self.targets.shape[-1]) targets_source = 'targets' space_components = [features_space, targets_space] source_components = [features_source, targets_source] # additional support for meta information self.meta_maps = dict() for meta_source in meta_sources: self.meta_maps[meta_source] = sorted(list(set([m[meta_source] for m in self.metadata]))) space_components.extend([VectorSpace(dim=1)]) source_components.extend([meta_source]) log.info('Generated meta-source "{}" with value map: {}' .format(meta_source, self.meta_maps[meta_source])) space = CompositeSpace(space_components) source = tuple(source_components) self.data_specs = (space, source) log.debug('data specs: {}'.format(self.data_specs))
def run(self, classifiers=(), verbose=False, debug=False): print 'running job #{}'.format(self.job_id) import deepthought.util.fs_util as fs_util fs_util.ensure_dir_exists(self.output_path) print 'output path: ', self.output_path # prepare result objects results = {k: ClassificationResult(k) for (k, _) in classifiers} # load full dataset with all sources only once! from deepthought.datasets.hdf5 import get_dataset self.full_hdf5, self.full_meta = get_dataset(self.hdf5name, selectors=self.base_selectors, sources=None) self.initialize() # main loop ### # outer cross-validation outer_folds = self.fold_generator.get_outer_cv_folds() for ofi, ofold in enumerate(outer_folds): print 'processing outer fold', ofold # phase I : pre-train features ### encoder_fn = self.pretrain_encoder(ofi, ofold) # FIXME: add params # phase II : classify ### train_selectors = self.fold_generator.get_fold_selectors(outer_fold=ofold['train']) X_train, Y_train, meta_train = self.get_encoded_dataset(encoder_fn, train_selectors) test_selectors = self.fold_generator.get_fold_selectors(outer_fold=ofold['valid']) X_test, Y_test, _ = self.get_encoded_dataset(encoder_fn, test_selectors) for (classifier_name, classifier_factory) in classifiers: result = results[classifier_name] model_prefix = os.path.join(self.output_path, '{}_fold_{}'.format(classifier_name, ofi)) # generate index folds idx_folds = [] from deepthought.datasets.selection import DatasetMetaDB for ifold in self.fold_generator.get_inner_cv_folds(ofold): train_selectors = self.fold_generator.get_fold_selectors(outer_fold=ofold['train'], inner_fold=ifold['train']) metadb = DatasetMetaDB(meta_train, train_selectors.keys()) if 'valid' in ifold.keys(): valid_selectors = self.fold_generator.get_fold_selectors(outer_fold=ofold['train'], inner_fold=ifold['valid']) else: valid_selectors = None if debug: print 'train_selectors:', train_selectors print 'valid_selectors:', valid_selectors # get selected trial IDs train_idx = metadb.select(train_selectors) if valid_selectors is not None: valid_idx = metadb.select(valid_selectors) else: valid_idx = [] idx_folds.append((train_idx, valid_idx)) if debug: print idx_folds # print the generated folds before running the classifier # train classifier classifier, predict_fn = classifier_factory.train(X_train, Y_train, idx_folds, self.hyper_params, model_prefix) # test classifier train_Y_pred = predict_fn(X_train) test_Y_pred = predict_fn(X_test) # append to result result.append_train(Y_train, train_Y_pred) result.append_test(Y_test, test_Y_pred) # result.fold_scores.append(classifier.score(X_test, Y_test)) result.fold_scores.append(np.mean(Y_test == test_Y_pred)) if verbose: print '{} results for fold {}'.format(classifier_name, ofold) print classification_report(Y_test, test_Y_pred) print confusion_matrix(Y_test, test_Y_pred) print 'overall test accuracy so far:', 1 - result.test_error() print 'all folds completed' for (classifier_name, _) in classifiers: result = results[classifier_name] fs_util.save(os.path.join(self.output_path, '{}_result.pklz'.format(classifier_name)), result) # result print print 'SUMMARY for classifier', classifier_name print print 'fold scores: ', np.asarray(result.fold_scores) print print classification_report(result.test_Y_real, result.test_Y_pred) print confusion_matrix(result.test_Y_real, result.test_Y_pred) print print 'train accuracy:', 1 - result.train_error() print 'test accuracy :', 1 - result.test_error() return [results[classifier[0]].test_error() for classifier in classifiers] # error for each classifier
def __init__( self, db, # data source name='', # optional name selectors=dict(), partitioner=None, meta_sources=[], # optional sources other than 'features' and 'targets' from metadata channel_filter=NoChannelFilter( ), # optional channel filter, default: keep all channel_names=None, # optional channel names (for metadata) label_attribute='label', # metadata attribute to be used as label label_map=None, # optional conversion of labels use_targets=True, # use targets if provides, otherwise labels are used remove_dc_offset=False, # optional subtraction of channel mean, usually done already earlier resample=None, # optional down-sampling normalize=True, # normalize to max=1 # optional sub-sequences selection start_sample=0, stop_sample=None, # optional for selection of sub-sequences zero_padding=True, # if True (default) trials that are too short will be padded with # otherwise they will rejected. # optional signal filter to by applied before splitting the signal signal_filter=None, trial_processors=[], # optional processing of the trials target_processor=None, # optional processing of the targets, e.g. zero-padding transformers=[], # optional transformations of the dataset layout='tf', # (0,1)-axes layout tf=time x features or ft=features x time debug=False, ): ''' Constructor ''' # save params self.params = locals().copy() del self.params['self'] # print self.params self.name = name self.debug = debug metadb = DatasetMetaDB(db.metadata, selectors.keys()) if partitioner is not None: pass # FIXME selected_trial_ids = metadb.select(selectors) log.info('selectors: {}'.format(selectors)) log.info('selected trials: {}'.format(selected_trial_ids)) if normalize: log.info( 'Data will be normalized to max amplitude 1 per channel (normalize=True).' ) trials = list() labels = list() targets = list() meta = list() if stop_sample == 'auto-min': stop_sample = np.min( [db.data[trial_i].shape[-1] for trial_i in selected_trial_ids]) log.info('Using minimum trial length. stop_sample={}'.format( stop_sample)) elif stop_sample == 'auto-max': stop_sample = np.max( [db.data[trial_i].shape[-1] for trial_i in selected_trial_ids]) log.info('Using maximum trial length. stop_sample={}'.format( stop_sample)) for trial_i in selected_trial_ids: trial_meta = db.metadata[trial_i] if use_targets: if targets is None: target = None else: target = db.targets[trial_i] assert not np.isnan(np.sum(target)) if target_processor is not None: target = target_processor.process(target, trial_meta) assert not np.isnan(np.sum(target)) else: # get and process label label = db.metadata[trial_i][label_attribute] if label_map is not None: label = label_map[label] processed_trial = [] trial = db.data[trial_i] if np.isnan(np.sum(trial)): print trial_i, trial assert not np.isnan(np.sum(trial)) rejected = False # flag for trial rejection trial = np.atleast_2d(trial) # process 1 channel at a time for channel in xrange(trial.shape[0]): # filter channels if not channel_filter.keep_channel(channel): continue samples = trial[channel, :] # subtract channel mean if remove_dc_offset: samples -= samples.mean() # down-sample if requested if resample is not None and resample[0] != resample[1]: samples = librosa.resample(samples, resample[0], resample[1], res_type='sinc_best') # apply optional signal filter after down-sampling -> requires lower order if signal_filter is not None: samples = signal_filter.process(samples) # get sub-sequence in resampled space # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape)) if stop_sample is not None and stop_sample > len(samples): if zero_padding: tmp = np.zeros(stop_sample) tmp[:len(samples)] = samples samples = tmp else: rejected = True break # stop processing this trial s = samples[start_sample:stop_sample] # TODO optional channel processing # normalize to max amplitude 1 if normalize: s = librosa.util.normalize(s) # add 2nd data dimension s = s.reshape(s.shape[0], 1) # print s.shape s = np.asfarray(s, dtype=theano.config.floatX) processed_trial.append(s) ### end of channel iteration ### if rejected: continue # next trial processed_trial = np.asfarray([processed_trial], dtype=theano.config.floatX) # processed_trial = processed_trial.reshape((1, processed_trial.shape)) processed_trial = np.rollaxis(processed_trial, 1, 4) # optional (external) trial processing, e.g. windowing # trials will be in b01c format with tf layout for 01-axes for trial_processor in trial_processors: processed_trial = trial_processor.process( processed_trial, trial_meta) trials.append(processed_trial) for k in range(len(processed_trial)): meta.append(trial_meta) if use_targets: targets.append(target) else: labels.append(label) ### end of datafile iteration ### # turn into numpy arrays self.trials = np.vstack(trials) assert not np.isnan(np.sum(self.trials)) # prepare targets / labels if use_targets: self.targets = np.vstack(targets) assert not np.isnan(np.sum(self.targets)) else: labels = np.hstack(labels) if label_map is None: one_hot_formatter = OneHotFormatter(max(labels) + 1) else: one_hot_formatter = OneHotFormatter( max(label_map.values()) + 1) one_hot_y = one_hot_formatter.format(labels) self.targets = one_hot_y self.metadata = meta if layout == 'ft': # swap axes to (batch, feature, time, channels) self.trials = self.trials.swapaxes(1, 2) # transform after finalizing the data structure for transformer in transformers: self.trials, self.targets = transformer.process( self.trials, self.targets) self.trials = np.asarray(self.trials, dtype=theano.config.floatX) log.debug('final dataset shape: {} (b,0,1,c)'.format( self.trials.shape)) # super(EEGEpochsDataset, self).__init__(topo_view=self.trials, y=self.targets, axes=['b', 0, 1, 'c']) self.X = self.trials.reshape(self.trials.shape[0], np.prod(self.trials.shape[1:])) self.y = self.targets log.info('generated dataset "{}" with shape X={}={} y={} targets={} '. format(self.name, self.X.shape, self.trials.shape, self.y.shape, self.targets.shape)) # determine data specs features_space = Conv2DSpace( shape=[self.trials.shape[1], self.trials.shape[2]], num_channels=self.trials.shape[3]) features_source = 'features' targets_space = VectorSpace(dim=self.targets.shape[-1]) targets_source = 'targets' space_components = [features_space, targets_space] source_components = [features_source, targets_source] # additional support for meta information self.meta_maps = dict() for meta_source in meta_sources: self.meta_maps[meta_source] = sorted( list(set([m[meta_source] for m in self.metadata]))) space_components.extend([VectorSpace(dim=1)]) source_components.extend([meta_source]) log.info('Generated meta-source "{}" with value map: {}'.format( meta_source, self.meta_maps[meta_source])) space = CompositeSpace(space_components) source = tuple(source_components) self.data_specs = (space, source) log.debug('data specs: {}'.format(self.data_specs))
def __init__(self, dataset, dataset_metadata, base_selectors=None, ext_selectors=None, targets_source='targets', group_attribute=None, allow_self_comparison=False, additional_sources=None, **kwargs): if base_selectors is None: base_selectors = {} if additional_sources is None: additional_sources = [] # get selected trial IDs from deepthought.datasets.selection import DatasetMetaDB metadb = DatasetMetaDB(dataset_metadata, base_selectors.keys()) base_trial_ids = metadb.select(base_selectors) log.debug('base selectors: {}'.format(base_selectors)) log.debug('selected base trials: {}'.format(base_trial_ids)) if ext_selectors is not None: split_index = len(base_trial_ids) metadb = DatasetMetaDB(dataset_metadata, ext_selectors.keys()) ext_trial_ids = metadb.select(ext_selectors) else: split_index = 0 ext_trial_ids = [] log.debug('ext selectors: {}'.format(ext_selectors)) log.debug('selected ext trials: {}'.format(ext_trial_ids)) # indices = np.concatenate((base_trial_ids, ext_trial_ids)) indices = base_trial_ids + ext_trial_ids metadata = [dataset_metadata[i] for i in indices] # load targets from dataset state = dataset.open() targets = dataset.get_data( state=state, request=indices)[dataset.sources.index(targets_source)] dataset.close(state) # print targets # split data into partitions according to groups = dict() if group_attribute is not None: for i, meta in enumerate(metadata): group = meta[group_attribute] if group not in groups: groups[group] = [] groups[group].append(i) else: # default: all in one group groups['default'] = np.arange(len(metadata)) # print groups from itertools import product pairs = [] others = [] # add group-wise for group_ids in groups.values(): for i in range(targets.shape[-1]): # 1st trial candidates if split_index > 0: trial_ids = np.where(targets[:split_index, i] == 1)[0] else: trial_ids = np.where(targets[:, i] == 1)[0] # 2nd trial candidates (same class) trial_ids2 = np.where(targets[:, i] == 1)[0] # others candidates (different class) others_ids = np.where(targets[:, i] == 0)[0] # only retain ids within the group trial_ids = np.intersect1d(trial_ids, group_ids) trial_ids2 = np.intersect1d(trial_ids2, group_ids) others_ids = np.intersect1d(others_ids, group_ids) # combine with permutation new_pairs = [ tuple(pair) for pair in product(trial_ids, trial_ids2) ] if not allow_self_comparison: # remove repetitions to_remove = [] for pair in new_pairs: if pair[0] == pair[1]: to_remove.append(pair) for pair in to_remove: new_pairs.remove(pair) # print 'removed', to_remove new_pairs = sorted(new_pairs) # print pairs # combine all pairs with all other trials for pair, other in product(new_pairs, others_ids): # print pair, other pairs.append(pair) others.append([other]) # NOTE: triplets uses internal ids # (refencing into indices which contains hdfs-specific ids) self.triplets = np.concatenate([pairs, others], axis=1) self.indices = np.asarray(indices, dtype=np.int16) # indices = indices.reshape((len(indices), 1)) # make 2D for VectorSpace log.debug('triplets.shape={} indices.shape={}'.format( self.triplets.shape, self.indices.shape)) sources = ['targets', '0_indices', '1_indices', '2_indices'] self.data_per_source = dict() for source in additional_sources: # load source data from hdf5 dataset # hdf5 = H5PYDataset(hdf5name, which_sets=('all',), # load_in_memory=True, sources=(source,) # ) # state = hdf5.open() # self.data_per_source[source] = hdf5.get_data(request=indices)[0] # hdf5.close(state) # load source data from dataset state = dataset.open() self.data_per_source[source] = dataset.get_data( state=state, request=indices)[dataset.sources.index(source)] dataset.close(state) for i in range(3): sources.append('{}_{}'.format(i, source)) self.sources = tuple(sources) self.provides_sources = self.sources log.debug('sources: {}'.format(self.sources)) super(TripletsIndexDataset, self).__init__(**kwargs)
def __init__(self, dataset, dataset_metadata, base_selectors=None, ext_selectors=None, targets_source='targets', group_attribute=None, allow_self_comparison=False, additional_sources=None, **kwargs): if base_selectors is None: base_selectors = {} if additional_sources is None: additional_sources = [] # get selected trial IDs from deepthought.datasets.selection import DatasetMetaDB metadb = DatasetMetaDB(dataset_metadata, base_selectors.keys()) base_trial_ids = metadb.select(base_selectors) log.debug('base selectors: {}'.format(base_selectors)) log.debug('selected base trials: {}'.format(base_trial_ids)) if ext_selectors is not None: split_index = len(base_trial_ids) metadb = DatasetMetaDB(dataset_metadata, ext_selectors.keys()) ext_trial_ids = metadb.select(ext_selectors) else: split_index = 0 ext_trial_ids = [] log.debug('ext selectors: {}'.format(ext_selectors)) log.debug('selected ext trials: {}'.format(ext_trial_ids)) # indices = np.concatenate((base_trial_ids, ext_trial_ids)) indices = base_trial_ids + ext_trial_ids metadata = [dataset_metadata[i] for i in indices] # load targets from dataset state = dataset.open() targets = dataset.get_data( state=state, request=indices)[dataset.sources.index(targets_source)] dataset.close(state) # print targets # split data into partitions according to groups = dict() if group_attribute is not None: for i, meta in enumerate(metadata): group = meta[group_attribute] if group not in groups: groups[group] = [] groups[group].append(i) else: # default: all in one group groups['default'] = np.arange(len(metadata)) # print groups from itertools import product pairs = [] pair_targets = [] # add group-wise for group_ids in groups.values(): for i in range(targets.shape[-1]): # 1st trial candidates if split_index > 0: trial_ids = np.where(targets[:split_index, i] == 1)[0] else: trial_ids = np.where(targets[:, i] == 1)[0] # similar candidates (same class) trial_ids2 = np.where(targets[:, i] == 1)[0] # dissimilar candidates (different class) others_ids = np.where(targets[:, i] == 0)[0] # only retain ids within the group trial_ids = np.intersect1d(trial_ids, group_ids) trial_ids2 = np.intersect1d(trial_ids2, group_ids) others_ids = np.intersect1d(others_ids, group_ids) for pair in product(trial_ids, trial_ids2): if allow_self_comparison or pair[0] != pair[1]: pairs.append(tuple(pair)) pair_targets.append(0) for pair in product(trial_ids, others_ids): pairs.append(tuple(pair)) pair_targets.append(1) # NOTE: pairs uses internal ids # (refencing into indices which contains hdfs-specific ids) self.pairs = np.asarray(pairs) self.pair_targets = np.asarray(pair_targets) self.indices = np.asarray(indices, dtype=np.int16) log.debug('pairs.shape={} indices.shape={}'.format( self.pairs.shape, self.indices.shape)) sources = ['targets', '0_indices', '1_indices'] self.data_per_source = dict() for source in additional_sources: # load source data from dataset state = dataset.open() self.data_per_source[source] = dataset.get_data( state=state, request=indices)[dataset.sources.index(source)] dataset.close(state) for i in range(2): sources.append('{}_{}'.format(i, source)) self.sources = tuple(sources) self.provides_sources = self.sources log.debug('sources: {}'.format(self.sources)) super(PairsIndexDataset, self).__init__(**kwargs)