def test_concatenate_dict(self): dict_a = { "variable_one": np.random.randn(10), "variable_two": np.random.randn(15), } dict_b = { "variable_one": np.random.randn(20), "variable_two": np.random.randn(20), } dict_c = { "variable_one": np.random.randn(10, 10), "variable_two": np.random.randn(15, 15), } dict_d = { "variable_one": np.random.randn(20, 10), "variable_two": np.random.randn(20, 15), } dict_e = {} dict_a_b = PhotonDataHelper.join_dictionaries(dict_a, dict_b) dict_c_d = PhotonDataHelper.join_dictionaries(dict_c, dict_d) dict_e_a = PhotonDataHelper.join_dictionaries(dict_e, dict_a) self.assertEqual(len(dict_a_b["variable_one"]), 30) self.assertEqual(len(dict_a_b["variable_two"]), 35) self.assertEqual(dict_c_d["variable_one"].shape, (30, 10)) self.assertEqual(dict_c_d["variable_two"].shape, (35, 15)) self.assertEqual(len(dict_e_a["variable_one"]), 10) self.assertEqual(len(dict_e_a["variable_two"]), 15)
def apply_transform_parallelized(self, X): """ :param X: the data to which the delegate should be applied in parallel """ if self.nr_of_processes > 1: jobs_to_do = list() # distribute the data equally to all available cores number_of_items_to_process = PhotonDataHelper.find_n(X) number_of_items_for_each_core = int( np.ceil(number_of_items_to_process / self.nr_of_processes)) logger.info("NeuroBranch " + self.name + ": Using " + str(self.nr_of_processes) + " cores calculating " + str(number_of_items_for_each_core) + " items each") for start, stop in PhotonDataHelper.chunker( number_of_items_to_process, number_of_items_for_each_core): X_batched, _, _ = PhotonDataHelper.split_data( X, None, {}, start, stop) # copy my pipeline new_pipe_mr = self.copy_me() new_pipe_copy = new_pipe_mr.base_element new_pipe_copy.cache_folder = self.base_element.cache_folder new_pipe_copy.skip_loading = True new_pipe_copy._parallel_use = True del_job = dask.delayed(NeuroBranch.parallel_application)( new_pipe_copy, X_batched) jobs_to_do.append(del_job) dask.compute(*jobs_to_do)
def _prepare_data(self, X, y=None, **kwargs): logger.info( "Preparing data for outer fold " + str(self.cross_validaton_info.outer_folds[self.outer_fold_id].fold_nr) + "..." ) # Prepare Train and validation set data train_indices = self.cross_validaton_info.outer_folds[ self.outer_fold_id ].train_indices test_indices = self.cross_validaton_info.outer_folds[ self.outer_fold_id ].test_indices self._validation_X, self._validation_y, self._validation_kwargs = PhotonDataHelper.split_data( X, y, kwargs, indices=train_indices ) self._test_X, self._test_y, self._test_kwargs = PhotonDataHelper.split_data( X, y, kwargs, indices=test_indices ) # write numbers to database info object self.result_object.number_samples_validation = self._validation_y.shape[0] self.result_object.number_samples_test = self._test_y.shape[0] if self._pipe._estimator_type == "classifier": self.result_object.class_distribution_validation = FoldInfo.data_overview( self._validation_y ) self.result_object.class_distribution_test = FoldInfo.data_overview( self._test_y )
def test_index_dict(self): labels = np.asarray([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) dict_a = { "variable_one": np.random.randn(10), "variable_two": np.random.randn(10, 10), } dict_a_1 = PhotonDataHelper.index_dict(dict_a, labels == 0) dict_a_2 = PhotonDataHelper.index_dict(dict_a, labels == 1) self.assertEqual(len(dict_a_1["variable_one"]), 5) self.assertEqual(dict_a_2["variable_two"].shape, (5, 10))
def test_split_join_resorting(self): X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) y = np.array([1, 1, 1, 1, 1, 2, 2, 2, 2, 2]) kwargs = {"test": np.array([-1, -2, -3, -4, -5, -6, -7, -8, -9, -10])} X_new, y_new, kwargs_new = list(), list(), dict() # first randomly split the data and append them to X_new, y_new, kwargs_new idx_list_one, idx_list_two = list(), list() for idx in range(len(X)): if bool(random.getrandbits(1)): idx_list_one.append(idx) else: idx_list_two.append(idx) for ilist in [idx_list_two, idx_list_one]: for idx in ilist: X_batched, y_batched, kwargs_batched = PhotonDataHelper.split_data( X, y, kwargs, idx, idx) # test if batching works self.assertEqual(X_batched, X[idx]) self.assertEqual(y_batched, y[idx]) self.assertDictEqual(kwargs_batched, {"test": [kwargs["test"][idx]]}) # then join again X_new, y_new, kwargs_new = PhotonDataHelper.join_data( X_new, X_batched, y_new, y_batched, kwargs_new, kwargs_batched) # test if joining works joined_idx = PhotonDataHelper.stack_data_vertically( idx_list_two, idx_list_one) self.assertTrue(np.array_equal(X_new, X[joined_idx])) self.assertTrue(np.array_equal(y_new, y[joined_idx])) self.assertTrue( np.array_equal(kwargs_new["test"], kwargs["test"][joined_idx])) # now resort and see if that works too X_resorted, y_resorted, kwargs_resorted = PhotonDataHelper.resort_splitted_data( X_new, y_new, kwargs_new, joined_idx) self.assertTrue(np.array_equal(X_resorted, X)) self.assertTrue(np.array_equal(y_resorted, y)) self.assertListEqual(list(kwargs_resorted.keys()), list(kwargs.keys())) self.assertTrue(np.array_equal(kwargs_resorted["test"], kwargs["test"]))
def compute_learning_curves(self, new_pipe, train_X, train_y, train, kwargs_cv_train, test_X, test_y, test, kwargs_cv_test): self.cross_validation_infos.learning_curves_cut.transform() cut_range = [ round(cut * train_X.shape[0]) for cut in self.cross_validation_infos.learning_curves_cut.values ] learning_curves = [] for i, cut in enumerate(cut_range[1:]): cut_indices = np.arange(cut) train_cut_X, train_cut_y, train_cut_kwargs = PhotonDataHelper.split_data( train_X, train_y, kwargs_cv_train, indices=cut_indices) train_cut = train[:cut] job_data = self.InnerCVJob( pipe=new_pipe, config=dict(self.params), metrics=self.optimization_infos.metrics, callbacks=self.optimization_constraints, train_data=self.JobData(train_cut_X, train_cut_y, train_cut, train_cut_kwargs), test_data=self.JobData(test_X, test_y, test, kwargs_cv_test)) curr_test_cut, curr_train_cut = InnerFoldManager.fit_and_score( job_data) learning_curves.append([ self.cross_validation_infos.learning_curves_cut.values[i], curr_test_cut.metrics, curr_train_cut.metrics ]) return learning_curves
def predict(self, X, training=False, **kwargs): """ Transforms the data for every step that offers a transform function and then calls the estimator with predict on transformed data. It returns the predictions made. In case the last step is no estimator, it returns the transformed data. """ # first transform if not training: X, _, kwargs = self.transform(X, y=None, **kwargs) # then call predict on final estimator if self._final_estimator is not None: if self._final_estimator.is_estimator: logger.debug('PhotonPipeline: Predicting with ' + self._final_estimator.name + ' ...') predict_start_time = datetime.datetime.now() y_pred = self._final_estimator.predict(X, **kwargs) predict_duration = (datetime.datetime.now() - predict_start_time).total_seconds() n = PhotonDataHelper.find_n(X) self.time_monitor['predict'].append( (self.elements[-1][0], predict_duration, n)) return y_pred else: return X else: return None
def objective_function_simple(self, cfg): cfg = {k: cfg[k] for k in cfg if cfg[k]} values = [] train_indices = list(self.pipe.cross_validation.outer_folds.values( ))[0].train_indices self._validation_X, self._validation_y, _ = PhotonDataHelper.split_data( self.X, self.y, kwargs=None, indices=train_indices) for inner_fold in list( list(self.pipe.cross_validation.inner_folds.values()) [0].values()): sc = PipelineElement("StandardScaler", {}) pca = PipelineElement("PCA", {}, random_state=42) svc = PipelineElement("SVC", {}, random_state=42, gamma='auto') my_pipe = PhotonPipeline([('StandardScaler', sc), ('PCA', pca), ('SVC', svc)]) my_pipe.set_params(**cfg) my_pipe.fit(self._validation_X[inner_fold.train_indices, :], self._validation_y[inner_fold.train_indices]) values.append( accuracy_score( self._validation_y[inner_fold.test_indices], my_pipe.predict( self._validation_X[inner_fold.test_indices, :]))) return 1 - np.mean(values)
def transform(self, X, y=None, **kwargs): """ Generates "new samples" by computing the mean between all or n_draws pairs of existing samples and appends them to X The target for each new sample is computed as the mean between the constituent targets :param X: data :param y: targets (optional) :param draw_limit: in case the full number of combinations is > 10k, how many to draw? :param rand_seed: sets seed for random sampling of combinations (for reproducibility only) :return: X_new: X and X_augmented; (y_new: the correspoding targets) """ logger.debug("Pairing " + str(self.draw_limit) + " samples...") # ensure class balance in the training set if balance_classes is True unique_classes = np.unique(y) n_pairs = list() for label in unique_classes: if self.balance_classes: n_pairs.append(self.draw_limit - np.sum(y == label)) else: n_pairs.append(self.draw_limit) # run get_samples for each class independently X_extended = list() y_extended = list() kwargs_extended = dict() for label, limit in zip(unique_classes, n_pairs): X_new_class, y_new_class, kwargs_new_class = self._return_samples( X[y == label], y[y == label], PhotonDataHelper.index_dict(kwargs, y == label), generator=self.generator, distance_metric=self.distance_metric, draw_limit=limit, rand_seed=self.random_state, ) X_extended.extend(X_new_class) y_extended.extend(y_new_class) # get the corresponding kwargs if kwargs: kwargs_extended = PhotonDataHelper.join_dictionaries( kwargs_extended, kwargs_new_class) return X_extended, y_extended, kwargs_extended
def test_transform(self): for elements, stack in self.stacks: np.random.seed(42) Xt_stack, _, _ = stack.fit(self.X, self.y).transform(self.X) np.random.seed(42) Xt_elements = None for i, element in enumerate(elements): Xt_element, _, _ = element.fit(self.X, self.y).transform(self.X) Xt_elements = PhotonDataHelper.stack_data_horizontally( Xt_elements, Xt_element) np.testing.assert_array_equal(Xt_stack, Xt_elements)
def inverse_transform(self, X, y=None, **kwargs): new_X = None for i in range(X.shape[1]): feature = X[:, i] transformer = self.encoder_list[i] if transformer is not None: feature = np.reshape(feature, (-1, 1)) trans_X = transformer.inverse_transform(feature) else: trans_X = feature new_X = PhotonDataHelper.stack_data_horizontally(new_X, trans_X) return new_X
def test_predict(self): for elements, stack in [ ([self.svc, self.tree], self.estimator_stack), ([self.estimator_branch_1, self.estimator_branch_2], self.estimator_branch_stack) ]: np.random.seed(42) stack = stack.fit(self.X, self.y) yt_stack = stack.predict(self.X) np.random.seed(42) Xt_elements = None for i, element in enumerate(elements): Xt_element = element.fit(self.X, self.y).predict(self.X) Xt_elements = PhotonDataHelper.stack_data_horizontally( Xt_elements, Xt_element) np.testing.assert_array_equal(yt_stack, Xt_elements)
def fit(self, X, y=None, **kwargs): self._validate_elements() X, y, kwargs = self._caching_fit_transform(X, y, kwargs, fit=True) if self._final_estimator is not None: logger.debug("PhotonPipeline: Fitting " + self._final_estimator.name) fit_start_time = datetime.datetime.now() if self.random_state: self._final_estimator.random_state = self.random_state self._final_estimator.fit(X, y, **kwargs) #todo after fit final_estimator actions by estimtor go n = PhotonDataHelper.find_n(X) fit_duration = (datetime.datetime.now() - fit_start_time).total_seconds() self.time_monitor["fit"].append((self.elements[-1][0], fit_duration, n)) return self
def test_data_split_indices(self): vals = np.array([-1, -2, -3, -4, -5, -6, -7, -8, -9, -10]) vals_str = np.array([ascii(i) for i in vals]) random_features = np.random.randn(10, 20) kwargs = {"test": vals, "subtest": vals_str, "random": random_features} pick_list = [1, 3, 5] splitted_X, splitted_y, splitted_example = PhotonDataHelper.split_data( random_features, vals, kwargs, indices=pick_list) self.assertTrue(np.array_equal(splitted_X, random_features[pick_list])) self.assertTrue(np.array_equal(splitted_y, vals[pick_list])) self.assertTrue( np.array_equal(splitted_example["test"], vals[pick_list])) self.assertTrue( np.array_equal(splitted_example["subtest"], vals_str[pick_list])) self.assertTrue( np.array_equal(splitted_example["random"], random_features[pick_list]))
def _do_timed_fit_transform(self, name, transformer, fit, X, y, **kwargs): n = PhotonDataHelper.find_n(X) if self.random_state: transformer.random_state = self.random_state if fit: logger.debug("PhotonPipeline: Fitting " + transformer.name) fit_start_time = datetime.datetime.now() transformer.fit(X, y, **kwargs) fit_duration = (datetime.datetime.now() - fit_start_time).total_seconds() self.time_monitor["fit"].append((name, fit_duration, n)) logger.debug("PhotonPipeline: Transforming data with " + transformer.name) transform_start_time = datetime.datetime.now() X, y, kwargs = transformer.transform(X, y, **kwargs) transform_duration = ( datetime.datetime.now() - transform_start_time ).total_seconds() self.time_monitor["transform_computed"].append((name, transform_duration, n)) return X, y, kwargs
def objective_function_switch(self, cfg): cfg = {k: cfg[k] for k in cfg if cfg[k]} values = [] train_indices = list(self.pipe.cross_validation.outer_folds.values( ))[0].train_indices self._validation_X, self._validation_y, _ = PhotonDataHelper.split_data( self.X, self.y, kwargs=None, indices=train_indices) switch = cfg["Estimator_switch"] del cfg["Estimator_switch"] for inner_fold in list( list(self.pipe.cross_validation.inner_folds.values()) [0].values()): sc = PipelineElement("StandardScaler", {}) pca = PipelineElement("PCA", {}, random_state=42) if switch == 'svc': est = PipelineElement("SVC", {}, random_state=42, gamma='auto') name = 'SVC' else: est = PipelineElement("RandomForestClassifier", {}, random_state=42) name = "RandomForestClassifier" my_pipe = PhotonPipeline([('StandardScaler', sc), ('PCA', pca), (name, est)]) my_pipe.set_params(**cfg) my_pipe.fit(self._validation_X[inner_fold.train_indices, :], self._validation_y[inner_fold.train_indices]) values.append( accuracy_score( self._validation_y[inner_fold.test_indices], my_pipe.predict( self._validation_X[inner_fold.test_indices, :]))) return 1 - np.mean(values)
def load_or_save_cached_data(self, name, X, y, kwargs, transformer, fit=False, needed_for_further_computation=False, initial_X=None): if not self.single_subject_caching: # if we do it group-wise then its easy if self.skip_loading and not needed_for_further_computation: # check if data is already calculated if self.cache_man.check_cache(name): # if so, do nothing return X, y, kwargs else: # otherwise, do the calculation and save it cached_result = None else: start_time_for_loading = datetime.datetime.now() cached_result = self.cache_man.load_cached_data(name) if cached_result is None: X, y, kwargs = self._do_timed_fit_transform( name, transformer, fit, X, y, **kwargs) start_time_saving = datetime.datetime.now() self.cache_man.save_data_to_cache(name, (X, y, kwargs)) saving_duration = (datetime.datetime.now() - start_time_saving).total_seconds() self.time_monitor['transform_cached'].append( (name, saving_duration, 1)) else: X, y, kwargs = cached_result[0], cached_result[ 1], cached_result[2] loading_duration = (datetime.datetime.now() - start_time_for_loading).total_seconds() n = PhotonDataHelper.find_n(X) self.time_monitor['transform_cached'].append( (name, loading_duration, n)) return X, y, kwargs else: # if we do it subject-wise we need to iterate and collect the results processed_X, processed_y, processed_kwargs = list(), list(), dict() X_uncached, y_uncached, kwargs_uncached, initial_X_uncached = list( ), list(), dict(), list() list_of_idx_cached, list_of_idx_non_cached = list(), list() nr = PhotonDataHelper.find_n(X) for start, stop in PhotonDataHelper.chunker(nr, 1): # split data in single entities, find key from first element = PATH to file X_key, _, _ = PhotonDataHelper.split_data( initial_X, None, {}, start, stop) X_batched, y_batched, kwargs_dict_batched = PhotonDataHelper.split_data( X, y, kwargs, start, stop) self.cache_man.update_single_subject_state_info(X_key) # check if item has been processed if self.cache_man.check_cache(name): list_of_idx_cached.append(start) else: list_of_idx_non_cached.append(start) X_uncached = PhotonDataHelper.stack_data_vertically( X_uncached, X_batched) y_uncached = PhotonDataHelper.stack_data_vertically( y_uncached, y_batched) initial_X_uncached = PhotonDataHelper.stack_data_vertically( initial_X_uncached, X_key) kwargs_uncached = PhotonDataHelper.join_dictionaries( kwargs_uncached, kwargs_dict_batched) # now we know which part can be loaded and which part should be transformed # first apply the transformation to the group, then save it single-subject-wise if len(list_of_idx_non_cached) > 0: # apply transformation groupwise new_group_X, new_group_y, new_group_kwargs = self._do_timed_fit_transform( name, transformer, fit, X_uncached, y_uncached, **kwargs_uncached) # then save it single nr = PhotonDataHelper.find_n(new_group_X) for start in range(nr): # split data in single entities X_batched, y_batched, kwargs_dict_batched = PhotonDataHelper.split_data( new_group_X, new_group_y, new_group_kwargs, start, start) X_key, _, _ = PhotonDataHelper.split_data( initial_X_uncached, None, {}, start, start) # we save the data in relation to the input path (X_key = hash(input X)) self.cache_man.update_single_subject_state_info(X_key) start_time_saving = datetime.datetime.now() self.cache_man.save_data_to_cache( name, (X_batched, y_batched, kwargs_dict_batched)) saving_duration = (datetime.datetime.now() - start_time_saving).total_seconds() self.time_monitor['transform_cached'].append( (name, saving_duration, 1)) # we need to collect the data only when we want to load them # we can skip that process if we only want them to get into the cache (case: parallelisation) if not self.skip_loading or needed_for_further_computation: # stack results processed_X, processed_y, processed_kwargs = new_group_X, new_group_y, new_group_kwargs # afterwards load everything that has been cached if len(list_of_idx_cached) > 0: if not self.skip_loading or needed_for_further_computation: for cache_idx in list_of_idx_cached: # we identify the data according to the input path (X before any transformation) self.cache_man.update_single_subject_state_info( [initial_X[cache_idx]]) # time the loading of the cached item start_time_for_loading = datetime.datetime.now() transformed_X, transformed_y, transformed_kwargs = self.cache_man.load_cached_data( name) loading_duration = ( datetime.datetime.now() - start_time_for_loading).total_seconds() self.time_monitor['transform_cached'].append( (name, loading_duration, PhotonDataHelper.find_n(X))) processed_X, processed_y, processed_kwargs = PhotonDataHelper.join_data( processed_X, transformed_X, processed_y, transformed_y, processed_kwargs, transformed_kwargs) logger.debug(name + " loaded " + str(len(list_of_idx_cached)) + " items from cache and computed " + str(len(list_of_idx_non_cached))) if not self.skip_loading or needed_for_further_computation: # now sort the data in the correct order again processed_X, processed_y, processed_kwargs = PhotonDataHelper.resort_splitted_data( processed_X, processed_y, processed_kwargs, PhotonDataHelper.stack_data_vertically( list_of_idx_cached, list_of_idx_non_cached)) return processed_X, processed_y, processed_kwargs
def fit(self, X, y, **kwargs): """ Iterates over cross-validation folds and trains the pipeline, then uses it for predictions. Calculates metrics per fold and averages them over fold. :param X: Training and test data :param y: Training and test targets :returns: configuration class for result tree that monitors training and test performance """ # needed for testing Timeboxed Random Grid Search # time.sleep(35) config_item = MDBConfig() config_item.config_dict = self.params config_item.inner_folds = [] config_item.metrics_test = [] config_item.metrics_train = [] config_item.computation_start_time = datetime.datetime.now() try: # do inner cv for idx, (inner_fold_id, inner_fold) in enumerate( self.cross_validation_infos.inner_folds[ self.outer_fold_id].items()): train, test = inner_fold.train_indices, inner_fold.test_indices # split kwargs according to cross validation train_X, train_y, kwargs_cv_train = PhotonDataHelper.split_data( X, y, kwargs, indices=train) test_X, test_y, kwargs_cv_test = PhotonDataHelper.split_data( X, y, kwargs, indices=test) new_pipe = self.pipe() if self.cache_folder is not None and self.cache_updater is not None: self.cache_updater(new_pipe, self.cache_folder, inner_fold_id) if not config_item.human_readable_config: config_item.human_readable_config = PhotonPrintHelper.config_to_human_readable_dict( new_pipe, self.params) logger.clean_info( json.dumps(config_item.human_readable_config, indent=4, sort_keys=True)) job_data = InnerFoldManager.InnerCVJob( pipe=new_pipe, config=dict(self.params), metrics=self.optimization_infos.metrics, callbacks=self.optimization_constraints, train_data=InnerFoldManager.JobData( train_X, train_y, train, kwargs_cv_train), test_data=InnerFoldManager.JobData(test_X, test_y, test, kwargs_cv_test), ) # only for unparallel processing # inform children in which inner fold we are # self.pipe.distribute_cv_info_to_hyperpipe_children(inner_fold_counter=fold_cnt) # self.mother_inner_fold_handle(fold_cnt) # --> write that output in InnerFoldManager! # logger.debug(config_item.human_readable_config) fold_nr = idx + 1 logger.debug("calculating inner fold " + str(fold_nr) + "...") curr_test_fold, curr_train_fold = InnerFoldManager.fit_and_score( job_data) logger.debug("Performance inner fold " + str(fold_nr)) print_double_metrics( curr_train_fold.metrics, curr_test_fold.metrics, photon_system_log=False, ) durations = job_data.pipe.time_monitor self.update_config_item_with_inner_fold( config_item=config_item, fold_cnt=fold_nr, curr_train_fold=curr_train_fold, curr_test_fold=curr_test_fold, time_monitor=durations, feature_importances=new_pipe.feature_importances_, ) if isinstance(self.optimization_constraints, list): break_cv = 0 for cf in self.optimization_constraints: if not cf.shall_continue(config_item): logger.info( "Skipped further cross validation after fold " + str(fold_nr) + " due to performance constraints in " + cf.metric) break_cv += 1 break if break_cv > 0: break elif self.optimization_constraints is not None: if not self.optimization_constraints.shall_continue( config_item): logger.info( "Skipped further cross validation after fold " + str(fold_nr) + " due to performance constraints in " + cf.metric) break InnerFoldManager.process_fit_results( config_item, self.cross_validation_infos.calculate_metrics_across_folds, self.cross_validation_infos.calculate_metrics_per_fold, self.optimization_infos.metrics, ) except Exception as e: if self.raise_error: raise e logger.error(e) logger.error(traceback.format_exc()) traceback.print_exc() if not isinstance(e, Warning): config_item.config_failed = True config_item.config_error = str(e) warnings.warn("One test iteration of pipeline failed with error") logger.debug("...done with") logger.debug( json.dumps(config_item.human_readable_config, indent=4, sort_keys=True)) config_item.computation_end_time = datetime.datetime.now() return config_item