def test_split_classification_many_imbalanced_classes(self): for i in range(10): X = np.array([range(20), range(20)]).transpose() y = np.array( (0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5)) np.random.shuffle(y) X_train, X_valid, Y_train, Y_valid = split_data( X, y, classification=True) print X_train, Y_train self.assertLessEqual(max(Y_valid), 1)
def test_split_classification_many_imbalanced_classes(self): for i in range(10): X = np.array([range(20), range(20)]).transpose() y = np.array((0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5)) np.random.shuffle(y) X_train, X_valid, Y_train, Y_valid = split_data(X, y, classification=True) print X_train, Y_train self.assertLessEqual(max(Y_valid), 1)
def _split_regular(self): X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]]) y = np.array([0, 0, 0, 1, 1, 2]) X_train, X_valid, Y_train, Y_valid = split_data(X, y) # Check shapes self.assertEqual(X_train.shape, (4, 2)) self.assertEqual(Y_train.shape, (4, )) self.assertEqual(X_valid.shape, (2, 2)) self.assertEqual(Y_valid.shape, (2, )) self.assertListEqual(list(Y_valid), [0, 0]) self.assertListEqual(list(Y_train), [2, 0, 1, 1])
def _stratify(self): X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]]) y = np.array([0, 0, 0, 0, 1, 1]) X_train, X_valid, Y_train, Y_valid = split_data(X, y) # Check shapes self.assertEqual(X_train.shape[0], 4) self.assertEqual(X_train.shape[1], 2) self.assertEqual(Y_train.shape[0], 4) self.assertEqual(X_valid.shape[0], 2) self.assertEqual(X_valid.shape[1], 2) self.assertEqual(Y_valid.shape[0], 2) self.assertListEqual(list(Y_valid), [1, 0]) self.assertListEqual(list(Y_train), [0, 0, 0, 1])
def __init__(self, Datamanager, configuration, with_predictions=False, all_scoring_functions=False, seed=1, output_dir=None, output_y_test=False, num_run=None): super(HoldoutEvaluator, self).__init__(Datamanager, configuration, with_predictions=with_predictions, all_scoring_functions=all_scoring_functions, seed=seed, output_dir=output_dir, output_y_test=output_y_test, num_run=num_run) classification = Datamanager.info['task'] in CLASSIFICATION_TASKS self.X_train, self.X_optimization, self.Y_train, self.Y_optimization = \ split_data(Datamanager.data['X_train'], Datamanager.data['Y_train'], classification=classification) self.model = self.model_class(self.configuration, self.seed)
def test_split_data_regression(self): n_points = 1000 np.random.seed(42) n_dims = np.random.randint(1, 100) X = np.random.rand(n_points, n_dims) y = np.random.rand(n_points) X_train, X_valid, Y_train, Y_valid = split_data(X, y) self.assertEqual(X_train.shape[0], 670) self.assertEqual(X_valid.shape[0], 330) self.assertEqual(Y_train.shape[0], 670) self.assertEqual(Y_valid.shape[0], 330) self.assertEqual(X_train.shape[1], n_dims) self.assertEqual(X_valid.shape[1], n_dims) # Random checks self.assertAlmostEqual(X_train[4, 2], 0.5986584841970366) self.assertAlmostEqual(X_valid[4, 2], 0.63911512838980322)
def _fit(self, D): # TODO: check that data and task definition fit together! self.metric_ = D.info['metric'] self.task_ = D.info['task'] self.target_num_ = D.info['target_num'] # Set environment variable: seed = os.environ.get("AUTOSKLEARN_SEED") if seed is not None and int(seed) != self.seed: raise ValueError("It seems you have already started an instance " "of AutoSklearn in this thread.") else: os.environ["AUTOSKLEARN_SEED"] = str(self.seed) # == Split dataset and store Data for the ensemble script X_train, X_ensemble, Y_train, Y_ensemble = split_data.split_data( D.data['X_train'], D.data['Y_train']) true_labels_ensemble_filename = os.path.join(self.tmp_dir, "true_labels_ensemble.npy") true_labels_ensemble_lock = true_labels_ensemble_filename + ".lock" with lockfile.LockFile(true_labels_ensemble_lock): if not os.path.exists(true_labels_ensemble_filename): np.save(true_labels_ensemble_filename, Y_ensemble) del X_train, X_ensemble, Y_train, Y_ensemble time_needed_to_load_data = self.stopwatch_.wall_elapsed(self.basename_) time_left_after_reading = max(0, self.time_left_for_this_task - time_needed_to_load_data) self.logger.info("Remaining time after reading %s %5.2f sec" % (self.basename_, time_left_after_reading)) self.stopwatch_.stop_task("LoadData") # == Calculate metafeatures self.stopwatch_.start_task("CalculateMetafeatures") categorical = [True if feat_type.lower() in ["categorical"] else False for feat_type in D.feat_type] if self.initial_configurations_via_metalearning <= 0: ml = None elif D.info["task"] in \ [MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION]: ml = metalearning.MetaLearning() self.logger.debug("Start calculating metafeatures for %s" % self.basename_) ml.calculate_metafeatures_with_labels(D.data["X_train"], D.data["Y_train"], categorical=categorical, dataset_name=self.basename_) else: ml = None self.logger.critical("Metafeatures not calculated") self.stopwatch_.stop_task("CalculateMetafeatures") self.logger.debug("Calculating Metafeatures (categorical attributes) took %5.2f" % self.stopwatch_.wall_elapsed("CalculateMetafeatures")) self.stopwatch_.start_task("OneHot") D.perform1HotEncoding() self.ohe_ = D.encoder_ self.stopwatch_.stop_task("OneHot") # == Pickle the data manager self.stopwatch_.start_task("StoreDatamanager") data_manager_path = os.path.join(self.tmp_dir, self.basename_ + "_Manager.pkl") data_manager_lockfile = data_manager_path + ".lock" with lockfile.LockFile(data_manager_lockfile): if not os.path.exists(data_manager_path): pickle.dump(D, open(data_manager_path, 'w'), protocol=-1) self.logger.debug("Pickled Datamanager at %s" % data_manager_path) else: self.logger.debug("Data manager already presend at %s" % data_manager_path) self.stopwatch_.stop_task("StoreDatamanager") # = Create a searchspace self.stopwatch_.start_task("CreateConfigSpace") configspace_path = os.path.join(self.tmp_dir, "space.pcs") self.configuration_space = paramsklearn.get_configuration_space( D.info) self.configuration_space_created_hook() sp_string = pcs_parser.write(self.configuration_space) configuration_space_lockfile = configspace_path + ".lock" with lockfile.LockFile(configuration_space_lockfile): if not os.path.exists(configspace_path): with open(configspace_path, "w") as fh: fh.write(sp_string) self.logger.debug("Configuration space written to %s" % configspace_path) else: self.logger.debug("Configuration space already present at %s" % configspace_path) self.stopwatch_.stop_task("CreateConfigSpace") if ml is None: initial_configurations = [] elif D.info["task"]in \ [MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION]: self.stopwatch_.start_task("CalculateMetafeaturesEncoded") ml.calculate_metafeatures_encoded_labels(X_train=D.data["X_train"], Y_train=D.data["Y_train"], categorical=[False] * D.data["X_train"].shape[0], dataset_name=self.basename_) self.stopwatch_.stop_task("CalculateMetafeaturesEncoded") self.logger.debug( "Calculating Metafeatures (encoded attributes) took %5.2fsec" % self.stopwatch_.wall_elapsed("CalculateMetafeaturesEncoded")) self.logger.debug(ml._metafeatures_labels.__repr__(verbosity=2)) self.logger.debug(ml._metafeatures_encoded_labels.__repr__(verbosity=2)) self.stopwatch_.start_task("InitialConfigurations") try: initial_configurations = ml.create_metalearning_string_for_smac_call( self.configuration_space, self.basename_, self.metric_, self.task_, True if D.info['is_sparse'] == 1 else False, self.initial_configurations_via_metalearning, self.metadata_directory) except Exception as e: import traceback self.logger.error(str(e)) self.logger.error(traceback.format_exc()) initial_configurations = [] self.stopwatch_.stop_task("InitialConfigurations") self.logger.debug("Initial Configurations: (%d)", len(initial_configurations)) for initial_configuration in initial_configurations: self.logger.debug(initial_configuration) self.logger.debug("Looking for initial configurations took %5.2fsec" % self.stopwatch_.wall_elapsed("InitialConfigurations")) self.logger.info( "Time left for %s after finding initial configurations: %5.2fsec" % (self.basename_, self.time_left_for_this_task - self.stopwatch_.wall_elapsed(self.basename_))) else: initial_configurations = [] self.logger.critical("Metafeatures encoded not calculated") # == Set up a directory where all the trained models will be pickled to if self.keep_models: self.model_directory_ = os.path.join(self.tmp_dir, "models_%d" % self.seed) os.mkdir(self.model_directory_) self.ensemble_indices_directory_ = os.path.join(self.tmp_dir, "ensemble_indices_%d" % self.seed) os.mkdir(self.ensemble_indices_directory_) # == RUN SMAC self.stopwatch_.start_task("runSmac") # = Create an empty instance file instance_file = os.path.join(self.tmp_dir, "instances.txt") instance_file_lock = instance_file + ".lock" with lockfile.LockFile(instance_file_lock): if not os.path.exists(instance_file_lock): with open(instance_file, "w") as fh: fh.write("holdout") self.logger.debug("Created instance file %s" % instance_file) else: self.logger.debug("Instance file already present at %s" % instance_file) # = Start SMAC time_left_for_smac = max(0, self.time_left_for_this_task - ( self.stopwatch_.wall_elapsed(self.basename_))) self.logger.debug("Start SMAC with %5.2fsec time left" % time_left_for_smac) proc_smac, smac_call = \ submit_process.run_smac(dataset_name=self.basename_, dataset=data_manager_path, tmp_dir=self.tmp_dir, searchspace=configspace_path, instance_file=instance_file, limit=time_left_for_smac, cutoff_time=self.per_run_time_limit, initial_challengers=initial_configurations, memory_limit=self.ml_memory_limit, seed=self.seed) self.logger.debug(smac_call) self.stopwatch_.stop_task("runSmac") # == RUN ensemble builder self.stopwatch_.start_task("runEnsemble") time_left_for_ensembles = max(0, self.time_left_for_this_task - ( self.stopwatch_.wall_elapsed(self.basename_))) self.logger.debug("Start Ensemble with %5.2fsec time left" % time_left_for_ensembles) proc_ensembles = \ submit_process.run_ensemble_builder(tmp_dir=self.tmp_dir, dataset_name=self.basename_, task_type=self.task_, metric=self.metric_, limit=time_left_for_ensembles, output_dir=self.output_dir, ensemble_size=self.ensemble_size, ensemble_nbest=self.ensemble_nbest, seed=self.seed, ensemble_indices_output_dir=self.ensemble_indices_directory_) self.stopwatch_.stop_task("runEnsemble") del D if self.queue is not None: self.queue.put([time_needed_to_load_data, data_manager_path, proc_smac, proc_ensembles]) else: proc_smac.wait() proc_ensembles.wait() # Delete AutoSklearn environment variable del os.environ["AUTOSKLEARN_SEED"] return self