def train(self, train_data, tuning_data, resource, time_limits=None, search_strategy='random', search_options=None, scheduler_options=None, num_trials=None, plot_results=False, console_log=True, ignore_warning=True, verbosity=2): force_forkserver() start_tick = time.time() logging_config(folder=self._output_directory, name='main', console=console_log, logger=self._logger) assert len(self._label_columns) == 1 # TODO(sxjscience) Try to support S3 os.makedirs(self._output_directory, exist_ok=True) search_space_reg = args(search_space=space.Dict(**self.search_space)) # Scheduler and searcher for HPO if scheduler_options is None: scheduler_options = dict() scheduler_options = compile_scheduler_options( scheduler_options=scheduler_options, search_strategy=search_strategy, search_options=search_options, nthreads_per_trial=resource['num_cpus'], ngpus_per_trial=resource['num_gpus'], checkpoint=os.path.join(self._output_directory, 'checkpoint.ag'), num_trials=num_trials, time_out=time_limits, resume=False, visualizer=scheduler_options.get('visualizer'), time_attr='report_idx', reward_attr='reward_attr', dist_ip_addrs=scheduler_options.get('dist_ip_addrs')) # Create a temporary cache file and then ask the inner function to load the # temporary cache. train_df_path = os.path.join(self._output_directory, 'cache_train_dataframe.pq') tuning_df_path = os.path.join(self._output_directory, 'cache_tuning_dataframe.pq') train_data.table.to_parquet(train_df_path) tuning_data.table.to_parquet(tuning_df_path) train_fn = search_space_reg( functools.partial(train_function, train_df_path=train_df_path, time_limits=time_limits, time_start=start_tick, tuning_df_path=tuning_df_path, base_config=self.base_config, problem_types=self.problem_types, column_properties=self._column_properties, label_columns=self._label_columns, label_shapes=self._label_shapes, log_metrics=self._log_metrics, stopping_metric=self._stopping_metric, console_log=console_log, ignore_warning=ignore_warning)) scheduler_cls = schedulers[search_strategy.lower()] # Create scheduler, run HPO experiment scheduler = scheduler_cls(train_fn, **scheduler_options) scheduler.run() scheduler.join_jobs() if len(scheduler.config_history) == 0: raise RuntimeError( 'No training job has been completed! ' 'There are two possibilities: ' '1) The time_limits is too small, ' 'or 2) There are some internal errors in AutoGluon. ' 'For the first case, you can increase the time_limits or set it to ' 'None, e.g., setting "TextPrediction.fit(..., time_limits=None). To ' 'further investigate the root cause, you can also try to train with ' '"verbosity=3", i.e., TextPrediction.fit(..., verbosity=3).') best_config = scheduler.get_best_config() if verbosity >= 2: self._logger.info('Results=', scheduler.searcher._results) self._logger.info('Best_config={}'.format(best_config)) best_task_id = scheduler.get_best_task_id() best_model_saved_dir_path = os.path.join(self._output_directory, 'task{}'.format(best_task_id)) best_cfg_path = os.path.join(best_model_saved_dir_path, 'cfg.yml') cfg = self.base_config.clone_merge(best_cfg_path) self._results = dict() self._results.update(best_reward=scheduler.get_best_reward(), best_config=scheduler.get_best_config(), total_time=time.time() - start_tick, metadata=scheduler.metadata, training_history=scheduler.training_history, config_history=scheduler.config_history, reward_attr=scheduler._reward_attr, config=cfg) if plot_results: plot_training_curves = os.path.join(self._output_directory, 'plot_training_curves.png') scheduler.get_training_curves(filename=plot_training_curves, plot=plot_results, use_legend=True) # Consider to move this to a separate predictor self._config = cfg backbone_model_cls, backbone_cfg, tokenizer, backbone_params_path, _ \ = get_backbone(cfg.model.backbone.name) text_backbone = backbone_model_cls.from_cfg(backbone_cfg) preprocessor = TabularBasicBERTPreprocessor( tokenizer=tokenizer, column_properties=self._column_properties, label_columns=self._label_columns, max_length=cfg.model.preprocess.max_length, merge_text=cfg.model.preprocess.merge_text) self._preprocessor = preprocessor net = BERTForTabularBasicV1( text_backbone=text_backbone, feature_field_info=preprocessor.feature_field_info(), label_shape=self._label_shapes[0], cfg=cfg.model.network) net.hybridize() ctx_l = get_mxnet_available_ctx() net.load_parameters(os.path.join(best_model_saved_dir_path, 'best_model.params'), ctx=ctx_l) self._net = net mx.npx.waitall()
def train(self, train_data, tuning_data, resource, time_limits=None, scheduler='fifo', searcher=None, num_trials=10, grace_period=None, max_t=None, reduction_factor=4, brackets=1, plot_results=False, console_log=True, ignore_warning=True): start_tick = time.time() logging_config(folder=self._output_directory, name='main', console=console_log, logger=self._logger) assert len(self._label_columns) == 1 # TODO(sxjscience) Try to support S3 os.makedirs(self._output_directory, exist_ok=True) search_space_reg = args(search_space=space.Dict(**self.search_space)) if scheduler == 'hyperband' and time_limits is None: time_limits = 5 * 60 * 60 # 5 hour train_fn = search_space_reg(functools.partial(train_function, train_data=train_data, time_limits=time_limits, tuning_data=tuning_data, base_config=self.base_config, problem_types=self.problem_types, column_properties=self._column_properties, label_columns=self._label_columns, label_shapes=self._label_shapes, log_metrics=self._log_metrics, stopping_metric=self._stopping_metric, console_log=console_log, ignore_warning=ignore_warning)) if scheduler == 'fifo': if searcher is None: searcher = 'random' scheduler = FIFOScheduler(train_fn, time_out=time_limits, num_trials=num_trials, resource=resource, searcher=searcher, checkpoint=None, reward_attr='reward', time_attr='time_spent') elif scheduler == 'hyperband': if searcher is None: searcher = 'random' if grace_period is None: grace_period = 1 if max_t is None: max_t = 5 scheduler = HyperbandScheduler(train_fn, time_out=time_limits, max_t=max_t, resource=resource, searcher=searcher, grace_period=grace_period, reduction_factor=reduction_factor, brackets=brackets, checkpoint=None, reward_attr='reward', time_attr='report_idx') else: raise NotImplementedError scheduler.run() scheduler.join_jobs() if len(scheduler.config_history) == 0: raise RuntimeError('No training job has been completed! ' 'There are two possibilities: ' '1) The time_limits is too small, ' 'or 2) There are some internal errors in AutoGluon. ' 'For the first case, you can increase the time_limits or set it to ' 'None, e.g., setting "TextPrediction.fit(..., time_limits=None). To ' 'further investigate the root cause, you can also try to train with ' '"verbosity=3", i.e., TextPrediction.fit(..., verbosity=3).') best_config = scheduler.get_best_config() self._logger.info('Best_config={}'.format(best_config)) best_task_id = scheduler.get_best_task_id() best_model_saved_dir_path = os.path.join(self._output_directory, 'task{}'.format(best_task_id)) best_cfg_path = os.path.join(best_model_saved_dir_path, 'cfg.yml') cfg = self.base_config.clone_merge(best_cfg_path) self._results = dict() self._results.update(best_reward=scheduler.get_best_reward(), best_config=scheduler.get_best_config(), total_time=time.time() - start_tick, metadata=scheduler.metadata, training_history=scheduler.training_history, config_history=scheduler.config_history, reward_attr=scheduler._reward_attr, config=cfg) if plot_results: plot_training_curves = os.path.join(self._output_directory, 'plot_training_curves.png') scheduler.get_training_curves(filename=plot_training_curves, plot=plot_results, use_legend=True) # Consider to move this to a separate predictor self._config = cfg backbone_model_cls, backbone_cfg, tokenizer, backbone_params_path, _ \ = get_backbone(cfg.model.backbone.name) text_backbone = backbone_model_cls.from_cfg(backbone_cfg) preprocessor = TabularBasicBERTPreprocessor(tokenizer=tokenizer, column_properties=self._column_properties, label_columns=self._label_columns, max_length=cfg.model.preprocess.max_length, merge_text=cfg.model.preprocess.merge_text) self._preprocessor = preprocessor net = BERTForTabularBasicV1(text_backbone=text_backbone, feature_field_info=preprocessor.feature_field_info(), label_shape=self._label_shapes[0], cfg=cfg.model.network) # Here, we cannot use GPU due to https://github.com/awslabs/autogluon/issues/602 net.load_parameters(os.path.join(best_model_saved_dir_path, 'best_model.params'), ctx=mx.cpu()) self._net = net mx.npx.waitall()