Exemplo n.º 1
0
 def train(self,
           train_data,
           tuning_data,
           resource,
           time_limits=None,
           search_strategy='random',
           search_options=None,
           scheduler_options=None,
           num_trials=None,
           plot_results=False,
           console_log=True,
           ignore_warning=True,
           verbosity=2):
     force_forkserver()
     start_tick = time.time()
     logging_config(folder=self._output_directory,
                    name='main',
                    console=console_log,
                    logger=self._logger)
     assert len(self._label_columns) == 1
     # TODO(sxjscience) Try to support S3
     os.makedirs(self._output_directory, exist_ok=True)
     search_space_reg = args(search_space=space.Dict(**self.search_space))
     # Scheduler and searcher for HPO
     if scheduler_options is None:
         scheduler_options = dict()
     scheduler_options = compile_scheduler_options(
         scheduler_options=scheduler_options,
         search_strategy=search_strategy,
         search_options=search_options,
         nthreads_per_trial=resource['num_cpus'],
         ngpus_per_trial=resource['num_gpus'],
         checkpoint=os.path.join(self._output_directory, 'checkpoint.ag'),
         num_trials=num_trials,
         time_out=time_limits,
         resume=False,
         visualizer=scheduler_options.get('visualizer'),
         time_attr='report_idx',
         reward_attr='reward_attr',
         dist_ip_addrs=scheduler_options.get('dist_ip_addrs'))
     # Create a temporary cache file and then ask the inner function to load the
     # temporary cache.
     train_df_path = os.path.join(self._output_directory,
                                  'cache_train_dataframe.pq')
     tuning_df_path = os.path.join(self._output_directory,
                                   'cache_tuning_dataframe.pq')
     train_data.table.to_parquet(train_df_path)
     tuning_data.table.to_parquet(tuning_df_path)
     train_fn = search_space_reg(
         functools.partial(train_function,
                           train_df_path=train_df_path,
                           time_limits=time_limits,
                           time_start=start_tick,
                           tuning_df_path=tuning_df_path,
                           base_config=self.base_config,
                           problem_types=self.problem_types,
                           column_properties=self._column_properties,
                           label_columns=self._label_columns,
                           label_shapes=self._label_shapes,
                           log_metrics=self._log_metrics,
                           stopping_metric=self._stopping_metric,
                           console_log=console_log,
                           ignore_warning=ignore_warning))
     scheduler_cls = schedulers[search_strategy.lower()]
     # Create scheduler, run HPO experiment
     scheduler = scheduler_cls(train_fn, **scheduler_options)
     scheduler.run()
     scheduler.join_jobs()
     if len(scheduler.config_history) == 0:
         raise RuntimeError(
             'No training job has been completed! '
             'There are two possibilities: '
             '1) The time_limits is too small, '
             'or 2) There are some internal errors in AutoGluon. '
             'For the first case, you can increase the time_limits or set it to '
             'None, e.g., setting "TextPrediction.fit(..., time_limits=None). To '
             'further investigate the root cause, you can also try to train with '
             '"verbosity=3", i.e., TextPrediction.fit(..., verbosity=3).')
     best_config = scheduler.get_best_config()
     if verbosity >= 2:
         self._logger.info('Results=', scheduler.searcher._results)
         self._logger.info('Best_config={}'.format(best_config))
     best_task_id = scheduler.get_best_task_id()
     best_model_saved_dir_path = os.path.join(self._output_directory,
                                              'task{}'.format(best_task_id))
     best_cfg_path = os.path.join(best_model_saved_dir_path, 'cfg.yml')
     cfg = self.base_config.clone_merge(best_cfg_path)
     self._results = dict()
     self._results.update(best_reward=scheduler.get_best_reward(),
                          best_config=scheduler.get_best_config(),
                          total_time=time.time() - start_tick,
                          metadata=scheduler.metadata,
                          training_history=scheduler.training_history,
                          config_history=scheduler.config_history,
                          reward_attr=scheduler._reward_attr,
                          config=cfg)
     if plot_results:
         plot_training_curves = os.path.join(self._output_directory,
                                             'plot_training_curves.png')
         scheduler.get_training_curves(filename=plot_training_curves,
                                       plot=plot_results,
                                       use_legend=True)
     # Consider to move this to a separate predictor
     self._config = cfg
     backbone_model_cls, backbone_cfg, tokenizer, backbone_params_path, _ \
         = get_backbone(cfg.model.backbone.name)
     text_backbone = backbone_model_cls.from_cfg(backbone_cfg)
     preprocessor = TabularBasicBERTPreprocessor(
         tokenizer=tokenizer,
         column_properties=self._column_properties,
         label_columns=self._label_columns,
         max_length=cfg.model.preprocess.max_length,
         merge_text=cfg.model.preprocess.merge_text)
     self._preprocessor = preprocessor
     net = BERTForTabularBasicV1(
         text_backbone=text_backbone,
         feature_field_info=preprocessor.feature_field_info(),
         label_shape=self._label_shapes[0],
         cfg=cfg.model.network)
     net.hybridize()
     ctx_l = get_mxnet_available_ctx()
     net.load_parameters(os.path.join(best_model_saved_dir_path,
                                      'best_model.params'),
                         ctx=ctx_l)
     self._net = net
     mx.npx.waitall()
Exemplo n.º 2
0
 def train(self, train_data, tuning_data, resource,
           time_limits=None,
           scheduler='fifo',
           searcher=None,
           num_trials=10,
           grace_period=None,
           max_t=None,
           reduction_factor=4,
           brackets=1,
           plot_results=False,
           console_log=True,
           ignore_warning=True):
     start_tick = time.time()
     logging_config(folder=self._output_directory, name='main',
                    console=console_log,
                    logger=self._logger)
     assert len(self._label_columns) == 1
     # TODO(sxjscience) Try to support S3
     os.makedirs(self._output_directory, exist_ok=True)
     search_space_reg = args(search_space=space.Dict(**self.search_space))
     if scheduler == 'hyperband' and time_limits is None:
         time_limits = 5 * 60 * 60  # 5 hour
     train_fn = search_space_reg(functools.partial(train_function,
                                                   train_data=train_data,
                                                   time_limits=time_limits,
                                                   tuning_data=tuning_data,
                                                   base_config=self.base_config,
                                                   problem_types=self.problem_types,
                                                   column_properties=self._column_properties,
                                                   label_columns=self._label_columns,
                                                   label_shapes=self._label_shapes,
                                                   log_metrics=self._log_metrics,
                                                   stopping_metric=self._stopping_metric,
                                                   console_log=console_log,
                                                   ignore_warning=ignore_warning))
     if scheduler == 'fifo':
         if searcher is None:
             searcher = 'random'
         scheduler = FIFOScheduler(train_fn,
                                   time_out=time_limits,
                                   num_trials=num_trials,
                                   resource=resource,
                                   searcher=searcher,
                                   checkpoint=None,
                                   reward_attr='reward',
                                   time_attr='time_spent')
     elif scheduler == 'hyperband':
         if searcher is None:
             searcher = 'random'
         if grace_period is None:
             grace_period = 1
         if max_t is None:
             max_t = 5
         scheduler = HyperbandScheduler(train_fn,
                                        time_out=time_limits,
                                        max_t=max_t,
                                        resource=resource,
                                        searcher=searcher,
                                        grace_period=grace_period,
                                        reduction_factor=reduction_factor,
                                        brackets=brackets,
                                        checkpoint=None,
                                        reward_attr='reward',
                                        time_attr='report_idx')
     else:
         raise NotImplementedError
     scheduler.run()
     scheduler.join_jobs()
     if len(scheduler.config_history) == 0:
         raise RuntimeError('No training job has been completed! '
                            'There are two possibilities: '
                            '1) The time_limits is too small, '
                            'or 2) There are some internal errors in AutoGluon. '
                            'For the first case, you can increase the time_limits or set it to '
                            'None, e.g., setting "TextPrediction.fit(..., time_limits=None). To '
                            'further investigate the root cause, you can also try to train with '
                            '"verbosity=3", i.e., TextPrediction.fit(..., verbosity=3).')
     best_config = scheduler.get_best_config()
     self._logger.info('Best_config={}'.format(best_config))
     best_task_id = scheduler.get_best_task_id()
     best_model_saved_dir_path = os.path.join(self._output_directory,
                                              'task{}'.format(best_task_id))
     best_cfg_path = os.path.join(best_model_saved_dir_path, 'cfg.yml')
     cfg = self.base_config.clone_merge(best_cfg_path)
     self._results = dict()
     self._results.update(best_reward=scheduler.get_best_reward(),
                          best_config=scheduler.get_best_config(),
                          total_time=time.time() - start_tick,
                          metadata=scheduler.metadata,
                          training_history=scheduler.training_history,
                          config_history=scheduler.config_history,
                          reward_attr=scheduler._reward_attr,
                          config=cfg)
     if plot_results:
         plot_training_curves = os.path.join(self._output_directory, 'plot_training_curves.png')
         scheduler.get_training_curves(filename=plot_training_curves, plot=plot_results,
                                       use_legend=True)
     # Consider to move this to a separate predictor
     self._config = cfg
     backbone_model_cls, backbone_cfg, tokenizer, backbone_params_path, _ \
         = get_backbone(cfg.model.backbone.name)
     text_backbone = backbone_model_cls.from_cfg(backbone_cfg)
     preprocessor = TabularBasicBERTPreprocessor(tokenizer=tokenizer,
                                                 column_properties=self._column_properties,
                                                 label_columns=self._label_columns,
                                                 max_length=cfg.model.preprocess.max_length,
                                                 merge_text=cfg.model.preprocess.merge_text)
     self._preprocessor = preprocessor
     net = BERTForTabularBasicV1(text_backbone=text_backbone,
                                 feature_field_info=preprocessor.feature_field_info(),
                                 label_shape=self._label_shapes[0],
                                 cfg=cfg.model.network)
     # Here, we cannot use GPU due to https://github.com/awslabs/autogluon/issues/602
     net.load_parameters(os.path.join(best_model_saved_dir_path, 'best_model.params'),
                         ctx=mx.cpu())
     self._net = net
     mx.npx.waitall()