def _score_with_pred_proba(self, y, y_internal, y_pred_proba_internal, metric, sample_weight=None, weight_evaluation=None): metric = get_metric(metric, self.problem_type, 'leaderboard_metric') if weight_evaluation is None: weight_evaluation = self.weight_evaluation if metric.needs_pred: if self.problem_type == BINARY: # Use 1 and 0, otherwise f1 can crash due to unknown pos_label. y_pred = get_pred_from_proba(y_pred_proba_internal, problem_type=self.problem_type) y_tmp = y_internal else: y_pred = self.label_cleaner.inverse_transform_proba(y_pred_proba_internal, as_pred=True) y_tmp = y elif metric.needs_quantile: y_pred = self.label_cleaner.inverse_transform_proba(y_pred_proba_internal, as_pred=True) y_tmp = y else: y_pred = self.label_cleaner.inverse_transform_proba(y_pred_proba_internal, as_pred=False) y_tmp = y_internal return compute_weighted_metric(y_tmp, y_pred, metric, weights=sample_weight, weight_evaluation=weight_evaluation, quantile_levels=self.quantile_levels)
def evaluate(self, valid_data, metrics): """ Report the predictive performance evaluated for a given dataset. Parameters ---------- valid_data : str or :class:`TabularDataset` or `pandas.DataFrame` This Dataset must also contain the label-column with the same column-name as specified during `fit()`. If str is passed, `valid_data` will be loaded using the str value as the file path. metrics : List[str] A list of names of metrics to report. Returns ------- Dict mapping metric -> score calculated over the given dataset. """ if isinstance(metrics, str): metrics = [metrics] assert self.net is not None if not isinstance(valid_data, TabularDataset): valid_data = TabularDataset( valid_data, columns=self._feature_columns + self._label_columns, column_properties=self._column_properties) ground_truth = np.array(valid_data.table[self._label_columns[0]].apply( self._column_properties[self._label_columns[0]].transform)) if self._problem_types[0] == _C.CLASSIFICATION: predictions = self.predict_proba(valid_data) else: predictions = self.predict(valid_data) metric_scores = { metric: calculate_metric(get_metric(metric), ground_truth, predictions, self.problem_types[0]) for metric in metrics } return metric_scores
def __init__(self, path_context: str, label: str, feature_generator: PipelineFeatureGenerator, ignored_columns: list = None, label_count_threshold=10, problem_type=None, eval_metric=None, positive_class=None, cache_data=True, is_trainer_present=False, random_state=0): self.path, self.model_context, self.save_path = self.create_contexts(path_context) self.label = label self.ignored_columns = ignored_columns if self.ignored_columns is None: self.ignored_columns = [] self.threshold = label_count_threshold self.problem_type = problem_type self.eval_metric = get_metric(eval_metric, self.problem_type, 'eval_metric') self.cache_data = cache_data if not self.cache_data: logger.log(30, 'Warning: `cache_data=False` will disable or limit advanced functionality after training such as feature importance calculations. It is recommended to set `cache_data=True` unless you explicitly wish to not have the data saved to disk.') self.is_trainer_present = is_trainer_present if random_state is None: random_state = random.randint(0, 1000000) self.random_state = random_state self.cleaner = None self.label_cleaner: LabelCleaner = None self.feature_generator: PipelineFeatureGenerator = feature_generator self.trainer: AbstractTrainer = None self.trainer_type = None self.trainer_path = None self.reset_paths = False self._positive_class = positive_class try: from ..version import __version__ self.version = __version__ except: self.version = None
def __init__(self, path_context: str, label: str, feature_generator: PipelineFeatureGenerator, ignored_columns: list = None, label_count_threshold=10, problem_type=None, eval_metric=None, positive_class=None, cache_data=True, is_trainer_present=False, random_state=0, sample_weight=None, weight_evaluation=False): self.path, self.model_context, self.save_path = self.create_contexts( path_context) self.label = label self.ignored_columns = ignored_columns if self.ignored_columns is None: self.ignored_columns = [] self.threshold = label_count_threshold self.problem_type = problem_type self.eval_metric = get_metric(eval_metric, self.problem_type, 'eval_metric') self.cache_data = cache_data if not self.cache_data: logger.log( 30, 'Warning: `cache_data=False` will disable or limit advanced functionality after training such as feature importance calculations. It is recommended to set `cache_data=True` unless you explicitly wish to not have the data saved to disk.' ) self.is_trainer_present = is_trainer_present if random_state is None: random_state = random.randint(0, 1000000) self.random_state = random_state self.cleaner = None self.label_cleaner: LabelCleaner = None self.feature_generator: PipelineFeatureGenerator = feature_generator self.trainer: AbstractTrainer = None self.trainer_type = None self.trainer_path = None self.reset_paths = False self._positive_class = positive_class self.sample_weight = sample_weight self.weight_evaluation = weight_evaluation if sample_weight is not None and not isinstance(sample_weight, str): raise ValueError( "sample_weight must be a string indicating the name of column that contains sample weights. If you have a vector of sample weights, first add these as an extra column to your data." ) if weight_evaluation and sample_weight is None: raise ValueError( "Must specify sample_weight column if you specify weight_evaluation=True" ) try: from ..version import __version__ self.version = __version__ except: self.version = None
def estimate_importance(dataset, model_name): if os.path.exists( os.path.join('feature_importance', dataset, model_name, 'importance.csv')): print(f'Found {dataset}, {model_name}') return model_remote_path = stat_df.loc[model_name, dataset] postfix = '/test_score.json' remote_dir_name = model_remote_path[:-len(postfix)] def downloadDirectoryFroms3(bucketName, remoteDirectoryName, local_dir_path): s3_resource = boto3.resource('s3') bucket = s3_resource.Bucket(bucketName) for obj in bucket.objects.filter(Prefix=remoteDirectoryName): print(obj.key) download_path = os.path.join(local_dir_path, obj.key) if not os.path.exists(os.path.dirname(download_path)): os.makedirs(os.path.dirname(download_path), exist_ok=True) bucket.download_file(obj.key, download_path) local_dir_name = os.path.join(download_path, remote_dir_name) if os.path.exists(local_dir_name): pass else: downloadDirectoryFroms3('automl-mm-bench', remote_dir_name, download_path) test_dataset = dataset_registry.create(dataset, 'test') if model_name == MULTIMODAL_TEXT_MODEL_NAME: predictor = MultiModalTextModel.load( os.path.join(local_dir_name, 'saved_model')) elif model_name == TABULAR_MODEL_NAME: predictor = TabularPredictor.load(os.path.join(local_dir_name)) elif model_name == STACK_ENSEMBLE_MODEL_NAME: predictor = TabularPredictor.load(os.path.join(local_dir_name)) else: raise NotImplementedError sample_size = min(len(test_dataset.data), 1000) if model_name == TABULAR_MODEL_NAME: importance_df = predictor.feature_importance( test_dataset.data[test_dataset.feature_columns + test_dataset.label_columns], subsample_size=sample_size) else: importance_df = compute_permutation_feature_importance( test_dataset.data[test_dataset.feature_columns], test_dataset.data[test_dataset.label_columns[0]], predict_func=predictor.predict, eval_metric=get_metric(test_dataset.metric), subsample_size=sample_size, num_shuffle_sets=3) os.makedirs(os.path.join('feature_importance', dataset, model_name), exist_ok=True) importance_df.to_csv( os.path.join('feature_importance', dataset, model_name, 'importance.csv')) print(importance_df)
def _get_default_stopping_metric(self): if self.eval_metric.name == 'roc_auc': stopping_metric = 'log_loss' else: stopping_metric = self.eval_metric stopping_metric = metrics.get_metric(stopping_metric, self.problem_type, 'stopping_metric') return stopping_metric
def __init__(self, path_context: str, label: str, feature_generator: PipelineFeatureGenerator, ignored_columns: list = None, label_count_threshold=10, problem_type=None, quantile_levels=None, eval_metric=None, positive_class=None, cache_data=True, is_trainer_present=False, random_state=0, sample_weight=None, weight_evaluation=False): self.path, self.model_context, self.save_path = self.create_contexts(path_context) self.label = label self.ignored_columns = ignored_columns if self.ignored_columns is None: self.ignored_columns = [] self.threshold = label_count_threshold self.problem_type = problem_type self.eval_metric = get_metric(eval_metric, self.problem_type, 'eval_metric') if self.problem_type == QUANTILE and quantile_levels is None: raise ValueError("if `problem_type='quantile'`, `quantile_levels` has to be specified") if isinstance(quantile_levels, float): quantile_levels = [quantile_levels] if isinstance(quantile_levels, Iterable): for quantile in quantile_levels: if quantile <= 0.0 or quantile >= 1.0: raise ValueError("quantile values have to be non-negative and less than 1.0 (0.0 < q < 1.0). " "For example, 0.95 quantile = 95 percentile") quantile_levels = np.sort(np.array(quantile_levels)) self.quantile_levels = quantile_levels self.cache_data = cache_data if not self.cache_data: logger.log(30, 'Warning: `cache_data=False` will disable or limit advanced functionality after training such as feature importance calculations. It is recommended to set `cache_data=True` unless you explicitly wish to not have the data saved to disk.') self.is_trainer_present = is_trainer_present if random_state is None: random_state = random.randint(0, 1000000) self.random_state = random_state self.cleaner = None self.label_cleaner: LabelCleaner = None self.feature_generator: PipelineFeatureGenerator = feature_generator self.trainer: AbstractTrainer = None self.trainer_type = None self.trainer_path = None self.reset_paths = False self._pre_X_rows = None self._post_X_rows = None self._positive_class = positive_class self.sample_weight = sample_weight self.weight_evaluation = weight_evaluation if sample_weight is not None and not isinstance(sample_weight, str): raise ValueError("sample_weight must be a string indicating the name of column that contains sample weights. If you have a vector of sample weights, first add these as an extra column to your data.") if weight_evaluation and sample_weight is None: raise ValueError("Must specify sample_weight column if you specify weight_evaluation=True") try: from ..version import __version__ self.version = __version__ except: self.version = None
def _score_with_pred(self, y, y_internal, y_pred_internal, metric, sample_weight=None): metric = get_metric(metric, self.problem_type, 'leaderboard_metric') if self.problem_type == BINARY: # Use 1 and 0, otherwise f1 can crash due to unknown pos_label. y_pred = y_pred_internal y_tmp = y_internal else: y_pred = self.label_cleaner.inverse_transform(y_pred_internal) y_tmp = y return compute_weighted_metric( y_tmp, y_pred, metric, weights=sample_weight, weight_evaluation=self.weight_evaluation, quantile_levels=self.quantile_levels)
def evaluate_predictions(self, y_true, y_pred, sample_weight=None, silent=False, auxiliary_metrics=True, detailed_report=False): """ Evaluate predictions. Does not support sample weights since this method reports a variety of metrics. Args: silent (bool): Should we print which metric is being used as well as performance. auxiliary_metrics (bool): Should we compute other (problem_type specific) metrics in addition to the default metric? detailed_report (bool): Should we computed more-detailed versions of the auxiliary_metrics? (requires auxiliary_metrics=True). Returns single performance-value if auxiliary_metrics=False. Otherwise returns dict where keys = metrics, values = performance along each metric. """ is_proba = False assert isinstance(y_true, (np.ndarray, pd.Series)) assert isinstance(y_pred, (np.ndarray, pd.Series, pd.DataFrame)) self._validate_class_labels(y_true) if isinstance(y_pred, np.ndarray): if self.problem_type == QUANTILE: y_pred = pd.DataFrame(data=y_pred, columns=self.quantile_levels) elif len(y_pred.shape) > 1: y_pred = pd.DataFrame(data=y_pred, columns=self.class_labels) if isinstance(y_pred, pd.DataFrame): is_proba = True elif not self.eval_metric.needs_pred: raise AssertionError(f'`evaluate_predictions` requires y_pred_proba input ' f'when evaluating "{self.eval_metric.name}"... Please generate valid input via `predictor.predict_proba(data)`.\n' f'This may have occurred if you passed in predict input instead of predict_proba input, ' f'or if you specified `as_multiclass=False` to `predictor.predict_proba(data, as_multiclass=False)`, ' f'which is not supported by `evaluate_predictions`.') if is_proba: y_pred_proba = y_pred y_pred = get_pred_from_proba_df(y_pred_proba, problem_type=self.problem_type) if self.problem_type == BINARY: # roc_auc crashes if this isn't done y_pred_proba = y_pred_proba[self.positive_class] else: y_pred_proba = None y_pred = pd.Series(y_pred) if y_pred_proba is not None: y_pred_proba_internal = self.label_cleaner.transform_proba(y_pred_proba, as_pandas=True) else: y_pred_proba_internal = None y_true_internal = self.label_cleaner.transform(y_true) # Get labels in numeric order y_true_internal = y_true_internal.fillna(-1) y_pred_internal = self.label_cleaner.transform(y_pred) # Get labels in numeric order # Compute auxiliary metrics: auxiliary_metrics_lst = [self.eval_metric] performance_dict = {} if auxiliary_metrics: if self.problem_type == REGRESSION: # Adding regression metrics auxiliary_metrics_lst += [ 'root_mean_squared_error', 'mean_squared_error', 'mean_absolute_error', 'r2', 'pearsonr', 'median_absolute_error', ] if self.problem_type in [BINARY, MULTICLASS]: # Adding classification metrics auxiliary_metrics_lst += [ 'accuracy', 'balanced_accuracy', # 'log_loss', # Don't include as it probably adds more confusion to novice users (can be infinite) 'mcc', ] if self.problem_type == BINARY: # binary-specific metrics auxiliary_metrics_lst += [ 'roc_auc', 'f1', 'precision', 'recall', ] scoring_args = dict( y=y_true, y_internal=y_true_internal, weight_evaluation=False, ) if sample_weight is not None: scoring_args['sample_weight'] = sample_weight scoring_args['weight_evaluation'] = True for aux_metric in auxiliary_metrics_lst: if isinstance(aux_metric, str): aux_metric = get_metric(metric=aux_metric, problem_type=self.problem_type, metric_type='aux_metric') if not aux_metric.needs_pred and y_pred_proba_internal is None: logger.log(15, f'Skipping {aux_metric.name} because no prediction probabilities are available to score.') continue if aux_metric.name not in performance_dict: if y_pred_proba_internal is not None: score = self._score_with_pred_proba( y_pred_proba_internal=y_pred_proba_internal, metric=aux_metric, **scoring_args ) else: score = self._score_with_pred( y_pred_internal=y_pred_internal, metric=aux_metric, **scoring_args ) performance_dict[aux_metric.name] = score if self.eval_metric.name in performance_dict: score_eval = performance_dict[self.eval_metric.name] score_eval_flipped = self.eval_metric.convert_score_to_sklearn_val(score_eval) # flip negative once again back to positive (so higher is no longer necessarily better) if score_eval_flipped != score_eval: flipped = True else: flipped = False if not silent: logger.log(20, f"Evaluation: {self.eval_metric.name} on test data: {score_eval}") if flipped: logger.log(20, f"\tNote: Scores are always higher_is_better. This metric score can be multiplied by -1 to get the metric value.") if not silent: logger.log(20, "Evaluations on test data:") logger.log(20, json.dumps(performance_dict, indent=4)) if detailed_report and (self.problem_type != REGRESSION): # Construct confusion matrix try: performance_dict['confusion_matrix'] = confusion_matrix(y_true, y_pred, labels=self.label_cleaner.ordered_class_labels, output_format='pandas_dataframe') except ValueError: pass # One final set of metrics to report cl_metric = lambda y_true, y_pred: classification_report(y_true, y_pred, output_dict=True) metric_name = 'classification_report' if metric_name not in performance_dict: try: # only compute auxiliary metrics which do not error (y_pred = class-probabilities may cause some metrics to error) performance_dict[metric_name] = cl_metric(y_true, y_pred) except ValueError: pass if not silent and metric_name in performance_dict: logger.log(20, "Detailed (per-class) classification report:") logger.log(20, json.dumps(performance_dict[metric_name], indent=4)) return performance_dict
def score_debug(self, X: DataFrame, y=None, extra_info=False, compute_oracle=False, extra_metrics=None, silent=False): leaderboard_df = self.leaderboard(extra_info=extra_info, silent=silent) if y is None: X, y = self.extract_label(X) if extra_metrics is None: extra_metrics = [] self._validate_class_labels(y) w = None if self.weight_evaluation: X, w = extract_column(X, self.sample_weight) X = self.transform_features(X) y_internal = self.label_cleaner.transform(y) y_internal = y_internal.fillna(-1) trainer = self.load_trainer() scores = {} all_trained_models = trainer.get_model_names() all_trained_models_can_infer = trainer.get_model_names(can_infer=True) all_trained_models_original = all_trained_models.copy() model_pred_proba_dict, pred_time_test_marginal = trainer.get_model_pred_proba_dict(X=X, models=all_trained_models_can_infer, fit=False, record_pred_time=True) if compute_oracle: pred_probas = list(model_pred_proba_dict.values()) ensemble_selection = EnsembleSelection(ensemble_size=100, problem_type=trainer.problem_type, metric=self.eval_metric, quantile_levels=self.quantile_levels) ensemble_selection.fit(predictions=pred_probas, labels=y_internal, identifiers=None, sample_weight=w) # TODO: Only fit non-nan oracle_weights = ensemble_selection.weights_ oracle_pred_time_start = time.time() oracle_pred_proba_norm = [pred * weight for pred, weight in zip(pred_probas, oracle_weights)] oracle_pred_proba_ensemble = np.sum(oracle_pred_proba_norm, axis=0) oracle_pred_time = time.time() - oracle_pred_time_start model_pred_proba_dict['OracleEnsemble'] = oracle_pred_proba_ensemble pred_time_test_marginal['OracleEnsemble'] = oracle_pred_time all_trained_models.append('OracleEnsemble') scoring_args = dict( y=y, y_internal=y_internal, sample_weight=w ) extra_scores = {} for model_name, y_pred_proba_internal in model_pred_proba_dict.items(): scores[model_name] = self._score_with_pred_proba( y_pred_proba_internal=y_pred_proba_internal, metric=self.eval_metric, **scoring_args ) for metric in extra_metrics: metric = get_metric(metric, self.problem_type, 'leaderboard_metric') if metric.name not in extra_scores: extra_scores[metric.name] = {} extra_scores[metric.name][model_name] = self._score_with_pred_proba( y_pred_proba_internal=y_pred_proba_internal, metric=metric, **scoring_args ) if extra_scores: series = [] for metric in extra_scores: series.append(pd.Series(extra_scores[metric], name=metric)) df_extra_scores = pd.concat(series, axis=1) extra_metrics_names = list(df_extra_scores.columns) df_extra_scores['model'] = df_extra_scores.index df_extra_scores = df_extra_scores.reset_index(drop=True) else: df_extra_scores = None extra_metrics_names = None pred_time_test = {} # TODO: Add support for calculating pred_time_test_full for oracle_ensemble, need to copy graph from trainer and add oracle_ensemble to it with proper edges. for model in model_pred_proba_dict.keys(): if model in all_trained_models_original: base_model_set = trainer.get_minimum_model_set(model) if len(base_model_set) == 1: pred_time_test[model] = pred_time_test_marginal[base_model_set[0]] else: pred_time_test_full_num = 0 for base_model in base_model_set: pred_time_test_full_num += pred_time_test_marginal[base_model] pred_time_test[model] = pred_time_test_full_num else: pred_time_test[model] = None scored_models = list(scores.keys()) for model in all_trained_models: if model not in scored_models: scores[model] = None pred_time_test[model] = None pred_time_test_marginal[model] = None logger.debug('Model scores:') logger.debug(str(scores)) model_names_final = list(scores.keys()) df = pd.DataFrame( data={ 'model': model_names_final, 'score_test': list(scores.values()), 'pred_time_test': [pred_time_test[model] for model in model_names_final], 'pred_time_test_marginal': [pred_time_test_marginal[model] for model in model_names_final], } ) if df_extra_scores is not None: df = pd.merge(df, df_extra_scores, on='model', how='left') df_merged = pd.merge(df, leaderboard_df, on='model', how='left') df_merged = df_merged.sort_values(by=['score_test', 'pred_time_test', 'score_val', 'pred_time_val', 'model'], ascending=[False, True, False, True, False]).reset_index(drop=True) df_columns_lst = df_merged.columns.tolist() explicit_order = [ 'model', 'score_test', ] if extra_metrics_names is not None: explicit_order += extra_metrics_names explicit_order += [ 'score_val', 'pred_time_test', 'pred_time_val', 'fit_time', 'pred_time_test_marginal', 'pred_time_val_marginal', 'fit_time_marginal', 'stack_level', 'can_infer', 'fit_order', ] df_columns_other = [column for column in df_columns_lst if column not in explicit_order] df_columns_new = explicit_order + df_columns_other df_merged = df_merged[df_columns_new] return df_merged
def __init__(self, path: str, name: str, problem_type: str, eval_metric: Union[str, metrics.Scorer] = None, num_classes=None, stopping_metric=None, model=None, hyperparameters=None, features=None, feature_metadata: FeatureMetadata = None, debug=0, **kwargs): """ Creates a new model. Args: path (str): directory where to store all outputs. name (str): name of subdirectory inside path where model will be saved. problem_type (str): type of problem this model will handle. Valid options: ['binary', 'multiclass', 'regression']. eval_metric (str or autogluon.core.metrics.Scorer): objective function the model intends to optimize. If None, will be inferred based on problem_type. hyperparameters (dict): various hyperparameters that will be used by model (can be search spaces instead of fixed values). feature_metadata (autogluon.tabular.features.feature_metadata.FeatureMetadata): contains feature type information that can be used to identify special features such as text ngrams and datetime as well as which features are numerical vs categorical """ self.name = name # TODO: v0.1 Consider setting to self._name and having self.name be a property so self.name can't be set outside of self.rename() self.path_root = path self.path_suffix = self.name + os.path.sep # TODO: Make into function to avoid having to reassign on load? self.path = self.create_contexts( self.path_root + self.path_suffix ) # TODO: Make this path a function for consistency. self.num_classes = num_classes self.model = model self.problem_type = problem_type if eval_metric is not None: self.eval_metric = metrics.get_metric( eval_metric, self.problem_type, 'eval_metric' ) # Note: we require higher values = better performance else: self.eval_metric = infer_eval_metric( problem_type=self.problem_type) logger.log( 20, f"Model {self.name}'s eval_metric inferred to be '{self.eval_metric.name}' because problem_type='{self.problem_type}' and eval_metric was not specified during init." ) if self.eval_metric.name in OBJECTIVES_TO_NORMALIZE: self.normalize_pred_probas = True logger.debug( f"{self.name} predicted probabilities will be transformed to never =0 since eval_metric='{self.eval_metric.name}'" ) else: self.normalize_pred_probas = False if feature_metadata is not None: feature_metadata = copy.deepcopy(feature_metadata) self.feature_metadata = feature_metadata # TODO: Should this be passed to a model on creation? Should it live in a Dataset object and passed during fit? Currently it is being updated prior to fit by trainer self.features = features self.debug = debug self.fit_time = None # Time taken to fit in seconds (Training data) self.predict_time = None # Time taken to predict in seconds (Validation data) self.val_score = None # Score with eval_metric (Validation data) self.params = {} self.params_aux = {} self._set_default_auxiliary_params() if hyperparameters is not None: hyperparameters = hyperparameters.copy() if AG_ARGS_FIT in hyperparameters: ag_args_fit = hyperparameters.pop(AG_ARGS_FIT) self.params_aux.update(ag_args_fit) if stopping_metric is None: self.stopping_metric = self.params_aux.get( 'stopping_metric', self._get_default_stopping_metric()) else: if 'stopping_metric' in self.params_aux: raise AssertionError( 'stopping_metric was specified in both hyperparameters AG_args_fit and model init. Please specify only once.' ) self.stopping_metric = stopping_metric self.stopping_metric = metrics.get_metric(self.stopping_metric, self.problem_type, 'stopping_metric') self._set_default_params() self.nondefault_params = [] if hyperparameters is not None: self.params.update(hyperparameters) self.nondefault_params = list(hyperparameters.keys( ))[:] # These are hyperparameters that user has specified. self.params_trained = dict()
def train_function(args, reporter, train_df_path, tuning_df_path, time_limits, time_start, base_config, problem_types, column_properties, label_columns, label_shapes, log_metrics, stopping_metric, console_log, ignore_warning=False): if time_limits is not None: start_train_tick = time.time() time_left = time_limits - (start_train_tick - time_start) if time_left <= 0: reporter.terminate() return import os # Get the log metric scorers if isinstance(log_metrics, str): log_metrics = [log_metrics] # Load the training and tuning data from the parquet file train_data = pd.read_parquet(train_df_path) tuning_data = pd.read_parquet(tuning_df_path) log_metric_scorers = [get_metric(ele) for ele in log_metrics] stopping_metric_scorer = get_metric(stopping_metric) greater_is_better = stopping_metric_scorer.greater_is_better os.environ['MKL_NUM_THREADS'] = '1' os.environ['OMP_NUM_THREADS'] = '1' os.environ['MKL_DYNAMIC'] = 'FALSE' if ignore_warning: import warnings warnings.filterwarnings("ignore") search_space = args['search_space'] cfg = base_config.clone() specified_values = [] for key in search_space: specified_values.append(key) specified_values.append(search_space[key]) cfg.merge_from_list(specified_values) exp_dir = cfg.misc.exp_dir if reporter is not None: # When the reporter is not None, # we create the saved directory based on the task_id + time task_id = args.task_id exp_dir = os.path.join(exp_dir, 'task{}'.format(task_id)) os.makedirs(exp_dir, exist_ok=True) cfg.defrost() cfg.misc.exp_dir = exp_dir cfg.freeze() logger = logging.getLogger() logging_config(folder=exp_dir, name='training', logger=logger, console=console_log) logger.info(cfg) # Load backbone model backbone_model_cls, backbone_cfg, tokenizer, backbone_params_path, _ \ = get_backbone(cfg.model.backbone.name) with open(os.path.join(exp_dir, 'cfg.yml'), 'w') as f: f.write(str(cfg)) text_backbone = backbone_model_cls.from_cfg(backbone_cfg) # Build Preprocessor + Preprocess the training dataset + Inference problem type # TODO Move preprocessor + Dataloader to outer loop to better cache the dataloader preprocessor = TabularBasicBERTPreprocessor( tokenizer=tokenizer, column_properties=column_properties, label_columns=label_columns, max_length=cfg.model.preprocess.max_length, merge_text=cfg.model.preprocess.merge_text) logger.info('Process training set...') processed_train = preprocessor.process_train(train_data) logger.info('Done!') logger.info('Process dev set...') processed_dev = preprocessor.process_test(tuning_data) logger.info('Done!') label = label_columns[0] # Get the ground-truth dev labels gt_dev_labels = np.array(tuning_data[label].apply( column_properties[label].transform)) ctx_l = get_mxnet_available_ctx() base_batch_size = cfg.optimization.per_device_batch_size num_accumulated = int( np.ceil(cfg.optimization.batch_size / base_batch_size)) inference_base_batch_size = base_batch_size * cfg.optimization.val_batch_size_mult train_dataloader = DataLoader( processed_train, batch_size=base_batch_size, shuffle=True, batchify_fn=preprocessor.batchify(is_test=False)) dev_dataloader = DataLoader( processed_dev, batch_size=inference_base_batch_size, shuffle=False, batchify_fn=preprocessor.batchify(is_test=True)) net = BERTForTabularBasicV1( text_backbone=text_backbone, feature_field_info=preprocessor.feature_field_info(), label_shape=label_shapes[0], cfg=cfg.model.network) net.initialize_with_pretrained_backbone(backbone_params_path, ctx=ctx_l) net.hybridize() num_total_params, num_total_fixed_params = count_parameters( net.collect_params()) logger.info('#Total Params/Fixed Params={}/{}'.format( num_total_params, num_total_fixed_params)) # Initialize the optimizer updates_per_epoch = int( len(train_dataloader) / (num_accumulated * len(ctx_l))) optimizer, optimizer_params, max_update \ = get_optimizer(cfg.optimization, updates_per_epoch=updates_per_epoch) valid_interval = math.ceil(cfg.optimization.valid_frequency * updates_per_epoch) train_log_interval = math.ceil(cfg.optimization.log_frequency * updates_per_epoch) trainer = mx.gluon.Trainer(net.collect_params(), optimizer, optimizer_params, update_on_kvstore=False) if 0 < cfg.optimization.layerwise_lr_decay < 1: apply_layerwise_decay(net.text_backbone, cfg.optimization.layerwise_lr_decay, backbone_name=cfg.model.backbone.name) # Do not apply weight decay to all the LayerNorm and bias for _, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 params = [p for p in net.collect_params().values() if p.grad_req != 'null'] # Set grad_req if gradient accumulation is required if num_accumulated > 1: logger.info('Using gradient accumulation.' ' Global batch size = {}'.format( cfg.optimization.batch_size)) for p in params: p.grad_req = 'add' net.collect_params().zero_grad() train_loop_dataloader = grouper(repeat(train_dataloader), len(ctx_l)) log_loss_l = [mx.np.array(0.0, dtype=np.float32, ctx=ctx) for ctx in ctx_l] log_num_samples_l = [0 for _ in ctx_l] logging_start_tick = time.time() best_performance_score = None mx.npx.waitall() no_better_rounds = 0 report_idx = 0 start_tick = time.time() if time_limits is not None: time_limits -= start_tick - time_start if time_limits <= 0: reporter.terminate() return best_report_items = None for update_idx in tqdm.tqdm(range(max_update), disable=None): num_samples_per_update_l = [0 for _ in ctx_l] for accum_idx in range(num_accumulated): sample_l = next(train_loop_dataloader) loss_l = [] num_samples_l = [0 for _ in ctx_l] for i, (sample, ctx) in enumerate(zip(sample_l, ctx_l)): feature_batch, label_batch = sample feature_batch = move_to_ctx(feature_batch, ctx) label_batch = move_to_ctx(label_batch, ctx) with mx.autograd.record(): pred = net(feature_batch) if problem_types[0] == _C.CLASSIFICATION: logits = mx.npx.log_softmax(pred, axis=-1) loss = -mx.npx.pick(logits, label_batch[0]) elif problem_types[0] == _C.REGRESSION: loss = mx.np.square(pred - label_batch[0]) loss_l.append(loss.mean() / len(ctx_l)) num_samples_l[i] = loss.shape[0] num_samples_per_update_l[i] += loss.shape[0] for loss in loss_l: loss.backward() for i in range(len(ctx_l)): log_loss_l[i] += loss_l[i] * len(ctx_l) * num_samples_l[i] log_num_samples_l[i] += num_samples_per_update_l[i] # Begin to update trainer.allreduce_grads() num_samples_per_update = sum(num_samples_per_update_l) total_norm, ratio, is_finite = \ clip_grad_global_norm(params, cfg.optimization.max_grad_norm * num_accumulated) total_norm = total_norm / num_accumulated trainer.update(num_samples_per_update) # Clear after update if num_accumulated > 1: net.collect_params().zero_grad() if (update_idx + 1) % train_log_interval == 0: log_loss = sum([ele.as_in_ctx(ctx_l[0]) for ele in log_loss_l]).asnumpy() log_num_samples = sum(log_num_samples_l) logger.info( '[Iter {}/{}, Epoch {}] train loss={:0.4e}, gnorm={:0.4e}, lr={:0.4e}, #samples processed={},' ' #sample per second={:.2f}'.format( update_idx + 1, max_update, int(update_idx / updates_per_epoch), log_loss / log_num_samples, total_norm, trainer.learning_rate, log_num_samples, log_num_samples / (time.time() - logging_start_tick))) logging_start_tick = time.time() log_loss_l = [ mx.np.array(0.0, dtype=np.float32, ctx=ctx) for ctx in ctx_l ] log_num_samples_l = [0 for _ in ctx_l] if (update_idx + 1) % valid_interval == 0 or (update_idx + 1) == max_update: valid_start_tick = time.time() dev_predictions = \ _classification_regression_predict(net, dataloader=dev_dataloader, problem_type=problem_types[0], has_label=False) log_scores = [ calculate_metric(scorer, gt_dev_labels, dev_predictions, problem_types[0]) for scorer in log_metric_scorers ] dev_score = calculate_metric(stopping_metric_scorer, gt_dev_labels, dev_predictions, problem_types[0]) valid_time_spent = time.time() - valid_start_tick if best_performance_score is None or \ (greater_is_better and dev_score >= best_performance_score) or \ (not greater_is_better and dev_score <= best_performance_score): find_better = True no_better_rounds = 0 best_performance_score = dev_score net.save_parameters(os.path.join(exp_dir, 'best_model.params')) else: find_better = False no_better_rounds += 1 mx.npx.waitall() loss_string = ', '.join([ '{}={:0.4e}'.format(metric.name, score) for score, metric in zip(log_scores, log_metric_scorers) ]) logger.info('[Iter {}/{}, Epoch {}] valid {}, time spent={:.3f}s,' ' total_time={:.2f}min'.format( update_idx + 1, max_update, int(update_idx / updates_per_epoch), loss_string, valid_time_spent, (time.time() - start_tick) / 60)) report_items = [('iteration', update_idx + 1), ('report_idx', report_idx + 1), ('epoch', int(update_idx / updates_per_epoch))] +\ [(metric.name, score) for score, metric in zip(log_scores, log_metric_scorers)] + \ [('find_better', find_better), ('time_spent', int(time.time() - start_tick))] total_time_spent = time.time() - start_tick if stopping_metric_scorer._sign < 0: report_items.append(('reward_attr', -dev_score)) else: report_items.append(('reward_attr', dev_score)) report_items.append(('eval_metric', stopping_metric_scorer.name)) report_items.append(('exp_dir', exp_dir)) if find_better: best_report_items = report_items reporter(**dict(report_items)) report_idx += 1 if no_better_rounds >= cfg.learning.early_stopping_patience: logger.info('Early stopping patience reached!') break if time_limits is not None and total_time_spent > time_limits: break best_report_items_dict = dict(best_report_items) best_report_items_dict['report_idx'] = report_idx + 1 reporter(**best_report_items_dict)
def train(self, train_data, tuning_data, resource, time_limits=None, search_strategy='random', search_options=None, scheduler_options=None, num_trials=None, plot_results=False, console_log=True, ignore_warning=True): force_forkserver() start_tick = time.time() logging_config(folder=self._output_directory, name='main', console=console_log, logger=self._logger) assert len(self._label_columns) == 1 # TODO(sxjscience) Try to support S3 os.makedirs(self._output_directory, exist_ok=True) search_space_reg = args(search_space=space.Dict(**self.search_space)) # Scheduler and searcher for HPO if search_strategy.endswith('hyperband') and time_limits is None: time_limits = 5 * 60 * 60 # 5 hours if scheduler_options is None: scheduler_options = dict() stopping_metric_scorer = get_metric(self._stopping_metric) scheduler_options = compile_scheduler_options( scheduler_options=scheduler_options, search_strategy=search_strategy, search_options=search_options, nthreads_per_trial=resource['num_cpus'], ngpus_per_trial=resource['num_gpus'], checkpoint=None, num_trials=num_trials, time_out=scheduler_options.get('time_out'), resume=False, visualizer=scheduler_options.get('visualizer'), time_attr='report_idx', reward_attr=stopping_metric_scorer.reward_attr, dist_ip_addrs=scheduler_options.get('dist_ip_addrs')) train_fn = search_space_reg( functools.partial(train_function, train_data=train_data, time_limits=time_limits, tuning_data=tuning_data, base_config=self.base_config, problem_types=self.problem_types, column_properties=self._column_properties, label_columns=self._label_columns, label_shapes=self._label_shapes, log_metrics=self._log_metrics, stopping_metric=self._stopping_metric, console_log=console_log, ignore_warning=ignore_warning)) scheduler_cls = schedulers[search_strategy.lower()] # Create scheduler, run HPO experiment scheduler = scheduler_cls(train_fn, **scheduler_options) scheduler.run() scheduler.join_jobs() if len(scheduler.config_history) == 0: raise RuntimeError( 'No training job has been completed! ' 'There are two possibilities: ' '1) The time_limits is too small, ' 'or 2) There are some internal errors in AutoGluon. ' 'For the first case, you can increase the time_limits or set it to ' 'None, e.g., setting "TextPrediction.fit(..., time_limits=None). To ' 'further investigate the root cause, you can also try to train with ' '"verbosity=3", i.e., TextPrediction.fit(..., verbosity=3).') best_config = scheduler.get_best_config() self._logger.info('Results=', scheduler.searcher._results) self._logger.info('Best_config={}'.format(best_config)) best_task_id = scheduler.get_best_task_id() best_model_saved_dir_path = os.path.join(self._output_directory, 'task{}'.format(best_task_id)) best_cfg_path = os.path.join(best_model_saved_dir_path, 'cfg.yml') cfg = self.base_config.clone_merge(best_cfg_path) self._results = dict() self._results.update(best_reward=scheduler.get_best_reward(), best_config=scheduler.get_best_config(), total_time=time.time() - start_tick, metadata=scheduler.metadata, training_history=scheduler.training_history, config_history=scheduler.config_history, reward_attr=scheduler._reward_attr, config=cfg) if plot_results: plot_training_curves = os.path.join(self._output_directory, 'plot_training_curves.png') scheduler.get_training_curves(filename=plot_training_curves, plot=plot_results, use_legend=True) # Consider to move this to a separate predictor self._config = cfg backbone_model_cls, backbone_cfg, tokenizer, backbone_params_path, _ \ = get_backbone(cfg.model.backbone.name) text_backbone = backbone_model_cls.from_cfg(backbone_cfg) preprocessor = TabularBasicBERTPreprocessor( tokenizer=tokenizer, column_properties=self._column_properties, label_columns=self._label_columns, max_length=cfg.model.preprocess.max_length, merge_text=cfg.model.preprocess.merge_text) self._preprocessor = preprocessor net = BERTForTabularBasicV1( text_backbone=text_backbone, feature_field_info=preprocessor.feature_field_info(), label_shape=self._label_shapes[0], cfg=cfg.model.network) net.hybridize() ctx_l = get_mxnet_available_ctx() net.load_parameters(os.path.join(best_model_saved_dir_path, 'best_model.params'), ctx=ctx_l[0]) self._net = net mx.npx.waitall()