def train(args): set_seed(args.seed) train_df = pd.read_csv(os.path.join(args.data_path, 'train.csv')) test_df = pd.read_csv(os.path.join(args.data_path, 'test.csv')) # For the purpose of generating submission file submission_df = pd.read_csv(os.path.join(args.data_path, 'sample_submission.csv')) train_df = preprocess(train_df, with_tax_values=args.with_tax_values, has_label=True) test_df = preprocess(test_df, with_tax_values=args.with_tax_values, has_label=False) label_column = 'Sold Price' eval_metric = 'r2' automm_hyperparameters = get_automm_hyperparameters(args.automm_mode, args.text_backbone, args.cat_as_text) tabular_hyperparameters = { 'GBM': [ {}, {'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, ], 'CAT': {}, 'AG_AUTOMM': automm_hyperparameters, } if args.mode == 'single': predictor = MultiModalPredictor(eval_metric=eval_metric, label=label_column, path=args.exp_path) predictor.fit(train_df, hyperparameters=automm_hyperparameters, seed=args.seed) elif args.mode == 'weighted' or args.mode == 'stack5' or args.mode == 'single_bag5' or args.mode == 'single_bag4': predictor = TabularPredictor(eval_metric=eval_metric, label=label_column, path=args.exp_path) if args.mode == 'single_bag5': tabular_hyperparameters = { 'AG_AUTOMM': automm_hyperparameters, } num_bag_folds, num_stack_levels = 5, 0 elif args.mode == 'weighted': num_bag_folds, num_stack_levels = None, None elif args.mode == 'stack5': num_bag_folds, num_stack_levels = 5, 1 else: raise NotImplementedError predictor.fit(train_df, hyperparameters=tabular_hyperparameters, num_bag_folds=num_bag_folds, num_stack_levels=num_stack_levels) leaderboard = predictor.leaderboard() leaderboard.to_csv(os.path.join(args.exp_path, 'leaderboard.csv')) else: raise NotImplementedError predictions = np.exp(predictor.predict(test_df)) submission_df['Sold Price'] = predictions submission_df.to_csv(os.path.join(args.exp_path, 'submission.csv'), index=None)
def main(args): if args.gpu_id is not None: os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id assert args.dataset_name in TABULAR_DATASETS.keys( ), "Unsupported dataset name." ### Dataset loading train_data = TABULAR_DATASETS[args.dataset_name]("train", args.dataset_dir) val_data = TABULAR_DATASETS[args.dataset_name]("val", args.dataset_dir) test_data = TABULAR_DATASETS[args.dataset_name]("test", args.dataset_dir) automm_hyperparameters["optimization.learning_rate"] = args.lr automm_hyperparameters["optimization.end_lr"] = args.end_lr if args.embedding_arch is not None: automm_hyperparameters[ "model.numerical_transformer.embedding_arch"] = args.embedding_arch tabular_hyperparameters = { "GBM": [ {}, { "extra_trees": True, "ag_args": { "name_suffix": "XT" } }, ], "CAT": {}, "XGB": {}, "AG_AUTOMM": automm_hyperparameters, } if args.mode == "single": ### model initalization predictor = MultiModalPredictor( label=train_data.label_column, problem_type=train_data.problem_type, eval_metric=train_data.metric, path=args.exp_dir, verbosity=4, ) ### model training predictor.fit( train_data=train_data.data, tuning_data=val_data.data, seed=args.seed, hyperparameters=automm_hyperparameters, ) ### model inference scores = predictor.evaluate(data=test_data.data, metrics=[test_data.metric]) with open(os.path.join(args.exp_dir, "scores.json"), "w") as f: json.dump(scores, f) print(scores) elif args.mode == "weighted" or args.mode == "single_bag5" or args.mode == "stack5": if args.mode == "single_bag5": tabular_hyperparameters = { "AG_AUTOMM": automm_hyperparameters, } num_bag_folds, num_stack_levels = 5, 0 elif args.mode == "weighted": num_bag_folds, num_stack_levels = None, None elif args.mode == "stack5": num_bag_folds, num_stack_levels = 5, 1 else: raise NotImplementedError from autogluon.tabular import TabularPredictor predictor = TabularPredictor(eval_metric=train_data.metric, label=train_data.label_column, path=args.exp_dir) predictor.fit( train_data=train_data.data, tuning_data=val_data.data if num_bag_folds is None else None, hyperparameters=tabular_hyperparameters, num_bag_folds=num_bag_folds, num_stack_levels=num_stack_levels, ) leaderboard = predictor.leaderboard() leaderboard.to_csv(os.path.join(args.exp_dir, "leaderboard.csv")) else: raise NotImplementedError scores = predictor.evaluate(data=test_data.data) with open(os.path.join(args.exp_dir, "scores.json"), "w") as f: json.dump(scores, f) print(scores) predictions = predictor.predict(data=test_data.data) predictions.to_csv(os.path.join(args.exp_dir, "predictions.csv"))
"data.categorical.convert_to_text": args.categorical_convert_to_text, "env.per_gpu_batch_size": args.per_gpu_batch_size, "env.precision": args.precision, "optimization.learning_rate": args.learning_rate, "optimization.weight_decay": args.weight_decay, "optimization.lr_decay": args.learning_rate_decay, "optimization.max_epochs": args.max_epochs, "optimization.warmup_steps": args.warmup_steps, "optimization.loss_function": args.loss_function, }, seed=args.seed, ) # Manual Validating process. valid_pred = predictor.predict(data=valid_df) score = mean_squared_error(valid_df["Pawpularity"].values, valid_pred, squared=False) print(f"Fold {i} | Score: {score}") predictor.save( path=save_standalone_path + f"_fold{i}", standalone=True, ) all_score.append(score) del predictor torch.cuda.empty_cache() print(f"all-scores: {all_score}") print(f"mean_rmse: {np.mean(all_score)}")
class MultiModalPredictorModel(AbstractModel): _NN_MODEL_NAME = 'automm_model' def __init__(self, **kwargs): """Wrapper of autogluon.multimodal.MultiModalPredictor. The features can be a mix of - image column - text column - categorical column - numerical column The labels can be categorical or numerical. Parameters ---------- path The directory to store the modeling outputs. name Name of subdirectory inside path where model will be saved. problem_type Type of problem that this model will handle. Valid options: ['binary', 'multiclass', 'regression']. eval_metric The evaluation metric. num_classes The number of classes. stopping_metric The stopping metric. model The internal model object. hyperparameters The hyperparameters of the model features Names of the features. feature_metadata The feature metadata. """ super().__init__(**kwargs) self._label_column_name = None self._load_model = None # Whether to load inner model when loading. def _get_default_auxiliary_params(self) -> dict: default_auxiliary_params = super()._get_default_auxiliary_params() extra_auxiliary_params = dict( valid_raw_types=[R_INT, R_FLOAT, R_CATEGORY, R_OBJECT], ignored_type_group_special=[ S_TEXT_NGRAM, S_TEXT_AS_CATEGORY, S_TEXT_SPECIAL ], ) default_auxiliary_params.update(extra_auxiliary_params) return default_auxiliary_params @classmethod def _get_default_ag_args(cls) -> dict: default_ag_args = super()._get_default_ag_args() extra_ag_args = {'valid_stacker': False} default_ag_args.update(extra_ag_args) return default_ag_args def _set_default_params(self): super()._set_default_params() try_import_autogluon_text() def _fit(self, X: pd.DataFrame, y: pd.Series, X_val: Optional[pd.DataFrame] = None, y_val: Optional[pd.Series] = None, time_limit: Optional[int] = None, sample_weight=None, **kwargs): """The internal fit function Parameters ---------- X Features of the training dataset y Labels of the training dataset X_val Features of the validation dataset y_val Labels of the validation dataset time_limit The time limits for the fit function sample_weight The weights of the samples kwargs Other keyword arguments """ try_import_autogluon_text() from autogluon.multimodal import MultiModalPredictor # Decide name of the label column if 'label' in X.columns: label_col_id = 0 while True: self._label_column_name = 'label{}'.format(label_col_id) if self._label_column_name not in X.columns: break label_col_id += 1 else: self._label_column_name = 'label' X_train = self.preprocess(X, fit=True) if X_val is not None: X_val = self.preprocess(X_val) # Get arguments from kwargs verbosity = kwargs.get('verbosity', 2) num_gpus = kwargs.get('num_gpus', None) if sample_weight is not None: # TODO: support logger.log( 15, "sample_weight not yet supported for MultiModalPredictorModel, " "this model will ignore them in training.") X_train.insert(len(X_train.columns), self._label_column_name, y) if X_val is not None: X_val.insert(len(X_val.columns), self._label_column_name, y_val) verbosity_text = max(0, verbosity - 1) root_logger = logging.getLogger('autogluon') root_log_level = root_logger.level self.model = MultiModalPredictor(label=self._label_column_name, problem_type=self.problem_type, path=self.path, eval_metric=self.eval_metric, verbosity=verbosity_text) params = self._get_model_params() if num_gpus is not None: params['env.num_gpus'] = num_gpus presets = params.pop('presets', None) seed = params.pop('seed', 0) self.model.fit(train_data=X_train, tuning_data=X_val, time_limit=time_limit, presets=presets, hyperparameters=params, seed=seed) self.model.set_verbosity(verbosity) root_logger.setLevel(root_log_level) # Reset log level def _predict_proba(self, X, **kwargs): X = self.preprocess(X, **kwargs) if self.problem_type == REGRESSION: return self.model.predict(X, as_pandas=False) y_pred_proba = self.model.predict_proba(X, as_pandas=False) return self._convert_proba_to_unified_form(y_pred_proba) def save(self, path: str = None, verbose=True) -> str: self._load_model = self.model is not None __model = self.model self.model = None # save this AbstractModel object without NN weights path = super().save(path=path, verbose=verbose) self.model = __model if self._load_model: automm_nn_path = os.path.join(path, self._NN_MODEL_NAME) self.model.save(automm_nn_path) logger.log( 15, f"\tSaved AutoMM model weights and model hyperparameters to '{automm_nn_path}'." ) self._load_model = None return path @classmethod def load(cls, path: str, reset_paths=True, verbose=True): model = super().load(path=path, reset_paths=reset_paths, verbose=verbose) if model._load_model: try_import_autogluon_text() from autogluon.multimodal import MultiModalPredictor model.model = MultiModalPredictor.load( os.path.join(path, cls._NN_MODEL_NAME)) model._load_model = None return model def get_memory_size(self) -> int: """Return the memory size by calculating the total number of parameters. Returns ------- memory_size The total memory size in bytes. """ total_size = sum(param.numel() for param in self.model._model.parameters()) return total_size def _get_default_resources(self): num_cpus = get_cpu_count() num_gpus = min( get_gpu_count_torch(), 1 ) # Use single gpu training by default. Consider to revise it later. return num_cpus, num_gpus def get_minimum_resources(self) -> Dict[str, int]: return { 'num_cpus': 1, 'num_gpus': 1, } def _more_tags(self): # `can_refit_full=False` because MultiModalPredictor does not communicate how to train until the best epoch in refit_full. return {'can_refit_full': False}