def _set_default_parameters(self): self.final_config_dict['dataset'] = self.dataset self.final_config_dict['model'] = self.model if self.dataset == 'ml-100k': current_path = os.path.dirname(os.path.realpath(__file__)) self.final_config_dict['data_path'] = os.path.join( current_path, '../dataset_example/' + self.dataset) else: self.final_config_dict['data_path'] = os.path.join( self.final_config_dict['data_path'], self.dataset) if hasattr(get_model(self.model), 'input_type'): self.final_config_dict['MODEL_INPUT_TYPE'] = get_model( self.model).input_type elif 'loss_type' in self.final_config_dict: if self.final_config_dict['loss_type'] in ['CE']: self.final_config_dict[ 'MODEL_INPUT_TYPE'] = InputType.POINTWISE elif self.final_config_dict['loss_type'] in ['BPR']: self.final_config_dict['MODEL_INPUT_TYPE'] = InputType.PAIRWISE else: raise ValueError('Either Model has attr \'input_type\',' 'or arg \'loss_type\' should exist in config.') eval_type = None for metric in self.final_config_dict['metrics']: if metric.lower() in loss_metrics: if eval_type is not None and eval_type == EvaluatorType.RANKING: raise RuntimeError( 'Ranking metrics and other metrics can not be used at the same time.' ) else: eval_type = EvaluatorType.INDIVIDUAL if metric.lower() in topk_metrics: if eval_type is not None and eval_type == EvaluatorType.INDIVIDUAL: raise RuntimeError( 'Ranking metrics and other metrics can not be used at the same time.' ) else: eval_type = EvaluatorType.RANKING self.final_config_dict['eval_type'] = eval_type smaller_metric = ['rmse', 'mae', 'logloss'] valid_metric = self.final_config_dict['valid_metric'].split('@')[0] self.final_config_dict[ 'valid_metric_bigger'] = False if valid_metric in smaller_metric else True if 'additional_feat_suffix' in self.final_config_dict: ad_suf = self.final_config_dict['additional_feat_suffix'] if isinstance(ad_suf, str): self.final_config_dict['additional_feat_suffix'] = [ad_suf]
def objective_function(config_dict=None, config_file_list=None, saved=True): r""" The default objective_function used in HyperTuning Args: config_dict (dict): parameters dictionary used to modify experiment parameters config_file_list (list): config files used to modify experiment parameters saved (bool): whether to save the model """ config = Config(config_dict=config_dict, config_file_list=config_file_list) init_seed(config['seed'], config['reproducibility']) logging.basicConfig(level=logging.ERROR) dataset = create_dataset(config) train_data, valid_data, test_data = data_preparation(config, dataset) model = get_model(config['model'])(config, train_data).to(config['device']) trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model) best_valid_score, best_valid_result = trainer.fit(train_data, valid_data, verbose=False, saved=saved) test_result = trainer.evaluate(test_data, load_best_model=saved) return { 'best_valid_score': best_valid_score, 'valid_score_bigger': config['valid_metric_bigger'], 'best_valid_result': best_valid_result, 'test_result': test_result }
def save_example(): # configurations initialization config_dict = {'checkpoint_dir': '../saved'} config = Config(model='BPR', dataset='ml-100k', config_dict=config_dict) init_seed(config['seed'], config['reproducibility']) init_logger(config) # dataset filtering dataset = create_dataset(config) dataset.save('../saved/') # dataset splitting train_data, valid_data, test_data = data_preparation(config, dataset) save_split_dataloaders(config, dataloaders=(train_data, valid_data, test_data)) model = get_model(config['model'])(config, train_data).to(config['device']) # trainer loading and initialization trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model) # model training # the best model will be saved in here best_valid_score, best_valid_result = trainer.fit( train_data, valid_data, saved=True, show_progress=config['show_progress'])
def _get_model_and_dataset(self, model, dataset): if model is None: try: model = self.external_config_dict['model'] except KeyError: raise KeyError( 'model need to be specified in at least one of the these ways: ' '[model variable, config file, config dict, command line] ' ) if not isinstance(model, str): final_model_class = model final_model = model.__name__ else: final_model = model final_model_class = get_model(final_model) if dataset is None: try: final_dataset = self.external_config_dict['dataset'] except KeyError: raise KeyError( 'dataset need to be specified in at least one of the these ways: ' '[dataset variable, config file, config dict, command line] ' ) else: final_dataset = dataset return final_model, final_model_class, final_dataset
def load_data_and_model(model_file): r"""Load filtered dataset, split dataloaders and saved model. Args: model_file (str): The path of saved model file. Returns: tuple: - config (Config): An instance object of Config, which record parameter information in :attr:`model_file`. - model (AbstractRecommender): The model load from :attr:`model_file`. - dataset (Dataset): The filtered dataset. - train_data (AbstractDataLoader): The dataloader for training. - valid_data (AbstractDataLoader): The dataloader for validation. - test_data (AbstractDataLoader): The dataloader for testing. """ checkpoint = torch.load(model_file) config = checkpoint['config'] init_seed(config['seed'], config['reproducibility']) init_logger(config) logger = getLogger() logger.info(config) dataset = create_dataset(config) logger.info(dataset) train_data, valid_data, test_data = data_preparation(config, dataset) init_seed(config['seed'], config['reproducibility']) model = get_model(config['model'])(config, train_data.dataset).to(config['device']) model.load_state_dict(checkpoint['state_dict']) model.load_other_parameter(checkpoint.get('other_parameter')) return config, model, dataset, train_data, valid_data, test_data
def run_recbole(model=None, dataset=None, config_file_list=None, config_dict=None, saved=True): r""" A fast running api, which includes the complete process of training and testing a model on a specified dataset Args: model (str): model name dataset (str): dataset name config_file_list (list): config files used to modify experiment parameters config_dict (dict): parameters dictionary used to modify experiment parameters saved (bool): whether to save the model """ # configurations initialization config = Config(model=model, dataset=dataset, config_file_list=config_file_list, config_dict=config_dict) init_seed(config['seed'], config['reproducibility']) # logger initialization init_logger(config) logger = getLogger() logger.info(config) # dataset filtering dataset = create_dataset(config) logger.info(dataset) # dataset splitting train_data, valid_data, test_data = data_preparation(config, dataset) print(train_data.dataset.item_feat) print(valid_data.dataset.item_feat) # model loading and initialization model = get_model(config['model'])(config, train_data).to(config['device']) logger.info(model) # trainer loading and initialization trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model) # model training with profiler.profile(enabled=config["monitor"], with_stack=True, profile_memory=True, use_cuda=True) as prof: best_valid_score, best_valid_result = trainer.fit( train_data, valid_data, saved=saved, show_progress=config['show_progress'] ) if prof is not None: print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cpu_time_total')) # model evaluation with profiler.profile(enabled=config["monitor_eval"], with_stack=True, profile_memory=True, use_cuda=True) as prof: test_result = trainer.evaluate(test_data, load_best_model=saved, show_progress=config['show_progress'], cold_warm_distinct_eval=True) if prof is not None: print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cpu_time_total')) logger.info(set_color('best valid ', 'yellow') + f': {best_valid_result}') logger.info(set_color('test result', 'yellow') + f': {test_result}') return { 'best_valid_score': best_valid_score, 'valid_score_bigger': config['valid_metric_bigger'], 'best_valid_result': best_valid_result, 'test_result': test_result }
def run_recbole(model=None, dataset=None, config_file_list=None, config_dict=None, saved=True): r""" A fast running api, which includes the complete process of training and testing a model on a specified dataset Args: model (str): model name dataset (str): dataset name config_file_list (list): config files used to modify experiment parameters config_dict (dict): parameters dictionary used to modify experiment parameters saved (bool): whether to save the model """ # configurations initialization config = Config(model=model, dataset=dataset, config_file_list=config_file_list, config_dict=config_dict) init_seed(config['seed'], config['reproducibility']) # logger initialization init_logger(config) logger = getLogger() logger.info(config) # dataset filtering dataset = create_dataset(config) logger.info(dataset) # dataset splitting train_data, valid_data, test_data = data_preparation(config, dataset) # model loading and initialization model = get_model(config['model'])(config, train_data).to(config['device']) logger.info(model) # trainer loading and initialization trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model) # model training best_valid_score, best_valid_result = trainer.fit(train_data, valid_data, saved=saved) # model evaluation test_result = trainer.evaluate(test_data, load_best_model=saved) logger.info('best valid result: {}'.format(best_valid_result)) logger.info('test result: {}'.format(test_result)) return { 'best_valid_score': best_valid_score, 'valid_score_bigger': config['valid_metric_bigger'], 'best_valid_result': best_valid_result, 'test_result': test_result }
def run_recbole(model=None, dataset=None, config_file_list=None, config_dict=None, saved=True): r""" A fast running api, which includes the complete process of training and testing a model on a specified dataset Args: model (str, optional): Model name. Defaults to ``None``. dataset (str, optional): Dataset name. Defaults to ``None``. config_file_list (list, optional): Config files used to modify experiment parameters. Defaults to ``None``. config_dict (dict, optional): Parameters dictionary used to modify experiment parameters. Defaults to ``None``. saved (bool, optional): Whether to save the model. Defaults to ``True``. """ # configurations initialization config = Config(model=model, dataset=dataset, config_file_list=config_file_list, config_dict=config_dict) init_seed(config['seed'], config['reproducibility']) # logger initialization init_logger(config) logger = getLogger() logger.info(config) # dataset filtering dataset = create_dataset(config) if config['save_dataset']: dataset.save() logger.info(dataset) # dataset splitting train_data, valid_data, test_data = data_preparation(config, dataset) if config['save_dataloaders']: save_split_dataloaders(config, dataloaders=(train_data, valid_data, test_data)) # model loading and initialization model = get_model(config['model'])(config, train_data.dataset).to(config['device']) logger.info(model) # trainer loading and initialization trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model) # model training best_valid_score, best_valid_result = trainer.fit( train_data, valid_data, saved=saved, show_progress=config['show_progress'] ) # model evaluation test_result = trainer.evaluate(test_data, load_best_model=saved, show_progress=config['show_progress']) logger.info(set_color('best valid ', 'yellow') + f': {best_valid_result}') logger.info(set_color('test result', 'yellow') + f': {test_result}') return { 'best_valid_score': best_valid_score, 'valid_score_bigger': config['valid_metric_bigger'], 'best_valid_result': best_valid_result, 'test_result': test_result }
def load_data_and_model(model_file, dataset_file=None, dataloader_file=None): r"""Load filtered dataset, split dataloaders and saved model. Args: model_file (str): The path of saved model file. dataset_file (str, optional): The path of filtered dataset. Defaults to ``None``. dataloader_file (str, optional): The path of split dataloaders. Defaults to ``None``. Note: The :attr:`dataset` will be loaded or created according to the following strategy: If :attr:`dataset_file` is not ``None``, the :attr:`dataset` will be loaded from :attr:`dataset_file`. If :attr:`dataset_file` is ``None`` and :attr:`dataloader_file` is ``None``, the :attr:`dataset` will be created according to :attr:`config`. If :attr:`dataset_file` is ``None`` and :attr:`dataloader_file` is not ``None``, the :attr:`dataset` will neither be loaded or created. The :attr:`dataloader` will be loaded or created according to the following strategy: If :attr:`dataloader_file` is not ``None``, the :attr:`dataloader` will be loaded from :attr:`dataloader_file`. If :attr:`dataloader_file` is ``None``, the :attr:`dataloader` will be created according to :attr:`config`. Returns: tuple: - config (Config): An instance object of Config, which record parameter information in :attr:`model_file`. - model (AbstractRecommender): The model load from :attr:`model_file`. - dataset (Dataset): The filtered dataset. - train_data (AbstractDataLoader): The dataloader for training. - valid_data (AbstractDataLoader): The dataloader for validation. - test_data (AbstractDataLoader): The dataloader for testing. """ checkpoint = torch.load(model_file) config = checkpoint['config'] init_logger(config) dataset = None if dataset_file: with open(dataset_file, 'rb') as f: dataset = pickle.load(f) if dataloader_file: train_data, valid_data, test_data = load_split_dataloaders(dataloader_file) else: if dataset is None: dataset = create_dataset(config) train_data, valid_data, test_data = data_preparation(config, dataset) model = get_model(config['model'])(config, train_data.dataset).to(config['device']) model.load_state_dict(checkpoint['state_dict']) model.load_other_parameter(checkpoint.get('other_parameter')) return config, model, dataset, train_data, valid_data, test_data
def load_example(): # configurations initialization config_dict = {'checkpoint_dir': '../saved'} config = Config(model='BPR', dataset='ml-100k', config_dict=config_dict) init_seed(config['seed'], config['reproducibility']) init_logger(config) logger = getLogger() with open('../saved/ml-100k-dataset.pth', 'rb') as f: # You can use your filtered data path here. dataset = pickle.load(f) train_data, valid_data, test_data = load_split_dataloaders( '../saved/ml-100k-for-BPR-dataloader.pth') # You can use your split data path here. model = get_model(config['model'])(config, train_data).to(config['device']) checkpoint = torch.load('../saved/BPR-Mar-20-2021_17-11-05.pth' ) # Here you can replace it by your model path. model.load_state_dict(checkpoint['state_dict']) logger.info(model) logger.info(train_data.dataset) logger.info(valid_data.dataset) logger.info(test_data.dataset)
test_data = get_dataloader(config, 'test')(config, new_test_dataset, None, shuffle=False) else: train_data = get_dataloader(config, 'train')(config, train_dataset, None, shuffle=True) test_data = get_dataloader(config, 'test')(config, test_dataset, None, shuffle=False) # model loading and initialization model = get_model(config['model'])(config, train_data.dataset).to(config['device']) logger.info(model) # trainer loading and initialization trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model) # model training and evaluation test_score, test_result = trainer.fit( train_data, test_data, saved=True, show_progress=config['show_progress']) logger.info(set_color('test result', 'yellow') + f': {test_result}')
from recbole.utils.case_study import full_sort_topk, full_sort_scores if __name__ == '__main__': # this part is to load saved model. config_dict = { # here you can set some parameters such as `gpu_id` and so on. } config = Config(model='BPR', dataset='ml-100k', config_dict=config_dict) init_seed(config['seed'], config['reproducibility']) dataset = create_dataset(config) train_data, valid_data, test_data = data_preparation(config, dataset) # Here you can also use `load_split_dataloaders` to load data. # The example code for `load_split_dataloaders` can be found in `save_and_load_example.py`. model = get_model(config['model'])(config, train_data) checkpoint = torch.load('RecBole/saved/BPR-Dec-08-2020_15-37-37.pth') # Here you can replace it by your model path. model.load_state_dict(checkpoint['state_dict']) model.eval() # uid_series = np.array([1, 2]) # internal user id series # or you can use dataset.token2id to transfer external user token to internal user id uid_series = dataset.token2id(dataset.uid_field, ['200']) topk_score, topk_iid_list = full_sort_topk(uid_series, model, test_data, k=10) print(topk_score) # scores of top 10 items print(topk_iid_list) # internal id of top 10 items external_item_list = dataset.id2token(dataset.iid_field, topk_iid_list) print(external_item_list) # external tokens of top 10 items print()
def _load_internal_config_dict(self, model, dataset): current_path = os.path.dirname(os.path.realpath(__file__)) overall_init_file = os.path.join(current_path, '../properties/overall.yaml') model_init_file = os.path.join( current_path, '../properties/model/' + model + '.yaml') sample_init_file = os.path.join(current_path, '../properties/dataset/sample.yaml') dataset_init_file = os.path.join( current_path, '../properties/dataset/' + dataset + '.yaml') self.internal_config_dict = dict() for file in [ overall_init_file, model_init_file, sample_init_file, dataset_init_file ]: if os.path.isfile(file): with open(file, 'r', encoding='utf-8') as f: config_dict = yaml.load(f.read(), Loader=self.yaml_loader) if file == dataset_init_file: self.parameters['Dataset'] += [ key for key in config_dict.keys() if key not in self.parameters['Dataset'] ] if config_dict is not None: self.internal_config_dict.update(config_dict) self.internal_config_dict['MODEL_TYPE'] = get_model(model).type if self.internal_config_dict['MODEL_TYPE'] == ModelType.GENERAL: pass elif self.internal_config_dict['MODEL_TYPE'] == ModelType.CONTEXT: self.internal_config_dict.update({ 'eval_setting': 'RO_RS', 'group_by_user': False, 'training_neg_sample_num': 0, 'metrics': ['AUC', 'LogLoss'], 'valid_metric': 'AUC', }) if dataset == 'ml-100k': self.internal_config_dict.update({ 'threshold': { 'rating': 4 }, 'load_col': { 'inter': ['user_id', 'item_id', 'rating', 'timestamp'], 'user': ['user_id', 'age', 'gender', 'occupation'], 'item': ['item_id', 'release_year', 'class'] }, }) elif self.internal_config_dict['MODEL_TYPE'] == ModelType.SEQUENTIAL: if model == 'DIN': self.internal_config_dict.update({ 'eval_setting': 'TO_LS, uni100', 'metrics': ['AUC', 'LogLoss'], 'valid_metric': 'AUC', }) if dataset == 'ml-100k': self.internal_config_dict.update({ 'load_col': { 'inter': ['user_id', 'item_id', 'rating', 'timestamp'], 'user': ['user_id', 'age', 'gender', 'occupation'], 'item': ['item_id', 'release_year'] }, }) else: self.internal_config_dict.update({ 'eval_setting': 'TO_LS,full', }) if dataset == 'ml-100k' and model in [ 'GRU4RecF', 'SASRecF', 'FDSA', 'S3Rec' ]: self.internal_config_dict.update({ 'load_col': { 'inter': ['user_id', 'item_id', 'rating', 'timestamp'], 'item': ['item_id', 'release_year', 'class'] }, }) elif self.internal_config_dict['MODEL_TYPE'] == ModelType.KNOWLEDGE: self.internal_config_dict.update({ 'load_col': { 'inter': ['user_id', 'item_id', 'rating', 'timestamp'], 'kg': ['head_id', 'relation_id', 'tail_id'], 'link': ['item_id', 'entity_id'] } })