def train(self): self.monitor = Monitor(log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id) self.model_save_dir = os.path.join(self.config["model_save_dir"], self.config["save_name"]) for epoch in range(config["max_epoch"]): print(f"Epoch {epoch} starts !") print("-" * 80) if epoch > 0 and self.eval_engine.n_no_update == 0: # previous epoch have already obtained better result self.engine.save_checkpoint(model_dir=self.model_save_dir) if self.eval_engine.n_no_update >= MAX_N_UPDATE: print( "Early stop criterion triggered, no performance update for {:} times" .format(MAX_N_UPDATE)) break train_loader = self.dataset self.engine.train_an_epoch(epoch_id=epoch, train_loader=train_loader) self.eval_engine.train_eval(self.dataset.valid[0], self.dataset.test[0], self.engine.model, epoch) self.config["run_time"] = self.monitor.stop()
def train(self): """ Main training navigator Returns: """ # Options are: 'gcn', 'mlp', 'ncf', and 'ncf_gcn'; # Train NeuMF without pre-train self.monitor = Monitor( log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id ) if self.config["model"] == "ncf": self.train_ncf() elif self.config["model"] == "gcn": self.train_gcn() elif self.config["model"] == "mlp": self.train_mlp() elif self.config["model"] == "ncf_gcn": self.train_gcn() while self.eval_engine.n_worker: print(f"Wait 15s for the complete of eval_engine.n_worker") time.sleep(15) # wait the self.train_mlp() while self.eval_engine.n_worker: print(f"Wait 15s for the complete of eval_engine.n_worker") time.sleep(15) # wait the self.train_ncf() else: raise ValueError( "Model type error: Options are: 'gcn', 'mlp', 'ncf', and 'ncf_gcn'." ) self.config["run_time"] = self.monitor.stop() self.eval_engine.test_eval(self.dataset.test, self.engine.model)
def train(self): """Default train implementation """ assert hasattr(self, "engine"), "Please specify the exact model engine !" self.monitor = Monitor(log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id) self.engine.data = self.dataset print("Start training... ") epoch_bar = tqdm(range(self.config["max_epoch"]), file=sys.stdout) for epoch in epoch_bar: print(f"Epoch {epoch} starts !") print("-" * 80) if self.check_early_stop( engine=self.engine, model_dir=os.path.join(self.config["model_save_dir"], "model.cpk"), epoch=epoch, ): break data_loader = self.build_data_loader() self.engine.train_an_epoch(data_loader, epoch_id=epoch) self.eval_engine.train_eval(self.dataset.valid[0], self.dataset.test[0], self.engine.model, epoch) """Sets the learning rate to the initial LR decayed by 10 every 10 epochs""" lr = self.config["lr"] * (0.5**(epoch // 10)) for param_group in self.engine.optimizer.param_groups: param_group["lr"] = lr self.config["run_time"] = self.monitor.stop() return self.eval_engine.best_valid_performance
def train(self): """ Main training navigator Returns: """ self.monitor = Monitor( log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id ) self.train_rescal() self.config["run_time"] = self.monitor.stop() self.eval_engine.test_eval(self.dataset.test, self.engine.model)
def train(self): """ Main training navigator Returns: """ # Options are: 'gcn', 'mlp', 'ncf', and 'ncf_gcn'; # Train NeuMF without pre-train self.monitor = Monitor( log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id ) self.train_mf() self.config["run_time"] = self.monitor.stop() self.eval_engine.test_eval(self.dataset.test, self.engine.model) return self.eval_engine.best_valid_performance
def train(self): """ Main training navigator Returns: """ self.monitor = Monitor(log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id) if self.config["pre_train"] == 0: self.train_compgcn() else: pass self.train_lightgcn() self.config["run_time"] = self.monitor.stop() self.eval_engine.test_eval(self.dataset.test, self.engine.model)
class LightGCN_train(TrainEngine): """ An instance class from the TrainEngine base class """ def __init__(self, config): """Constructor Args: config (dict): All the parameters for the model """ self.config = config super(LightGCN_train, self).__init__(config) self.load_dataset() self.build_data_loader() self.engine = LightGCNEngine(self.config) def build_data_loader(self): # ToDo: Please define the directory to store the adjacent matrix plain_adj, norm_adj, mean_adj = self.dataset.get_adj_mat() norm_adj = sparse_mx_to_torch_sparse_tensor(norm_adj) self.config["norm_adj"] = norm_adj self.config["num_batch"] = self.dataset.n_train // config["batch_size"] + 1 self.config["n_users"] = self.dataset.n_users self.config["n_items"] = self.dataset.n_items def train(self): self.monitor = Monitor( log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id ) self.model_save_dir = os.path.join( self.config["model_save_dir"], self.config["save_name"] ) for epoch in range(config["max_epoch"]): print(f"Epoch {epoch} starts !") print("-" * 80) if epoch > 0 and self.eval_engine.n_no_update == 0: # previous epoch have already obtained better result self.engine.save_checkpoint(model_dir=self.model_save_dir) if self.eval_engine.n_no_update >= MAX_N_UPDATE: print( "Early stop criterion triggered, no performance update for {:} times".format( MAX_N_UPDATE ) ) break train_loader = self.dataset self.engine.train_an_epoch(epoch_id=epoch, train_loader=train_loader) self.eval_engine.train_eval( self.dataset.valid[0], self.dataset.test[0], self.engine.model, epoch ) self.config["run_time"] = self.monitor.stop() def test(self): self.engine.resume_checkpoint(model_dir=self.model_save_dir) super(LightGCN_train, self).test()
def train(self): """ Main training navigator Returns: """ # Options are: 'gcn', 'mlp', 'ncf', and 'ncf_gcn'; # Train NeuMF without pre-train self.monitor = Monitor(log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id) print("train gnn ngcf") if self.config["pre_train"] == 0: self.train_gcn() else: pass self.train_ngcf() self.config["run_time"] = self.monitor.stop() self.eval_engine.test_eval(self.dataset.test, self.engine.model)
class LightGCN_train(TrainEngine): """ An instance class from the TrainEngine base class """ def __init__(self, config): """Constructor Args: config (dict): All the parameters for the model """ self.config = config super(LightGCN_train, self).__init__(config) self.load_dataset() self.build_data_loader() self.gpu_id, self.config["device_str"] = self.get_device() def build_data_loader(self): ( user_edge_list, user_edge_type, item_edge_list, item_edge_type, self.config["n_user_fea"], self.config["n_item_fea"], ) = self.dataset.make_multi_graph() self.sample_generator = SampleGenerator(ratings=self.dataset.train) self.config["user_edge_list"] = torch.LongTensor(user_edge_list) self.config["user_edge_type"] = torch.LongTensor(user_edge_type) self.config["item_edge_list"] = torch.LongTensor(item_edge_list) self.config["item_edge_type"] = torch.LongTensor(item_edge_type) self.config["num_batch"] = self.dataset.n_train // self.config[ "batch_size"] + 1 self.config["n_users"] = self.dataset.n_users self.config["n_items"] = self.dataset.n_items plain_adj, norm_adj, mean_adj = self.dataset.get_adj_mat() norm_adj = sparse_mx_to_torch_sparse_tensor(norm_adj) self.config["norm_adj"] = norm_adj def _train(self, engine, train_loader, save_dir): self.eval_engine.flush() epoch_bar = tqdm(range(self.config["max_epoch"]), file=sys.stdout) for epoch in epoch_bar: print("Epoch {} starts !".format(epoch)) print("-" * 80) if self.check_early_stop(engine, save_dir, epoch): break engine.train_an_epoch(train_loader, epoch_id=epoch) """evaluate model on validation and test sets""" if self.config["validate"]: self.eval_engine.train_eval(self.dataset.valid[0], self.dataset.test[0], engine.model, epoch) else: self.eval_engine.train_eval(None, self.dataset.test[0], engine.model, epoch) def train(self): """ Main training navigator Returns: """ self.monitor = Monitor(log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id) if self.config["pre_train"] == 0: self.train_compgcn() else: pass self.train_lightgcn() self.config["run_time"] = self.monitor.stop() self.eval_engine.test_eval(self.dataset.test, self.engine.model) def train_lightgcn(self): self.model_dir = os.path.join(self.config["model_save_dir"], self.config["save_name"]) train_loader = self.dataset self.engine = LightGCNEngine(self.config) self._train(self.engine, train_loader, self.model_dir) def train_compgcn(self): """ Train RGCN Returns: None """ train_loader = self.dataset # Train RGCN self.engine = CompGCNEngine(self.config) self.model_save_dir = os.path.join( self.config["model_save_dir"], self.config["compgcn_config"]["save_name"]) self._train(self.engine, train_loader, self.model_save_dir) while self.eval_engine.n_worker: print(f"Wait 15s for the complete of eval_engine.n_worker")
def train_rescal(self): """ Train rescal Returns: None """ import sys self.monitor = Monitor(log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id) sys.path.append("../") from scipy.io.matlab import loadmat from scipy.sparse import lil_matrix from rescal.rescal import als as rescal_als def edge2Tensor(edge_list, edge_type, n_type): r_tensor = np.zeros( (edge_list.max() + 1, edge_list.max() + 1, n_type)) # print(np.count_nonzero(r_tensor)) for idx, e in enumerate(edge_type): i = edge_list[0][idx] j = edge_list[1][idx] r_tensor[i][j][e] = 1 print(f"n values: {np.count_nonzero(r_tensor)}") X = [ lil_matrix(r_tensor[:, :, k]) for k in range(r_tensor.shape[2]) ] return X def get_emb(r_tensor, dim=64, lambda_A=10, lambda_R=10): A, R, fit, itr, exectimes = rescal_als(r_tensor, dim, init="nvecs", lambda_A=lambda_A, lambda_R=lambda_R) return A ( user_edge_list, user_edge_type, item_edge_list, item_edge_type, n_user_fea, n_item_fea, ) = self.dataset.make_multi_graph() user_r = edge2Tensor(user_edge_list, user_edge_type, n_user_fea) item_r = edge2Tensor(item_edge_list, item_edge_type, n_item_fea) lambda_A = self.config["lambda_A"] lambda_R = self.config["lambda_R"] u_emb = get_emb(user_r, dim=self.config["emb_dim"], lambda_A=lambda_A, lambda_R=lambda_R) i_emb = get_emb(item_r, dim=self.config["emb_dim"], lambda_A=lambda_A, lambda_R=lambda_R) u_emb.astype(np.float64) i_emb.astype(np.float64) train_loader = self.dataset self.engine = NGCFEngine(self.config) self.engine.model.user_embedding.weight.data = torch.tensor( u_emb.astype(np.float64)).to(self.engine.device) self.engine.model.item_embedding.weight.data = torch.tensor( i_emb.astype(np.float64)).to(self.engine.device) self.model_save_dir = os.path.join( self.config["model_save_dir"], self.config["rescal_config"]["save_name"]) self._train(self.engine, train_loader, self.model_save_dir) self.config["run_time"] = self.monitor.stop() self.eval_engine.test_eval(self.dataset.test, self.engine.model)
class NGCF_train(TrainEngine): """ An instance class from the TrainEngine base class """ def __init__(self, config): """Constructor Args: config (dict): All the parameters for the model """ self.config = config super(NGCF_train, self).__init__(self.config) self.load_dataset() self.build_data_loader() def build_data_loader(self): # ToDo: Please define the directory to store the adjacent matrix ( user_edge_list, user_edge_type, item_edge_list, item_edge_type, self.config["n_user_fea"], self.config["n_item_fea"], ) = self.dataset.make_multi_graph() self.sample_generator = SampleGenerator(ratings=self.dataset.train) self.config["user_edge_list"] = torch.LongTensor(user_edge_list) self.config["user_edge_type"] = torch.LongTensor(user_edge_type) self.config["item_edge_list"] = torch.LongTensor(item_edge_list) self.config["item_edge_type"] = torch.LongTensor(item_edge_type) self.config["num_batch"] = self.dataset.n_train // self.config[ "batch_size"] + 1 self.config["n_users"] = self.dataset.n_users self.config["n_items"] = self.dataset.n_items plain_adj, norm_adj, mean_adj = self.dataset.get_adj_mat() norm_adj = sparse_mx_to_torch_sparse_tensor(norm_adj) self.config["norm_adj"] = norm_adj def _train(self, engine, train_loader, save_dir): self.eval_engine.flush() epoch_bar = tqdm(range(self.config["max_epoch"]), file=sys.stdout) for epoch in epoch_bar: print("Epoch {} starts !".format(epoch)) print("-" * 80) if self.check_early_stop(engine, save_dir, epoch): break engine.train_an_epoch(train_loader, epoch_id=epoch) """evaluate model on validation and test sets""" if self.config["validate"]: self.eval_engine.train_eval(self.dataset.valid[0], self.dataset.test[0], engine.model, epoch) else: self.eval_engine.train_eval(None, self.dataset.test[0], engine.model, epoch) def train_rescal(self): """ Train rescal Returns: None """ import sys self.monitor = Monitor(log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id) sys.path.append("../") from scipy.io.matlab import loadmat from scipy.sparse import lil_matrix from rescal.rescal import als as rescal_als def edge2Tensor(edge_list, edge_type, n_type): r_tensor = np.zeros( (edge_list.max() + 1, edge_list.max() + 1, n_type)) # print(np.count_nonzero(r_tensor)) for idx, e in enumerate(edge_type): i = edge_list[0][idx] j = edge_list[1][idx] r_tensor[i][j][e] = 1 print(f"n values: {np.count_nonzero(r_tensor)}") X = [ lil_matrix(r_tensor[:, :, k]) for k in range(r_tensor.shape[2]) ] return X def get_emb(r_tensor, dim=64, lambda_A=10, lambda_R=10): A, R, fit, itr, exectimes = rescal_als(r_tensor, dim, init="nvecs", lambda_A=lambda_A, lambda_R=lambda_R) return A ( user_edge_list, user_edge_type, item_edge_list, item_edge_type, n_user_fea, n_item_fea, ) = self.dataset.make_multi_graph() user_r = edge2Tensor(user_edge_list, user_edge_type, n_user_fea) item_r = edge2Tensor(item_edge_list, item_edge_type, n_item_fea) lambda_A = self.config["lambda_A"] lambda_R = self.config["lambda_R"] u_emb = get_emb(user_r, dim=self.config["emb_dim"], lambda_A=lambda_A, lambda_R=lambda_R) i_emb = get_emb(item_r, dim=self.config["emb_dim"], lambda_A=lambda_A, lambda_R=lambda_R) u_emb.astype(np.float64) i_emb.astype(np.float64) train_loader = self.dataset self.engine = NGCFEngine(self.config) self.engine.model.user_embedding.weight.data = torch.tensor( u_emb.astype(np.float64)).to(self.engine.device) self.engine.model.item_embedding.weight.data = torch.tensor( i_emb.astype(np.float64)).to(self.engine.device) self.model_save_dir = os.path.join( self.config["model_save_dir"], self.config["rescal_config"]["save_name"]) self._train(self.engine, train_loader, self.model_save_dir) self.config["run_time"] = self.monitor.stop() self.eval_engine.test_eval(self.dataset.test, self.engine.model) def train_compgcn(self): """ Train RGCN Returns: None """ train_loader = self.dataset # Train RGCN self.engine = CompGCNEngine(self.config) self.model_save_dir = os.path.join( self.config["model_save_dir"], self.config["compgcn_config"]["save_name"]) self._train(self.engine, train_loader, self.model_save_dir) while self.eval_engine.n_worker: print(f"Wait 15s for the complete of eval_engine.n_worker") time.sleep(15) # wait the
class LightGCN_train(TrainEngine): def __init__(self, config): """Constructor Args: config (dict): All the parameters for the model """ self.config = config super(LightGCN_train, self).__init__(self.config) self.load_dataset() self.build_data_loader() self.gpu_id, self.config["device_str"] = self.get_device() def build_data_loader(self): # ToDo: Please define the directory to store the adjacent matrix user_fea_norm_adj, item_fea_norm_adj = self.dataset.make_fea_sim_mat() self.sample_generator = SampleGenerator(ratings=self.dataset.train) self.config["user_fea_norm_adj"] = sparse_mx_to_torch_sparse_tensor( user_fea_norm_adj ) self.config["item_fea_norm_adj"] = sparse_mx_to_torch_sparse_tensor( item_fea_norm_adj ) plain_adj, norm_adj, mean_adj = self.dataset.get_adj_mat() norm_adj = sparse_mx_to_torch_sparse_tensor(norm_adj) self.config["norm_adj"] = norm_adj self.config["num_batch"] = self.dataset.n_train // self.config["batch_size"] + 1 self.config["n_users"] = self.dataset.n_users self.config["n_items"] = self.dataset.n_items def _train(self, engine, train_loader, save_dir): self.eval_engine.flush() epoch_bar = tqdm(range(self.config["max_epoch"]), file=sys.stdout) for epoch in epoch_bar: print("Epoch {} starts !".format(epoch)) print("-" * 80) if self.check_early_stop(engine, save_dir, epoch): break engine.train_an_epoch(train_loader, epoch_id=epoch) """evaluate model on validation and test sets""" self.eval_engine.train_eval( self.dataset.valid[0], self.dataset.test[0], engine.model, epoch ) def train(self): """ Main training navigator Returns: """ # Options are: 'gcn', 'mlp', 'ncf', and 'ncf_gcn'; # Train NeuMF without pre-train self.monitor = Monitor( log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id ) print("train gnn lightgcn") if self.config["pre_train"] == 0: self.train_gcn() else: pass self.train_lightgcn() self.config["run_time"] = self.monitor.stop() self.eval_engine.test_eval(self.dataset.test, self.engine.model) def train_lightgcn(self): self.model_dir = os.path.join( self.config["model_save_dir"], self.config["save_name"] ) train_loader = self.dataset self.engine = LightGCNEngine(self.config) self._train(self.engine, train_loader, self.model_dir) def train_gcn(self): """ Train GCN Returns: None """ train_loader = self.dataset # Train GCN self.engine = GCN_SEngine(self.config) self.gcn_save_dir = os.path.join( self.config["model_save_dir"], self.config["gcn_config"]["save_name"] ) print(self.gcn_save_dir) self._train(self.engine, train_loader, self.gcn_save_dir) while self.eval_engine.n_worker: print(f"Wait 15s for the complete of eval_engine.n_worker") time.sleep(15) # wait the def test(self): self.engine.resume_checkpoint(model_dir=self.model_dir) super(LightGCN_train, self).test()
class NCF_train(TrainEngine): """ An instance class from the TrainEngine base class """ def __init__(self, config): """Constructor Args: config (dict): All the parameters for the model """ self.config = config super(NCF_train, self).__init__(self.config) self.load_dataset() self.build_data_loader() self.gpu_id, self.config["device_str"] = self.get_device() def build_data_loader(self): # ToDo: Please define the directory to store the adjacent matrix user_fea_norm_adj, item_fea_norm_adj = self.dataset.make_fea_sim_mat() self.sample_generator = SampleGenerator(ratings=self.dataset.train) self.config["user_fea_norm_adj"] = sparse_mx_to_torch_sparse_tensor( user_fea_norm_adj ) self.config["item_fea_norm_adj"] = sparse_mx_to_torch_sparse_tensor( item_fea_norm_adj ) self.config["num_batch"] = self.dataset.n_train // self.config["batch_size"] + 1 self.config["n_users"] = self.dataset.n_users self.config["n_items"] = self.dataset.n_items def _train(self, engine, train_loader, save_dir): self.eval_engine.flush() epoch_bar = tqdm(range(self.config["max_epoch"]), file=sys.stdout) for epoch in epoch_bar: print("Epoch {} starts !".format(epoch)) print("-" * 80) if self.check_early_stop(engine, save_dir, epoch): break engine.train_an_epoch(train_loader, epoch_id=epoch) """evaluate model on validation and test sets""" if self.config["validate"]: self.eval_engine.train_eval( self.dataset.valid[0], self.dataset.test[0], engine.model, epoch ) else: self.eval_engine.train_eval( None, self.dataset.test[0], engine.model, epoch ) def train(self): """ Main training navigator Returns: """ # Options are: 'gcn', 'mlp', 'ncf', and 'ncf_gcn'; # Train NeuMF without pre-train self.monitor = Monitor( log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id ) if self.config["model"] == "ncf": self.train_ncf() elif self.config["model"] == "gcn": self.train_gcn() elif self.config["model"] == "mlp": self.train_mlp() elif self.config["model"] == "ncf_gcn": self.train_gcn() while self.eval_engine.n_worker: print(f"Wait 15s for the complete of eval_engine.n_worker") time.sleep(15) # wait the self.train_mlp() while self.eval_engine.n_worker: print(f"Wait 15s for the complete of eval_engine.n_worker") time.sleep(15) # wait the self.train_ncf() else: raise ValueError( "Model type error: Options are: 'gcn', 'mlp', 'ncf', and 'ncf_gcn'." ) self.config["run_time"] = self.monitor.stop() self.eval_engine.test_eval(self.dataset.test, self.engine.model) def train_ncf(self): """ Train NeuMF Returns: None """ train_loader = self.sample_generator.instance_a_train_loader( self.config["num_negative"], self.config["batch_size"] ) self.engine = NeuMFEngine(self.config) self.neumf_save_dir = os.path.join( self.config["model_save_dir"], self.config["neumf_config"]["save_name"] ) self._train(self.engine, train_loader, self.neumf_save_dir) def train_gcn(self): """ Train GCN Returns: None """ train_loader = self.dataset # Train GCN self.engine = GCN_SEngine(self.config) self.gcn_save_dir = os.path.join( self.config["model_save_dir"], self.config["gcn_config"]["save_name"] ) self._train(self.engine, train_loader, self.gcn_save_dir) while self.eval_engine.n_worker: print(f"Wait 15s for the complete of eval_engine.n_worker") time.sleep(15) # wait the def train_mlp(self): """ Train MLP Returns: None """ # Train MLP train_loader = self.sample_generator.instance_a_train_loader( self.config["num_negative"], self.config["batch_size"] ) self.engine = MLPEngine(self.config) self.mlp_save_dir = os.path.join( self.config["model_save_dir"], self.config["mlp_config"]["save_name"] ) self._train(self.engine, train_loader, self.mlp_save_dir) while self.eval_engine.n_worker: print(f"Wait 15s for the complete of eval_engine.n_worker") time.sleep(15) # wait the
class MF_train(TrainEngine): """ An instance class from the TrainEngine base class """ def __init__(self, config): """Constructor Args: config (dict): All the parameters for the model """ self.config = config super(MF_train, self).__init__(self.config) self.load_dataset() self.build_data_loader() self.gpu_id, self.config["device_str"] = self.get_device() def build_data_loader(self): # ToDo: Please define the directory to store the adjacent matrix user_fea_norm_adj, item_fea_norm_adj = self.dataset.make_fea_sim_mat() self.sample_generator = SampleGenerator(ratings=self.dataset.train) self.config["user_fea_norm_adj"] = sparse_mx_to_torch_sparse_tensor( user_fea_norm_adj) self.config["item_fea_norm_adj"] = sparse_mx_to_torch_sparse_tensor( item_fea_norm_adj) plain_adj, norm_adj, mean_adj = self.dataset.get_adj_mat() norm_adj = sparse_mx_to_torch_sparse_tensor(norm_adj) self.config["norm_adj"] = norm_adj self.config["num_batch"] = self.dataset.n_train // self.config[ "batch_size"] + 1 self.config["n_users"] = self.dataset.n_users self.config["n_items"] = self.dataset.n_items def _train(self, engine, train_loader, save_dir): self.eval_engine.flush() epoch_bar = tqdm(range(self.config["max_epoch"]), file=sys.stdout) for epoch in epoch_bar: print("Epoch {} starts !".format(epoch)) print("-" * 80) if self.check_early_stop(engine, save_dir, epoch): break engine.train_an_epoch(train_loader, epoch_id=epoch) """evaluate model on validation and test sets""" if self.config["validate"]: self.eval_engine.train_eval(self.dataset.valid[0], self.dataset.test[0], engine.model, epoch) else: self.eval_engine.train_eval(None, self.dataset.test[0], engine.model, epoch) def train(self): """ Main training navigator Returns: """ self.monitor = Monitor(log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id) if self.config["pre_train"] == 0: print(self.config["pre_train"]) self.train_gcn() else: pass self.train_mf() self.config["run_time"] = self.monitor.stop() self.eval_engine.test_eval(self.dataset.test, self.engine.model) def train_mf(self): """ Train NeuMF Returns: None """ if self.config["loss"] == "bpr": train_loader = self.sample_generator.pairwise_negative_train_loader( self.config["batch_size"], self.config["device_str"]) elif self.config["loss"] == "bce": train_loader = self.sample_generator.uniform_negative_train_loader( self.config["num_negative"], self.config["batch_size"], self.config["device_str"], ) else: raise ValueError( f"Unsupported loss type {self.config['loss']}, try other options: 'bpr' or 'bce'" ) self.engine = MFEngine(self.config) self.model_save_dir = os.path.join(self.config["model_save_dir"], self.config["save_name"]) self._train(self.engine, train_loader, self.model_save_dir) def train_gcn(self): """ Train GCN Returns: None """ train_loader = self.dataset # Train GCN self.engine = GCN_SEngine(self.config) self.gcn_save_dir = os.path.join( self.config["model_save_dir"], self.config["gcn_config"]["save_name"]) self._train(self.engine, train_loader, self.gcn_save_dir) while self.eval_engine.n_worker: print(f"Wait 15s for the complete of eval_engine.n_worker") time.sleep(15) # wait the
class TrainEngine(object): """Training engine for all the models. """ def __init__(self, config): """Initialing Args: config (dict): Config dict received from command line. Should have the config["config_file"]. Attributes: dataset (Dataset): A dataset containing DataFrame of train, validation and test. train_data (DataLoader): Extracted training data or train DataLoader, need to be implement. monitor (Monitor): An monitor object that monitor the computational resources. engine (Model Engine) """ self.dataset = None self.train_data = None self.monitor = None self.engine = None self.config = prepare_env(config) self.gpu_id, self.config["device_str"] = self.get_device() self.eval_engine = EvalEngine(self.config) def get_device(self): """ Get one gpu id that have the most available memory. Returns: (int, str): The gpu id (None if no available gpu) and the the device string (pytorch style). """ if "device" in self.config: if self.config["device"] == "cpu": return (None, "cpu") elif "cuda" in self.config[ "device"]: # receive an string with "cuda:#" return ( int(self.config["device"].replace("cuda:", "")), self.config["device"], ) elif len(self.config["device"]) < 1: # receive an int string return ( int(self.config["device"]), "cuda:" + self.config["device"], ) gpu_id_list = GPUtil.getAvailable( order="memory", limit=3) # get the fist gpu with the lowest load if len(gpu_id_list) < 1: gpu_id = None device_str = "cpu" else: gpu_id = gpu_id_list[0] # need to set 0 if ray only specify 1 gpu if "CUDA_VISIBLE_DEVICES" in os.environ: if len(os.environ["CUDA_VISIBLE_DEVICES"].split()) == 1: # gpu_id = int(os.environ["CUDA_VISIBLE_DEVICES"]) gpu_id = 0 print("Find only one gpu with id: ", gpu_id) device_str = "cuda:" + str(gpu_id) # print(os.system("nvidia-smi")) else: print("Get a gpu with the most available memory :", gpu_id) device_str = "cuda:" + str(gpu_id) return gpu_id, device_str def load_dataset(self): """ Default implementation of building dataset Returns: None """ self.dataset = data_util.Dataset(self.config) if "item_fea_type" in self.config or "user_fea_type" in self.config: self.config["item_fea"] = self.dataset.item_feature self.config["user_fea"] = self.dataset.user_feature self.config["n_users"] = self.dataset.n_users self.config["n_items"] = self.dataset.n_items # noinspection PyTypeChecker def build_data_loader(self): """ Default data builder Returns: DataLoader """ return DataLoader( torch.LongTensor(self.train_data.to_numpy()).to( self.engine.device), batch_size=self.config["batch_size"], shuffle=True, drop_last=True, ) def check_early_stop(self, engine, model_dir, epoch): """ Check if early stop criterion is triggered Save model if previous epoch have already obtained better result Args: epoch (int): epoch num Returns: True: if early stop criterion is triggered False: else """ if epoch > 0 and self.eval_engine.n_no_update == 0: # save model if previous epoch have already obtained better result engine.save_checkpoint(model_dir=model_dir) if self.eval_engine.n_no_update >= MAX_N_UPDATE: # stop training if early stop criterion is triggered print( "Early stop criterion triggered, no performance update for {:} times" .format(MAX_N_UPDATE)) return True return False def train(self): """Default train implementation """ assert hasattr(self, "engine"), "Please specify the exact model engine !" self.monitor = Monitor(log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id) self.engine.data = self.dataset print("Start training... ") epoch_bar = tqdm(range(self.config["max_epoch"]), file=sys.stdout) for epoch in epoch_bar: print(f"Epoch {epoch} starts !") print("-" * 80) if self.check_early_stop( engine=self.engine, model_dir=os.path.join(self.config["model_save_dir"], "model.cpk"), epoch=epoch, ): break data_loader = self.build_data_loader() self.engine.train_an_epoch(data_loader, epoch_id=epoch) self.eval_engine.train_eval(self.dataset.valid[0], self.dataset.test[0], self.engine.model, epoch) """Sets the learning rate to the initial LR decayed by 10 every 10 epochs""" lr = self.config["lr"] * (0.5**(epoch // 10)) for param_group in self.engine.optimizer.param_groups: param_group["lr"] = lr self.config["run_time"] = self.monitor.stop() return self.eval_engine.best_valid_performance def test(self): """Evaluate the performance for the testing sets based on the final model. """ self.eval_engine.test_eval(self.dataset.test, self.engine.model)
class MF_train(TrainEngine): """ An instance class from the TrainEngine base class """ def __init__(self, config): """Constructor Args: config (dict): All the parameters for the model """ self.config = config super(MF_train, self).__init__(self.config) self.load_dataset() self.build_data_loader() self.gpu_id, self.config["device_str"] = self.get_device() def build_data_loader(self): # ToDo: Please define the directory to store the adjacent matrix self.sample_generator = SampleGenerator(ratings=self.dataset.train) def _train(self, engine, train_loader, save_dir): self.eval_engine.flush() epoch_bar = tqdm(range(self.config["max_epoch"]), file=sys.stdout) for epoch in epoch_bar: print("Epoch {} starts !".format(epoch)) print("-" * 80) if self.check_early_stop(engine, save_dir, epoch): break engine.train_an_epoch(train_loader, epoch_id=epoch) """evaluate model on validation and test sets""" if self.config["validate"]: self.eval_engine.train_eval( self.dataset.valid[0], self.dataset.test[0], engine.model, epoch ) else: self.eval_engine.train_eval( None, self.dataset.test[0], engine.model, epoch ) def train(self): """ Main training navigator Returns: """ # Options are: 'gcn', 'mlp', 'ncf', and 'ncf_gcn'; # Train NeuMF without pre-train self.monitor = Monitor( log_dir=self.config["run_dir"], delay=1, gpu_id=self.gpu_id ) self.train_mf() self.config["run_time"] = self.monitor.stop() self.eval_engine.test_eval(self.dataset.test, self.engine.model) return self.eval_engine.best_valid_performance def train_mf(self): """ Train NeuMF Returns: None """ if self.config["loss"] == "bpr": train_loader = self.sample_generator.pairwise_negative_train_loader( self.config["batch_size"], self.config["device_str"] ) elif self.config["loss"] == "bce": train_loader = self.sample_generator.uniform_negative_train_loader( self.config["num_negative"], self.config["batch_size"], self.config["device_str"], ) else: raise ValueError( f"Unsupported loss type {self.config['loss']}, try other options: 'bpr' or 'bce'" ) self.engine = MFEngine(self.config) self.model_save_dir = os.path.join( self.config["model_save_dir"], self.config["save_name"] ) print(self.model_save_dir) self._train(self.engine, train_loader, self.model_save_dir)