def test_model(model, data_loader, mode, teaching_force=False, **kwargs): predictions = list() targets = list() tqdm_loader = tqdm(enumerate(data_loader)) model = convert_to_gpu(model) #model = nn.DataParallel(convert_to_gpu(model), [0, 1]) if kwargs['return_attn']: attn_record = list() with torch.no_grad(): model.eval() for step, (features, truth, covariate) in tqdm(tqdm_loader): features = convert_to_gpu(features) truth = convert_to_gpu(truth) covariate = convert_to_gpu(covariate) if kwargs['return_attn']: outputs, attn = model(features, covariate) #attn_record.append(attn_T.cpu().numpy()) else: outputs = model(features, covariate) outputs, truth = normalized_transform(outputs, truth, **kwargs) targets.append(truth.cpu().numpy()) predictions.append(outputs.cpu().detach().numpy()) pre2 = np.concatenate(predictions) tar2 = np.concatenate(targets) print(pre2.shape) print(calculate_metrics(pre2, tar2, mode, **kwargs)) if kwargs['return_attn']: #attn_plot(attn_record) # np.save('data/result/attn.npy', attn) return attn else: return pre2, tar2
def __getitem__(self, index): """ :param index: :return: g, graph, fully connected, containing N nodes, unweighted nodes_feature, tensor (N, item_embedding) edges_weight, tensor (T, N*N) nodes, tensor (N, ) user_data, list, (baskets, items) """ # list of tensors user_data = self.data_list[index] # print(user_data) # nodes -> tensor, len(nodes) = N # may change the order of appearing items in dataset nodes = self.get_nodes(baskets=user_data[:-1]) # print(nodes) # N * item_embedding tensor # print(nodes) nodes_feature = self.item_embedding_matrix(convert_to_gpu(nodes)) # construct graph for the user project_nodes = torch.tensor(list(range(nodes.shape[0]))) # construct fully connected graph, containing N nodes, unweighted # (0, 0), (0, 1), ..., (0, N-1), (1, 0), (1, 1), ..., (1, N-1), ... # src -> [0, 0, 0, ... N-1, N-1, N-1, ...], dst -> [0, 1, ..., N-1, ..., 0, 1, ..., N-1] src = torch.stack( [project_nodes for _ in range(project_nodes.shape[0])], dim=1).flatten().tolist() dst = torch.stack( [project_nodes for _ in range(project_nodes.shape[0])], dim=0).flatten().tolist() g = dgl.graph((src, dst), num_nodes=project_nodes.shape[0]) edges_weight_dict = self.get_edges_weight(user_data[:-1]) # add self-loop for node in nodes.tolist(): if edges_weight_dict[(node, node)] == 0.0: edges_weight_dict[(node, node)] = 1.0 # normalize weight max_weight = max(edges_weight_dict.values()) for i, j in edges_weight_dict.items(): edges_weight_dict[i] = j / max_weight # get edge weight for each timestamp, shape (T, N*N) # print(edges_weight_dict) edges_weight = [] for basket in user_data[:-1]: basket = basket.tolist() # list containing N * N weights of elements edge_weight = [] for node_1 in nodes.tolist(): for node_2 in nodes.tolist(): if (node_1 in basket and node_2 in basket) or (node_1 == node_2): # each node has a self connection edge_weight.append(edges_weight_dict[(node_1, node_2)]) else: edge_weight.append(0.0) edges_weight.append(torch.Tensor(edge_weight)) # tensor -> shape (T, N*N) edges_weight = torch.stack(edges_weight) return g, nodes_feature, edges_weight, nodes, user_data
def train_model(model: nn.Module, data_loaders: Dict[str, DataLoader], loss_func: callable, optimizer, model_folder: str, tensorboard_folder: str, pid: int): phases = ['train', 'validate', 'test'] writer = SummaryWriter(tensorboard_folder) num_epochs = get_attribute('epochs') since = time.perf_counter() model = convert_to_gpu(model) loss_func = convert_to_gpu(loss_func) save_dict, best_f1_score = {'model_state_dict': copy.deepcopy(model.state_dict()), 'epoch': 0}, 0 scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=.5, patience=2, threshold=1e-3, min_lr=1e-6) test_metric = None try: for epoch in range(num_epochs): running_loss, running_metrics = {phase: 0.0 for phase in phases}, {phase: dict() for phase in phases} save_validate_this_epoch = False for phase in phases: if phase == 'train': model.train() else: model.eval() steps, predictions, targets = 0, list(), list() tqdm_loader = tqdm(enumerate(data_loaders[phase])) for step, (g, spatial_features, temporal_features, external_features, truth_data) in tqdm_loader: if list(external_features.size())[0] != get_attribute("batch_size"): continue if not get_attribute("use_spatial_features"): torch.zero_(spatial_features) if not get_attribute("use_temporal_features"): torch.zero_(temporal_features) if not get_attribute("use_external_features"): torch.zero_(external_features) features, truth_data = convert_train_truth_to_gpu( [spatial_features, temporal_features, external_features], truth_data) with torch.set_grad_enabled(phase == 'train'): _outputs = model(g, *features) outputs = torch.squeeze(_outputs) # squeeze [batch-size, 1] to [batch-size] loss = loss_func(truth=truth_data, predict=outputs) if phase == 'train': optimizer.zero_grad() loss.backward() optimizer.step() targets.append(truth_data.cpu().numpy()) with torch.no_grad(): predictions.append(outputs.cpu().detach().numpy()) running_loss[phase] += loss * truth_data.size(0) steps += truth_data.size(0) tqdm_loader.set_description( f'{pid:2} pid: {phase:8} epoch: {epoch:3}, {phase:8} loss: {running_loss[phase] / steps:3.6}') # For the issue that the CPU memory increases while training. DO NOT know why, but it works. torch.cuda.empty_cache() print(f'{phase} metric ...') _cp = np.concatenate(predictions) _ct = np.concatenate(targets) scores = evaluate(_cp, _ct) running_metrics[phase] = scores print(scores) if phase == 'validate' and scores['F1-SCORE'] > best_f1_score: best_f1_score = scores['F1-SCORE'] save_validate_this_epoch = True save_dict.update(model_state_dict=copy.deepcopy(model.state_dict()), epoch=epoch, optimizer_state_dict=copy.deepcopy(optimizer.state_dict())) print(f"save model as {model_folder}/model_{epoch}.pkl") save_model(f"{model_folder}/model_{epoch}.pkl", **save_dict) scheduler.step(running_loss['train']) if save_validate_this_epoch: test_metric = running_metrics["test"].copy() for metric in running_metrics['train'].keys(): writer.add_scalars(metric, { f'{phase} {metric}': running_metrics[phase][metric] for phase in phases}, global_step=epoch) writer.add_scalars('Loss', { f'{phase} loss': running_loss[phase] / len(data_loaders[phase].dataset) for phase in phases}, global_step=epoch) finally: time_elapsed = time.perf_counter() - since print(f"cost {time_elapsed} seconds") save_model(f"{model_folder}/best_model.pkl", **save_dict) return test_metric
def train_model(model: nn.Module, train_data_loader: DataLoader, valid_data_loader: DataLoader, loss_func, epochs, optimizer, model_folder, tensorboard_folder): """ Args: model: nn.Module train_data_loader: DataLoader valid_data_loader: DataLoader loss_func: nn.Module epochs: int optimizer: Optimizer model_folder: str tensorboard_folder: str """ warnings.filterwarnings('ignore') print(model) print(optimizer) writer = SummaryWriter(tensorboard_folder) writer.add_text('Welcome', 'Welcome to tensorboard!') model = convert_to_gpu(model) model.train() loss_func = convert_to_gpu(loss_func) start_time = datetime.datetime.now() validate_max_ndcg = 0 name_list = ["train", "validate"] scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer) for epoch in range(epochs): loss_dict, metric_dict = {name: 0.0 for name in name_list }, {name: dict() for name in name_list} data_loader_dic = { "train": train_data_loader, "validate": valid_data_loader } for name in name_list: # training if name == "train": model.train() # validate else: model.eval() y_true = [] y_pred = [] total_loss = 0.0 tqdm_loader = tqdm(data_loader_dic[name]) for step, (g, nodes_feature, edges_weight, lengths, nodes, truth_data, users_frequency) in enumerate(tqdm_loader): g, nodes_feature, edges_weight, lengths, nodes, truth_data, users_frequency = \ convert_all_data_to_gpu(g, nodes_feature, edges_weight, lengths, nodes, truth_data, users_frequency) with torch.set_grad_enabled(name == 'train'): # (B, N) output = model(g, nodes_feature, edges_weight, lengths, nodes, users_frequency) loss = loss_func(output, truth_data.float()) total_loss += loss.cpu().data.numpy() if name == "train": optimizer.zero_grad() loss.backward() optimizer.step() y_pred.append(output.detach().cpu()) y_true.append(truth_data.detach().cpu()) tqdm_loader.set_description( f'{name} epoch: {epoch}, {name} loss: {total_loss / (step + 1)}' ) loss_dict[name] = total_loss / (step + 1) y_true = torch.cat(y_true, dim=0) y_pred = torch.cat(y_pred, dim=0) print(f'{name} metric ...') scores = get_metric(y_true=y_true, y_pred=y_pred) scores = sorted(scores.items(), key=lambda item: item[0], reverse=False) scores = {item[0]: item[1] for item in scores} print(json.dumps(scores, indent=4)) metric_dict[name] = scores # save best model if name == "validate": validate_ndcg_list = [] for key in metric_dict["validate"]: if key.startswith("ndcg_"): validate_ndcg_list.append(metric_dict["validate"][key]) validate_ndcg = np.mean(validate_ndcg_list) if validate_ndcg > validate_max_ndcg: validate_max_ndcg = validate_ndcg model_path = f"{model_folder}/model_epoch_{epoch}.pkl" save_model(model, model_path) print(f"model save as {model_path}") scheduler.step(loss_dict['train']) writer.add_scalars( 'Loss', {f'{name} loss': loss_dict[name] for name in name_list}, global_step=epoch) for metric in metric_dict['train'].keys(): for name in name_list: writer.add_scalars(f'{name} {metric}', {f'{metric}': metric_dict[name][metric]}, global_step=epoch) end_time = datetime.datetime.now() print("cost %d seconds" % (end_time - start_time).seconds)
from utils.metric import evaluate from utils.load_config import get_attribute from train.train_main import create_model import torch from utils.data_container import get_data_loaders from tqdm import tqdm from utils.util import convert_train_truth_to_gpu from utils.util import convert_to_gpu if __name__ == '__main__': model_path = f"../saves/spatial_temporal_external/DSTGCN/model_0.pkl" print(f'model path -> {model_path}') model = create_model() model.load_state_dict(torch.load(model_path)["model_state_dict"]) print(f'model epoch -> {torch.load(model_path)["epoch"]}') model = convert_to_gpu(model) print(model) data_loaders = get_data_loaders(get_attribute('K_hop'), get_attribute('batch_size')) phase = "test" tqdm_loader = tqdm(enumerate(data_loaders[phase])) predictions, targets = list(), list() for step, (g, spatial_features, temporal_features, external_features, truth_data) in tqdm_loader: torch.zero_(external_features) features, truth_data = convert_train_truth_to_gpu( [spatial_features, temporal_features, external_features], truth_data) outputs = model(g, *features)
def train_my_model(model: nn.Module, data_loader, loss_func: callable, optimizer, num_epochs, model_folder, tensorboard_folder: str, **kwargs): phases = ['train', 'val', 'test'] writer = SummaryWriter(tensorboard_folder) model = convert_to_gpu(model) #model = nn.DataParallel(convert_to_gpu(model), [0, 1]) loss_func = convert_to_gpu(loss_func) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=.1, patience=8, threshold=1e-4, min_lr=1e-6) save_dict = {'model_state_dict': copy.deepcopy(model.state_dict()), 'epoch': 0} loss_global = 100000 for epoch in range(num_epochs): running_loss = {phase: 0.0 for phase in phases} for phase in phases: if phase == 'train': model.train() else: model.eval() steps, predictions, targets = 0, list(), list() tqdm_loaders = tqdm(enumerate(data_loader[phase])) for step, (features, truth, covariate) in tqdm_loaders: features = convert_to_gpu(features) truth = convert_to_gpu(truth) covariate = convert_to_gpu(covariate) with torch.set_grad_enabled(phase == 'train'): outputs = model(features, covariate) if not get_Parameter('loss_normalized'): outputs, truth = normalized_transform(outputs, truth, **kwargs) taxi_pickup_loss = loss_func(truth[:, :, :get_Parameter('taxi_size'), 0], outputs[:, :, :get_Parameter('taxi_size'), 0]) taxi_dropoff_loss = loss_func(truth[:, :, :get_Parameter('taxi_size'), 1], outputs[:, :, :get_Parameter('taxi_size'), 1]) #taxi_loss = loss_func(truth[:, :, :get_Parameter('taxi_size')], outputs[:, :, :get_Parameter('taxi_size')]) taxi_loss = taxi_pickup_loss + taxi_dropoff_loss*1.5 bike_loss = loss_func(truth[:, :, get_Parameter('taxi_size'):], outputs[:, :, get_Parameter('taxi_size'):]) # if epoch<=100: # loss = (2*taxi_loss + bike_loss)*100 # else: # loss = taxi_loss #loss = taxi_loss + 30*bike_loss loss = (1.5*taxi_loss + bike_loss)*100 #loss = loss_func(truth, outputs) #loss = bike_loss if phase == 'train': optimizer.zero_grad() loss.backward() optimizer.step() if get_Parameter('loss_normalized'): outputs, truth = normalized_transform(outputs, truth, **kwargs) targets.append(truth.cpu().numpy()) with torch.no_grad(): predictions.append(outputs.cpu().numpy()) running_loss[phase] += loss.item() steps += truth.size(0) tqdm_loaders.set_description(f'{phase} epoch:{epoch}, {phase} loss: {running_loss[phase]/steps}') predictions = np.concatenate(predictions) targets = np.concatenate(targets) scores = calculate_metrics(predictions.reshape(predictions.shape[0], -1), targets.reshape(targets.shape[0], -1), mode='train', **kwargs) print(scores) writer.add_scalars(f'score/{phase}', scores, global_step=epoch) if phase == 'val' and scores['RMSE'] < loss_global: loss_global = scores['RMSE'] save_dict.update(model_state_dict=copy.deepcopy(model.state_dict()), epoch=epoch, optimizer_state_dict=copy.deepcopy(optimizer.state_dict())) scheduler.step(running_loss['train']) writer.add_scalars('Loss', { f'{phase} loss': running_loss[phase] for phase in phases }, global_step=epoch) save_model(f'{model_folder}/best_model.pkl', **save_dict) model.load_state_dict(save_dict['model_state_dict']) return model