def train(self): """Multiple training. Returns: None. """ max_noprogress = 5 _loss_train_min = 1e-5 n_noprogress = 0 process_bar = tqdm(range(self.iteration)) liveloss = PlotLosses(fig_path=self.output_file_name + ".iter.pdf") loss_list = [] _best_ndcg = 0 for i in process_bar: logs = {} all_loss = 0 kl_loss = 0 batch_num = 0 for batch_ndx, sample in enumerate(self.data_loader): pos_u = torch.tensor( [triple[0] for triple in sample], dtype=torch.int64, device=self.device, ) pos_i_1 = torch.tensor( [triple[1] for triple in sample], dtype=torch.int64, device=self.device, ) pos_i_2 = torch.tensor( [triple[2] for triple in sample], dtype=torch.int64, device=self.device, ) neg_u = torch.tensor( self.data.user_sampler.sample(self.n_neg, len(sample)), dtype=torch.int64, device=self.device, ) neg_i_1 = torch.tensor( self.data.item_sampler.sample(self.n_neg, len(sample)), dtype=torch.int64, device=self.device, ) neg_i_2 = torch.tensor( self.data.item_sampler.sample(self.n_neg, len(sample)), dtype=torch.int64, device=self.device, ) # print(pos_u,neg_u) self.optimizer.zero_grad() loss = self.model.forward(pos_u, pos_i_1, pos_i_2, neg_u, neg_i_2, neg_i_2) # print(loss) loss.backward() self.optimizer.step() all_loss = all_loss + loss kl_loss = kl_loss + self.model.kl_loss batch_num = batch_ndx if self.device.type == "cuda": all_loss = all_loss.cpu() if kl_loss != 0: kl_loss = kl_loss.cpu() logs["loss"] = all_loss.item() / batch_num if self.show_result: data_i = np.random.randint(10) result = self.data.evaluate_vali(self.data.test[data_i], self.model) logs["ndcg@10_test"], logs["recall@10_test"] = ( result["ndcg@10"], result["recall@10"], ) result = self.data.evaluate_vali(self.data.validate[data_i], self.model) logs["ndcg@10_val"], logs["recall@10_val"] = ( result["ndcg@10"], result["recall@10"], ) if _best_ndcg < result["ndcg@10"]: _best_ndcg = result["ndcg@10"] self.best_model = copy.deepcopy(self.model.state_dict()) torch.save(self.best_model, self.output_file_name) if kl_loss != 0: logs["kl_loss"] = kl_loss.item() / batch_num logs["loss"] = logs["loss"] - logs["kl_loss"] loss_list.append(logs["loss"]) if i > 1: if abs(loss_list[i] - loss_list[i - 1]) < _loss_train_min: n_noprogress += 1 else: n_noprogress = 0 liveloss.update(logs) liveloss.draw() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (logs["loss"], self.optimizer.param_groups[0]["lr"])) print("=== #no progress: ", n_noprogress) if n_noprogress >= max_noprogress: liveloss.draw() break """Sets the learning rate to the initial LR decayed by 10 every 10 epochs""" lr = self.initial_lr * (0.5**(i // 10)) for param_group in self.optimizer.param_groups: param_group["lr"] = lr if i >= self.iteration - 1: liveloss.draw()
def main(): global best_test_bpd last_checkpoints = [] lipschitz_constants = [] ords = [] # if args.resume: # validate(args.begin_epoch - 1, model, ema) #liveloss = PlotLosses() #liveloss = PlotLosses() liveloss = PlotLosses() for epoch in range(args.begin_epoch, args.nepochs): logs = {} logger.info('Current LR {}'.format(optimizer.param_groups[0]['lr'])) running_loss = train(epoch, model) #train(epoch, model) lipschitz_constants.append(get_lipschitz_constants(model)) #ords.append(get_ords(model)) #ords.append(get_ords(model)) ords.append(get_ords(model)) logger.info('Lipsh: {}'.format(pretty_repr(lipschitz_constants[-1]))) logger.info('Order: {}'.format(pretty_repr(ords[-1]))) #epoch_loss = running_loss / len(dataloaders[phase].dataset) epoch_loss = running_loss / len( datasets.CIFAR10( args.dataroot, train=True, transform=transform_train)) logs['log loss'] = epoch_loss.item() liveloss.update(logs) liveloss.draw() if args.ema_val: test_bpd = validate(epoch, model, ema) else: test_bpd = validate(epoch, model) if args.scheduler and scheduler is not None: scheduler.step() if test_bpd < best_test_bpd: best_test_bpd = test_bpd utils.save_checkpoint( { 'state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'args': args, 'ema': ema, 'test_bpd': test_bpd, }, os.path.join(args.save, 'moMoModels'), epoch, last_checkpoints, num_checkpoints=5) """ utils.save_checkpoint({ 'state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'args': args, 'ema': ema, 'test_bpd': test_bpd, }, os.path.join(args.save, 'mMoModels'), epoch, last_checkpoints, num_checkpoints=5) utils.save_checkpoint({ 'state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'args': args, 'ema': ema, 'test_bpd': test_bpd, }, os.path.join(args.save, 'mModels'), epoch, last_checkpoints, num_checkpoints=5) utils.save_checkpoint({ 'state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'args': args, 'ema': ema, 'test_bpd': test_bpd, }, os.path.join(args.save, 'models'), epoch, last_checkpoints, num_checkpoints=5) """ torch.save( { 'state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'args': args, 'ema': ema, 'test_bpd': test_bpd, }, os.path.join(args.save, 'models', '010mmoosttMoosttRecentt.pth')) """
def train_cross_validation(model_cls, dataset, dropout=0.0, lr=1e-3, weight_decay=1e-2, num_epochs=200, n_splits=10, use_gpu=True, dp=False, ddp=False, comment='', tb_service_loc='192.168.192.57:6007', batch_size=1, num_workers=0, pin_memory=False, cuda_device=None, tb_dir='runs', model_save_dir='saved_models', res_save_dir='res', fold_no=None, saved_model_path=None, device_ids=None, patience=20, seed=None, fold_seed=None, save_model=False, is_reg=True, live_loss=True, domain_cls=True, final_cls=True): """ :type fold_seed: int :param live_loss: bool :param is_reg: bool :param save_model: bool :param seed: :param patience: for early stopping :param device_ids: for ddp :param saved_model_path: :param fold_no: int :param ddp_port: str :param ddp: DDP :param cuda_device: list of int :param pin_memory: bool, DataLoader args :param num_workers: int, DataLoader args :param model_cls: pytorch Module cls :param dataset: instance :param dropout: float :param lr: float :param weight_decay: :param num_epochs: :param n_splits: number of kFolds :param use_gpu: bool :param dp: bool :param comment: comment in the logs, to filter runs in tensorboard :param tb_service_loc: tensorboard service location :param batch_size: Dataset args not DataLoader :return: """ saved_args = locals() seed = int(time.time() % 1e4 * 1e5) if seed is None else seed saved_args['random_seed'] = seed torch.manual_seed(seed) np.random.seed(seed) if use_gpu: torch.cuda.manual_seed_all(seed) # torch.backends.cudnn.deterministic = True # torch.backends.cudnn.benchmark = False model_name = model_cls.__name__ if not cuda_device: if device_ids and dp: device = device_ids[0] else: device = torch.device( 'cuda' if torch.cuda.is_available() and use_gpu else 'cpu') else: device = cuda_device device_count = torch.cuda.device_count() if dp else 1 device_count = len(device_ids) if (device_ids is not None and dp) else device_count batch_size = batch_size * device_count # TensorBoard log_dir_base = get_model_log_dir(comment, model_name) if tb_service_loc is not None: print("TensorBoard available at http://{1}/#scalars®exInput={0}". format(log_dir_base, tb_service_loc)) else: print("Please set up TensorBoard") # model criterion = nn.NLLLoss() print("Training {0} {1} models for cross validation...".format( n_splits, model_name)) # 1 # folds, fold = KFold(n_splits=n_splits, shuffle=False, random_state=seed), 0 # 2 # folds = GroupKFold(n_splits=n_splits) # iter = folds.split(np.zeros(len(dataset)), groups=dataset.data.site_id) # 4 # folds = StratifiedKFold(n_splits=n_splits, random_state=fold_seed, shuffle=True if fold_seed else False) # iter = folds.split(np.zeros(len(dataset)), dataset.data.y.numpy(), groups=dataset.data.subject_id) # 5 fold = 0 iter = multi_site_cv_split(dataset.data.y, dataset.data.site_id, dataset.data.subject_id, n_splits, random_state=fold_seed, shuffle=True if fold_seed else False) for train_idx, val_idx in tqdm_notebook(iter, desc='CV', leave=False): fold += 1 liveloss = PlotLosses() if live_loss else None # for a specific fold if fold_no is not None: if fold != fold_no: continue writer = SummaryWriter(log_dir=osp.join('runs', log_dir_base + str(fold))) model_save_dir = osp.join('saved_models', log_dir_base + str(fold)) print("creating dataloader tor fold {}".format(fold)) train_dataset, val_dataset = norm_train_val(dataset, train_idx, val_idx) model = model_cls(writer) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, collate_fn=lambda data_list: data_list, num_workers=num_workers, pin_memory=pin_memory) val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=batch_size, collate_fn=lambda data_list: data_list, num_workers=num_workers, pin_memory=pin_memory) if fold == 1 or fold_no is not None: print(model) writer.add_text('model_summary', model.__repr__()) writer.add_text('training_args', str(saved_args)) optimizer = torch.optim.AdamW(model.parameters(), lr=lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=weight_decay, amsgrad=False) # scheduler_reduce = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5) scheduler = GradualWarmupScheduler(optimizer, multiplier=10, total_epoch=5) # scheduler = scheduler_reduce # optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay) if dp and use_gpu: model = model.cuda() if device_ids is None else model.to( device_ids[0]) model = DataParallel(model, device_ids=device_ids) elif use_gpu: model = model.to(device) if saved_model_path is not None: model.load_state_dict(torch.load(saved_model_path)) best_map, patience_counter, best_score = 0.0, 0, np.inf for epoch in tqdm_notebook(range(1, num_epochs + 1), desc='Epoch', leave=False): logs = {} # scheduler.step(epoch=epoch, metrics=best_score) for phase in ['train', 'validation']: if phase == 'train': model.train() dataloader = train_dataloader else: model.eval() dataloader = val_dataloader # Logging running_total_loss = 0.0 running_corrects = 0 running_reg_loss = 0.0 running_nll_loss = 0.0 epoch_yhat_0, epoch_yhat_1 = torch.tensor([]), torch.tensor([]) epoch_label, epoch_predicted = torch.tensor([]), torch.tensor( []) logging_hist = True if phase == 'train' else False # once per epoch for data_list in tqdm_notebook(dataloader, desc=phase, leave=False): # TODO: check devices if dp: data_list = to_cuda(data_list, (device_ids[0] if device_ids is not None else 'cuda')) y_hat, domain_yhat, reg = model(data_list) y = torch.tensor([], dtype=dataset.data.y.dtype, device=device) domain_y = torch.tensor([], dtype=dataset.data.site_id.dtype, device=device) for data in data_list: y = torch.cat([y, data.y.view(-1).to(device)]) domain_y = torch.cat( [domain_y, data.site_id.view(-1).to(device)]) loss = criterion(y_hat, y) domain_loss = criterion(domain_yhat, domain_y) # domain_loss = -1e-7 * domain_loss # print(domain_loss.item()) if domain_cls: total_loss = domain_loss _, predicted = torch.max(domain_yhat, 1) label = domain_y if final_cls: total_loss = loss _, predicted = torch.max(y_hat, 1) label = y if domain_cls and final_cls: total_loss = (loss + domain_loss).sum() _, predicted = torch.max(y_hat, 1) label = y if is_reg: total_loss += reg.sum() if phase == 'train': # print(torch.autograd.grad(y_hat.sum(), model.saved_x, retain_graph=True)) optimizer.zero_grad() total_loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 2.0) optimizer.step() running_nll_loss += loss.item() running_total_loss += total_loss.item() running_reg_loss += reg.sum().item() running_corrects += (predicted == label).sum().item() epoch_yhat_0 = torch.cat( [epoch_yhat_0, y_hat[:, 0].detach().view(-1).cpu()]) epoch_yhat_1 = torch.cat( [epoch_yhat_1, y_hat[:, 1].detach().view(-1).cpu()]) epoch_label = torch.cat( [epoch_label, label.detach().float().view(-1).cpu()]) epoch_predicted = torch.cat([ epoch_predicted, predicted.detach().float().view(-1).cpu() ]) # precision = sklearn.metrics.precision_score(epoch_label, epoch_predicted, average='micro') # recall = sklearn.metrics.recall_score(epoch_label, epoch_predicted, average='micro') # f1_score = sklearn.metrics.f1_score(epoch_label, epoch_predicted, average='micro') accuracy = sklearn.metrics.accuracy_score( epoch_label, epoch_predicted) epoch_total_loss = running_total_loss / dataloader.__len__() epoch_nll_loss = running_nll_loss / dataloader.__len__() epoch_reg_loss = running_reg_loss / dataloader.__len__() # print('epoch {} {}_nll_loss: {}'.format(epoch, phase, epoch_nll_loss)) writer.add_scalars( 'nll_loss', {'{}_nll_loss'.format(phase): epoch_nll_loss}, epoch) writer.add_scalars('accuracy', {'{}_accuracy'.format(phase): accuracy}, epoch) # writer.add_scalars('{}_APRF'.format(phase), # { # 'accuracy': accuracy, # 'precision': precision, # 'recall': recall, # 'f1_score': f1_score # }, # epoch) if epoch_reg_loss != 0: writer.add_scalars( 'reg_loss'.format(phase), {'{}_reg_loss'.format(phase): epoch_reg_loss}, epoch) # print(epoch_reg_loss) # writer.add_histogram('hist/{}_yhat_0'.format(phase), # epoch_yhat_0, # epoch) # writer.add_histogram('hist/{}_yhat_1'.format(phase), # epoch_yhat_1, # epoch) # Save Model & Early Stopping if phase == 'validation': model_save_path = model_save_dir + '-{}-{}-{:.3f}-{:.3f}'.format( model_name, epoch, accuracy, epoch_nll_loss) # best score if accuracy > best_map: best_map = accuracy model_save_path = model_save_path + '-best' score = epoch_nll_loss if score < best_score: patience_counter = 0 best_score = score else: patience_counter += 1 # skip first 10 epoch # best_score = best_score if epoch > 10 else -np.inf if save_model: for th, pfix in zip( [0.8, 0.75, 0.7, 0.5, 0.0], ['-perfect', '-great', '-good', '-bad', '-miss']): if accuracy >= th: model_save_path += pfix break torch.save(model.state_dict(), model_save_path) writer.add_scalars('best_val_accuracy', {'{}_accuracy'.format(phase): best_map}, epoch) writer.add_scalars( 'best_nll_loss', {'{}_nll_loss'.format(phase): best_score}, epoch) writer.add_scalars('learning_rate', { 'learning_rate': scheduler.optimizer.param_groups[0]['lr'] }, epoch) if patience_counter >= patience: print("Stopped at epoch {}".format(epoch)) return if live_loss: prefix = '' if phase == 'validation': prefix = 'val_' logs[prefix + 'log loss'] = epoch_nll_loss logs[prefix + 'accuracy'] = accuracy if live_loss: liveloss.update(logs) liveloss.draw() print("Done !")
def trainer(classifier, optimizer, scheduler, epochs, early_stop, train_dataloader, validation_dataloader, save_file, seed_val=0, accumulation_steps=1): if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") classifier = nn.DataParallel(classifier) classifier.to(device) tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') embedder = AlbertModel.from_pretrained('albert-base-v2') embedder.to(device) random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) best = (np.inf, -1, -np.inf, None, None) liveloss = PlotLosses() LossHistory = [] val_step = 0 for epoch_i in range(0, epochs): logs = {} print("") print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) print("Global Learning Rate", optimizer.param_groups[0]["lr"]) print('Training...') classifier.train() epoch_loss = 0. start = time.time() classifier.zero_grad() for step, batch in enumerate(train_dataloader): b_inputs_c = batch[0].to(device) b_inputs_r = batch[1].to(device) b_mask_c = batch[2].to(device) b_mask_r = batch[3].to(device) b_labels = batch[4].to(device) x_c = embedder(input_ids=b_inputs_c, attention_mask=b_mask_c)[0] x_r = embedder(input_ids=b_inputs_r, attention_mask=b_mask_r)[0] loss, logits = classifier(x_c.permute(1, 0, 2), x_r.permute(1, 0, 2), b_labels) if torch.cuda.device_count() > 1: loss = loss.sum() loss.backward() if (step + 1) % accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(classifier.parameters(), 1.0) optimizer.step() scheduler.step() classifier.zero_grad() batch_loss = loss.cpu().item() epoch_loss += loss.cpu().item() if (step % 100) == 0: print("Step %i with loss %.3f elapsed time %.3f" % (step, epoch_loss / (step + 1), time.time() - start)) # writer.add_scalar("Loss/train", epoch_loss/(step+1), global_step) # writer.flush() print('Evaluating...') classifier.eval() dev_loss = 0. total_eval_accuracy = 0. y_preds = None y_true = None for batch in validation_dataloader: b_inputs_c = batch[0].to(device) b_inputs_r = batch[1].to(device) b_mask_c = batch[2].to(device) b_mask_r = batch[3].to(device) b_labels = batch[4].to(device) with torch.no_grad(): x_c = embedder(input_ids=b_inputs_c, attention_mask=b_mask_c)[0] x_r = embedder(input_ids=b_inputs_r, attention_mask=b_mask_r)[0] loss, logits = classifier(x_c.permute(1, 0, 2), x_r.permute(1, 0, 2), b_labels) if torch.cuda.device_count() > 1: loss = loss.sum() dev_loss += loss.cpu().item() label_ids = b_labels.cpu().numpy() logits = logits.detach().cpu().numpy() total_eval_accuracy += flat_accuracy(logits, label_ids) if y_preds is None: y_preds = np.argmax(logits, axis=1) y_true = label_ids else: y_preds = np.concatenate((y_preds, np.argmax(logits, axis=1))) y_true = np.concatenate((y_true, label_ids)) avg_val_accuracy = total_eval_accuracy / len(validation_dataloader) f1_score_1 = precision_recall_fscore_support(y_true, y_preds, average="binary") f1_score_0 = precision_recall_fscore_support(y_true, y_preds, average="binary", pos_label=0) print("Epoch %i with dev loss %f and dev accuracy %f" % (epoch_i + 1, dev_loss, avg_val_accuracy)) logs["val_loss"] = dev_loss / len(validation_dataloader) logs["loss"] = epoch_loss / len(train_dataloader) logs["val_accuracy"] = avg_val_accuracy liveloss.update(logs) LossHistory.append(logs["loss"]) liveloss.send() if (val_step - best[1] >= early_stop and best[0] < dev_loss): print("early_stopping, epoch:", epoch_i + 1) print( "Final dev loss %f Final Train Loss %f Final dev accuracy %f" % (dev_loss, epoch_loss, avg_val_accuracy)) print("Best dev loss %f Best dev accuracy %f" % (best[0], best[2])) print("F1_score Sarcasm ", f1_score_1) print("F1_score Non-Sarcasm ", f1_score_0) return classifier elif (best[0] > dev_loss): best = (dev_loss, val_step, avg_val_accuracy, f1_score_1, f1_score_0) torch.save(classifier.state_dict(), save_file) val_step += 1 classifier.train() print("Final dev loss %f Final Train Loss %f Final dev accuracy %f" % (dev_loss, epoch_loss, avg_val_accuracy)) print("Best dev loss %f Best dev accuracy %f" % (best[0], best[2])) print("F1_score Sarcasm ", f1_score_1) print("F1_score Non-Sarcasm ", f1_score_0) return classifier
class LiveLossPlot(Callback): """ Callback to write metrics to `LiveLossPlot <https://github.com/stared/livelossplot>`_, a library for visualisation in notebooks Example: :: >>> import torch.nn >>> from torchbearer import Trial >>> from torchbearer.callbacks import LiveLossPlot # Example Trial which clips all model gradients norms at 2 under the L1 norm. >>> model = torch.nn.Linear(1,1) >>> live_loss_plot = LiveLossPlot() >>> trial = Trial(model, callbacks=[live_loss_plot], metrics=['acc']) Args: on_batch (bool): If True, batch metrics will be logged. Else batch metrics will not be logged batch_step_size (int): The number of batches between logging metrics on_epoch (bool): If True, epoch metrics will be logged every epoch. Else epoch metrics will not be logged draw_once (bool): If True, draw the plot only at the end of training. Else draw every time metrics are logged kwargs: Keyword arguments for livelossplot.PlotLosses State Requirements: - :attr:`torchbearer.state.METRICS`: Metrics should be a dict containing the metrics to be plotted - :attr:`torchbearer.state.BATCH`: Batch should be the current batch or iteration number in the epoch """ def __init__(self, on_batch=False, batch_step_size=10, on_epoch=True, draw_once=False, **kwargs): super(LiveLossPlot, self).__init__() self._kwargs = kwargs self.on_batch = on_batch self.on_epoch = on_epoch self.draw_once = draw_once self.batch_step_size = batch_step_size if on_batch: self.on_step_training = self._on_step_training if on_epoch: self.on_end_epoch = self._on_end_epoch def on_start(self, state): from livelossplot import PlotLosses self.plt = PlotLosses(**self._kwargs) self.batch_plt = PlotLosses(**self._kwargs) def _on_step_training(self, state): self.batch_plt.update({ k: get_metric('LiveLossPlot', state, k) for k in state[torchbearer.METRICS] }) if state[torchbearer. BATCH] % self.batch_step_size == 0 and not self.draw_once: with no_print(): self.batch_plt.draw() def _on_end_epoch(self, state): self.plt.update({ k: get_metric('LiveLossPlot', state, k) for k in state[torchbearer.METRICS] }) if not self.draw_once: with no_print(): self.plt.draw() def on_end(self, state): if self.draw_once: with no_print(): self.batch_plt.draw() self.plt.draw()
# plot liveloss = PlotLosses() # train loop for ep in range(epoch): s_time = time.time() p_loss_v = 0 print(f'start ep: {ep}') for it, (batch_x, batch_y) in enumerate(train_loader): batch_x = batch_x.to(device) batch_y = batch_y.to(device) optimizer.zero_grad() predict = model(batch_x) p_loss = loss(predict, batch_y) p_loss_v = p_loss.item() p_loss.backward() optimizer.step() # plot if it % 50 == 0: liveloss.update({'loss': p_loss_v}) liveloss.send() print(f'end ep: {ep} @ {time.time()-s_time:.3f}s') if (ep + 1) % 2 == 0: torch.save(model.state_dict(), f'save/ep_{ep+1}.pth')
def fit(self, interactions_df, users_df, items_df): """ Training of the recommender. :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items defined by user_id, item_id and features of the interaction. :param pd.DataFrame users_df: DataFrame with users and their features defined by user_id and the user feature columns. :param pd.DataFrame items_df: DataFrame with items and their features defined by item_id and the item feature columns. """ del users_df, items_df # Shift item ids and user ids so that they are consecutive unique_item_ids = interactions_df['item_id'].unique() self.item_id_mapping = dict( zip(unique_item_ids, list(range(len(unique_item_ids))))) self.item_id_reverse_mapping = dict( zip(list(range(len(unique_item_ids))), unique_item_ids)) unique_user_ids = interactions_df['user_id'].unique() self.user_id_mapping = dict( zip(unique_user_ids, list(range(len(unique_user_ids))))) self.user_id_reverse_mapping = dict( zip(list(range(len(unique_user_ids))), unique_user_ids)) interactions_df = interactions_df.copy() interactions_df.replace( { 'item_id': self.item_id_mapping, 'user_id': self.user_id_mapping }, inplace=True) # Get the number of items and users self.interactions_df = interactions_df n_users = np.max(interactions_df['user_id']) + 1 n_items = np.max(interactions_df['item_id']) + 1 # Get the user-item interaction matrix (mapping to int is necessary because of how iterrows works) r = np.zeros(shape=(n_users, n_items)) for idx, interaction in interactions_df.iterrows(): r[int(interaction['user_id'])][int(interaction['item_id'])] = 1 self.r = r # Generate negative interactions negative_interactions = [] i = 0 while i < self.n_neg_per_pos * len(interactions_df): sample_size = 1000 user_ids = self.rng.choice(np.arange(n_users), size=sample_size) item_ids = self.rng.choice(np.arange(n_items), size=sample_size) j = 0 while j < sample_size and i < self.n_neg_per_pos * len( interactions_df): if r[user_ids[j]][item_ids[j]] == 0: negative_interactions.append([user_ids[j], item_ids[j], 0]) i += 1 j += 1 interactions_df = pd.concat([ interactions_df, pd.DataFrame(negative_interactions, columns=['user_id', 'item_id', 'interacted']) ]) # Initialize user and item embeddings as random vectors (from Gaussian distribution) self.user_repr = self.rng.normal(0, 1, size=(r.shape[0], self.embedding_dim)) self.item_repr = self.rng.normal(0, 1, size=(r.shape[1], self.embedding_dim)) # Initialize losses and loss visualization if self.print_type is not None and self.print_type == 'live': liveloss = PlotLosses() training_losses = deque(maxlen=50) training_avg_losses = [] training_epoch_losses = [] validation_losses = deque(maxlen=50) validation_avg_losses = [] validation_epoch_losses = [] last_training_total_loss = 0.0 last_validation_total_loss = 0.0 # Split the data interaction_ids = self.rng.permutation(len(interactions_df)) train_validation_slice_idx = int( len(interactions_df) * (1 - self.validation_set_size)) training_ids = interaction_ids[:train_validation_slice_idx] validation_ids = interaction_ids[train_validation_slice_idx:] # Train the model for epoch in range(self.n_epochs): if self.print_type is not None and self.print_type == 'live': logs = {} # Train training_losses.clear() training_total_loss = 0.0 batch_idx = 0 for idx in training_ids: user_id = int(interactions_df.iloc[idx]['user_id']) item_id = int(interactions_df.iloc[idx]['item_id']) e_ui = r[user_id, item_id] - np.dot(self.user_repr[user_id], self.item_repr[item_id]) self.user_repr[user_id] = self.user_repr[user_id] \ + self.lr * (e_ui * self.item_repr[item_id] - self.reg_l * self.user_repr[user_id]) self.item_repr[item_id] = self.item_repr[item_id] \ + self.lr * (e_ui * self.user_repr[user_id] - self.reg_l * self.item_repr[item_id]) loss = e_ui**2 training_total_loss += loss if self.print_type is not None and self.print_type == 'text': print( "\rEpoch: {}\tBatch: {}\tLast epoch - avg training loss: {:.2f} avg validation loss: {:.2f} loss: {}" .format(epoch, batch_idx, last_training_total_loss, last_validation_total_loss, loss), end="") batch_idx += 1 training_losses.append(loss) training_avg_losses.append(np.mean(training_losses)) # Validate validation_losses.clear() validation_total_loss = 0.0 for idx in validation_ids: user_id = int(interactions_df.iloc[idx]['user_id']) item_id = int(interactions_df.iloc[idx]['item_id']) e_ui = r[user_id, item_id] - np.dot(self.user_repr[user_id], self.item_repr[item_id]) loss = e_ui**2 validation_total_loss += loss validation_losses.append(loss) validation_avg_losses.append(np.mean(validation_losses)) # Save and print epoch losses training_last_avg_loss = training_total_loss / len(training_ids) training_epoch_losses.append(training_last_avg_loss) validation_last_avg_loss = validation_total_loss / len( validation_ids) validation_epoch_losses.append(validation_last_avg_loss) if self.print_type is not None and self.print_type == 'live' and epoch >= 3: # A bound on epoch prevents showing extremely high losses in the first epochs # noinspection PyUnboundLocalVariable logs['loss'] = training_last_avg_loss logs['val_loss'] = validation_last_avg_loss # noinspection PyUnboundLocalVariable liveloss.update(logs) liveloss.send() # Find the most popular items for the cold start problem offers_count = interactions_df.loc[:, ['item_id', 'user_id']].groupby( by='item_id').count() offers_count = offers_count.sort_values('user_id', ascending=False) self.most_popular_items = offers_count.index
def train(self, train_ds, valid_ds, plot_loss=True): # Initialize plotting if plot_loss: liveloss = PlotLosses() # Initialize DataLoaders tdl = DataLoader(train_ds, batch_size=self.batch_size, pin_memory=True) vdl = DataLoader(valid_ds, batch_size=self.batch_size, shuffle=False, pin_memory=True) # Lists for losses train_losses, valid_losses = [], [] # Lists for accuracies train_accs, valid_accs = [], [] # Iterate over epochs for epoch in range(self.max_epochs): # Logs for livelossplot logs = {} batch_losses = [] batch_count_goods = [] # Iterate over batches for idx_batch, batch in enumerate(tdl): x = batch[0].to(DEVICE) y = batch[1].to(device=DEVICE, dtype=torch.long) pred = self.model(x) loss = self.loss_fn(pred, y) batch_losses.append(loss.item()) # Accuracy with torch.no_grad(): batch_count_goods.append(self.count_goods(pred, y)) loss.backward() self.optimizer.step() self.optimizer.zero_grad() # Save train loss and accuracy for the epoch train_losses.append(sum(batch_losses) / len(train_ds)) train_accs.append(sum(batch_count_goods) / len(train_ds)) # Compute and save validation loss and accuracy for the epoch with torch.no_grad(): v_batch_losses, v_batch_count_goods = [], [] for idx_batch, batch in enumerate(vdl): x = batch[0].to(DEVICE) y = batch[1].to(device=DEVICE, dtype=torch.long) pred = self.model(x) loss = self.loss_fn(pred, y) v_batch_losses.append(loss.item()) v_batch_count_goods.append(self.count_goods(pred, y)) valid_losses.append(sum(v_batch_losses) / len(valid_ds)) valid_accs.append(sum(v_batch_count_goods) / len(valid_ds)) if plot_loss: logs['log loss'] = train_losses[epoch] logs['val_log loss'] = valid_losses[epoch] logs['accuracy'] = train_accs[epoch] logs['val_accuracy'] = valid_accs[epoch] liveloss.update(logs) liveloss.draw()
def train(self): """ Train the model """ # initial setup epoch = 1 loss_val_best = 100 num_epochs_increased = 0 epoch_best = 1 liveloss = PlotLosses() logs = {} # Perform training while True: # Run one iteration of SGD t0 = time.time() loss_train = self.train_epoch() loss_train_eval = self.compute_loss(self.loader_train_eval) loss_val = self.compute_loss(self.loader_val) time_epoch = time.time() - t0 self.logger.add_entry({ 'loss_train': loss_train, 'loss_train_eval': loss_train_eval, 'loss_val': loss_val }) # run learing rate scheduler if self.scheduler: self.scheduler.step(loss_val) # save logger info if self.save_dir: self.logger.append(os.path.join(self.save_dir, 'log.txt')) # change in loss_val d_loss_val = (loss_val - loss_val_best) / loss_val_best * 100 # display results logs['loss'] = loss_train_eval logs['val_loss'] = loss_val logs['percent improvement'] = ( loss_val - loss_train_eval) / loss_train_eval * 100 liveloss.update(logs) logs['val_percent improvement'] = d_loss_val liveloss.send() print( 'E: {:} / Train: {:.3e} / Valid: {:.3e} / Diff Valid: {:.2f}% / Diff Valid-Train: {:.1f}% / Time: {:.2f}' .format(epoch, loss_train_eval, loss_val, d_loss_val, (loss_val - loss_train_eval) / loss_train_eval * 100, time_epoch)) # if validation loss improves if d_loss_val < 0: num_epochs_increased = 0 # record epoch and loss epoch_best = epoch loss_val_best = loss_val # save model weights if self.save_dir: print('Validation loss improved. Saving model.') torch.save(self.model.state_dict(), os.path.join(self.save_dir, 'model.dat')) else: num_epochs_increased = num_epochs_increased + 1 # stop training if we lose patience: if num_epochs_increased > self.patience: break # advance epoch counter epoch = epoch + 1
class train_wrapper(): """ Class that keeps a model, its optimiser and dataloaders together. Stores the train, validate and evaluate functions for training as well as some other useful methods to carry out the training with a love plot and save the model. """ def __init__(self, model, optimizer, train_loader, validate_loader, criterion=nn.CrossEntropyLoss(), device="cpu", keep_best=0): "Stores the parameters on the class instance for later methods" for arg in ["model", "optimizer", "train_loader", "validate_loader", "criterion", "device", "keep_best"]: exec("self." + arg + "=" + arg) try: self.transform = validate_loader.dataset.transform except: print("No transform found, test data must be normalised manually") # store the liveloss as it holds all our logs, useful for later self.liveloss = PlotLosses() # store the best model params self.best_params_dict = {} # store the current epoch between training batches self.epoch = 0 # for keeping the best model params self.max_acc=0. return def train(self): "Train a single epoch" # set the model expect a backward pass self.model.train() train_loss, train_accuracy = 0, 0 # for every training batch for X, y in self.train_loader: # put the samples on the device X, y = X.to(self.device), y.to(self.device) # zero the gradent self.optimizer.zero_grad() # find the model output with current parameters output = self.model(X) # caclulate the loss for to the expect output loss = self.criterion(output, y) # propagate the gradients though the network loss.backward() # store the loss (scaled by batch size for averaging) train_loss += loss * X.size(0) # find the predictions from this output y_pred = F.log_softmax(output, dim=1).max(1)[1] # compare to expected output to find the accuracy train_accuracy += accuracy_score(y.cpu().numpy(), y_pred.detach().cpu().numpy())*X.size(0) # improve the parameters self.optimizer.step() # return the mean loss and accuracy of this epoch N_samp = len(self.train_loader.dataset) return train_loss/N_samp, train_accuracy/N_samp def validate(self): """ Find the loss and accuracy of the current model parameters to the validation data set """ # if no validation set present return zeros if self.validate_loader == None: return torch.tensor(0.), torch.tensor(0.) # set the model to not expect a backward pass self.model.eval() validation_loss, validation_accuracy = 0., 0. # for every validate batch for X, y in self.validate_loader: # tell the optimizer not to store gradients with torch.no_grad(): # put the samples on the device X, y = X.to(self.device), y.to(self.device) # find the model output with current parameters output = self.model(X) # caclulate the loss for to the expect output loss = self.criterion(output, y) # store the loss (scaled by batch size for averaging) validation_loss += loss * X.size(0) # find the predictions from this output y_pred = F.log_softmax(output, dim=1).max(1)[1] # compare to expected output to find the accuracy validation_accuracy += accuracy_score(y.cpu().numpy(), y_pred.cpu().numpy())*X.size(0) # return the mean loss and accuracy of this epoch N_samp = len(self.validate_loader.dataset) return validation_loss/N_samp, validation_accuracy/N_samp def evaluate(self, test_data, prob_output=True): """ Find the prediction of the current model parameters with the test data set and return both the predicted and actual labels """ # set the model to not expect a backward pass self.model.eval() y_preds = [] # for every test batch for X in test_data: # normalise the test data with validates transformation if self.transform: X = self.transform(X) # tell the optimizer not to store gradients with torch.no_grad(): # put the samples on the device X = X.to(self.device) # find the model output with current parameters output = self.model(X.view(-1, 1, 28, 28)) # find the predictions from this output y_pred = F.log_softmax(output, dim=1) if not prob_output: y_pred = y_pred.max(1)[1] # store the predicted and actual outcomes y_preds.append(y_pred.cpu().numpy()) # return the list of predictions and actual targets return np.concatenate(y_preds, 0) def train_model(self, epochs): """ Do a live plot of the training accuracy and loss as the model is trained """ for _ in range(epochs): logs = {} train_loss, train_accuracy = self.train() logs['' + 'log loss'] = train_loss.item() logs['' + 'accuracy'] = train_accuracy.item() validation_loss, validation_accuracy = self.validate() logs['val_' + 'log loss'] = validation_loss.item() logs['val_' + 'accuracy'] = validation_accuracy.item() # if we are after the if self.keep_best: if train_accuracy.item() > self.max_acc and self.epoch > self.keep_best: self.max_acc = train_accuracy.item() self.best_params_dict = self.model.state_dict() self.liveloss.update(logs) self.liveloss.draw() self.epoch += 1 print("Training Finished") return def save_model(self, name, path=F"/content/gdrive/My Drive/models/"): """ Pickel either the whole model or its parameter dictionary via torch's save methods """ dict = {"model":self.model, "transform":self.transform, "Liveloss":self.liveloss} torch.save(dict, path + name) print("saved to " + path + name) def num_model_params(self): n_params = sum([t.cpu().detach().numpy().size for t in self.model.parameters()]) print("Number of model Parameters: ", n_params) return n_params def max_acc_epoch(self): max_acc = self.liveloss.metrics_extrema['val_accuracy']['max'] for log in self.liveloss.logs: if log["val_accuracy"] == max_acc: return log["_i"]# def confusion_matrix(self): y_preds, ys = [], [] # same code as validate self.model.eval() for X, y in self.validate_loader: with torch.no_grad(): X, y = X.to(self.device), y.to(self.device) output = self.model(X) y_pred = F.log_softmax(output, dim=1) y_pred = y_pred.max(1)[1] y_preds.append(y_pred.cpu().numpy()) ys.append(y.cpu().numpy()) y_preds = np.array(y_preds).flatten() ys = np.array(ys).flatten() return ConfusionMatrix(actual_vector=ys, predict_vector=y_preds)
def trainer(cfg, train_id=None, num_workers=15, device=None): device = device or 'cuda:0' ## train_id = train_id or cfg['train_id'] use_pretrained_vgg=cfg["use_pretrained_vgg"] batch_size=cfg["batch_size"] lr=cfg["lr"] num_epochs=cfg["num_epochs"] model = ternausnet.models.UNet11(pretrained=use_pretrained_vgg) if cfg.get('first_freeze_layers', None) is not None: for i in range(cfg['first_freeze_layers']): for param in model.encoder[i].parameters(): param.requires_grad = False if cfg['pretrained_model'] is not None: model.load_state_dict(torch.load(cfg['pretrained_model'])) model = model.to(device) loss = nn.BCEWithLogitsLoss() optimizer = Adam(filter(lambda x: return x.requires_grad, model.parameters()), lr) d_train = WaterDataset(cfg['train_img_list'], train_transform) d_val = WaterDataset(cfg['test_img_list'], test_transform) print(d_val[0][0].shape) dl_train = DataLoader(d_train, batch_size, shuffle=True, num_workers=num_workers) dl_val = DataLoader(d_val, batch_size, shuffle=False, num_workers=num_workers) metrics = { 'val_acc': AccuracyMetric(0.5), 'train_acc': AccuracyMetric(0.5), 'val_loss': LossMetric(), 'train_loss': LossMetric(), 'train_lake_acc': LakeAccuracyMetric(0.5), 'val_lake_acc': LakeAccuracyMetric(0.5), 'train_nolake_acc': NoLakeAccuracyMetric(0.5), 'val_nolake_acc': NoLakeAccuracyMetric(0.5), 'val_miou': MIOUMetric(0.5), 'train_miou': MIOUMetric(0.5), 'val_f1': F1Metric(0.5), 'train_f1': F1Metric(0.5) } groups = { 'accuracy': ['train_acc', 'val_acc'], 'bce-loss': ['train_loss', 'val_loss'], 'lake-acc': ['train_lake_acc', 'val_lake_acc'], 'nolake_acc': ['train_nolake_acc', 'val_nolake_acc'], 'miou': ['train_miou', 'val_miou'], 'f1': ['train_f1', 'val_f1'] } plotlosses = PlotLosses(groups=groups) topk_val_losses = {} for epoch in range(num_epochs): print('train step') for name, metric in metrics.items(): metric.reset() model.train() for idx, (im, gt) in enumerate(dl_train): im = im.to(device) gt = gt.to(device) optimizer.zero_grad() pred = model(im) L = loss(pred, gt) L.backward() assert pred.shape == gt.shape metrics['train_acc'].append(pred, gt) metrics['train_lake_acc'].append(pred, gt) metrics['train_nolake_acc'].append(pred, gt) metrics['train_miou'].append(pred, gt) metrics['train_f1'].append(pred, gt) metrics['train_loss'].append(L) optimizer.step() torch.cuda.empty_cache() model.eval() print('eval step') with torch.no_grad(): for idx, (im, gt) in enumerate(dl_val): im = im.to(device) gt = gt.to(device) pred = model(im) L = loss(pred, gt) metrics['val_acc'].append(pred, gt) metrics['val_lake_acc'].append(pred, gt) metrics['val_nolake_acc'].append(pred, gt) metrics['val_miou'].append(pred, gt) metrics['val_f1'].append(pred, gt) metrics['val_loss'].append(L) torch.cuda.empty_cache() results = {key: metrics[key].result() for key in metrics} plotlosses.update(results) plotlosses.send() for name, metric in metrics.items(): metric.history() history = {key: metrics[key].hist for key in metrics} save_models(model, topk_val_losses, metrics['val_loss'].result(), epoch, train_id, save_num_models=3) torch.save(model.state_dict(), 'model-latest.pth') with open(f'history-{train_id}.json', "w") as write_file: json.dump(history, write_file, indent=4)
class Trainer(object): def __init__( self, model=None, data_loader=None, train_times=1000, lr=1e-3, alpha=0.5, use_gpu=True, opt_method="sgd", save_steps=None, checkpoint_dir=None, ): self.work_threads = 8 self.train_times = train_times self.opt_method = opt_method self.optimizer = None self.lr_decay = 0 self.weight_decay = 0 self.alpha = alpha self.lr = lr self.model = model self.data_loader = data_loader self.use_gpu = use_gpu self.save_steps = save_steps self.checkpoint_dir = checkpoint_dir self.liveplot = PlotLosses() def train_one_step(self, data, stage=1): self.optimizer.zero_grad() self.model.zero_grad() loss = self.model({ 'batch_h': self.to_var(data['batch_h'], self.use_gpu), 'batch_t': self.to_var(data['batch_t'], self.use_gpu), 'batch_r': self.to_var(data['batch_r'], self.use_gpu), 'batch_y': self.to_var(data['batch_y'], self.use_gpu), 'mode': data['mode'], 'stage': stage }) loss.backward() nn.utils.clip_grad_norm_(self.model.parameters(), 2) self.optimizer.step() return loss.item() def run(self, lr=None, alpha=None, weight_decay=None, train_times=None, stage=1, multiplier=1): if lr: self.lr = lr if alpha: self.alpha = alpha if weight_decay: self.weight_decay = weight_decay if train_times: self.train_times = train_times if self.use_gpu: self.model.cuda() if self.optimizer is not None: pass elif self.opt_method == "Adagrad" or self.opt_method == "adagrad": self.optimizer = optim.Adagrad( self.model.parameters(), lr=self.lr, lr_decay=self.lr_decay, weight_decay=self.weight_decay, ) elif self.opt_method == "Adadelta" or self.opt_method == "adadelta": self.optimizer = optim.Adadelta( self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay, ) elif self.opt_method == "Adam" or self.opt_method == "adam": self.optimizer = optim.Adam( self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay, ) elif self.opt_method == "ranger": if not lr: self.optimizer = Ranger(self.model.parameters(), lr=self.lr, alpha=self.alpha) else: self.optimizer = Ranger(self.model.parameters(), lr=lr, alpha=self.alpha) elif self.opt_method == "rangerva": self.optimizer = RangerVA(self.model.parameters(), lr=lr) else: self.optimizer = optim.SGD( self.model.parameters(), lr=self.alpha, weight_decay=self.weight_decay, ) print("Finish initializing...") # training_range = tqdm.tqdm(range(self.train_times)) training_range = tqdm.trange(self.train_times) # training_range = range(self.train_times) for epoch in training_range: res = 0.0 for data in self.data_loader: loss = multiplier * self.train_one_step(data, stage) res += loss self.liveplot.update({'loss': res}) self.liveplot.send() if self.save_steps and self.checkpoint_dir and ( epoch + 1) % self.save_steps == 0: print("Epoch %d has finished, saving..." % (epoch)) self.model.save_checkpoint( os.path.join(self.checkpoint_dir + "-" + str(epoch) + ".ckpt")) def set_model(self, model): self.model = model def to_var(self, x, use_gpu): if use_gpu: return Variable(torch.from_numpy(x).cuda()) else: return Variable(torch.from_numpy(x)) def set_use_gpu(self, use_gpu): self.use_gpu = use_gpu def set_alpha(self, alpha): self.alpha = alpha def set_lr_decay(self, lr_decay): self.lr_decay = lr_decay def set_weight_decay(self, weight_decay): self.weight_decay = weight_decay def set_opt_method(self, opt_method): self.opt_method = opt_method def set_train_times(self, train_times): self.train_times = train_times def set_save_steps(self, save_steps, checkpoint_dir=None): self.save_steps = save_steps if not self.checkpoint_dir: self.set_checkpoint_dir(checkpoint_dir) def set_checkpoint_dir(self, checkpoint_dir): self.checkpoint_dir = checkpoint_dir
def train_eval_loop( model: Module, train_dataset: Dataset, val_dataset: Dataset, lr: float = 1e-4, epoch_n: int = 10, batch_size: int = 32, device=None, early_stopping_patience: int = 10, l2_reg_alpha: float = 0, max_batches_per_epoch_train: int = 10000, max_batches_per_epoch_val: int = 1000, optimizer_ctor: Optimizer = None, lr_scheduler_ctor=None, shuffle_train=True, dataloader_workers_n: int = 0, verbose_batch: bool = False, verbose_liveloss=True, prev_loss: Dict[str, List[float]] = {} ) -> Tuple[float, Module, Dict[str, List[float]]]: """ Цикл для обучения модели. После каждой эпохи качество модели оценивается по отложенной выборке. :param prev_loss: лоссы от предыдущего цикла обучения :param verbose_batch: :param model: torch.nn.Module - обучаемая модель :param train_dataset: torch.utils.data.Dataset - данные для обучения :param val_dataset: torch.utils.data.Dataset - данные для оценки качества :param criterion: функция потерь для настройки модели :param lr: скорость обучения :param epoch_n: максимальное количество эпох :param batch_size: количество примеров, обрабатываемых моделью за одну итерацию :param device: cuda/cpu - устройство, на котором выполнять вычисления :param early_stopping_patience: наибольшее количество эпох, в течение которых допускается отсутствие улучшения модели, чтобы обучение продолжалось. :param l2_reg_alpha: коэффициент L2-регуляризации :param max_batches_per_epoch_train: максимальное количество итераций на одну эпоху обучения :param max_batches_per_epoch_val: максимальное количество итераций на одну эпоху валидации :param optimizer_ctor :param optimizer_params :param lr_scheduler_ctor :param shuffle_train :param dataloader_workers_n :return: кортеж из двух элементов: - среднее значение функции потерь на валидации на лучшей эпохе - лучшая модель """ if device is None: device = 'cuda' if torch.cuda.is_available() else 'cpu' device = torch.device(device) model.to(device) if optimizer_ctor is None: optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2_reg_alpha) else: optimizer = optimizer_ctor(model.parameters()) if lr_scheduler_ctor is not None: lr_scheduler = lr_scheduler_ctor(optimizer) else: lr_scheduler = None train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle_train, num_workers=dataloader_workers_n) val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=dataloader_workers_n) best_val_loss = float('inf') best_epoch_i = 0 best_model = copy.deepcopy(model) losses = { 'train_loss': prev_loss.get('train_loss', []), 'valid_loss': prev_loss.get('val_loss', []) } if verbose_liveloss: liveloss = PlotLosses() for epoch_i in range(epoch_n): try: epoch_start = datetime.datetime.now() print('Эпоха {}'.format(epoch_i)) model.train() mean_train_loss = 0 train_batches_n = 0 for batch_i, (batch_x, batch_y) in enumerate(train_dataloader): start_batch = time.time() if batch_i > max_batches_per_epoch_train: break mask = (batch_x[:, :, 1] != 0) batch_x = copy_data_to_device(batch_x, device) batch_y = copy_data_to_device(batch_y, device) mask = copy_data_to_device(mask, device) # set_trace() pred = model(batch_x) loss = -model.crf(pred.permute(0, 2, 1), batch_y, mask) / batch_size # loss = criterion(pred, batch_y) model.zero_grad() loss.backward() optimizer.step() mean_train_loss += float(loss) train_batches_n += 1 if verbose_batch: print( f"Батч {batch_i} выполнен за {time.time() - start_batch:.2f} секунд" ) mean_train_loss /= train_batches_n print('Эпоха: {} итераций, {:0.2f} сек'.format( train_batches_n, (datetime.datetime.now() - epoch_start).total_seconds())) print('Среднее значение функции потерь на обучении', mean_train_loss) losses['train_loss'].append(mean_train_loss) model.eval() mean_val_loss = 0 val_batches_n = 0 with torch.no_grad(): for batch_i, (batch_x, batch_y) in enumerate(val_dataloader): if batch_i > max_batches_per_epoch_val: break mask = (batch_x[:, :, 1] != 0) batch_x = copy_data_to_device(batch_x, device) batch_y = copy_data_to_device(batch_y, device) mask = copy_data_to_device(mask, device) pred = model(batch_x) loss = -model.crf(pred.permute(0, 2, 1), batch_y, mask) / batch_size mean_val_loss += float(loss) val_batches_n += 1 mean_val_loss /= val_batches_n print('Среднее значение функции потерь на валидации', mean_val_loss) losses['valid_loss'].append(mean_val_loss) logs = {'log loss': mean_train_loss, 'val_log loss': mean_val_loss} if mean_val_loss < best_val_loss: best_epoch_i = epoch_i best_val_loss = mean_val_loss best_model = copy.deepcopy(model) print('Новая лучшая модель!') elif epoch_i - best_epoch_i > early_stopping_patience: print( 'Модель не улучшилась за последние {} эпох, прекращаем обучение' .format(early_stopping_patience)) break if lr_scheduler is not None: lr_scheduler.step(mean_val_loss) print() except KeyboardInterrupt: print('Досрочно остановлено пользователем') break except Exception as ex: print('Ошибка при обучении: {}\n{}'.format(ex, traceback.format_exc())) break if verbose_liveloss: liveloss.update(logs) liveloss.send() return best_val_loss, best_model, losses
class Logger(): def __init__(self, n_epochs, batches_epoch, out_dir, start_epoch=1): # self.viz = Visdom() self.n_epochs = n_epochs self.batches_epoch = batches_epoch self.epoch = start_epoch self.batch = 1 self.prev_time = time.time() self.mean_period = 0 self.losses = {} self.loss_windows = {} self.image_windows = {} self.out_dir = out_dir self.to_image = transforms.ToPILImage() self.liveloss = PlotLosses() def log(self, losses=None, images=None): pass self.mean_period += (time.time() - self.prev_time) self.prev_time = time.time() sys.stdout.write( '\rEpoch %03d/%03d [%04d/%04d] -- ' % (self.epoch, self.n_epochs, self.batch, self.batches_epoch)) plots = {} for i, loss_name in enumerate(losses.keys()): if loss_name not in self.losses: self.losses[loss_name] = losses[loss_name].data else: self.losses[loss_name] += losses[loss_name].data if (i + 1) == len(losses.keys()): sys.stdout.write( '%s: %.4f -- ' % (loss_name, self.losses[loss_name] / self.batch)) else: sys.stdout.write( '%s: %.4f | ' % (loss_name, self.losses[loss_name] / self.batch)) batches_done = self.batches_epoch * (self.epoch - 1) + self.batch batches_left = self.batches_epoch * ( self.n_epochs - self.epoch) + self.batches_epoch - self.batch sys.stdout.write('ETA: %s' % (datetime.timedelta( seconds=batches_left * self.mean_period / batches_done))) if self.batch % 10 == 0: # Save images plt.ioff() fig = plt.figure(figsize=(100, 50)) for i, (image_name, tensor) in enumerate(images.items()): ax = plt.subplot(1, len(images), i + 1) ax.imshow(self.to_image(tensor.cpu().data[0])) fig.savefig(self.out_dir + '/%d_%d.png' % (self.epoch, self.batch)) plt.close(fig) # self.to_image(images["composed"].cpu().data[0]).save(self.out_dir + '/%d_%d.png' % (self.epoch, self.batch)) # plt.close(fig) # End of epoch if (self.batch % self.batches_epoch) == 0: # Plot losses for i, (loss_name, loss) in enumerate(self.losses.items()): # if loss_name not in self.loss_windows: # self.loss_windows[loss_name] = self.viz.line(X=np.array([self.epoch]), Y=np.array([loss/self.batch]), # opts={'xlabel': 'epochs', 'ylabel': loss_name, 'title': loss_name}) # else: # self.viz.line(X=np.array([self.epoch]), Y=np.array([loss/self.batch]), win=self.loss_windows[loss_name], update='append') plots[loss_name] = self.losses[loss_name] / self.batch # Reset losses for next epoch self.losses[loss_name] = 0.0 self.liveloss.update(plots) self.liveloss.send() self.epoch += 1 self.batch = 1 sys.stdout.write('\n') else: self.batch += 1
def train(model, criterion, optimizer, train_dl, test_dl, num_epochs=40): liveloss = PlotLosses() for epoch in range(num_epochs): train_loss, valid_loss = [], [] logs = {} prefix = '' # Training Part model.train() for i, data in enumerate(train_dl, 0): # Get the inputs inputs = labels = data inputs = inputs.cuda() labels = labels.cuda() inputs = inputs.float() labels = labels.float() # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = model(inputs) outputs = outputs.cuda() loss = criterion(outputs,labels) loss.backward() optimizer.step() ## -> Dense Output Re-feeding <- ## # Zero the gradiants optimizer.zero_grad() # Important detach() the output, to avoid construction of # computation graph outputs = model(outputs.detach()) outputs = outputs.cuda() loss = criterion(outputs, labels) loss.backward() optimizer.step() train_loss.append(loss.item()) logs[prefix + 'MMSE loss'] = loss.item() for i, data in enumerate(test_dl, 0): model.eval() inputs = labels = data inputs = inputs.cuda() labels = labels.cuda() inputs = inputs.float() labels = labels.float() outputs = model(inputs) outputs = outputs.cuda() loss = criterion(outputs, labels) valid_loss.append(loss.item()) prefix = 'val_' logs[prefix + 'MMSE loss'] = loss.item() print() liveloss.update(logs) liveloss.draw() print ("Epoch:", epoch+1, " Training Loss: ", np.mean(train_loss), " Valid Loss: ", np.mean(valid_loss))
def train_model(model, dataloaders, dataset_sizes, criterion, optimizer, scheduler, num_epochs=25): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") since = time.time() liveloss = PlotLosses() best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0 for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch + 1, num_epochs)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': scheduler.step() model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode running_loss = 0.0 running_corrects = 0 # Iterate over data. for i, (inputs, labels) in enumerate(dataloaders[phase]): inputs = inputs.to(device) labels = labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward # track history if only in train with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs) _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) # backward + optimize only if in training phase if phase == 'train': loss.backward() optimizer.step() # statistics running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) print("\rIteration: {}/{}, Loss: {}.".format( i + 1, len(dataloaders[phase]), loss.item() * inputs.size(0)), end="") # print( (i+1)*100. / len(dataloaders[phase]), "% Complete" ) sys.stdout.flush() epoch_loss = running_loss / dataset_sizes[phase] epoch_acc = running_corrects.double() / dataset_sizes[phase] if phase == 'train': avg_loss = epoch_loss t_acc = epoch_acc else: val_loss = epoch_loss val_acc = epoch_acc # print('{} Loss: {:.4f} Acc: {:.4f}'.format( # phase, epoch_loss, epoch_acc)) # deep copy the model if phase == 'val' and epoch_acc > best_acc: best_acc = epoch_acc best_model_wts = copy.deepcopy(model.state_dict()) liveloss.update({ 'log loss': avg_loss, 'val_log loss': val_loss, 'accuracy': t_acc, 'val_accuracy': val_acc }) liveloss.draw() print('Train Loss: {:.4f} Acc: {:.4f}'.format(avg_loss, t_acc)) print('Val Loss: {:.4f} Acc: {:.4f}'.format(val_loss, val_acc)) print('Best Val Accuracy: {}'.format(best_acc)) print() time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best val Acc: {:4f}'.format(best_acc)) # load best model weights model.load_state_dict(best_model_wts) return model
def vs_net_train(args): train_path = args.train_h5 val_path = args.val_h5 NEPOCHS = args.epoch CASCADE = args.cascade LR = args.lr NBATCH = args.nb Res_name = args.Result_name device_num = args.device chpoint = args.checkpoint aug = args.aug zpad = args.zpad device = 'cuda:' + str(device_num) if zpad is False: print("input is from LORAKS") trainset = D.MAGIC_Dataset_LORAKS(train_path, augmentation=aug, verbosity=False) testset = D.MAGIC_Dataset_LORAKS(val_path, augmentation=False, verbosity=False) elif zpad is True: print("input is from Zero-Padding") trainset = D.MAGIC_Dataset_zpad(train_path, augmentation=aug, verbosity=False) testset = D.MAGIC_Dataset_zpad(val_path, augmentation=False, verbosity=False) trainloader = DataLoader(trainset, batch_size=NBATCH, shuffle=True, pin_memory=True, num_workers=0) valloader = DataLoader(testset, batch_size=NBATCH, shuffle=False, pin_memory=True, num_workers=0) dataloaders = {'train': trainloader, 'validation': valloader} net = network(alfa=None, beta=0.5, cascades=CASCADE) net = net.to(device) if chpoint is not None: print('Loading network from:', chpoint) net.load_state_dict(torch.load(chpoint)) ########## Training #################### _im0, _true, _Sens, _X_kJVC, _mask = testset[13] _im0, _true, _Sens, _X_kJVC, _mask = _im0.unsqueeze(0).to(device), _true.unsqueeze(0).to(device), _Sens.unsqueeze(0).to(device),\ _X_kJVC.unsqueeze(0).to(device), _mask.unsqueeze(0).to(device) criterion = torch.nn.L1Loss() liveloss = PlotLosses() optimizer = torch.optim.Adam(net.parameters(), lr=LR) # print('Now Training the Network') # pdb.set_trace() for epoch in range(NEPOCHS): print('Epoch', epoch + 1) logs = {} for phase in {'train', 'validation'}: if phase == 'train': kbar = pkbar.Kbar(target=len(trainloader), width=2) net.train() else: net.eval() running_loss = 0.0 running_mse = 0.0 iii = 0 for im0, true, tSens, tX_kJVC, tmask in dataloaders[phase]: im0, true, tX_kJVC, tSens, tmask = im0.to(device,non_blocking=True), true.to(device,non_blocking=True), tX_kJVC.to(device,non_blocking=True),\ tSens.to(device,non_blocking=True), tmask.to(device,non_blocking=True) if phase == 'train': out = net(im0, tX_kJVC, tmask, tSens) loss = criterion(out, true) optimizer.zero_grad() loss.backward() optimizer.step() running_loss = running_loss + loss.item() * im0.size(0) prefix = '' kbar.update(iii, values=[('L', 100 * running_loss / (iii + 1))]) iii = iii + 1 else: with torch.no_grad(): prefix = 'val_' out = net(im0, tX_kJVC, tmask, tSens) loss = criterion(out, true) running_loss = running_loss + loss.item() * im0.size(0) # print('hello') epoch_loss = running_loss / len(dataloaders[phase].dataset) logs[prefix + 'Loss'] = epoch_loss * 100 if epoch % 10 == 0: save_name = 'Result_' + Res_name + '/Val_Epoch_' + str( epoch) + '.jpg' show_output(net, _im0, _true, _X_kJVC, _Sens, _mask, save_name) file_name = 'models/' + Res_name + '/Weights_Epoch_' + str(epoch) print(' SAVING WEIGHTS : ' + file_name) torch.save(net.state_dict(), file_name) f = open("models/" + Res_name + "/Losses_graph.obj", "wb") # Saving Lossplot objects to pickle pickle.dump(liveloss, f) f.close() liveloss.update(logs) f = open("Loss_Logging.txt", "a") kbar.add(1, values=[('Train', logs['Loss']), ('Val', logs['val_Loss'])]) f.write("Epoch{} : Training Loss : {:.5f} & Validation Loss: {:.5f}\n". format(epoch, logs['Loss'], logs['val_Loss'])) f.close()
liveloss = PlotLosses() for i in range(100000): done = False score = 0 obs = env.reset() agent.noise.reset() while not done: # env.render() act = agent.choose_action(obs) new_state, reward, done, info = env.step(act) agent.remember(obs, act, reward, new_state, int(done)) agent.learn() score += reward obs = new_state score_history.append(score) metrics = {"score": score_history} print( "episode", i, "score %.2f" % score, "100 game average %.2f" % np.mean(score_history[-100:]), ) liveloss.update(metrics) liveloss.send() env.close()
for fit, ind in zip(fits, offspring): avg_h += fit[0] ind.fitness.values = fit #nova população population[:] = offspring #pega melhor e pior indivíduos para montar o gráfico top = tools.selBest(population, k=1) worst = tools.selWorst(population, k=1) avg_h = avg_h/len(population) top_h = nqueen_fitness(top[0])[0] worst_h = nqueen_fitness(worst[0])[0] plotlosses.update({'top': top_h, 'average': avg_h, 'worst': worst_h}) plotlosses.send() #Avalia critério de parada if(nqueen_fitness(top[0])[0] == 0): print(top[0]) resultado = binToDec(top[0],log_N) #dataframe eixos = [i for i in range(N)] estado_inicial = pd.DataFrame(index=(eixos),columns=(eixos)) estadoInicial = list(random.randrange(N) for i in range(N)) for i in range(len(estadoInicial)): estado_inicial[eixos[i]][resultado[i]] = 'rainha'
# iterate entire test dataset for x,t in test_loader: x,t = x.to(device), t.to(device) p = N(x).view(x.size(0), len(class_names)) loss = torch.nn.functional.cross_entropy(p, t) pred = p.argmax(dim=1, keepdim=True) test_loss_arr = np.append(test_loss_arr, loss.data) test_acc_arr = np.append(test_acc_arr, pred.data.eq(t.view_as(pred)).float().mean().item()) # NOTE: live plot library has dumb naming forcing our 'test' to be called 'validation' liveplot.update({ 'accuracy': train_acc_arr.mean(), 'val_accuracy': test_acc_arr.mean(), 'loss': train_loss_arr.mean(), 'val_loss': test_loss_arr.mean() }) liveplot.draw() epoch = epoch+1 # plot predictions def plot_image(i, predictions_array, true_label, img): predictions_array, true_label, img = predictions_array[i], true_label[i], img[i] plt.grid(False) plt.xticks([]) plt.yticks([]) plt.imshow(img, cmap=plt.cm.binary)
# TO START: # pip install livelossplot # pip install neptune-cli # neptune account login # neptune run minimal-neptune.py # enjoy results from time import sleep import numpy as np from livelossplot import PlotLosses liveplot = PlotLosses(target='neptune') for i in range(20): liveplot.update({ 'accuracy': 1 - np.random.rand() / (i + 2.), 'val_accuracy': 1 - np.random.rand() / (i + 0.5), 'mse': 1. / (i + 2.), 'val_mse': 1. / (i + 0.5) }) liveplot.draw() sleep(.5)
def fit_model(train_loader, val_loader, model, optimizer, scheduler, n_epochs, log_interval, plot=True, burnin=-1, patience=3, early_stop_score='MAP', eval_metric='cosine'): early_stop = {} early_stop['best'] = -float('inf') early_stop['best_params'] = to_cpu(model.state_dict()) early_stop['fails'] = 0 if plot: liveloss = PlotLosses() for epoch in range(n_epochs): logs = {} start_time = time.time() # Training train_loss = train_epoch(train_loader, model, optimizer) train_scores = {} # Turned off for optimize # if epoch > 0 and epoch % log_interval == 0: # train_scores = evaluate_ranking(model, train_loader, metric=eval_metric) elapsed = time.time() - start_time message = '\n' + '=' * 80 message += '\nTrain: ' message += f' epoch: {epoch:2d}, time: {int(elapsed):d}s., loss: {train_loss:5.3f}' if 'silhouette' in train_scores: message += f', silouhette: {train_scores["silhouette"]:.2f}' message += '\n' # Validation start_time = time.time() val_loss = test_epoch(val_loader, model) val_scores = {} if epoch > 0 and epoch % log_interval == 0: train_label_set = list(set(train_loader.dataset.labels)) val_scores = evaluate_ranking(model, val_loader, train_label_set, metric=eval_metric) # early stopping if val_scores[early_stop_score] > early_stop['best']: early_stop['best'] = val_scores[early_stop_score] early_stop['best_params'] = to_cpu(model.state_dict()) early_stop['fails'] = 0 early_stop['val_scores'] = val_scores else: early_stop['fails'] += 1 if early_stop['fails'] >= patience: raise EarlyStopException(early_stop['best'], early_stop['best_params'], early_stop['fails'], early_stop['val_scores']) elapsed = time.time() - start_time message += 'Validation:' message += f' epoch: {epoch:2d}, time: {int(elapsed):d}s., loss: {val_loss:5.3f}' if 'silhouette' in val_scores: message += f', silhouette: {val_scores["silhouette"]:.2f}' message += f'\n MAP: {val_scores["MAP"]:.2f}' message += f', MAP (seen): {val_scores["MAP seen labels"]:.2f}' message += f', MAP (unseen): {val_scores["MAP unseen labels"]:.2f}' message += '\n' message += '=' * 80 + '\n' print(message) logs['loss'] = train_loss logs['val_loss'] = val_loss for score, value in train_scores.items(): logs[score] = value for score, value in val_scores.items(): logs[f'val_{score}'] = value if epoch > burnin: scheduler.step(val_loss) if plot: liveloss.update(logs) liveloss.draw() # return data in case it never early stopped return early_stop
train_loader, val_loader, test_loader = dataloader_make(ICGT_tips_train, ICGT_tips_test) liveloss = PlotLosses() for epoch in range(n_epochs): logs = {} train_loss, train_acc = train(model, optimiser, criterion, train_loader) logs['' + 'log loss'] = train_loss.item() logs['' + 'accuracy'] = train_acc.item() val_loss, val_acc = validate(model, criterion, val_loader) logs['val_' + 'log loss'] = val_loss.item() logs['val_' + 'accuracy'] = val_acc.item() liveloss.update(logs) liveloss.draw() model.eval() output = model(ICGT_tips_test.data.float()) truth = ICGT_tips_test.labels avg_error = [0, 0, 0] max_error = [0, 0, 0] bad_index = [0, 0, 0] for i in range(len(truth)): for n in [0, 1, 2]: error = abs(1 - (truth[i,n] / output[i,n])) error = error.item() avg_error[n] += error if error > max_error[n]: max_error[n] = error
def train_model_it(model, dataloaders, dataset_sizes, criterion, optimizer, batch_size, num_epochs=10, scheduler=None): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") since = time.time() liveloss = PlotLosses() best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0 for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch + 1, num_epochs)) print('-' * 10) running_loss = 0.0 running_corrects = 0 #Iteration for i, (inputs, labels) in enumerate(dataloaders['train']): if scheduler != None: scheduler.step() model.train() running_loss = 0.0 running_corrects = 0 inputs = inputs.to(device) labels = labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward # track history if only in train outputs = model(inputs) _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) print("\rTraining Iteration: {}/{}, Loss: {}.".format( i + 1, len(dataloaders['train']), loss.item() * inputs.size(0) / batch_size), end="") sys.stdout.flush() if (i + 1) % 100 == 0: it_loss = running_loss / batch_size it_acc = running_corrects.double() / batch_size model.eval() val_loss = 0 val_corr = 0 for j, (inputs, labels) in enumerate(dataloaders['val']): inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() with torch.set_grad_enabled(False): outputs = model(inputs) _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) val_loss += loss.item() * inputs.size(0) val_corr += torch.sum(preds == labels.data) print("\rValidation Iteration: {}/{}, Loss: {}.".format( j + 1, len(dataloaders['val']), loss.item() * inputs.size(0) / batch_size), end="") sys.stdout.flush() valid_loss = val_loss / dataset_sizes['val'] valid_acc = val_corr.double() / dataset_sizes['val'] if valid_acc > best_acc: best_acc = valid_acc best_model_wts = copy.deepcopy(model.state_dict()) # statistics liveloss.update({ 'log loss': it_loss, 'val_log loss': valid_loss, 'accuracy': it_acc, 'val_accuracy': valid_acc }) liveloss.draw() print('validation loss: {}, validation accuracy: {}'.format( valid_loss, valid_acc)) print('Best Accuracy: {}'.format(best_acc)) torch.save( model.state_dict(), "./models/acc_{}_loss_{}.pt".format(best_acc, valid_loss)) # print('Train Loss: {:.4f} Acc: {:.4f}'.format(avg_loss, t_acc)) # print( 'Val Loss: {:.4f} Acc: {:.4f}'.format(val_loss, val_acc)) # print('Best Val Accuracy: {}'.format(best_acc)) print() time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best val Acc: {:4f}'.format(best_acc)) # load best model weights model.load_state_dict(best_model_wts) return model
def train_model(output_path, model, dataloaders, dataset_sizes, criterion, optimizer, num_epochs=5, scheduler=None): if not os.path.exists('models/'+str(output_path)): os.makedirs('models/'+str(output_path)) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") since = time.time() liveloss = PlotLosses() best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0 best = 0 for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch+1, num_epochs)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': if scheduler != None: scheduler.step() model.train() # Set model to training mode else: pbar = dataloaders[phase] model.eval() # Set model to evaluate mode running_loss = 0.0 running_corrects = 0 # Iterate over data. pbar = tqdm(dataloaders[phase]) for i,(inputs, labels) in enumerate(pbar): inputs = inputs.to(device) labels = labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward # track history if only in train with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs) _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) # backward + optimize only if in training phase if phase == 'train': loss.backward() optimizer.step() # statistics running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) #print("\rIteration: {}/{}, Loss: {}.".format(i+1, len(dataloaders[phase]), loss.item() * inputs.size(0)), end="") # print( (i+1)*100. / len(dataloaders[phase]), "% Complete" ) pbar.set_description(desc= f'Loss={loss.item()} Batch_id={i} ') epoch_loss = running_loss / dataset_sizes[phase] epoch_acc = running_corrects.double() / dataset_sizes[phase] if phase == 'train': avg_loss = epoch_loss t_acc = epoch_acc else: val_loss = epoch_loss val_acc = epoch_acc # print('{} Loss: {:.4f} Acc: {:.4f}'.format( # phase, epoch_loss, epoch_acc)) # deep copy the model if phase == 'val' and epoch_acc > best_acc: best_acc = epoch_acc best = epoch + 1 best_model_wts = copy.deepcopy(model.state_dict()) liveloss.update({ 'log loss': avg_loss, 'val_log loss': val_loss, 'accuracy': t_acc, 'val_accuracy': val_acc }) #liveloss.draw() print('Train Loss: {:.4f} Acc: {:.4f}'.format(avg_loss, t_acc)) print( 'Val Loss: {:.4f} Acc: {:.4f}'.format(val_loss, val_acc)) print() torch.save(model.state_dict(), './models/' + str(output_path) + '/model_{}_epoch.pt'.format(epoch+1)) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best Validation Accuracy: {}, Epoch: {}'.format(best_acc, best))
'Deep Learning', f'G_{epoch}.pth')) torch.save( D.state_dict(), os.path.join('/', 'content', 'drive', 'My Drive', 'University Work', 'Year 3', 'Software, Systems, & Applications III', 'Deep Learning', f'D_{epoch}.pth')) # plot some examples plt.grid(False) plt.imshow(torchvision.utils.make_grid(g).cpu().data.permute( 0, 2, 1).contiguous().permute(2, 1, 0), cmap=plt.cm.binary) liveplot.update({ 'generator loss': gen_loss_arr.mean(), 'discriminator loss': dis_loss_arr.mean() }) liveplot.draw() sleep(1.) epoch = epoch + 1 """**Sample a batch from the generative model to show the output diversity**""" G.eval() horse_seed = random.randint(0, 1000000000) print(horse_seed) torch.manual_seed(460150825) # change to horse_seed for random batches horses = G.generate(torch.randn(100, 100, 1, 1).to(device))
best_acc = test_correct / test_total # checkpoint = torch.load('./checkpoint/Sqnet_1x_v1.0/Sqnet_1x_v1.0_Cifar10.ckpt') # net.load_state_dict(checkpoint['net_state_dict']) # optimizer.load_state_dict(checkpoint['optimizer_state_dict']) liveloss = PlotLosses() for _epoch in range(start_epoch, start_epoch + num_epochs): start_time = time.time() train(_epoch) print() test(_epoch) print() print() end_time = time.time() print('Epoch #%d Cost %ds' % (_epoch, end_time - start_time)) best_cost = end_time - start_time if end_time - start_time < best_cost: best_cost = end_time - start_time liveloss.update({ 'log loss': train_loss, 'val_log loss': test_loss, 'accuracy': train_correct, 'val_accuracy': test_correct }) liveloss.draw() print('Best Cost: %ds' % (best_cost)) print('Best Acc: %.4f percent' % (best_acc * 100))
def train_classifier( self, train_loader, test_loader, params: dict = None, livelossplot=False, save_checkpoint_each=None, ): """ Method to train the model. Arguments: ---------- - train_loader : DatasetLoader for the training set - test_loader : DatasetLoader for the test set - params (dict) : if needed to update some parameters such as epochs without rebuilding the entire class put the updated parameters here - livelossplot (bool=False): use livelossplot to plot running loss and error_rate - save_checkpoint_each (list): list of epoch when we want to save model """ # Update parameters if given if save_checkpoint_each is None: save_checkpoint_each = [self.params_classifier["epochs"]] if params: for param, value in params.items(): self.params_classifier[param] = value # Define liveloss and time of training start if livelossplot: liveloss = PlotLosses() since = time.time() # Show which device is used print("Using device {}".format(self.device)) self.model.to(self.device) loader_dict = {"train": train_loader, "validation": test_loader} for e in range(self.params_classifier["epochs"]): self.logs = {} if not livelossplot: print("Epoch {}/{} :".format(e, self.params_classifier["epochs"])) print("--------------") # Alternate between train and validation phase for phase in ["train", "validation"]: if phase == "train": self.model.train() else: self.model.eval() # Define loss and uncorrects predictions running_loss = 0.0 running_uncorrects = 0 # Loop over loader for images, labels in iter(loader_dict[phase]): images = images.to(self.device) labels = torch.tensor(labels, dtype=torch.long, device=self.device) # Compute forward output = self.model.forward(images) loss = self.loss(output, labels) # Do the retropropag if in train phase if phase == "train": self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Compute prediction _, predicted = torch.max(output, 1) running_loss += loss.detach() * images.size(0) running_uncorrects += torch.sum( predicted != labels.data.detach()) # Compute loss and error_rate size_loader = len(loader_dict[phase].dataset) epoch_loss = running_loss / size_loader epoch_error_rate = running_uncorrects.float() / size_loader # Set the prefix for logs prefix = "" if phase == "validation": prefix = "val_" # Update logs self.logs[prefix + "log loss"] = epoch_loss.item() self.logs[prefix + "error_rate"] = epoch_error_rate.item() # Use liveloss to plot loss and accuracy if livelossplot: liveloss.update(self.logs) liveloss.draw() else: string_print = """ Training: | Validation: log loss = {} | val_log loss = {} error_rate = {} | val_error_rate = {} """.format( self.logs["log loss"], self.logs["val_log loss"], self.logs["error_rate"], self.logs["val_error_rate"], ) print(string_print) # Save checkpoint if (e + 1) in save_checkpoint_each: save_checkpoint( self.model, model_name="AlexNet_checkpoint_e{}.pth".format(e)) # Print training time time_elapsed = time.time() - since print("Training complete in {:.0f}m {:.0f}s".format( time_elapsed // 60, time_elapsed % 60))
def fit(self, X, eval_X, y=None, model_saved_path='bprh_model.pkl', iter_to_save=5000, coselection_saved_path='data/item-set-coselection.pkl', iter_to_log=100, correlation=True, coselection=False, plot_metric=False, log_metric=False): # Here we do not load model -> train a new model if self.existed_model_path is None: # To make sure train and test works with inconsistent user and item list, # we transform user and item's string ID to int ID so that their ID is their index in U and V print("Registering Model Parameters") # rename user and item self.user_original_id_list = sorted( set(X.UserID).union(set(eval_X.UserID))) self.item_original_id_list = sorted( set(X.ItemID).union(set(eval_X.ItemID))) self.train_data = X.copy() self.test_data = eval_X.copy() self.train_data.UserID = self.train_data.UserID.apply( lambda x: self.user_original_id_list.index(x)) self.train_data.ItemID = self.train_data.ItemID.apply( lambda x: self.item_original_id_list.index(x)) self.test_data.UserID = self.test_data.UserID.apply( lambda x: self.user_original_id_list.index(x)) self.test_data.ItemID = self.test_data.ItemID.apply( lambda x: self.item_original_id_list.index(x)) self.item_list = [ idx[0] for idx in enumerate(self.item_original_id_list) ] self.user_list = [ idx[0] for idx in enumerate(self.user_original_id_list) ] self.num_u = len(self.user_list) self.num_i = len(self.item_list) # build I_u_t, I_u_a (pre-computing for acceleration) self.build_itemset_for_user() # Calculate auxiliary-target correlation C for every user and each types of auxiliary action if correlation: self.alpha_u = self.auxiliary_target_correlation( X=self.train_data) else: print( "No auxiliary-target correlation - all alpha_u equal to one" ) alpha_u_all_ones = dict() user_set_bar = tqdm(self.user_list) for u in user_set_bar: alpha_u_all_ones[u] = dict() alpha_u_all_ones[u]['alpha'] = 1.0 self.alpha_u = alpha_u_all_ones.copy() # Generate item-set based on co-selection if coselection: self.S, self.U_item = self.itemset_coselection( X=self.train_data) # Initialization of User and Item Matrices if self.random_state is not None: np.random.seed(self.random_state) else: np.random.seed(0) print("Initializing User and Item Matrices") # NOTE: Initialization is influenced by mean and std self.U = np.random.normal(size=(self.num_u, self.dim + 1), loc=0.0, scale=0.1) self.V = np.random.normal(size=(self.dim + 1, self.num_i), loc=0.0, scale=0.1) # self.U = np.zeros(shape=(self.num_u, self.dim + 1)) # self.V = np.zeros(shape=(self.dim + 1, self.num_i)) self.U[:, -1] = 1.0 # estimation is U dot V self.estimation = np.dot(self.U, self.V) # Configure loss plots layout if plot_metric: groups = { 'Precision@K': ['Precision@5', 'Precision@10'], 'Recall@K': ['Recall@5', 'Recall@10'], 'AUC': ['AUC'] } plot_losses = PlotLosses(groups=groups) # Start Iteration all_item = set(self.item_list) user_in_train = sorted(set(self.train_data.UserID)) print("Start Training") with trange(self.num_iter) as t: for index in t: # Description will be displayed on the left # t.set_description('ITER %i' % index) # Build u, I, J, K # uniformly sample a user from U u = choice(user_in_train) # build I # uniformly sample a item i from I_u_t I_u_t = self.I_u_t_train[u] if len(I_u_t) != 0: i = choice(sorted(I_u_t)) # build I = I_u_t cap S_i if coselection: I = I_u_t.intersection(self.S[i]) else: # if no coselection, we set I as the set of purchased items by user u # no uniform sampling, like COFISET I = I_u_t else: # if no item in I_u_t, then set I to empty set i = None I = set() # build J, since we only have one auxiliary action, we follow the uniform sampling I_u_oa = self.I_u_a_train[u] - I_u_t if len(I_u_oa) != 0: j = choice(sorted(I_u_oa)) if coselection: # NOTE: typo in paper? J = I_u_oa.intersection(self.S[j]) else: # if no coselection, we set J as the set of only-auxiliary items by user u # no uniform sampling, like COFISET J = I_u_oa else: # if no item in I_u_oa, then set J to empty set j = None J = set() # build K I_u_n = all_item - I_u_t - I_u_oa if len(I_u_n) != 0: k = choice(sorted(I_u_n)) # build K if coselection: # NOTE: typo in paper? K = I_u_n.intersection(self.S[k]) else: # if no coselection, we set K as the set of no-action items by user u # no uniform sampling, like COFISET K = I_u_n else: # if no item in I_u_n, then set K to empty set k = None K = set() # calculate intermediate variables # get specific alpha_u spec_alpha_u = self.alpha_u[u]['alpha'] U_u = self.U[u, :-1].copy() sorted_I = sorted(I) sorted_J = sorted(J) sorted_K = sorted(K) # get r_hat_uIJ, r_hat_uJK, r_hat_uIK r_hat_uI = np.average( self.estimation[u, sorted_I]) if len(I) != 0 else np.array( [0]) r_hat_uJ = np.average( self.estimation[u, sorted_J]) if len(J) != 0 else np.array( [0]) r_hat_uK = np.average( self.estimation[u, sorted_K]) if len(K) != 0 else np.array( [0]) r_hat_uIJ = r_hat_uI - r_hat_uJ r_hat_uJK = r_hat_uJ - r_hat_uK r_hat_uIK = r_hat_uI - r_hat_uK # get V_bar_I, V_bar_J, V_bar_K V_bar_I = np.average(self.V[:-1, sorted_I], axis=1) if len(I) != 0 else np.zeros( shape=(self.dim, )) V_bar_J = np.average(self.V[:-1, sorted_J], axis=1) if len(J) != 0 else np.zeros( shape=(self.dim, )) V_bar_K = np.average(self.V[:-1, sorted_K], axis=1) if len(K) != 0 else np.zeros( shape=(self.dim, )) # get b_I, b_J, b_K b_I = np.average( self.V[-1, sorted_I]) if len(I) != 0 else np.array([0]) b_J = np.average( self.V[-1, sorted_J]) if len(J) != 0 else np.array([0]) b_K = np.average( self.V[-1, sorted_K]) if len(K) != 0 else np.array([0]) # here we want to examine the condition of empty sets indicator_I = indicator(len(I) == 0) indicator_J = indicator(len(J) == 0) indicator_K = indicator(len(K) == 0) indicator_sum = indicator_I + indicator_J + indicator_K if 0 <= indicator_sum <= 1: # these are the cases when only one set are empty or no set is empty # when all three are not empty, or I is empty, or K is empty, it is # easy to rewrite the obj by multiplying the indicator # when J is empty, we have to rewrite the obj if indicator_J == 1: # when J is empty # NABLA U_u df_dUu = sigmoid(-r_hat_uIK) * (V_bar_I - V_bar_K) dR_dUu = 2 * self.lambda_u * U_u # update U_u = U_u + gamma * (df_dUu - dR_dUu) self.U[u, :-1] += self.gamma * (df_dUu - dR_dUu) # NABLA V_i df_dbi = (1 - indicator_I ) * sigmoid(-r_hat_uIK) / indicator_len(I) dR_dbi = ( 1 - indicator_I ) * 2 * self.lambda_b * b_I / indicator_len(I) df_dVi = df_dbi * U_u dR_dVi = 2 * self.lambda_v * V_bar_I / indicator_len(I) # update V_i = V_i + gamma * (df_dVi - dR_dVi) self.V[:-1, sorted_I] += self.gamma * ( df_dVi - dR_dVi)[:, None] # trick: transpose here # update b_i = b_i + gamma * (df_dbi - dR_dbi) self.V[-1, sorted_I] += self.gamma * (df_dbi - dR_dbi) # No change on J # NABLA V_k df_dbk = (1 - indicator_K ) * -sigmoid(-r_hat_uIK) / indicator_len(K) dR_dbk = ( 1 - indicator_K ) * 2 * self.lambda_b * b_K / indicator_len(K) df_dVk = df_dbk * U_u dR_dVk = 2 * self.lambda_v * V_bar_K / indicator_len(K) # update V_k = V_k + gamma * (df_dVk - dR_dVk) self.V[:-1, sorted_K] += self.gamma * ( df_dVk - dR_dVk)[:, None] # trick: transpose here # update b_k = b_k + gamma * (df_dbk - dR_dbk) self.V[-1, sorted_K] += self.gamma * (df_dbk - dR_dbk) else: # when J is not empty # NABLA U_u df_dUu = (1 - indicator_I) * sigmoid(- r_hat_uIJ / spec_alpha_u) / spec_alpha_u * ( V_bar_I - V_bar_J) + \ (1 - indicator_K) * sigmoid(- r_hat_uJK) * (V_bar_J - V_bar_K) dR_dUu = 2 * self.lambda_u * U_u # update U_u = U_u + gamma * (df_dUu - dR_dUu) self.U[u, :-1] += self.gamma * (df_dUu - dR_dUu) # NABLA V_i df_dbi = (1 - indicator_I) * sigmoid( -r_hat_uIJ / spec_alpha_u) / (indicator_len(I) * spec_alpha_u) dR_dbi = ( 1 - indicator_I ) * 2 * self.lambda_b * b_I / indicator_len(I) df_dVi = df_dbi * U_u dR_dVi = 2 * self.lambda_v * V_bar_I / indicator_len(I) # update V_i = V_i + gamma * (df_dVi - dR_dVi) self.V[:-1, sorted_I] += self.gamma * ( df_dVi - dR_dVi)[:, None] # trick: transpose here # update b_i = b_i + gamma * (df_dbi - dR_dbi) self.V[-1, sorted_I] += self.gamma * (df_dbi - dR_dbi) # NABLA V_j df_dbj = (1 - indicator_I) * ( -sigmoid(-r_hat_uIJ / spec_alpha_u) / spec_alpha_u + (1 - indicator_K) * sigmoid(-r_hat_uJK)) / indicator_len(J) dR_dbj = 2 * self.lambda_b * b_J / indicator_len(J) df_dVj = df_dbj * U_u dR_dVj = 2 * self.lambda_v * V_bar_J / indicator_len(J) # update V_j = V_j + gamma * (df_dVj - dR_dVj) self.V[:-1, sorted_J] += self.gamma * ( df_dVj - dR_dVj)[:, None] # trick: transpose here # update b_j = b_j + gamma * (df_dbj - dR_dbj) self.V[-1, sorted_J] += self.gamma * (df_dbj - dR_dbj) # NABLA V_k df_dbk = (1 - indicator_K ) * -sigmoid(-r_hat_uJK) / indicator_len(K) dR_dbk = ( 1 - indicator_K ) * 2 * self.lambda_b * b_K / indicator_len(K) df_dVk = df_dbk * U_u dR_dVk = 2 * self.lambda_v * V_bar_K / indicator_len(K) # update V_k = V_k + gamma * (df_dVk - dR_dVk) self.V[:-1, sorted_K] += self.gamma * ( df_dVk - dR_dVk)[:, None] # trick: transpose here # update b_k = b_k + gamma * (df_dbk - dR_dbk) self.V[-1, sorted_K] += self.gamma * (df_dbk - dR_dbk) else: # these are the cases when at least two sets are empty # at these cases, we ignore this user and continue the loop continue # calculate loss # f_Theta = np.log(sigmoid(r_hat_uIJ / spec_alpha_u)) + np.log(sigmoid(r_hat_uJK)) # regula = self.lambda_u * np.linalg.norm(U_u, ord=2) + self.lambda_v * ( # (np.linalg.norm(V_bar_I, ord=2) if len(I) != 0 else 0) + ( # np.linalg.norm(V_bar_J, ord=2) if len(J) != 0 else 0) + ( # np.linalg.norm(V_bar_K, ord=2)) if len(K) != 0 else 0) + self.lambda_b * ( # (b_I if len(I) != 0 else 0) ** 2 + (b_J if len(J) != 0 else 0) ** 2 + ( # b_K if len(K) != 0 else 0) ** 2) # bprh_loss = f_Theta - regula # update estimation old_estimation = self.estimation.copy() # self.estimation = np.dot(self.U, self.V) all_sampled_item = sorted(set.union(I, J, K)) # for sampled_item in all_sampled_item: # self.estimation[:, sampled_item] = np.dot(self.U, self.V[:, sampled_item]) self.estimation[:, all_sampled_item] = np.dot( self.U, self.V[:, all_sampled_item]) # estimation changed est_changed = np.linalg.norm(self.estimation - old_estimation) # we only save model to file when the num of iter % iter_to_save == 0 if (index + 1) % iter_to_save == 0: self.save(model_path=model_saved_path + "_" + str(index)) # we only calculate metric when the num of iter % iter_to_log == 0 if (index + 1) % iter_to_log == 0: if log_metric | plot_metric: # calculate metrics on test data user_to_eval = sorted(set(self.test_data.UserID)) scoring_list_5, precision_5, recall_5, avg_auc = self.scoring( user_to_eval=user_to_eval, ground_truth=self.test_data, K=5, train_data_as_reference_flag=True) scoring_list_10, precision_10, recall_10, _ = self.scoring( user_to_eval=user_to_eval, ground_truth=self.test_data, K=10, train_data_as_reference_flag=True) if log_metric: self.eval_hist.append([ index, precision_5, precision_10, recall_5, recall_10, avg_auc ]) if plot_metric: plot_losses.update({ 'Precision@5': precision_5, 'Precision@10': precision_10, 'Recall@5': recall_5, 'Recall@10': recall_10, 'AUC': avg_auc }) plot_losses.send() # Postfix will be displayed on the right, # formatted automatically based on argument's datatype t.set_postfix(est_changed=est_changed, len_I=len(I), len_J=len(J), len_K=len(K))
def train_advanced(self, data_loaders, show_plot=True): liveloss = PlotLosses() how_near = 0.2 for epoch in range(self.num_epochs): logs = {} for phase in ['train', 'validation']: if phase == 'train': self.train() else: self.eval() running_loss = 0.0 for inputs, labels in data_loaders[phase]: inputs = T.DoubleTensor(inputs).to(self.device) targets = T.DoubleTensor(inputs).to(self.device) #inputs = T.DoubleTensor(inputs) inputs = Variable(inputs).to(self.device) targets = Variable(targets).to(self.device) #self.optimizer.zero_grad() #outputs = self.forward(inputs) outputs = self.encoder(inputs) outputs = self.decoder(outputs) loss = self.criterion(outputs, inputs) if phase == 'train': self.optimizer.zero_grad() loss.backward() self.optimizer.step() preds = outputs.view(len(inputs), self.original_dim) targets = targets.view(len(inputs), self.original_dim) #preds = preds.detach().cpu().numpy() #targets = targets.detach().cpu().numpy() #print(preds) #print('---') #print(targets) #preds = outputs.item() running_loss += loss.detach() * inputs.size(0) ''' if T.sum((T.abs(preds - targets) < T.abs(how_near*preds))): n_corrects += 1 else: n_wrongs += 1 ''' epoch_loss = running_loss / len(data_loaders[phase].dataset) #epoch_acc = (n_corrects*100) / len(data_loaders[phase].dataset) epoch_acc = self.accuracy(targets, preds) prefix = '' if phase == 'validation': prefix = 'val_' #print('[Model] epoch=%s, loss=%s , acc=%s' % ( epoch, loss.item(), epoch_acc.item)) print('[Model] epoch=%s, loss1=%s, loss2=%s acc=%s' % (epoch, loss.item(), epoch_loss.item(), epoch_acc)) logs[prefix + 'log loss'] = epoch_loss.item() logs[prefix + 'accuracy'] = epoch_acc if show_plot: liveloss.update(logs) liveloss.send()