def main(): """ PyTorch AlexNet implementation. """ global args, best_predc args = parser.parse_args() # create model if args.arch == 'alexnet': model = alexnet(pretrained=args.pretrained) else: raise NotImplementedError # use CUDA model.cuda() # define loss and optimizer loss = nn.CrossEntropyLoss().cuda() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weightdecay) train_dl, val_dl = data_loader(args.data, args.batch_size, args.cuda_workers) if args.evaluate: validate(val_dl, model, loss) return for epoch in range(args.start_epoch, args.epochs): custom_weight_decay(optimizer, epoch, args.lr) # train for one epoch train(train_dl, model, loss, optimizer, epoch) # evaluate on validation set prec1, prec5 = validate(val_dl, model, loss) # remember the best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) create_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict() }, is_best, args.arch + '.pth')
def train(config): # determine the filename (to be used for saving results, checkpoints, models, etc.) filename = Path(config.txt_file).stem # Initialize the device which to run the model on if config.device == 'cuda': if torch.cuda.is_available(): device = torch.device(config.device) else: device = torch.device('cpu') else: device = torch.device(config.device) # Initialize the dataset and data loader (note the +1) dataset = TextDataset( filename=config.txt_file, seq_length=config.seq_length ) data_loader = DataLoader(dataset, config.batch_size, num_workers=1) # get the vocabulary size and int2char and char2int dictionaries for use later VOCAB_SIZE = dataset.vocab_size # Initialize the model that we are going to use model = TextGenerationModel( batch_size=config.batch_size, seq_length=config.seq_length, vocabulary_size=VOCAB_SIZE, lstm_num_hidden=config.lstm_num_hidden, lstm_num_layers=config.lstm_num_layers, device=device, batch_first=config.batch_first, dropout=1.0-config.dropout_keep_prob ) # Setup the loss and optimizer and learning rate scheduler criterion = nn.CrossEntropyLoss() optimizer = optim.Adam( model.parameters(), config.learning_rate ) # Load the latest checkpoint, if any exist checkpoints = list(CHECKPOINTS_DIR.glob(f'{model.__class__.__name__}_{filename}_checkpoint_*.pt')) if len(checkpoints) > 0: # load the latest checkpoint checkpoints.sort(key=os.path.getctime) latest_checkpoint_path = checkpoints[-1] start_step, results, sequences = load_checkpoint(latest_checkpoint_path, model, optimizer) else: # initialize the epoch, results and best_accuracy start_step = 0 results = { 'step': [], 'accuracy': [], 'loss': [], } sequences = { 'step': [], 't': [], 'temperature': [], 'sequence': [] } for step in range(start_step, int(config.train_steps)): # reinitialize the data_loader iterater if we have iterated over all available mini-batches if step % len(data_loader) == 0 or step == start_step: data_iter = iter(data_loader) # get the mini-batch batch_inputs, batch_targets = next(data_iter) # Only for time measurement of step through network t1 = time.time() ####################################################### # Add more code here ... ####################################################### # put the model in training mode model.train() # convert the data and send to device X = torch.stack(batch_inputs, dim=1) X = X.to(device) Y = torch.stack(batch_targets, dim=1) Y = Y.to(device) # forward pass the mini-batch Y_out, _ = model.forward(X) Y_pred = Y_out.argmax(dim=-1) # (re)set the optimizer gradient to 0 optimizer.zero_grad() # compute the accuracy and the loss accuracy = get_accuracy(Y_pred, Y) loss = criterion.forward(Y_out.transpose(2, 1), Y) # backwards propogate the loss loss.backward() # clip the gradients (to preven them from exploding) torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.max_norm) # tune the model parameters optimizer.step() # Just for time measurement t2 = time.time() examples_per_second = config.batch_size/float(t2-t1) if step % config.print_every == 0: print(f'[{datetime.now().strftime("%Y-%m-%d %H:%M")}], Train Step {step:04d}/{int(config.train_steps):04d}, Batch Size = {config.batch_size}, Examples/Sec = {examples_per_second:.2f}, Accuracy = {accuracy:.2f}, Loss = {loss:.3f}') # append the accuracy and loss to the results results['step'].append(step) results['accuracy'].append(accuracy.item()) results['loss'].append(loss.item()) if step % config.sample_every == 0: for T in [20, 30, 60, 120]: for temperature in [0.0, 0.5, 1.0, 2.0]: # Generate some sentences by sampling from the model sequence = sample_sequence( model=model, vocab_size=VOCAB_SIZE, T=T, char=None, temperature=temperature, device=device ) sequence_str = dataset.convert_to_string(sequence) print(f'Generated sample sequence (T={T}, temp={temperature}): {sequence_str}') # append the generated sequence to the sequences sequences['step'].append(step) sequences['t'].append(T) sequences['temperature'].append(temperature) sequences['sequence'].append(sequence_str) if step % config.checkpoint_every == 0: # create a checkpoint create_checkpoint(CHECKPOINTS_DIR, filename, step, model, optimizer, results, sequences) # save the results save_results(RESULTS_DIR, filename, results, sequences, model) # save the model save_model(MODELS_DIR, filename, model) if step == config.train_steps: # If you receive a PyTorch data-loader error, check this bug report: # https://github.com/pytorch/pytorch/pull/9655 break print('Done training.')
target_caption = nn.utils.rnn.pack_padded_sequence( caption, caption_len, batch_first=True)[0] optimizer.zero_grad() cnn_feature = cnn(image) rnn_tokenized_sentence, alphas = rnn(cnn_feature, caption, caption_len) loss = loss_fn(rnn_tokenized_sentence, target_caption) loss += params['alpha_c'] * ( (1. - alphas.sum(dim=1))** 2).mean() # Doubly Stochastic Attention train_loss.append(loss.data.item()) loss.backward() optimizer.step() #*print('One batch completed'); if (idx + 1) % 5000 == 0: create_checkpoint(cnn, rnn, optimizer, epoch + 1, idx + 1, train_loss, params) if (idx + 1) % 500 == 0 or (idx + 1) == len(train_data_loader): print("Epoch %d (Step %d) - %0.4f train loss, %0.2f time." % (epoch + 1, idx + 1, loss, time.time() - start_time)) #*if idx == 2: break #*break print("Epoch %d - %0.4f loss, %.2f time. " % (epoch + 1, np.mean(train_loss), time.time() - start_time)) create_checkpoint(cnn, rnn, optimizer, epoch + 1, idx + 1, train_loss, params) print('Training completed.') import pickle
def generate_clean_db(language, batch_size, sleep_time, parameters): lines = [] csv_data = [] clips_output_path = '' index = 0 if parameters.checkpoint_path is not None: if not os.path.isdir(parameters.checkpoint_path): print('Checkpoint Path Not Exist.') sys.exit() print('Loading Checkpoint.') clips_output_path = os.path.join(parameters.checkpoint_path, 'clips') index, lines, csv_data = utils.load_checkpoint( parameters.checkpoint_path) else: if (not os.path.isdir(parameters.input_path)) and (not os.path.isfile( parameters.input_path)): print('Input Text File/Folder Not Exist.') sys.exit() if not os.path.isdir(parameters.output_path): print('Output Path Not Exist.') sys.exit() now = datetime.now().strftime('%Y-%m-%d*%H-%M-%S') clips_output_path = os.path.join(parameters.output_path, 'corpus-' + now, 'clips') os.makedirs(clips_output_path) all_files = [] if os.path.isdir(parameters.input_path): print('Input Is A Directory.') for file in os.listdir(parameters.input_path): if file.endswith(".txt"): all_files.append(os.path.join(parameters.input_path, file)) elif os.path.isfile(parameters.input_path): print('Input Is A Text File.') all_files.append(parameters.input_path) for file in all_files: current_file = open(file, 'r') for line in current_file: lines.append(line.strip()) utils.create_checkpoint( os.path.split(clips_output_path)[0], lines, all_files) print('Converting Text To Speech And Generating Dataset.') print('Dataset Path: %s.' % os.path.split(clips_output_path)[0]) for line in lines[index:]: pair = [] current_file, current_file_slow = g_tts(line, language, clips_output_path, index, parameters.generate_male) current_file_size = os.path.getsize(current_file) current_file_size_slow = os.path.getsize(current_file_slow) print('Processing Item: %d/%d, Type: Normal.' % (index + 1, len(lines))) is_valid, file, size = utils.check_audio( line, current_file, ) if is_valid == True: csv_data.append( [os.path.basename(file), str(size), line.strip().lower()]) pair.append(csv_data[-1]) else: print( 'Duration Of %s File Is Not Valid (Must Be Between 5-20 Seconds.)' % os.path.basename(current_file)) os.remove(file) print('Processing Item: %d/%d, Type: Slow.' % (index + 1, len(lines))) is_valid, file, size = utils.check_audio(line, current_file_slow) if is_valid == True: csv_data.append( [os.path.basename(file), str(size), line.strip().lower()]) pair.append(csv_data[-1]) else: print( 'Duration Of %s File Is Not Valid (Must Be Between 0.5-20 Seconds.)' % os.path.basename(current_file_slow)) os.remove(file) utils.save_checkpoint(os.path.split(clips_output_path)[0], pair, index) index += 1 if index % batch_size == 0: print('Sleep (%d Seconds).' % sleep_time) time.sleep(sleep_time) return csv_data, clips_output_path
def train(): model_type = FLAGS.model_type run_desc = FLAGS.run_desc run_desc_tl = FLAGS.run_desc_tl data_dir = Path(FLAGS.data_dir) checkpoints_dir = Path(FLAGS.checkpoints_dir) / model_type / run_desc models_dir = Path(FLAGS.models_dir) / model_type / run_desc results_dir = Path(FLAGS.results_dir) / model_type / run_desc checkpoints_dir_tl = Path(FLAGS.checkpoints_dir) / model_type / run_desc_tl models_dir_tl = Path(FLAGS.models_dir) / model_type / run_desc_tl results_dir_tl = Path(FLAGS.results_dir) / model_type / run_desc_tl learning_rate = FLAGS.learning_rate batch_size_fn = FLAGS.batch_size epoch_no = FLAGS.epoch sent_hidden_dim = FLAGS.sent_hidden_dim doc_hidden_dim = FLAGS.doc_hidden_dim if not data_dir.exists(): raise ValueError('Data directory does not exist') # create other directories if they do not exist create_directories(checkpoints_dir_tl, models_dir_tl, results_dir_tl) # load the data print('Loading the data...') # get the glove and elmo embedding glove_dim = 0 elmo_dim = 0 GloVe_vectors = None ELMo = None if 'glove' in model_type: GloVe_vectors = GloVe() glove_dim = WORD_EMBED_DIM print('Uploaded GloVe embeddings.') if 'elmo' in model_type: ELMo = Elmo(options_file=ELMO_OPTIONS_FILE, weight_file=ELMO_WEIGHT_FILE, num_output_representations=1, requires_grad=False, dropout=0).to(DEVICE) elmo_dim = ELMO_EMBED_DIM print('Uploaded Elmo embeddings.') input_dim = glove_dim + elmo_dim # get the fnn and snli data keys = ['train', 'test', 'val'] FNN_DL_small = {} for i in keys: FNN_temp = FNNDataset(data_dir / ('FNN_small_' + i + '.pkl'), GloVe_vectors, ELMo) FNN_DL_temp = data.DataLoader(dataset=FNN_temp, batch_size=batch_size_fn, num_workers=0, shuffle=True, drop_last=True, collate_fn=PadSortBatchFNN()) FNN_DL_small[i] = FNN_DL_temp print('Uploaded FNN data.') # initialize the model, according to the model type print('Initializing the model for transfer learning...', end=' ') model = HierarchicalAttentionNet(input_dim=input_dim, sent_hidden_dim=sent_hidden_dim, doc_hidden_dim=doc_hidden_dim, num_classes=NUM_CLASSES_FN, dropout=0).to(DEVICE) print('Done!') print_model_parameters(model) print() print('Working on: ', end='') print(DEVICE) # set the criterion and optimizer # we weigh the loss: class [0] is real, class [1] is fake # loss_func_fn = nn.CrossEntropyLoss() optimizer = optim.Adam(params=model.parameters(), lr=learning_rate) # load the last checkpoint (if it exists) results = { 'epoch': [], 'train_loss': [], 'train_accuracy': [], 'val_loss': [], 'val_accuracy': [] } if epoch_no == '0': model_path = models_dir / Path('HierarchicalAttentionNet_model.pt') _, _, _ = load_latest_checkpoint(model_path, model, optimizer) else: checkpoint_path = checkpoints_dir / Path( 'HierarchicalAttentionNet_Adam_checkpoint_' + str(epoch_no) + '_.pt') _, _, _ = load_checkpoint(checkpoint_path, model, optimizer) print(f'Starting transfer learning on the model extracted from {epoch_no}') epoch = 0 for i in range(epoch, MAX_EPOCHS): print(f'Epoch {i+1:0{len(str(MAX_EPOCHS))}}/{MAX_EPOCHS}:') model.train() # one epoch of training train_loss_fn, train_acc_fn = train_epoch_fn(FNN_DL_small['train'], model, optimizer, loss_func_fn) # one epoch of eval model.eval() val_loss_fn, val_acc_fn = eval_epoch_fn(FNN_DL_small['val'], model, loss_func_fn) results['epoch'].append(i) results['train_loss'].append(train_loss_fn) results['train_accuracy'].append(train_acc_fn) results['val_loss'].append(val_loss_fn) results['val_accuracy'].append(val_acc_fn) #print(results) best_accuracy = torch.tensor(val_acc_fn).max().item() create_checkpoint(checkpoints_dir_tl, i, model, optimizer, results, best_accuracy) # save and plot the results save_results(results_dir_tl, results, model) save_model(models_dir_tl, model)
def map_fn(index=None, flags=None): torch.set_default_tensor_type('torch.FloatTensor') torch.manual_seed(1234) train_data = dataset.DATA(config.TRAIN_DIR) if config.MULTI_CORE: train_sampler = torch.utils.data.distributed.DistributedSampler( train_data, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) else: train_sampler = torch.utils.data.RandomSampler(train_data) train_loader = torch.utils.data.DataLoader( train_data, batch_size=flags['batch_size'] if config.MULTI_CORE else config.BATCH_SIZE, sampler=train_sampler, num_workers=flags['num_workers'] if config.MULTI_CORE else 4, drop_last=True, pin_memory=True) if config.MULTI_CORE: DEVICE = xm.xla_device() else: DEVICE = config.DEVICE netG = model.colorization_model().double() netD = model.discriminator_model().double() VGG_modelF = torchvision.models.vgg16(pretrained=True).double() VGG_modelF.requires_grad_(False) netG = netG.to(DEVICE) netD = netD.to(DEVICE) VGG_modelF = VGG_modelF.to(DEVICE) optD = torch.optim.Adam(netD.parameters(), lr=2e-4, betas=(0.5, 0.999)) optG = torch.optim.Adam(netG.parameters(), lr=2e-4, betas=(0.5, 0.999)) ## Trains train_start = time.time() losses = { 'G_losses': [], 'D_losses': [], 'EPOCH_G_losses': [], 'EPOCH_D_losses': [], 'G_losses_eval': [] } netG, optG, netD, optD, epoch_checkpoint = utils.load_checkpoint( config.CHECKPOINT_DIR, netG, optG, netD, optD, DEVICE) netGAN = model.GAN(netG, netD) for epoch in range( epoch_checkpoint, flags['num_epochs'] + 1 if config.MULTI_CORE else config.NUM_EPOCHS + 1): print('\n') print('#' * 8, f'EPOCH-{epoch}', '#' * 8) losses['EPOCH_G_losses'] = [] losses['EPOCH_D_losses'] = [] if config.MULTI_CORE: para_train_loader = pl.ParallelLoader( train_loader, [DEVICE]).per_device_loader(DEVICE) engine.train(para_train_loader, netGAN, netD, VGG_modelF, optG, optD, device=DEVICE, losses=losses) elapsed_train_time = time.time() - train_start print("Process", index, "finished training. Train time was:", elapsed_train_time) else: engine.train(train_loader, netGAN, netD, VGG_modelF, optG, optD, device=DEVICE, losses=losses) #########################CHECKPOINTING################################# utils.create_checkpoint(epoch, netG, optG, netD, optD, max_checkpoint=config.KEEP_CKPT, save_path=config.CHECKPOINT_DIR) ######################################################################## utils.plot_some(train_data, netG, DEVICE, epoch) gc.collect()
def train(): model_type = FLAGS.model_type run_desc = FLAGS.run_desc data_dir = Path(FLAGS.data_dir) checkpoints_dir = Path(FLAGS.checkpoints_dir) / model_type / run_desc models_dir = Path(FLAGS.models_dir) / model_type / run_desc results_dir = Path(FLAGS.results_dir) / model_type / run_desc learning_rate = LEARNING_RATE sent_hidden_dim = FLAGS.sent_hidden_dim doc_hidden_dim = FLAGS.doc_hidden_dim if not data_dir.exists(): raise ValueError('Data directory does not exist') # create other directories if they do not exist create_directories(checkpoints_dir, models_dir, results_dir) # load the data print('Loading the data...') # get the glove and elmo embedding glove_dim = 0 elmo_dim = 0 GloVe_vectors = None ELMo = None if 'glove' in model_type: GloVe_vectors = GloVe() glove_dim = WORD_EMBED_DIM print('Uploaded GloVe embeddings.') if 'elmo' in model_type: ELMo = Elmo(options_file=ELMO_OPTIONS_FILE, weight_file=ELMO_WEIGHT_FILE, num_output_representations=1, requires_grad=False, dropout=0).to(DEVICE) elmo_dim = ELMO_EMBED_DIM print('Uploaded Elmo embeddings.') input_dim = glove_dim + elmo_dim # get the fnn and snli data FNN = {} FNN_DL = {} for path in ['train', 'val', 'test']: FNN[path] = FNNDataset(data_dir / ('FNN_' + path + '.pkl'), GloVe_vectors, ELMo) FNN_DL[path] = data.DataLoader(dataset=FNN[path], batch_size=BATCH_SIZE_FN, num_workers=0, shuffle=True, drop_last=True, collate_fn=PadSortBatchFNN()) print('Uploaded FNN data.') fnn_train_sent_no = get_number_sentences(data_dir / 'FNN_train.pkl') fnn_train_len = len(FNN['train']) # initialize the model, according to the model type print('Initializing the model...', end=' ') model = HierarchicalAttentionNet(input_dim=input_dim, sent_hidden_dim=sent_hidden_dim, doc_hidden_dim=doc_hidden_dim, num_classes=NUM_CLASSES_FN, dropout=0).to(DEVICE) print('Working on: ', end='') print(DEVICE) print('Done!') print_model_parameters(model) print() # set the criterion and optimizer # we weigh the loss: class [0] is real, class [1] is fake # real_ratio, fake_ratio = get_class_balance(data_dir / 'FNN_train.pkl') weights = [(1.0 - real_ratio), (1.0 - fake_ratio)] print(weights) class_weights = torch.FloatTensor(weights).to(DEVICE) loss_func_fn = nn.CrossEntropyLoss(weight=class_weights) optimizer = optim.Adam(params=model.parameters(), lr=LEARNING_RATE) # load the last checkpoint (if it exists) results = { 'epoch': [], 'train_loss': [], 'train_accuracy': [], 'val_loss': [], 'val_accuracy': [] } epoch, results, best_accuracy = load_latest_checkpoint( checkpoints_dir, model, optimizer) if epoch == 0: print(f'Starting training at epoch {epoch + 1}...') else: print(f'Resuming training from epoch {epoch + 1}...') for i in range(epoch, MAX_EPOCHS): print(f'Epoch {i+1:0{len(str(MAX_EPOCHS))}}/{MAX_EPOCHS}:') model.train() # one epoch of training train_loss_fn, train_acc_fn = train_epoch_fn(FNN_DL['train'], model, optimizer, loss_func_fn) # one epoch of eval model.eval() val_loss_fn, val_acc_fn = eval_epoch_fn(FNN_DL['val'], model, loss_func_fn) results['epoch'].append(i) results['train_loss'].append(train_loss_fn) results['train_accuracy'].append(train_acc_fn) results['val_loss'].append(val_loss_fn) results['val_accuracy'].append(val_acc_fn) #print(results) best_accuracy = torch.tensor(val_acc_fn).max().item() create_checkpoint(checkpoints_dir, i, model, optimizer, results, best_accuracy) if (i + 1) % 4 == 0 and i != 0: learning_rate = learning_rate / 2 optimizer = optim.Adam(params=model.parameters(), lr=learning_rate) # save and plot the results save_results(results_dir, results, model) save_model(models_dir, model)
def train(): model_type = FLAGS.model_type run_desc = FLAGS.run_desc data_dir = Path(FLAGS.data_dir) checkpoints_dir = Path(FLAGS.checkpoints_dir) / model_type / run_desc models_dir = Path(FLAGS.models_dir) / model_type / run_desc results_dir = Path(FLAGS.results_dir) / model_type / run_desc #data_percentage = FLAGS.data_percentage if model_type == 'STL': only_fn = True else: only_fn = False # check if data directory exists if not data_dir.exists(): raise ValueError('Data directory does not exist') # create other directories if they do not exist create_directories(checkpoints_dir, models_dir, results_dir) # load the data print('Loading the data...') # get the glove and elmo embeddings GloVe_vectors = GloVe() print('Uploaded GloVe embeddings.') # ELMo = Elmo( # options_file=ELMO_OPTIONS_FILE, # weight_file=ELMO_WEIGHT_FILE, # num_output_representations=1, # requires_grad=False, # dropout=0).to(DEVICE) # print('Uploaded Elmo embeddings.') # get the fnn and snli data FNN = {} FNN_DL = {} for path in ['train', 'val', 'test']: FNN[path] = FNNDataset(data_dir / ('FNN_' + path + '.pkl'), GloVe_vectors) FNN_DL[path] = data.DataLoader(dataset=FNN[path], batch_size=BATCH_SIZE_FN, num_workers=0, shuffle=True, drop_last=True, collate_fn=PadSortBatch()) print('Uploaded FNN data.') if not only_fn: SNLI = {} SNLI_DL = {} for path in ['train', 'val', 'test']: SNLI[path] = SNLIDataset(data_dir / ('SNLI_' + path + '.pkl'), GloVe_vectors) SNLI_DL[path] = data.DataLoader(dataset=SNLI[path], batch_size=BATCH_SIZE_NLI, num_workers=0, shuffle=True, drop_last=True, collate_fn=PadSortBatchSNLI()) print('Uploaded SNLI data.') snli_train_sent_no = len(SNLI['train']) * 2 snli_train_len = len(SNLI['train']) fnn_train_sent_no = get_number_sentences(data_dir / 'FNN_train.pkl') fnn_train_len = len(FNN['train']) # initialize the model, according to the model type print('Initializing the model...', end=' ') if model_type == 'MTL': NUM_CLASSES_NLI = 3 print("Loading an MTL HAN model.") elif model_type == 'STL': NUM_CLASSES_NLI = None print("Loading an STL HAN model.") elif model_type == 'Transfer': print("Nothing for now.") if ELMO_EMBED_DIM is not None: # input_dim = WORD_EMBED_DIM + ELMO_EMBED_DIM input_dim = WORD_EMBED_DIM else: input_dim = WORD_EMBED_DIM model = HierarchicalAttentionNet(input_dim=input_dim, hidden_dim=WORD_HIDDEN_DIM, num_classes_task_fn=NUM_CLASSES_FN, embedding=None, num_classes_task_nli=NUM_CLASSES_NLI, dropout=0).to(DEVICE) print('Working on: ', end='') print(DEVICE) print('Done!') print_model_parameters(model) print() # set the criterion and optimizer # we weigh the loss: class [0] is real, class [1] is fake # real_ratio, fake_ratio = get_class_balance(data_dir / 'FNN_train.pkl') weights = [(1.0 - real_ratio), (1.0 - fake_ratio)] print(weights) class_weights = torch.FloatTensor(weights).to(DEVICE) loss_func_fn = nn.CrossEntropyLoss(weight=class_weights) if not only_fn: loss_func_nli = nn.CrossEntropyLoss() temperature = 2 optimizer = optim.Adam(params=model.parameters(), lr=LEARNING_RATE) # load the last checkpoint (if it exists) epoch, results, best_accuracy = load_latest_checkpoint( checkpoints_dir, model, optimizer) results_fn = { 'epoch': [], 'train_loss': [], 'train_accuracy': [], 'val_loss': [], 'val_accuracy': [] } results_nli = { 'epoch': [], 'train_loss': [], 'train_accuracy': [], 'val_loss': [], 'val_accuracy': [] } results = {'fn': results_fn, 'nli': results_nli} if epoch == 0: print(f'Starting training at epoch {epoch + 1}...') else: print(f'Resuming training from epoch {epoch + 1}...') for i in range(epoch, MAX_EPOCHS): print(f'Epoch {i+1:0{len(str(MAX_EPOCHS))}}/{MAX_EPOCHS}:') model.train() # one epoch of training if only_fn: train_loss_fn, train_acc_fn = train_epoch_fn( FNN_DL['train'], model, optimizer, loss_func_fn) elif model_type == 'MTL': model.train() train_loss_fn = [] train_acc_fn = [] loss_fn_weight_gradnorm = 1 train_loss_nli = [] train_acc_nli = [] loss_nli_weight_gradnorm = 1 #define by sentence number #loss_fn_weight_dataset = 1 - fnn_train_sent_no / (fnn_train_sent_no + snli_train_sent_no) #loss_nli_weight_dataset = 1 - snli_train_sent_no / (fnn_train_sent_no + snli_train_sent_no) loss_fn_weight_dataset = 1 - fnn_train_len / (fnn_train_len + snli_train_len) loss_nli_weight_dataset = 1 - snli_train_len / (fnn_train_len + snli_train_len) chance_fn = 1000 * (fnn_train_len / BATCH_SIZE_FN) / ( (fnn_train_len / BATCH_SIZE_FN) + (snli_train_len / BATCH_SIZE_NLI)) iterator_fnn = enumerate(FNN_DL['train']) iterator_snli = enumerate(SNLI_DL['train']) done_fnn, done_snli = False, False step_fnn = 0 step_snli = 0 print( f'Train set length, FNN: {fnn_train_len}. Train set length, SNLI: {snli_train_len}.' ) print( f'Training set to batch size ratio for Fake News Detection is {fnn_train_len / BATCH_SIZE_FN}.' ) print( f'Training set to batch size ratio for Language Inference is {snli_train_len / BATCH_SIZE_NLI}.' ) while not (done_fnn and done_snli): if len(train_loss_fn) > 1 and len(train_loss_nli) > 1: # computes loss weights based on the loss from the previous iterations loss_fn_ratio = train_loss_fn[len(train_loss_fn) - 1] / train_loss_fn[ len(train_loss_fn) - 2] loss_nli_ratio = train_loss_nli[ len(train_acc_nli) - 1] / train_loss_nli[len(train_loss_nli) - 2] loss_fn_exp = math.exp(loss_fn_ratio / temperature) loss_nli_exp = math.exp(loss_nli_ratio / temperature) loss_fn_weight_gradnorm = loss_fn_exp / (loss_fn_exp + loss_nli_exp) loss_nli_weight_gradnorm = loss_nli_exp / (loss_fn_exp + loss_nli_exp) loss_fn_weight = math.exp( loss_fn_weight_dataset * loss_fn_weight_gradnorm) / ( math.exp(loss_fn_weight_dataset * loss_fn_weight_gradnorm) + math.exp(loss_nli_weight_dataset * loss_nli_weight_gradnorm)) loss_nli_weight = math.exp( loss_nli_weight_dataset * loss_nli_weight_gradnorm) / ( math.exp(loss_fn_weight_dataset * loss_fn_weight_gradnorm) + math.exp(loss_nli_weight_dataset * loss_nli_weight_gradnorm)) else: loss_fn_weight = loss_fn_weight_dataset loss_nli_weight = loss_nli_weight_dataset # define the total loss function #loss_func = loss_func_fn + loss_func_nli # is this needed? if np.random.randint(0, 1000) < chance_fn: try: step_fnn, batch_fnn = next(iterator_fnn) except StopIteration: done_fnn = True else: try: batch_loss_fn, batch_acc_fn = train_batch_fn( batch_fnn, model, optimizer, loss_func_fn, loss_fn_weight) train_loss_fn.append(batch_loss_fn) train_acc_fn.append(batch_acc_fn) except: print('Error in batch') else: try: step_snli, batch_snli = next(iterator_snli) except StopIteration: done_snli = True else: try: batch_loss_nli, batch_acc_nli = train_batch_nli( batch_snli, model, optimizer, loss_func_nli, loss_nli_weight) train_loss_nli.append(batch_loss_nli) train_acc_nli.append(batch_acc_nli) except: print('Error in batch') print(f'FNN batch {step_fnn}') print(f'SNLI batch {step_snli}') if step_fnn % 50 == 0 and step_fnn != 0: print(f'Processed {step_fnn} FNN batches.') print(f'Accuracy: {train_acc_fn[len(train_acc_fn)-1]}.') print( f'Weight for loss for NLI is {loss_nli_weight}, for loss for FN is {loss_fn_weight}.' ) if step_snli % 50 == 0 and step_snli != 0: print(f'Processed {step_snli} SNLIbatches.') print(f'Accuracy: {train_acc_nli[len(train_acc_nli)-1]}.') print( f'Weight for loss for NLI is {loss_nli_weight}, for loss for FN is {loss_fn_weight}.' ) # one epoch of eval model.eval() val_loss_fn, val_acc_fn = eval_epoch_fn(FNN_DL['val'], model, loss_func_fn) tasks = ['fn'] if model_type == 'MTL': val_loss_nli, val_acc_nli = eval_epoch_nli(SNLI_DL['val'], model, loss_func_nli) tasks.append('nli') for task in tasks: results[task]['epoch'].append(i) if task == 'fn': temp_train_loss = train_loss_fn temp_val_loss = val_loss_fn temp_train_acc = train_acc_fn temp_val_acc = val_acc_fn elif task == 'nli': temp_train_loss = train_loss_nli temp_val_loss = val_loss_nli temp_train_acc = train_acc_nli temp_val_acc = val_acc_nli results[task]['train_loss'].append(temp_train_loss) results[task]['train_accuracy'].append(temp_train_acc) results[task]['val_loss'].append(temp_val_loss) results[task]['val_accuracy'].append(temp_val_acc) print(results) best_accuracy = torch.tensor(temp_val_acc).max().item() create_checkpoint(checkpoints_dir, epoch, model, optimizer, results, best_accuracy) # save and plot the results save_results(results_dir, results, model) save_model(models_dir, model) plot_results(results_dir, results, model)
def train(): data_dir = Path(FLAGS.data_dir) checkpoints_dir = Path(FLAGS.checkpoints_dir) models_dir = Path(FLAGS.models_dir) results_dir = Path(FLAGS.results_dir) if not data_dir.exists(): raise ValueError('Data directory does not exist') # create other directories if they do not exist create_directories(checkpoints_dir, models_dir, results_dir) # load the data print('Loading the data...') adj_file = data_dir / 'adj_matrix.npz' features_file = data_dir / 'features_matrix.pkl' labels_file = data_dir / 'labels_matrix.pkl' splits_file = data_dir / 'splits_dict.pkl' adj, features, labels, splits_dict = load_data(adj_file, features_file, labels_file, splits_file) train_idxs = splits_dict['train'] val_idxs = splits_dict['val'] test_idxs = splits_dict['test'] # initialize the model, according to the model type print('Initializing the model...') model = GraphConvolutionalNetwork( input_dim=features.shape[1], hidden_dim=HIDDEN_DIM, num_classes=labels.max().item() + 1, dropout=DROPOUT ).to(DEVICE) # print_model_parameters(model) # set the criterion and optimizer print('Initializing the criterion and optimizer') criterion = nn.NLLLoss() optimizer = optim.Adam( params=model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY ) # initialize the results dict results = { 'epoch': [], 'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': [] } print(f'Starting training at epoch 1...') for i in range(0, MAX_EPOCHS): st = time() # train model.train() optimizer.zero_grad() # forward pass output = model(features, adj) # compute the training loss and accuracy train_targets = labels[train_idxs].max(dim=1).indices train_loss = criterion(output[train_idxs], train_targets) train_acc = accuracy(output[train_idxs], train_targets) # backpropogate the loss train_loss.backward() optimizer.step() # evaluate model.eval() output = model(features, adj) val_targets = labels[val_idxs].max(dim=1).indices val_loss = criterion(output[val_idxs], val_targets) val_acc = accuracy(output[val_idxs], val_targets) # record results results['epoch'].append(i) results['train_loss'].append(train_loss.item()) results['train_acc'].append(train_acc.item()) results['val_loss'].append(val_loss.item()) results['val_acc'].append(val_acc.item()) # print update print(f'Epoch: {i+1:02d} Train loss: {train_loss.item():0.4f} Train acc: {train_acc:0.4f} Val loss: {val_loss.item():0.4f} Val acc: {val_acc:0.4f} done in {time() - st} s') # create a checkpoint create_checkpoint(checkpoints_dir, i, model, optimizer, results) # test model.eval() output = model(features, adj) test_targets = labels[test_idxs].max(dim=1).indices test_loss = criterion(output[test_idxs], test_targets) test_acc = accuracy(output[test_idxs], test_targets) # record results results['test_loss'] = test_loss.item() results['test_acc'] = test_acc.item() # save the model and results save_model(models_dir, model) save_results(results_dir, results, model)
def main(cfg): os.makedirs(str(cfg.output_dir + f"/fold{cfg.fold}/"), exist_ok=True) # set random seed, works when use all data to train if cfg.seed < 0: cfg.seed = np.random.randint(1_000_000) set_seed(cfg.seed) # set dataset, dataloader train = pd.read_csv(cfg.train_df) if cfg.fold == -1: val_df = train[train["fold"] == 0] else: val_df = train[train["fold"] == cfg.fold] train_df = train[train["fold"] != cfg.fold] train_dataset = get_train_dataset(train_df, cfg) val_dataset = get_val_dataset(val_df, cfg) train_dataloader = get_train_dataloader(train_dataset, cfg) val_dataloader = get_val_dataloader(val_dataset, cfg) if cfg.train_val is True: train_val_dataset = get_val_dataset(train_df, cfg) train_val_dataloader = get_val_dataloader(train_val_dataset, cfg) to_device_transform = ToDeviced(keys=("input", "target", "mask", "is_annotated"), device=cfg.device) cfg.to_device_transform = to_device_transform # set model model = RanzcrNet(cfg) model.to(cfg.device) # set optimizer, lr scheduler total_steps = len(train_dataset) optimizer = get_optimizer(model, cfg) scheduler = get_scheduler(cfg, optimizer, total_steps) # set other tools if cfg.mixed_precision: scaler = GradScaler() else: scaler = None writer = SummaryWriter(str(cfg.output_dir + f"/fold{cfg.fold}/")) # train and val loop step = 0 i = 0 best_val_loss = np.inf optimizer.zero_grad() for epoch in range(cfg.epochs): print("EPOCH:", epoch) gc.collect() if cfg.train is True: run_train( model=model, train_dataloader=train_dataloader, optimizer=optimizer, scheduler=scheduler, cfg=cfg, scaler=scaler, writer=writer, epoch=epoch, iteration=i, step=step, ) if (epoch + 1) % cfg.eval_epochs == 0 or (epoch + 1) == cfg.epochs: val_loss = run_eval( model=model, val_dataloader=val_dataloader, cfg=cfg, writer=writer, epoch=epoch, ) if cfg.train_val is True: if (epoch + 1) % cfg.eval_train_epochs == 0 or (epoch + 1) == cfg.epochs: train_val_loss = run_eval(model, train_val_dataloader, cfg, writer, epoch) print(f"train_val_loss {train_val_loss:.5}") if val_loss < best_val_loss: print( f"SAVING CHECKPOINT: val_loss {best_val_loss:.5} -> {val_loss:.5}" ) best_val_loss = val_loss checkpoint = create_checkpoint( model, optimizer, epoch, scheduler=scheduler, scaler=scaler, ) torch.save( checkpoint, f"{cfg.output_dir}/fold{cfg.fold}/checkpoint_best_seed{cfg.seed}.pth", )