def train(train_file, test_file, batch_size, epoch_limit, gpu_mode, num_workers, retrain_model, retrain_model_path, gru_layers, hidden_size, lr, decay, model_dir, stats_dir, train_mode, world_size, rank, device_id): if train_mode is True and rank == 0: train_loss_logger = open(stats_dir + "train_loss.csv", 'w') test_loss_logger = open(stats_dir + "test_loss.csv", 'w') confusion_matrix_logger = open(stats_dir + "confusion_matrix.txt", 'w') else: train_loss_logger = None test_loss_logger = None confusion_matrix_logger = None torch.cuda.set_device(device_id) if rank == 0: sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: LOADING DATA\n") train_data_set = SequenceDataset(train_file) train_sampler = torch.utils.data.distributed.DistributedSampler( train_data_set, num_replicas=world_size, rank=rank) train_loader = torch.utils.data.DataLoader(dataset=train_data_set, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True, sampler=train_sampler) num_classes = ImageSizeOptions.TOTAL_LABELS if retrain_model is True: if os.path.isfile(retrain_model_path) is False: sys.stderr.write( "[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] ERROR: INVALID PATH TO RETRAIN PATH MODEL --retrain_model_path\n" ) exit(1) sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: RETRAIN MODEL LOADING\n") transducer_model, hidden_size, gru_layers, prev_ite = \ ModelHandler.load_simple_model_for_training(retrain_model_path, input_channels=ImageSizeOptions.IMAGE_CHANNELS, image_features=ImageSizeOptions.IMAGE_HEIGHT, seq_len=ImageSizeOptions.SEQ_LENGTH, num_classes=num_classes) if train_mode is True: epoch_limit = prev_ite + epoch_limit sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: RETRAIN MODEL LOADED\n") else: transducer_model = ModelHandler.get_new_gru_model( input_channels=ImageSizeOptions.IMAGE_CHANNELS, image_features=ImageSizeOptions.IMAGE_HEIGHT, gru_layers=gru_layers, hidden_size=hidden_size, num_classes=num_classes) prev_ite = 0 param_count = sum(p.numel() for p in transducer_model.parameters() if p.requires_grad) if rank == 0: sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: TOTAL TRAINABLE PARAMETERS:\t" + str(param_count) + "\n") sys.stderr.flush() model_optimizer = torch.optim.Adam(transducer_model.parameters(), lr=lr, weight_decay=decay) if retrain_model is True: sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: OPTIMIZER LOADING\n") model_optimizer = ModelHandler.load_simple_optimizer( model_optimizer, retrain_model_path, gpu_mode) sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: OPTIMIZER LOADED\n") sys.stderr.flush() if gpu_mode: transducer_model = transducer_model.to(device_id) transducer_model = nn.parallel.DistributedDataParallel( transducer_model, device_ids=[device_id]) class_weights = torch.Tensor(ImageSizeOptions.class_weights) # Loss criterion = nn.CrossEntropyLoss(class_weights) if gpu_mode is True: criterion = criterion.to(device_id) start_epoch = prev_ite # Train the Model if rank == 0: sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: TRAINING STARTING\n") sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: START: " + str(start_epoch + 1) + " END: " + str(epoch_limit) + "\n") stats = dict() stats['loss_epoch'] = [] stats['accuracy_epoch'] = [] for epoch in range(start_epoch, epoch_limit, 1): start_time = time.time() total_loss = 0 total_images = 0 if rank == 0: sys.stderr.write( "[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: TRAIN EPOCH: " + str(epoch + 1) + "\n") # make sure the model is in train mode. BN is different in train and eval. batch_no = 1 transducer_model.train() for images, labels in train_loader: labels = labels.type(torch.LongTensor) images = images.type(torch.FloatTensor) if gpu_mode: images = images.to(device_id) labels = labels.to(device_id) hidden = torch.zeros(images.size(0), 2 * TrainOptions.GRU_LAYERS, TrainOptions.HIDDEN_SIZE) if gpu_mode: hidden = hidden.to(device_id) for i in range(0, ImageSizeOptions.SEQ_LENGTH, TrainOptions.WINDOW_JUMP): model_optimizer.zero_grad() if i + TrainOptions.TRAIN_WINDOW > ImageSizeOptions.SEQ_LENGTH: break image_chunk = images[:, i:i + TrainOptions.TRAIN_WINDOW] label_chunk = labels[:, i:i + TrainOptions.TRAIN_WINDOW] ############################################################################################# ######################## ####################### #Just a Wild Guess that this too needs a hidden cell_state for lstm output_, hidden = transducer_model(image_chunk, hidden) loss = criterion(output_.contiguous().view(-1, num_classes), label_chunk.contiguous().view(-1)) loss.backward() model_optimizer.step() total_loss += loss.item() total_images += image_chunk.size(0) hidden = hidden.detach() # update the progress bar avg_loss = (total_loss / total_images) if total_images else 0 if train_mode is True and rank == 0: train_loss_logger.write( str(epoch + 1) + "," + str(batch_no) + "," + str(avg_loss) + "\n") if rank == 0 and batch_no % 10 == 0: percent_complete = int((100 * batch_no) / len(train_loader)) time_now = time.time() mins = int((time_now - start_time) / 60) secs = int((time_now - start_time)) % 60 sys.stderr.write( "[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: " + "EPOCH: " + str(epoch + 1) + " BATCH: " + str(batch_no) + "/" + str(len(train_loader)) + " LOSS: " + str(avg_loss) + " COMPLETE (" + str(percent_complete) + "%)" + " [ELAPSED TIME: " + str(mins) + " Min " + str(secs) + " Sec]\n") sys.stderr.flush() batch_no += 1 dist.barrier() if rank == 0: stats_dictioanry = test(test_file, batch_size * world_size, gpu_mode, transducer_model, num_workers, gru_layers, hidden_size, num_classes=ImageSizeOptions.TOTAL_LABELS) stats['loss'] = stats_dictioanry['loss'] stats['accuracy'] = stats_dictioanry['accuracy'] stats['loss_epoch'].append((epoch, stats_dictioanry['loss'])) stats['accuracy_epoch'].append( (epoch, stats_dictioanry['accuracy'])) dist.barrier() if rank == 0: sys.stderr.write( "[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: TEST COMPLETED.\n") # update the loggers if train_mode is True and rank == 0: # save the model after each epoch # encoder_model, decoder_model, encoder_optimizer, decoder_optimizer, hidden_size, layers, epoch, # file_name save_best_model( transducer_model, model_optimizer, hidden_size, gru_layers, epoch, model_dir + "_epoch_" + str(epoch + 1) + '_checkpoint.pkl') test_loss_logger.write( str(epoch + 1) + "," + str(stats['loss']) + "," + str(stats['accuracy']) + "\n") confusion_matrix_logger.write( str(epoch + 1) + "\n" + str(stats_dictioanry['confusion_matrix']) + "\n") train_loss_logger.flush() test_loss_logger.flush() confusion_matrix_logger.flush() elif train_mode is False: # this setup is for hyperband if epoch + 1 >= 10 and stats['accuracy'] < 98: sys.stderr.write( "[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: EARLY STOPPING AS THE MODEL NOT DOING WELL\n") return transducer_model, model_optimizer, stats if rank == 0: time_now = time.time() mins = int((time_now - start_time) / 60) secs = int((time_now - start_time)) % 60 sys.stderr.write( "[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: ELAPSED TIME FOR ONE EPOCH: " + str(mins) + " Min " + str(secs) + " Sec\n") if rank == 0: sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: FINISHED TRAINING\n") return transducer_model, model_optimizer, stats
def train(train_file, test_file, batch_size, epoch_limit, gpu_mode, num_workers, retrain_model, retrain_model_path, gru_layers, hidden_size, lr, decay, model_dir, stats_dir, train_mode): if train_mode is True: train_loss_logger = open(stats_dir + "train_loss.csv", 'w') test_loss_logger = open(stats_dir + "test_loss.csv", 'w') confusion_matrix_logger = open(stats_dir + "confusion_matrix.txt", 'w') else: train_loss_logger = None test_loss_logger = None confusion_matrix_logger = None sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: LOADING DATA\n") train_data_set = SequenceDataset(train_file) train_loader = DataLoader(train_data_set, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=gpu_mode) num_classes = ImageSizeOptions.TOTAL_LABELS if retrain_model is True: if os.path.isfile(retrain_model_path) is False: sys.stderr.write( "[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] ERROR: INVALID PATH TO RETRAIN PATH MODEL --retrain_model_path\n" ) exit(1) sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: RETRAIN MODEL LOADING\n") transducer_model, hidden_size, gru_layers, prev_ite = \ ModelHandler.load_simple_model_for_training(retrain_model_path, input_channels=ImageSizeOptions.IMAGE_CHANNELS, image_features=ImageSizeOptions.IMAGE_HEIGHT, seq_len=ImageSizeOptions.SEQ_LENGTH, num_classes=num_classes) if train_mode is True: epoch_limit = prev_ite + epoch_limit sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: RETRAIN MODEL LOADED\n") else: transducer_model = ModelHandler.get_new_gru_model( input_channels=ImageSizeOptions.IMAGE_CHANNELS, image_features=ImageSizeOptions.IMAGE_HEIGHT, gru_layers=gru_layers, hidden_size=hidden_size, num_classes=num_classes) prev_ite = 0 param_count = sum(p.numel() for p in transducer_model.parameters() if p.requires_grad) sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: TOTAL TRAINABLE PARAMETERS:\t" + str(param_count) + "\n") model_optimizer = torch.optim.Adam(transducer_model.parameters(), lr=lr, weight_decay=decay) lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( model_optimizer, 'min') if retrain_model is True: sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: OPTIMIZER LOADING\n") model_optimizer = ModelHandler.load_simple_optimizer( model_optimizer, retrain_model_path, gpu_mode) sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: OPTIMIZER LOADED\n") if gpu_mode: transducer_model = torch.nn.DataParallel(transducer_model).cuda() class_weights = torch.Tensor(ImageSizeOptions.class_weights) # Loss criterion = nn.CrossEntropyLoss(class_weights) if gpu_mode is True: criterion = criterion.cuda() start_epoch = prev_ite # Train the Model sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: TRAINING STARTING\n") stats = dict() stats['loss_epoch'] = [] stats['accuracy_epoch'] = [] sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: START: " + str(start_epoch + 1) + " END: " + str(epoch_limit) + "\n") for epoch in range(start_epoch, epoch_limit, 1): total_loss = 0 total_images = 0 sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: TRAIN EPOCH: " + str(epoch + 1) + "\n") # make sure the model is in train mode. BN is different in train and eval. batch_no = 1 with transducer_model.train(): for images, labels in train_loader: labels = labels.type(torch.LongTensor) images = images.type(torch.FloatTensor) if gpu_mode: # encoder_hidden = encoder_hidden.cuda() images = images.cuda() labels = labels.cuda() hidden = torch.zeros(images.size(0), 2 * TrainOptions.GRU_LAYERS, TrainOptions.HIDDEN_SIZE) if gpu_mode: hidden = hidden.cuda() for i in range(0, ImageSizeOptions.SEQ_LENGTH, TrainOptions.WINDOW_JUMP): model_optimizer.zero_grad() if i + TrainOptions.TRAIN_WINDOW > ImageSizeOptions.SEQ_LENGTH: break image_chunk = images[:, i:i + TrainOptions.TRAIN_WINDOW] label_chunk = labels[:, i:i + TrainOptions.TRAIN_WINDOW] output_, hidden = transducer_model(image_chunk, hidden) loss = criterion( output_.contiguous().view(-1, num_classes), label_chunk.contiguous().view(-1)) loss.backward() model_optimizer.step() total_loss += loss.item() total_images += image_chunk.size(0) hidden = hidden.detach() # update the progress bar avg_loss = (total_loss / total_images) if total_images else 0 sys.stderr.write( "[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: " + "EPOCH: " + str(epoch + 1) + " BATCH: " + str(batch_no) + "/" + str(len(train_loader)) + " LOSS: " + str(avg_loss) + "\n") if train_mode is True: train_loss_logger.write( str(epoch + 1) + "," + str(batch_no) + "," + str(avg_loss) + "\n") batch_no += 1 stats_dictioanry = test(test_file, batch_size, gpu_mode, transducer_model, num_workers, gru_layers, hidden_size, num_classes=ImageSizeOptions.TOTAL_LABELS) stats['loss'] = stats_dictioanry['loss'] stats['accuracy'] = stats_dictioanry['accuracy'] stats['loss_epoch'].append((epoch, stats_dictioanry['loss'])) stats['accuracy_epoch'].append((epoch, stats_dictioanry['accuracy'])) lr_scheduler.step(stats['loss']) # update the loggers if train_mode is True: # save the model after each epoch # encoder_model, decoder_model, encoder_optimizer, decoder_optimizer, hidden_size, layers, epoch, # file_name save_best_model( transducer_model, model_optimizer, hidden_size, gru_layers, epoch, model_dir + "PEPPER_epoch_" + str(epoch + 1) + '_checkpoint.pkl') test_loss_logger.write( str(epoch + 1) + "," + str(stats['loss']) + "," + str(stats['accuracy']) + "\n") confusion_matrix_logger.write( str(epoch + 1) + "\n" + str(stats_dictioanry['confusion_matrix']) + "\n") train_loss_logger.flush() test_loss_logger.flush() confusion_matrix_logger.flush() else: # this setup is for hyperband if epoch + 1 >= 10 and stats['accuracy'] < 98: sys.stderr.write( "[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: EARLY STOPPING AS THE MODEL NOT DOING WELL\n") return transducer_model, model_optimizer, stats sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: FINISHED TRAINING.\n") return transducer_model, model_optimizer, stats