def test(self, args, nrows=2, stats_file='model_stats'): """Test GAN""" print('Init') init_op = tf.global_variables_initializer() self.sess.run(init_op) # read_data returns a DataSet object with next_batch method normal = read_data('normal/') defect = read_masked('masked/') if self.load(self.checkpoint_dir): print(" [*] Load SUCCESS") else: print(" [!] Load failed. Continuing") batch_idxs = 781 // self.batch_size start = time.time() for i in xrange(0, batch_idxs): print('Generating samples for batch %2d, time: %4.4f' % (i, time.time() - start)) normal_test_batch = normal.next_batch(self.batch_size, which='test/', labels=True) defect_test_batch = defect.next_batch(self.batch_size, which='test/', labels=True) file_combinations = zip(normal_test_batch[1], defect_test_batch[1]) self._samples = self.sess.run(self.generated_sample, feed_dict={self.normal_xrays: normal_test_batch[0], self.masked_bbox: defect_test_batch[0]}) image_filename = './{}/test_{:04d}.png'.format(args.test_dir, i) save_images(images=self._samples, size=[nrows, self.batch_size//nrows], image_path=image_filename) save_stats(filename=stats_file, image_name=image_filename, labels=file_combinations)
def train_model(self): """ """ self.train_loss, self.train_acc = [], [] self.valid_loss, self.valid_acc = [], [] for epoch in range(10): valid = self.valid_epoch() print(f"Validation Accuracy: {self.valid_acc[-1]}") print(f"Validation Loss: {self.valid_loss[-1]}") train = self.train_epoch() print(f"Training Accuracy: {self.train_acc[-1]}") print(f"Training Loss: {self.train_loss[-1]}") save_stats(train_loss=self.train_loss, valid_loss=self.valid_loss, train_acc=self.train_acc, valid_acc=self.valid_acc) valid = self.valid_epoch() print(f"Final Accuracy: {self.valid_acc[-1]}") print(f"Final Loss: {self.valid_loss[-1]}") return
def test(self, args, stats_file='model_stats'): """Test GAN""" init_op = tf.global_variables_initializer() self.sess.run(init_op) # read_data returns a DataSet object with next_batch method normal = read_data('normal') defect = read_data('defect') if self.load(self.checkpoint_dir): print(" [*] Load SUCCESS") else: print(" [!] Load failed. Continuing") batch_idxs = defect.num_examples // self.batch_size batch_end_time = time.time() for i in batch_idxs: print('Generating samples for batch %2d, time: %4.4f' % (i, batch_end_time - time.time())) normal_test_batch = normal.next_batch(self.batch_size) defect_test_batch = defect.next_batch(self.batch_size) file_combinations = zip(normal_test_batch[1], defect_test_batch[1]) samples = self.sess.run(self.generated_sample, feed_dict={ self.normal_xrays: normal_test_batch[0], self.input_bbox: defect_test_batch[0] }) image_filename = './{}/test_{:04d}.png'.format(args.test_dir, i) save_images(images=samples, size=[self.batch_size, 2], image_path=image_filename) save_stats(filename=stats_file, image_name=image_filename, labels=file_combinations) batch_end_time = time.time()
def main(config): # create unique output directory for this model config['name'] = config['name'] + '-' + str(config['hidden_state_size']) if config['train_stride']: config['name'] = config['name'] + '-stride' if config['concat_labels']: config['name'] = config['name'] + '-concat_labels' if config['attention']: config['name'] = config['name'] + '-attention' if config['share_weights']: config['name'] = config['name'] + '-share_weights' config['name'] = config['name'] + '-' + config[ 'learning_rate_type'] + '-' + str(config['learning_rate']) timestamp = str(int(time.time())) config['model_dir'] = os.path.abspath( os.path.join(config['output_dir'], config['name'] + '-' + timestamp)) os.makedirs(config['model_dir']) print('Writing checkpoints into {}'.format(config['model_dir'])) # load the data, this requires that the *.npz files you downloaded from Kaggle be named `train.npz` and `valid.npz` data_train = load_data(config, 'train', config['train_stride']) data_valid = load_data(config, 'valid', config['eval_stride']) # TODO if you would like to do any preprocessing of the data, here would be a good opportunity stats = calculate_stats(data_train.input_) save_stats(stats) if config['normalize']: data_train.input_, _, _ = preprocess(data_train.input_) data_train.target, _, _ = preprocess(data_train.target) data_valid.input_, _, _ = preprocess(data_valid.input_) data_valid.target, _, _ = preprocess(data_valid.target) print('Post normalize samples shape: ', data_train.input_[0].shape) config['input_dim'] = data_train.input_[0].shape[-1] config['output_dim'] = data_train.target[0].shape[-1] # get input placeholders and get the model that we want to train seq2seq_model_class, placeholders = get_model_and_placeholders(config) # Create a variable that stores how many training iterations we performed. # This is useful for saving/storing the network global_step = tf.Variable(1, name='global_step', trainable=False) # create a training graph, this is the graph we will use to optimize the parameters with tf.name_scope('Training'): seq2seq_model = seq2seq_model_class(config, placeholders, mode='training') seq2seq_model.build_graph() print('created RNN model with {} parameters'.format( seq2seq_model.n_parameters)) # configure learning rate if config['learning_rate_type'] == 'exponential': lr = tf.train.exponential_decay( config['learning_rate'], global_step=global_step, decay_steps=config['learning_rate_decay_steps'], decay_rate=config['learning_rate_decay_rate'], staircase=False) lr_decay_op = tf.identity(lr) elif config['learning_rate_type'] == 'linear': lr = tf.Variable(config['learning_rate'], trainable=False) lr_decay_op = lr.assign( tf.multiply(lr, config['learning_rate_decay_rate'])) elif config['learning_rate_type'] == 'fixed': lr = config['learning_rate'] lr_decay_op = tf.identity(lr) else: raise ValueError('learning rate type "{}" unknown.'.format( config['learning_rate_type'])) with tf.name_scope('Optimizer'): # TODO choose the optimizer you desire here and define `train_op. The loss should be accessible through rnn_model.loss params = tf.trainable_variables() optimizer = tf.train.AdamOptimizer(config['learning_rate']) gradients = tf.gradients(seq2seq_model.loss, params) # clip the gradients to counter explosion clipped_gradients, _ = tf.clip_by_global_norm( gradients, config['gradient_clip']) # backprop train_op = optimizer.apply_gradients(zip(clipped_gradients, params), global_step=global_step) # create a graph for validation with tf.name_scope('Validation'): seq2seq_model_valid = seq2seq_model_class(config, placeholders, mode='validation') seq2seq_model_valid.build_graph() # Create summary ops for monitoring the training # Each summary op annotates a node in the computational graph and collects data data from it tf.summary.scalar('learning_rate', lr, collections=['training_summaries']) # Merge summaries used during training and reported after every step summaries_training = tf.summary.merge( tf.get_collection('training_summaries')) # create summary ops for monitoring the validation # caveat: we want to store the performance on the entire validation set, not just one validation batch # Tensorflow does not directly support this, so we must process every batch independently and then aggregate # the results outside of the model # so, we create a placeholder where can feed the aggregated result back into the model loss_valid_pl = tf.placeholder(tf.float32, name='loss_valid_pl') loss_valid_s = tf.summary.scalar('loss_valid', loss_valid_pl, collections=['validation_summaries']) # merge validation summaries summaries_valid = tf.summary.merge([loss_valid_s]) # dump the config to the model directory in case we later want to see it export_config(config, os.path.join(config['model_dir'], 'config.txt')) with tf.Session() as sess: # Add the ops to initialize variables. init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) # Actually intialize the variables sess.run(init_op) # create file writers to dump summaries onto disk so that we can look at them with tensorboard train_summary_dir = os.path.join(config['model_dir'], "summary", "train") train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) valid_summary_dir = os.path.join(config['model_dir'], "summary", "validation") valid_summary_writer = tf.summary.FileWriter(valid_summary_dir, sess.graph) # create a saver for writing training checkpoints saver = tf.train.Saver(var_list=tf.trainable_variables(), max_to_keep=config['n_keep_checkpoints']) # start training start_time = time.time() current_step = 0 for e in range(config['n_epochs']): # reshuffle the batches data_train.reshuffle() # loop through all training batches for i, batch in enumerate(data_train.all_batches()): step = tf.train.global_step(sess, global_step) current_step += 1 if config[ 'learning_rate_type'] == 'linear' and current_step % config[ 'learning_rate_decay_steps'] == 0: sess.run(lr_decay_op) # we want to train, so must request at least the train_op fetches = { 'summaries': summaries_training, 'loss': seq2seq_model.loss, 'train_op': train_op } # get the feed dict for the current batch feed_dict = seq2seq_model.get_feed_dict(batch) # feed data into the model and run optimization training_out = sess.run(fetches, feed_dict) # write logs train_summary_writer.add_summary(training_out['summaries'], global_step=step) # print training performance of this batch onto console time_delta = str( datetime.timedelta(seconds=int(time.time() - start_time))) print('\rEpoch: {:3d} [{:4d}/{:4d}] time: {:>8} loss: {:.4f}'. format(e + 1, i + 1, data_train.n_batches, time_delta, training_out['loss']), end='') # after every epoch evaluate the performance on the validation set total_valid_loss = 0.0 n_valid_samples = 0 for batch in data_valid.all_batches(): fetches = {'loss': seq2seq_model_valid.loss} feed_dict = seq2seq_model_valid.get_feed_dict(batch) valid_out = sess.run(fetches, feed_dict) total_valid_loss += valid_out['loss'] * batch.batch_size n_valid_samples += batch.batch_size # write validation logs avg_valid_loss = total_valid_loss / n_valid_samples valid_summaries = sess.run(summaries_valid, {loss_valid_pl: avg_valid_loss}) valid_summary_writer.add_summary(valid_summaries, global_step=tf.train.global_step( sess, global_step)) # print validation performance onto console print(' | validation loss: {:.6f}'.format(avg_valid_loss)) # save this checkpoint if necessary if (e + 1) % config['save_checkpoints_every_epoch'] == 0: saver.save(sess, os.path.join(config['model_dir'], 'model'), global_step) if avg_valid_loss > 10 or math.isnan(avg_valid_loss) or np.isinf( avg_valid_loss): break # Training finished, always save model before exiting print('Training finished') ckpt_path = saver.save(sess, os.path.join(config['model_dir'], 'model'), global_step) print('Model saved to file {}'.format(ckpt_path))
except FileExistsError: pass with torch.no_grad(): for i in range(4): h_views, v_views, i_views, d_views, center, gt, mask, index = test_set.get_scene(i) data_h = torch.tensor(h_views, device=device).float() data_v = torch.tensor(v_views, device=device).float() data_d = torch.tensor(d_views, device=device).float() predicted_h, _, _ = model(data_h) predicted_h = predicted_h.view(9, 3, 128, 128) predicted_v, _, _ = model(data_v) predicted_v = predicted_v.view(9, 3, 128, 128) mu_h, _ = model.encode(data_v) mu_v, _ = model.encode(data_h) predicted_d = model.decode(mu_h + mu_v).view(9, 3, 128, 128) test_loss += F.l1_loss(predicted_d, data_d) if i == 3: print("Save predicition:") show_view_sequence(predicted_h.cpu(), "horizontal", savepath=results_path) show_view_sequence(predicted_v.cpu(), "vertical", savepath=results_path) show_view_sequence(predicted_d.cpu(), "diagonal", savepath=results_path) show_view_sequence(h_views, "h_truth", savepath=results_path) show_view_sequence(v_views, "v_truth", savepath=results_path) show_view_sequence(d_views, "d_truth", savepath=results_path) show_view_sequence((d_views - predicted_d.cpu().numpy()), "difference", savepath=results_path, cmap="coolwarm") test_loss /= 4 print('====> Test set loss: {:.4f}'.format(test_loss)) save_stats("training_stats.csv", MODEL_NAME, "MSE", EPOCHS*4, "{:.4f}".format(test_loss))
def main(args): if args.model_name is not None: print('Preparing to train model: {}'.format(args.model_name)) global device device = torch.device( 'cuda' if torch.cuda.is_available() and not args.cpu else 'cpu') sc_will_happen = args.self_critical_from_epoch != -1 if args.validate is None and args.lr_scheduler == 'ReduceLROnPlateau': print( 'ERROR: you need to enable validation in order to use default lr_scheduler (ReduceLROnPlateau)' ) print('Hint: use something like --validate=coco:val2017') sys.exit(1) # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ # transforms.Resize((256, 256)), transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) scorers = {} if args.validation_scoring is not None or sc_will_happen: assert not ( args.validation_scoring is None and sc_will_happen ), "Please provide a metric when using self-critical training" for s in args.validation_scoring.split(','): s = s.lower().strip() if s == 'cider': from eval.cider import Cider scorers['CIDEr'] = Cider() if s == 'ciderd': from eval.ciderD.ciderD import CiderD scorers['CIDEr-D'] = CiderD(df=args.cached_words) ######################## # Set Model parameters # ######################## # Store parameters gotten from arguments separately: arg_params = ModelParams.fromargs(args) print("Model parameters inferred from command arguments: ") print(arg_params) start_epoch = 0 ############################### # Load existing model state # # and update Model parameters # ############################### state = None if args.load_model: try: state = torch.load(args.load_model, map_location=device) except AttributeError: print( 'WARNING: Old model found. Please use model_update.py in the model before executing this script.' ) exit(1) new_external_features = arg_params.features.external params = ModelParams(state, arg_params=arg_params) if len(new_external_features ) and params.features.external != new_external_features: print('WARNING: external features changed: ', params.features.external, new_external_features) print('Updating feature paths...') params.update_ext_features(new_external_features) start_epoch = state['epoch'] print('Loaded model {} at epoch {}'.format(args.load_model, start_epoch)) else: params = arg_params params.command_history = [] if params.rnn_hidden_init == 'from_features' and params.skip_start_token: print( "ERROR: Please remove --skip_start_token if you want to use image features " " to initialize hidden and cell states. <start> token is needed to trigger " " the process of sequence generation, since we don't have image features " " embedding as the first input token.") sys.exit(1) # Force set the following hierarchical model parameters every time: if arg_params.hierarchical_model: params.hierarchical_model = True params.max_sentences = arg_params.max_sentences params.weight_sentence_loss = arg_params.weight_sentence_loss params.weight_word_loss = arg_params.weight_word_loss params.dropout_stopping = arg_params.dropout_stopping params.dropout_fc = arg_params.dropout_fc params.coherent_sentences = arg_params.coherent_sentences params.coupling_alpha = arg_params.coupling_alpha params.coupling_beta = arg_params.coupling_beta assert args.replace or \ not os.path.isdir(os.path.join(args.output_root, args.model_path, get_model_name(args, params))) or \ not (args.load_model and not args.validate_only), \ '{} already exists. If you want to replace it or resume training please use --replace flag. ' \ 'If you want to validate a loaded model without training it, use --validate_only flag.' \ 'Otherwise specify a different model name using --model_name flag.'\ .format(os.path.join(args.output_root, args.model_path, get_model_name(args, params))) if args.load_model: print("Final model parameters (loaded model + command arguments): ") print(params) ############################## # Load dataset configuration # ############################## dataset_configs = DatasetParams(args.dataset_config_file) if args.dataset is None and not args.validate_only: print('ERROR: No dataset selected!') print( 'Please supply a training dataset with the argument --dataset DATASET' ) print('The following datasets are configured in {}:'.format( args.dataset_config_file)) for ds, _ in dataset_configs.config.items(): if ds not in ('DEFAULT', 'generic'): print(' ', ds) sys.exit(1) if args.validate_only: if args.load_model is None: print( 'ERROR: for --validate_only you need to specify a model to evaluate using --load_model MODEL' ) sys.exit(1) else: dataset_params = dataset_configs.get_params(args.dataset) for i in dataset_params: i.config_dict['no_tokenize'] = args.no_tokenize i.config_dict['show_tokens'] = args.show_tokens i.config_dict['skip_start_token'] = params.skip_start_token if params.hierarchical_model: i.config_dict['hierarchical_model'] = True i.config_dict['max_sentences'] = params.max_sentences i.config_dict['crop_regions'] = False if args.validate is not None: validation_dataset_params = dataset_configs.get_params(args.validate) for i in validation_dataset_params: i.config_dict['no_tokenize'] = args.no_tokenize i.config_dict['show_tokens'] = args.show_tokens i.config_dict['skip_start_token'] = params.skip_start_token if params.hierarchical_model: i.config_dict['hierarchical_model'] = True i.config_dict['max_sentences'] = params.max_sentences i.config_dict['crop_regions'] = False ####################### # Load the vocabulary # ####################### # For pre-trained models attempt to obtain # saved vocabulary from the model itself: if args.load_model and params.vocab is not None: print("Loading vocabulary from the model file:") vocab = params.vocab else: if args.vocab is None: print( "ERROR: You must specify the vocabulary to be used for training using " "--vocab flag.\nTry --vocab AUTO if you want the vocabulary to be " "either generated from the training dataset or loaded from cache." ) sys.exit(1) print("Loading / generating vocabulary:") vocab = get_vocab(args, dataset_params) print('Size of the vocabulary is {}'.format(len(vocab))) ########################## # Initialize data loader # ########################## ext_feature_sets = [ params.features.external, params.persist_features.external ] if not args.validate_only: print('Loading dataset: {} with {} workers'.format( args.dataset, args.num_workers)) if params.skip_start_token: print("Skipping the use of <start> token...") data_loader, ef_dims = get_loader( dataset_params, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers, ext_feature_sets=ext_feature_sets, skip_images=not params.has_internal_features(), verbose=args.verbose, unique_ids=sc_will_happen) if sc_will_happen: gts_sc = get_ground_truth_captions(data_loader.dataset) gts_sc_valid = None if args.validate is not None: valid_loader, ef_dims = get_loader( validation_dataset_params, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers, ext_feature_sets=ext_feature_sets, skip_images=not params.has_internal_features(), verbose=args.verbose) gts_sc_valid = get_ground_truth_captions( valid_loader.dataset) if sc_will_happen else None ######################################### # Setup (optional) TensorBoardX logging # ######################################### writer = None if args.tensorboard: if SummaryWriter is not None: model_name = get_model_name(args, params) timestamp = datetime.now().strftime('%Y%m%d%H%M%S') log_dir = os.path.join( args.output_root, 'log_tb/{}_{}'.format(model_name, timestamp)) writer = SummaryWriter(log_dir=log_dir) print("INFO: Logging TensorBoardX events to {}".format(log_dir)) else: print( "WARNING: SummaryWriter object not available. " "Hint: Please install TensorBoardX using pip install tensorboardx" ) ###################### # Build the model(s) # ###################### # Set per parameter learning rate here, if supplied by the user: if args.lr_word_decoder is not None: if not params.hierarchical_model: print( "ERROR: Setting word decoder learning rate currently supported in Hierarchical Model only." ) sys.exit(1) lr_dict = {'word_decoder': args.lr_word_decoder} else: lr_dict = {} model = EncoderDecoder(params, device, len(vocab), state, ef_dims, lr_dict=lr_dict) ###################### # Optimizer and loss # ###################### sc_activated = False opt_params = model.get_opt_params() # Loss and optimizer if params.hierarchical_model: criterion = HierarchicalXEntropyLoss( weight_sentence_loss=params.weight_sentence_loss, weight_word_loss=params.weight_word_loss) elif args.share_embedding_weights: criterion = SharedEmbeddingXentropyLoss(param_lambda=0.15) else: criterion = nn.CrossEntropyLoss() if sc_will_happen: # save it for later if args.self_critical_loss == 'sc': from model.loss import SelfCriticalLoss rl_criterion = SelfCriticalLoss() elif args.self_critical_loss == 'sc_with_diversity': from model.loss import SelfCriticalWithDiversityLoss rl_criterion = SelfCriticalWithDiversityLoss() elif args.self_critical_loss == 'sc_with_relative_diversity': from model.loss import SelfCriticalWithRelativeDiversityLoss rl_criterion = SelfCriticalWithRelativeDiversityLoss() elif args.self_critical_loss == 'sc_with_bleu_diversity': from model.loss import SelfCriticalWithBLEUDiversityLoss rl_criterion = SelfCriticalWithBLEUDiversityLoss() elif args.self_critical_loss == 'sc_with_repetition': from model.loss import SelfCriticalWithRepetitionLoss rl_criterion = SelfCriticalWithRepetitionLoss() elif args.self_critical_loss == 'mixed': from model.loss import MixedLoss rl_criterion = MixedLoss() elif args.self_critical_loss == 'mixed_with_face': from model.loss import MixedWithFACELoss rl_criterion = MixedWithFACELoss(vocab_size=len(vocab)) elif args.self_critical_loss in [ 'sc_with_penalty', 'sc_with_penalty_throughout', 'sc_masked_tokens' ]: raise ValueError('Deprecated loss, use \'sc\' loss') else: raise ValueError('Invalid self-critical loss') print('Selected self-critical loss is', rl_criterion) if start_epoch >= args.self_critical_from_epoch: criterion = rl_criterion sc_activated = True print('Self-critical loss training begins') # When using CyclicalLR, default learning rate should be always 1.0 if args.lr_scheduler == 'CyclicalLR': default_lr = 1. else: default_lr = 0.001 if sc_activated: optimizer = torch.optim.Adam( opt_params, lr=args.learning_rate if args.learning_rate else 5e-5, weight_decay=args.weight_decay) elif args.optimizer == 'adam': optimizer = torch.optim.Adam(opt_params, lr=default_lr, weight_decay=args.weight_decay) elif args.optimizer == 'rmsprop': optimizer = torch.optim.RMSprop(opt_params, lr=default_lr, weight_decay=args.weight_decay) elif args.optimizer == 'sgd': optimizer = torch.optim.SGD(opt_params, lr=default_lr, weight_decay=args.weight_decay) else: print('ERROR: unknown optimizer:', args.optimizer) sys.exit(1) # We don't want to initialize the optimizer if we are transfering # the language model from the regular model to hierarchical model transfer_language_model = False if arg_params.hierarchical_model and state and not state.get( 'hierarchical_model'): transfer_language_model = True # Set optimizer state to the one found in a loaded model, unless # we are doing a transfer learning step from flat to hierarchical model, # or we are using self-critical loss, # or the number of unique parameter groups has changed, or the user # has explicitly told us *not to* reuse optimizer parameters from before if state and not transfer_language_model and not sc_activated and not args.optimizer_reset: # Check that number of parameter groups is the same if len(optimizer.param_groups) == len( state['optimizer']['param_groups']): optimizer.load_state_dict(state['optimizer']) # override lr if set explicitly in arguments - # 1) Global learning rate: if args.learning_rate: for param_group in optimizer.param_groups: param_group['lr'] = args.learning_rate params.learning_rate = args.learning_rate else: params.learning_rate = default_lr # 2) Parameter-group specific learning rate: if args.lr_word_decoder is not None: # We want to give user an option to set learning rate for word_decoder # separately. Other exceptions can be added as needed: for param_group in optimizer.param_groups: if param_group.get('name') == 'word_decoder': param_group['lr'] = args.lr_word_decoder break if args.validate is not None and args.lr_scheduler == 'ReduceLROnPlateau': print('Using ReduceLROnPlateau learning rate scheduler') scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=True, patience=2) elif args.lr_scheduler == 'StepLR': print('Using StepLR learning rate scheduler with step_size {}'.format( args.lr_step_size)) # Decrease the learning rate by the factor of gamma at every # step_size epochs (for example every 5 or 10 epochs): step_size = args.lr_step_size scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size, gamma=0.5, last_epoch=-1) elif args.lr_scheduler == 'CyclicalLR': print( "Using Cyclical learning rate scheduler, lr range: [{},{}]".format( args.lr_cyclical_min, args.lr_cyclical_max)) step_size = len(data_loader) clr = cyclical_lr(step_size, min_lr=args.lr_cyclical_min, max_lr=args.lr_cyclical_max) n_groups = len(optimizer.param_groups) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, [clr] * n_groups) elif args.lr_scheduler is not None: print('ERROR: Invalid learing rate scheduler specified: {}'.format( args.lr_scheduler)) sys.exit(1) ################### # Train the model # ################### stats_postfix = None if args.validate_only: stats_postfix = args.validate if args.load_model: all_stats = init_stats(args, params, postfix=stats_postfix) else: all_stats = {} if args.force_epoch: start_epoch = args.force_epoch - 1 if not args.validate_only: total_step = len(data_loader) print( 'Start training with start_epoch={:d} num_epochs={:d} num_batches={:d} ...' .format(start_epoch, args.num_epochs, args.num_batches)) if args.teacher_forcing != 'always': print('\t k: {}'.format(args.teacher_forcing_k)) print('\t beta: {}'.format(args.teacher_forcing_beta)) print('Optimizer:', optimizer) if args.validate_only: stats = {} teacher_p = 1.0 if args.teacher_forcing != 'always': print( 'WARNING: teacher_forcing!=always, not yet implemented for --validate_only mode' ) epoch = start_epoch - 1 if str(epoch + 1) in all_stats.keys() and args.skip_existing_validations: print('WARNING: epoch {} already validated, skipping...'.format( epoch + 1)) return val_loss = do_validate(model, valid_loader, criterion, scorers, vocab, teacher_p, args, params, stats, epoch, sc_activated, gts_sc_valid) all_stats[str(epoch + 1)] = stats save_stats(args, params, all_stats, postfix=stats_postfix) else: for epoch in range(start_epoch, args.num_epochs): stats = {} begin = datetime.now() total_loss = 0 if params.hierarchical_model: total_loss_sent = 0 total_loss_word = 0 num_batches = 0 vocab_counts = { 'cnt': 0, 'max': 0, 'min': 9999, 'sum': 0, 'unk_cnt': 0, 'unk_sum': 0 } # If start self critical training if not sc_activated and sc_will_happen and epoch >= args.self_critical_from_epoch: if all_stats: best_ep, best_cider = max( [(ep, all_stats[ep]['validation_cider']) for ep in all_stats], key=lambda x: x[1]) print('Loading model from epoch', best_ep, 'which has the better score with', best_cider) state = torch.load( get_model_path(args, params, int(best_ep))) model = EncoderDecoder(params, device, len(vocab), state, ef_dims, lr_dict=lr_dict) opt_params = model.get_opt_params() optimizer = torch.optim.Adam(opt_params, lr=5e-5, weight_decay=args.weight_decay) criterion = rl_criterion print('Self-critical loss training begins') sc_activated = True for i, data in enumerate(data_loader): if params.hierarchical_model: (images, captions, lengths, image_ids, features, sorting_order, last_sentence_indicator) = data sorting_order = sorting_order.to(device) else: (images, captions, lengths, image_ids, features) = data if epoch == 0: unk = vocab('<unk>') for j in range(captions.shape[0]): # Flatten the caption in case it's a paragraph # this is harmless for regular captions too: xl = captions[j, :].view(-1) xw = xl > unk xu = xl == unk xwi = sum(xw).item() xui = sum(xu).item() vocab_counts['cnt'] += 1 vocab_counts['sum'] += xwi vocab_counts['max'] = max(vocab_counts['max'], xwi) vocab_counts['min'] = min(vocab_counts['min'], xwi) vocab_counts['unk_cnt'] += xui > 0 vocab_counts['unk_sum'] += xui # Set mini-batch dataset images = images.to(device) captions = captions.to(device) # Remove <start> token from targets if we are initializing the RNN # hidden state from image features: if params.rnn_hidden_init == 'from_features' and not params.hierarchical_model: # Subtract one from all lengths to match new target lengths: lengths = [x - 1 if x > 0 else x for x in lengths] targets = pack_padded_sequence(captions[:, 1:], lengths, batch_first=True)[0] else: if params.hierarchical_model: targets = prepare_hierarchical_targets( last_sentence_indicator, args.max_sentences, lengths, captions, device) else: targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] sorting_order = None init_features = features[0].to(device) if len( features) > 0 and features[0] is not None else None persist_features = features[1].to(device) if len( features) > 1 and features[1] is not None else None # Forward, backward and optimize # Calculate the probability whether to use teacher forcing or not: # Iterate over batches: iteration = (epoch - start_epoch) * len(data_loader) + i teacher_p = get_teacher_prob(args.teacher_forcing_k, iteration, args.teacher_forcing_beta) # Allow model to log values at the last batch of the epoch writer_data = None if writer and (i == len(data_loader) - 1 or i == args.num_batches - 1): writer_data = {'writer': writer, 'epoch': epoch + 1} sample_len = captions.size(1) if args.self_critical_loss in [ 'mixed', 'mixed_with_face' ] else 20 if sc_activated: sampled_seq, sampled_log_probs, outputs = model.sample( images, init_features, persist_features, max_seq_length=sample_len, start_token_id=vocab('<start>'), trigram_penalty_alpha=args.trigram_penalty_alpha, stochastic_sampling=True, output_logprobs=True, output_outputs=True) sampled_seq = model.decoder.alt_prob_to_tensor( sampled_seq, device=device) else: outputs = model(images, init_features, captions, lengths, persist_features, teacher_p, args.teacher_forcing, sorting_order, writer_data=writer_data) if args.share_embedding_weights: # Weights of (HxH) projection matrix used for regularizing # models that share embedding weights projection = model.decoder.projection.weight loss = criterion(projection, outputs, targets) elif sc_activated: # get greedy decoding baseline model.eval() with torch.no_grad(): greedy_sampled_seq = model.sample( images, init_features, persist_features, max_seq_length=sample_len, start_token_id=vocab('<start>'), trigram_penalty_alpha=args.trigram_penalty_alpha, stochastic_sampling=False) greedy_sampled_seq = model.decoder.alt_prob_to_tensor( greedy_sampled_seq, device=device) model.train() if args.self_critical_loss in [ 'sc', 'sc_with_diversity', 'sc_with_relative_diversity', 'sc_with_bleu_diversity', 'sc_with_repetition' ]: loss, advantage = criterion( sampled_seq, sampled_log_probs, greedy_sampled_seq, [gts_sc[i] for i in image_ids], scorers, vocab, return_advantage=True) elif args.self_critical_loss in ['mixed']: loss, advantage = criterion( sampled_seq, sampled_log_probs, outputs, greedy_sampled_seq, [gts_sc[i] for i in image_ids], scorers, vocab, targets, lengths, gamma_ml_rl=args.gamma_ml_rl, return_advantage=True) elif args.self_critical_loss in ['mixed_with_face']: loss, advantage = criterion( sampled_seq, sampled_log_probs, outputs, greedy_sampled_seq, [gts_sc[i] for i in image_ids], scorers, vocab, captions, targets, lengths, gamma_ml_rl=args.gamma_ml_rl, return_advantage=True) else: raise ValueError('Invalid self-critical loss') if writer is not None and i % 100 == 0: writer.add_scalar('training_loss', loss.item(), epoch * len(data_loader) + i) writer.add_scalar('advantage', advantage, epoch * len(data_loader) + i) writer.add_scalar('lr', optimizer.param_groups[0]['lr'], epoch * len(data_loader) + i) else: loss = criterion(outputs, targets) model.zero_grad() loss.backward() # Clip gradients if desired: if args.grad_clip is not None: # grad_norms = [x.grad.data.norm(2) for x in opt_params] # batch_max_grad = np.max(grad_norms) # if batch_max_grad > 10.0: # print('WARNING: gradient norms larger than 10.0') # torch.nn.utils.clip_grad_norm_(decoder.parameters(), 0.1) # torch.nn.utils.clip_grad_norm_(encoder.parameters(), 0.1) clip_gradients(optimizer, args.grad_clip) # Update weights: optimizer.step() # CyclicalLR requires us to update LR at every minibatch: if args.lr_scheduler == 'CyclicalLR': scheduler.step() total_loss += loss.item() num_batches += 1 if params.hierarchical_model: _, loss_sent, _, loss_word = criterion.item_terms() total_loss_sent += float(loss_sent) total_loss_word += float(loss_word) # Print log info if (i + 1) % args.log_step == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, ' 'Perplexity: {:5.4f}'.format(epoch + 1, args.num_epochs, i + 1, total_step, loss.item(), np.exp(loss.item()))) sys.stdout.flush() if params.hierarchical_model: weight_sent, loss_sent, weight_word, loss_word = criterion.item_terms( ) print('Sentence Loss: {:.4f}, ' 'Word Loss: {:.4f}'.format( float(loss_sent), float(loss_word))) sys.stdout.flush() if i + 1 == args.num_batches: break end = datetime.now() stats['training_loss'] = total_loss / num_batches if params.hierarchical_model: stats['loss_sentence'] = total_loss_sent / num_batches stats['loss_word'] = total_loss_word / num_batches print('Epoch {} duration: {}, average loss: {:.4f}'.format( epoch + 1, end - begin, stats['training_loss'])) save_model(args, params, model.encoder, model.decoder, optimizer, epoch, vocab) if epoch == 0: vocab_counts['avg'] = vocab_counts['sum'] / vocab_counts['cnt'] vocab_counts['unk_cnt_per'] = 100 * vocab_counts[ 'unk_cnt'] / vocab_counts['cnt'] vocab_counts['unk_sum_per'] = 100 * vocab_counts[ 'unk_sum'] / vocab_counts['sum'] # print(vocab_counts) print(( 'Training data contains {sum} words in {cnt} captions (avg. {avg:.1f} w/c)' + ' with {unk_sum} <unk>s ({unk_sum_per:.1f}%)' + ' in {unk_cnt} ({unk_cnt_per:.1f}%) captions').format( **vocab_counts)) ############################################ # Validation loss and learning rate update # ############################################ if args.validate is not None and (epoch + 1) % args.validation_step == 0: val_loss = do_validate(model, valid_loader, criterion, scorers, vocab, teacher_p, args, params, stats, epoch, sc_activated, gts_sc_valid) if args.lr_scheduler == 'ReduceLROnPlateau': scheduler.step(val_loss) elif args.lr_scheduler == 'StepLR': scheduler.step() all_stats[str(epoch + 1)] = stats save_stats(args, params, all_stats, writer=writer) if writer is not None: # Log model data to tensorboard log_model_data(params, model, epoch + 1, writer) if writer is not None: writer.close()
def train(seed=0, dataset='grid', samplers=(UniformDatasetSampler, UniformLatentSampler), latent_dim=2, model_dim=256, device='cuda', conditional=False, learning_rate=2e-4, betas=(0.5, 0.9), batch_size=256, iterations=400, n_critic=5, objective='gan', gp_lambda=10, output_dir='results', plot=False, spec_norm=True): experiment_name = [ seed, dataset, samplers[0].__name__, samplers[1].__name__, latent_dim, model_dim, device, conditional, learning_rate, betas[0], betas[1], batch_size, iterations, n_critic, objective, gp_lambda, plot, spec_norm ] experiment_name = '_'.join([str(p) for p in experiment_name]) results_dir = os.path.join(output_dir, experiment_name) network_dir = os.path.join(results_dir, 'networks') eval_log = os.path.join(results_dir, 'eval.log') os.makedirs(results_dir, exist_ok=True) os.makedirs(network_dir, exist_ok=True) eval_file = open(eval_log, 'w') if plot: samples_dir = os.path.join(results_dir, 'samples') os.makedirs(samples_dir, exist_ok=True) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) data, labels = load_data(dataset) data_dim, num_classes = data.shape[1], len(set(labels)) data_sampler = samplers[0]( torch.tensor(data).float(), torch.tensor(labels).long()) if conditional else samplers[0]( torch.tensor(data).float()) noise_sampler = samplers[1]( latent_dim, labels) if conditional else samplers[1](latent_dim) if conditional: test_data, test_labels = load_data(dataset, split='test') test_dataset = TensorDataset( torch.tensor(test_data).to(device).float(), torch.tensor(test_labels).to(device).long()) test_dataloader = DataLoader(test_dataset, batch_size=4096) G = Generator(latent_dim + num_classes, model_dim, data_dim).to(device).train().train() D = Discriminator(model_dim, data_dim + num_classes, spec_norm=spec_norm).to(device).train() C_real = Classifier(model_dim, data_dim, num_classes).to(device).train() C_fake = Classifier(model_dim, data_dim, num_classes).to(device).train() C_fake.load_state_dict(deepcopy(C_real.state_dict())) C_real_optimizer = optim.Adam(C_real.parameters(), lr=2 * learning_rate) C_fake_optimizer = optim.Adam(C_fake.parameters(), lr=2 * learning_rate) C_crit = nn.CrossEntropyLoss() else: G = Generator(latent_dim, model_dim, data_dim).to(device).train() D = Discriminator(model_dim, data_dim, spec_norm=spec_norm).to(device).train() D_optimizer = optim.Adam(D.parameters(), lr=learning_rate, betas=betas) G_optimizer = optim.Adam(G.parameters(), lr=learning_rate, betas=betas) if objective == 'gan': fake_target = torch.zeros(batch_size, 1).to(device) real_target = torch.ones(batch_size, 1).to(device) elif objective == 'wgan': grad_target = torch.ones(batch_size, 1).to(device) elif objective == 'hinge': bound = torch.zeros(batch_size, 1).to(device) sub = torch.ones(batch_size, 1).to(device) stats = {'D': [], 'G': [], 'C_it': [], 'C_real': [], 'C_fake': []} if plot: fixed_latent_batch = noise_sampler.get_batch(20000) sample_figure = plt.figure(num=0, figsize=(5, 5)) loss_figure = plt.figure(num=1, figsize=(10, 5)) if conditional: accuracy_figure = plt.figure(num=2, figsize=(10, 5)) for it in range(iterations + 1): # Train Discriminator data_batch = data_sampler.get_batch(batch_size) latent_batch = noise_sampler.get_batch(batch_size) if conditional: x_real, y_real = data_batch[0].to(device), data_batch[1].to(device) real_sample = torch.cat([x_real, y_real], dim=1) z_fake, y_fake = latent_batch[0].to(device), latent_batch[1].to( device) x_fake = G(torch.cat([z_fake, y_fake], dim=1)).detach() fake_sample = torch.cat([x_fake, y_fake], dim=1) else: x_real = data_batch.to(device) real_sample = x_real z_fake = latent_batch.to(device) x_fake = G(z_fake).detach() fake_sample = x_fake D.zero_grad() real_pred = D(real_sample) fake_pred = D(fake_sample) if is_recorded(data_sampler): data_sampler.record(real_pred.detach().cpu().numpy()) if is_weighted(data_sampler): weights = torch.tensor( data_sampler.get_weights()).to(device).float().view( real_pred.shape) else: weights = torch.ones_like(real_pred).to(device) if objective == 'gan': D_loss = F.binary_cross_entropy(fake_pred, fake_target).mean() + ( weights * F.binary_cross_entropy(real_pred, real_target)).mean() stats['D'].append(D_loss.item()) elif objective == 'wgan': alpha = torch.rand(batch_size, 1).expand(real_sample.size()).to(device) interpolate = (alpha * real_sample + (1 - alpha) * fake_sample).requires_grad_(True) gradients = torch.autograd.grad(outputs=D(interpolate), inputs=interpolate, grad_outputs=grad_target, create_graph=True, retain_graph=True, only_inputs=True)[0] gradient_penalty = (gradients.norm(2, dim=1) - 1).pow(2).mean() * gp_lambda D_loss = fake_pred.mean() - (real_pred * weights).mean() stats['D'].append(-D_loss.item()) D_loss += gradient_penalty elif objective == 'hinge': D_loss = -(torch.min(real_pred - sub, bound) * weights).mean() - torch.min(-fake_pred - sub, bound).mean() stats['D'].append(D_loss.item()) D_loss.backward() D_optimizer.step() # Train Generator if it % n_critic == 0: G.zero_grad() latent_batch = noise_sampler.get_batch(batch_size) if conditional: z_fake, y_fake = latent_batch[0].to( device), latent_batch[1].to(device) x_fake = G(torch.cat([z_fake, y_fake], dim=1)) fake_pred = D(torch.cat([x_fake, y_fake], dim=1)) else: z_fake = latent_batch.to(device) x_fake = G(z_fake) fake_pred = D(x_fake) if objective == 'gan': G_loss = F.binary_cross_entropy(fake_pred, real_target).mean() stats['G'].extend([G_loss.item()] * n_critic) elif objective == 'wgan': G_loss = -fake_pred.mean() stats['G'].extend([-G_loss.item()] * n_critic) elif objective == 'hinge': G_loss = -fake_pred.mean() stats['G'].extend([-G_loss.item()] * n_critic) G_loss.backward() G_optimizer.step() if conditional: # Train fake classifier C_fake.train() C_fake.zero_grad() C_fake_loss = C_crit(C_fake(x_fake.detach()), y_fake.argmax(1)) C_fake_loss.backward() C_fake_optimizer.step() # Train real classifier C_real.train() C_real.zero_grad() C_real_loss = C_crit(C_real(x_real), y_real.argmax(1)) C_real_loss.backward() C_real_optimizer.step() if it % 5 == 0: C_real.eval() C_fake.eval() real_correct, fake_correct, total = 0.0, 0.0, 0.0 for idx, (sample, label) in enumerate(test_dataloader): real_correct += ( C_real(sample).argmax(1).view(-1) == label).sum() fake_correct += ( C_fake(sample).argmax(1).view(-1) == label).sum() total += sample.shape[0] stats['C_it'].append(it) stats['C_real'].append(real_correct.item() / total) stats['C_fake'].append(fake_correct.item() / total) line = f"{it}\t{stats['D'][-1]:.3f}\t{stats['G'][-1]:.3f}" if conditional: line += f"\t{stats['C_real'][-1]*100:.3f}\t{stats['C_fake'][-1]*100:.3f}" print(line, eval_file) if plot: if conditional: z_fake, y_fake = fixed_latent_batch[0].to( device), fixed_latent_batch[1].to(device) x_fake = G(torch.cat([z_fake, y_fake], dim=1)) else: z_fake = fixed_latent_batch.to(device) x_fake = G(z_fake) generated = x_fake.detach().cpu().numpy() plt.figure(0) plt.clf() plt.scatter(generated[:, 0], generated[:, 1], marker='.', color=(0, 1, 0, 0.01)) plt.axis('equal') plt.xlim(-1, 1) plt.ylim(-1, 1) plt.savefig(os.path.join(samples_dir, f'{it}.png')) plt.figure(1) plt.clf() plt.plot(stats['G'], label='Generator') plt.plot(stats['D'], label='Discriminator') plt.legend() plt.savefig(os.path.join(results_dir, 'loss.png')) if conditional: plt.figure(2) plt.clf() plt.plot(stats['C_it'], stats['C_real'], label='Real') plt.plot(stats['C_it'], stats['C_fake'], label='Fake') plt.legend() plt.savefig(os.path.join(results_dir, 'accuracy.png')) save_model(G, os.path.join(network_dir, 'G_trained.pth')) save_model(D, os.path.join(network_dir, 'D_trained.pth')) save_stats(stats, os.path.join(results_dir, 'stats.pth')) if conditional: save_model(C_real, os.path.join(network_dir, 'C_real_trained.pth')) save_model(C_fake, os.path.join(network_dir, 'C_fake_trained.pth')) eval_file.close()