def main(): start = time.time() parser = args.parse_args() # run some checks on arguments check_args(parser) # format logging log_name = os.path.join( parser.run_log, '{}_run_log_{}.log'.format(parser.experiment, dt.now().strftime("%Y%m%d_%H%M"))) log.basicConfig(filename=log_name, format='%(asctime)s | %(name)s -- %(message)s', level=log.INFO) os.chmod(log_name, parser.access_mode) # set device to CPU if available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Starting experiment {} VN -> EN NMT on {}.".format( parser.experiment, device)) log.info("Starting experiment {} VN -> EN NMT on {}.".format( parser.experiment, device)) # set seed for replication random.seed(parser.seed) np.random.seed(parser.seed) torch.manual_seed(parser.seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(parser.seed) log.info("For reproducibility, the seed is set to {}.".format(parser.seed)) # set file paths source_name = parser.source_name target_name = parser.target_name # get saved models dir base_saved_models_dir = parser.save_dir saved_models_dir = os.path.join(base_saved_models_dir, source_name + '2' + target_name) plots_dir = parser.plots_dir log.info("We will save the models in this directory: {}".format( saved_models_dir)) log.info("We will save the plots in this directory: {}".format(plots_dir)) # get data dir main_data_path = parser.data_dir path_to_train_data = { 'source': main_data_path + 'train.tok.' + source_name, 'target': main_data_path + 'train.tok.' + target_name } path_to_dev_data = { 'source': main_data_path + 'dev.tok.' + source_name, 'target': main_data_path + 'dev.tok.' + target_name } path_to_test_data = { 'source': main_data_path + 'test.tok.' + source_name, 'target': main_data_path + 'test.tok.' + target_name } # Configuration bs = parser.batch_size log.info("Batch size = {}.".format(bs)) enc_emb = parser.enc_emb enc_hidden = parser.enc_hidden enc_layers = parser.enc_layers rnn_type = parser.rnn_type dec_emb = parser.dec_emb dec_hidden = parser.dec_hidden dec_layers = parser.dec_layers learning_rate = parser.learning_rate num_epochs = parser.epochs attn_flag = parser.wo_attn log.info("The attention flag is set to {}.".format(attn_flag)) beam_size = parser.beam_size log.info("We evaluate using beam size of {}.".format(beam_size)) train, val, test, en_lang, vi_lang = dataset_helper.train_val_load( "", main_data_path) # get vocab sizes log.info('English has vocab size of: {} words.'.format(en_lang.n_words)) log.info('Vietnamese has vocab size of: {} words.'.format(vi_lang.n_words)) # get max sentence length by 95% percentile MAX_LEN = int(train['en_len'].quantile(0.95)) log.info( 'We will have a max sentence length of {} (95 percentile).'.format( MAX_LEN)) # set data loaders bs_dict = {'train': bs, 'validate': 1, 'test': 1} shuffle_dict = {'train': True, 'validate': False, 'test': False} train_used = train val_used = val collate_fn_dict = { 'train': partial(dataset_helper.vocab_collate_func, MAX_LEN=MAX_LEN), 'validate': dataset_helper.vocab_collate_func_val, 'test': dataset_helper.vocab_collate_func_val } transformed_dataset = { 'train': dataset_helper.Vietnamese(train_used), 'validate': dataset_helper.Vietnamese(val_used, val=True), 'test': dataset_helper.Vietnamese(test, val=True) } dataloader = { x: DataLoader(transformed_dataset[x], batch_size=bs_dict[x], collate_fn=collate_fn_dict[x], shuffle=shuffle_dict[x], num_workers=0) for x in ['train', 'validate', 'test'] } # instantiate encoder/decoder encoder_wo_att = nnet_models.EncoderRNN(input_size=vi_lang.n_words, embed_dim=enc_emb, hidden_size=enc_hidden, n_layers=enc_layers, rnn_type=rnn_type).to(device) decoder_wo_att = nnet_models.AttentionDecoderRNN( output_size=en_lang.n_words, embed_dim=dec_emb, hidden_size=dec_hidden, n_layers=dec_layers, attention=attn_flag).to(device) # instantiate optimizer if parser.optimizer == 'sgd': encoder_optimizer = optim.SGD(encoder_wo_att.parameters(), lr=learning_rate, nesterov=True, momentum=0.99) decoder_optimizer = optim.SGD(decoder_wo_att.parameters(), lr=learning_rate, nesterov=True, momentum=0.99) elif parser.optimizer == 'adam': # lee kho learning rate encoder_optimizer = optim.Adam(encoder_wo_att.parameters(), lr=1e-4) decoder_optimizer = optim.Adam(decoder_wo_att.parameters(), lr=1e-4) else: raise ValueError('Invalid optimizer!') # instantiate scheduler enc_scheduler = ReduceLROnPlateau(encoder_optimizer, min_lr=1e-4, factor=0.5, patience=0) dec_scheduler = ReduceLROnPlateau(decoder_optimizer, min_lr=1e-4, factor=0.5, patience=0) criterion = nn.NLLLoss(ignore_index=global_variables.PAD_IDX) log.info( "Seq2Seq Model with the following parameters: batch_size = {}, learning_rate = {}, rnn_type = {}, enc_emb = {}, enc_hidden = {}, enc_layers = {}, dec_emb = {}, dec_hidden = {}, dec_layers = {}, num_epochs = {}, source_name = {}, target_name = {}" .format(bs, learning_rate, rnn_type, enc_emb, enc_hidden, enc_layers, dec_emb, dec_hidden, dec_layers, num_epochs, source_name, target_name)) # do we want to train again? train_again = False encoder_save = '{}_wo_att_{}bs_{}hs_{}_{}beam_enc_{}_layer'.format( rnn_type, bs, enc_hidden, parser.optimizer, beam_size, enc_layers) decoder_save = '{}_wo_att_{}bs_{}hs_{}_{}beam_dec_{}_layer'.format( rnn_type, bs, enc_hidden, parser.optimizer, beam_size, dec_layers) if os.path.exists(utils.get_full_filepath( saved_models_dir, encoder_save)) and os.path.exists( utils.get_full_filepath(saved_models_dir, decoder_save)) and (not train_again): log.info("Retrieving saved encoder from {}".format( utils.get_full_filepath(saved_models_dir, encoder_save))) log.info("Retrieving saved decoder from {}".format( utils.get_full_filepath(saved_models_dir, decoder_save))) encoder_wo_att.load_state_dict( torch.load(utils.get_full_filepath(saved_models_dir, encoder_save))) decoder_wo_att.load_state_dict( torch.load(utils.get_full_filepath(saved_models_dir, decoder_save))) else: log.info("Check if encoder path exists: {}".format( utils.get_full_filepath(saved_models_dir, encoder_save))) log.info("Check if decoder path exists: {}".format( utils.get_full_filepath(saved_models_dir, decoder_save))) log.info("Encoder and Decoder do not exist! Starting to train...") encoder_wo_att, decoder_wo_att, loss_hist, acc_hist = train_utilities.train_model( encoder_optimizer, decoder_optimizer, encoder_wo_att, decoder_wo_att, criterion, "no_attention", dataloader, en_lang, vi_lang, saved_models_dir, encoder_save, decoder_save, num_epochs=num_epochs, rm=0.95, enc_scheduler=enc_scheduler, dec_scheduler=dec_scheduler) log.info("Total time is: {} min : {} s".format( (time.time() - start) // 60, (time.time() - start) % 60)) log.info( "We will save the encoder/decoder in this directory: {}".format( saved_models_dir)) # BLEU with beam size bleu_no_unk, att_score_wo, pred_wo, src_wo = train_utilities.validation_beam_search( encoder_wo_att, decoder_wo_att, dataloader['validate'], en_lang, vi_lang, 'no_attention', beam_size, verbose=False) log.info("Bleu-{} Score (No UNK): {}".format(beam_size, bleu_no_unk)) print("Bleu-{} Score (No UNK): {}".format(beam_size, bleu_no_unk)) bleu_unk, att_score_wo, pred_wo, src_wo = train_utilities.validation_beam_search( encoder_wo_att, decoder_wo_att, dataloader['validate'], en_lang, vi_lang, 'no_attention', beam_size, verbose=False, replace_unk=True) log.info("Bleu-{} Score (UNK): {}".format(beam_size, bleu_unk)) print("Bleu-{} Score (UNK): {}".format(beam_size, bleu_unk)) # generate 5 random predictions indexes = range(len(pred_wo)) for i in np.random.choice(indexes, 5): print('Source: {} \nPrediction: {}\n---'.format(src_wo[i], pred_wo[i])) log.info('Source: {} \nPrediction: {}\n---'.format( src_wo[i], pred_wo[i])) log.info("Exported Binned Bleu Score Plot to {}!".format(plots_dir)) _, _, fig = utils.get_binned_bl_score( encoder=encoder_wo_att, decoder=decoder_wo_att, val_dataset=transformed_dataset['validate'], attn_flag=attn_flag, beam_size=beam_size, location=plots_dir, collate=collate_fn_dict['validate'], lang_en=en_lang, lang_vi=vi_lang)
DataLoader(dataset_dict['val'], batch_size=1, collate_fn=dataset_helper.vocab_collate_func_val, shuffle=True, num_workers=0), 'test': DataLoader(dataset_dict['test'], batch_size=1, collate_fn=dataset_helper.vocab_collate_func_val, shuffle=True, num_workers=0) } encoder = nnet_models.EncoderRNN( dataset_dict['train'].source_lang_obj.n_words, embed_dim=source_embed_dim, hidden_size=source_hidden_size, rnn_layers=source_rnn_layers, rnn_type=source_rnn_type).to(device) decoder = nnet_models.DecoderRNN( dataset_dict['train'].target_lang_obj.n_words, embed_dim=target_embed_dim, hidden_size=target_hidden_size, n_layers=target_rnn_layers, attention=attention).to(device) encoder_optimizer = optim.SGD(encoder.parameters(), lr=0.25, nesterov=True, momentum=0.99) enc_scheduler = ReduceLROnPlateau(encoder_optimizer,