def init(): global model device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'checkpoint.pth') checkpoint = torch.load(model_path, map_location='cpu') args = checkpoint['args'] hidden_dim = args.hidden_dim nheads = args.nheads enc_layers = args.enc_layers dec_layers = args.dec_layers dim_feedforward = args.dim_feedforward dropout = args.dropout # Build the models backbone_model = BackboneModel(hidden_dim=hidden_dim, arch=args.backbone) transformer_model = TransformerModel(d_model=hidden_dim, n_head=nheads, num_encoder_layers=enc_layers, num_decoder_layers=dec_layers, dim_feedforward=dim_feedforward, dropout=dropout, activation="relu", normalize_before=False) model = TraMapModel(backbone_model, transformer_model) backbone_model.to(device) transformer_model.to(device) model.to(device) model.load_state_dict(checkpoint['model']) model.eval()
def __init__(self, args): # real Transformer model architecture self.transformer_model = TransformerModel( args=args, transformer_dropout=0.05, embedding_dropout=0.05, use_same_embedding=False, ) self.args = args exp_name = args.data_set + '_' + args.exp_name # create experiment dir self.exp_dir = os.path.join(args.checkpoints_dir, exp_name) helper_fn.makedirs(self.exp_dir) hist_name = exp_name + '.hist' model_name = exp_name + '_final_model.h5' self.history_path = os.path.join(self.exp_dir, hist_name) self.model_path = os.path.join(self.exp_dir, model_name) outputs_dir = args.outputs_dir helper_fn.makedirs(outputs_dir) self.src_out_name = exp_name + '.src' self.src_out_path = os.path.join(outputs_dir, self.src_out_name) self.pred_out_name = exp_name + '.pred' self.pred_out_path = os.path.join(outputs_dir, self.pred_out_name) self.tar_out_name = exp_name + '.tgt' self.tar_out_path = os.path.join(outputs_dir, self.tar_out_name)
def train(**kwargs): print("loading dataset") train_dataset = NMTDataset(kwargs["src_train"], kwargs["tgt_train"]) valid_dataset = NMTDataset(kwargs["src_valid"], kwargs["tgt_valid"]) print("Dataset loaded successfully.") train_dl = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn) valid_dl = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn) tokenizer = SpaceTokenizer(base_path+"NMTtokenizers/spacetoken_vocab_files/vocab_nepali.json", base_path+"NMTtokenizers/spacetoken_vocab_files/vocab_english.json" ) if kwargs["tokenizer"] == "space_tokenizer" else BertTokenizer( base_path+"NMTtokenizers/wordpiece_vocab_files/vocab_newa.json", base_path+"NMTtokenizers/wordpiece_vocab_files/vocab_eng.json" ) if kwargs['model'] == 'transformer': model = TransformerModel(len(tokenizer.src_vocab), len(tokenizer.tgt_vocab), embed_size, n_heads, dropout=dropout_rate) else: model = Seq2Seq(embed_size, hidden_size, tokenizer, dropout_rate=dropout_rate, n_layers=n_layers) # criterion = nn.CrossEntropyLoss() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=0.001) model.to(device) model = trainer(model, optimizer, train_dl, valid_dl, BATCH_SIZE, epoch, device, LOG_EVERY, kwargs["checkpoint_path"], kwargs["best_model"], beam_size, max_decoding_time_step)
def decode(**kwargs): src_sent = open_file(kwargs['src_file']) tokenizer = SpaceTokenizer( src_vocab_path, tgt_vocab_path ) if kwargs["tokenizer"] == "space_tokenizer" else BertTokenizer( src_vocab_path, tgt_vocab_path) model = TransformerModel(len(tokenizer.src_vocab), len(tokenizer.tgt_vocab), tokenizer, embed_size, n_heads, dropout=dropout_rate) model.to(device) model, _, _, _ = load_checkpt(model, kwargs['best_model'], device) src_tensor, _ = tokenizer.encode(src_sent, device, return_tensor=True) # translator = Translator(model, beam_size, max_decoding_time_step, # model.tokenizer.src_vocab['[PAD]'], model.tokenizer.tgt_vocab['[PAD]'], # model.tokenizer.tgt_vocab['[SOS]'], model.tokenizer.tgt_vocab['[EOS]']).to(device) output = [] for src in src_tensor.transpose(0, 1): # pred_seq = translator.translate_sentence(src.view(1, -1), device) hyps = beam_search_transformer(model, src.view(1, -1), beam_size, max_decoding_time_step, model.tokenizer.src_vocab['[PAD]'], model.tokenizer.tgt_vocab['[EOS]'], device) # pred_seq = greedy_decode(model, src.view(1, -1), max_decoding_time_step, model.tokenizer.tgt_vocab['[SOS]'], # model.tokenizer.src_vocab['[PAD]'], model.tokenizer.tgt_vocab['[EOS]'], device) top_hyp = hyps[0] hyp_sent = ' '.join(top_hyp.value) print(hyp_sent)
def transformer_model(c: Configs): from models.transformer import TransformerModel m = TransformerModel(n_tokens=c.n_tokens, d_model=c.d_model, encoder=c.transformer.encoder, src_embed=c.transformer.src_embed) return m.to(c.device)
def add_args(parser): """Add model-specific arguments to the parser.""" TransformerModel.add_args(parser) parser.add_argument('--share-encoder-embeddings', action='store_true', help='share encoder embeddings across languages') parser.add_argument('--share-decoder-embeddings', action='store_true', help='share decoder embeddings across languages') parser.add_argument('--share-encoders', action='store_true', help='share encoders across languages') parser.add_argument('--share-decoders', action='store_true', help='share decoders across languages')
def build_model(cls, args, task): # set any default arguments transformer_align(args) transformer_model = TransformerModel.build_model(args, task) return TransformerAlignModel(transformer_model.encoder, transformer_model.decoder, args)
def __init__(self, ntoken, d_model=512, nhead=8, nhid=512, te_nlayers=6, te_dropout=0.5, pretrained_vec=None, n_layers=2, bidirectional=True, output_dim=1, hidden_dim=256, rnn_dropout=0.3, pad_token_id=None): super(TEGRU, self).__init__() self.transformer_encoder = TransformerModel(ntoken, d_model, nhead, nhid, te_nlayers, pretrained_vec, te_dropout) self.bidirectional = bidirectional self.n_layers = n_layers self.pad_token_id = pad_token_id self.rnn = nn.GRU(d_model, hidden_size=hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=0 if n_layers < 2 else rnn_dropout) rnn_input_dim = d_model if hidden_dim is None else hidden_dim self.fc = nn.Linear( 2 * rnn_input_dim if bidirectional else rnn_input_dim, output_dim) nn.init.xavier_uniform_(self.fc.weight) self.dropout = nn.Dropout(rnn_dropout)
def train(**kwargs): print("loading dataset") train_dataset = NMTDataset(kwargs["src_train"], kwargs["tgt_train"]) valid_dataset = NMTDataset(kwargs["src_valid"], kwargs["tgt_valid"]) print("Dataset loaded successfully.") train_dl = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn) valid_dl = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn) tokenizer = SpaceTokenizer( src_vocab_path, tgt_vocab_path ) if kwargs["tokenizer"] == "space_tokenizer" else BertTokenizer( src_vocab_path, tgt_vocab_path) model = TransformerModel(len(tokenizer.src_vocab), len(tokenizer.tgt_vocab), tokenizer, embed_size, n_heads, dropout=dropout_rate) model.to(device) criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='sum') optimizer = torch.optim.Adam(model.parameters(), lr=0.6, betas=(0.9, 0.98), eps=1e-9) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) torch.autograd.set_detect_anomaly(True) train_model(model, optimizer, criterion, scheduler, train_dl, valid_dl, BATCH_SIZE, epoch, device, kwargs["checkpoint_path"], kwargs["best_model"], beam_size, max_decoding_time_step)
def test(**kwargs): test_dataset = NMTDataset(kwargs["src_test"], kwargs["tgt_test"]) print("Dataset loaded successfully.") test_dl = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn) tokenizer = SpaceTokenizer( src_vocab_path, tgt_vocab_path ) if kwargs["tokenizer"] == "space_tokenizer" else BertTokenizer( src_vocab_path, tgt_vocab_path) model = TransformerModel(len(tokenizer.src_vocab), len(tokenizer.tgt_vocab), tokenizer, embed_size, n_heads, dropout=dropout_rate) criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='sum') model.to(device) model.eval() bleu_score = 0 test_loss = 0 test_start_time = time.time() with torch.no_grad(): for batch in test_dl: src_tensor, tgt_tensor, _, _ = model.tokenizer.encode( batch, device, return_tensor=True) src_tensor = src_tensor.transpose(0, 1) tgt_tensor = tgt_tensor.transpose(0, 1) trg_input = tgt_tensor[:, :-1] targets = tgt_tensor[:, 1:].contiguous().view(-1) preds = model(src_tensor, trg_input.to(device), device) loss = criterion(preds, targets) test_loss += loss.item() / BATCH_SIZE output = [] for src in src_tensor: hyps = beam_search_transformer( model, src.view(1, -1), beam_size, max_decoding_time_step, model.tokenizer.src_vocab['[PAD]'], model.tokenizer.tgt_vocab['[EOS]'], device) top_hyp = hyps[0] hyp_sent = ' '.join(top_hyp.value) output.append(hyp_sent) score = compute_bleu_score(output, batch[1]) bleu_score += score print( f'Avg. test loss: {test_loss/len(test_dl):.5f} | BLEU Score: {bleu_score/len(test_dl)} | time elapsed: {time.time() - test_start_time}' )
def __init__(self, cfg, weights_matrix=None): super(SentenceEncoder, self).__init__() vocab_size = cfg.get('vocab_size', {}) pretrained_embeddings = cfg.get('pretrained_embeddings', False) if pretrained_embeddings: vocab = pickle.load(open('vocab/words.pkl', 'rb')) weights_matrix = pretrained_weights_matrix( vocab, cfg, vocab_size.get('sentences', None)) self.embedding = create_emb_layer(cfg, vocab_size.get('sentences', None), weights_matrix) self.encoder_type = cfg.get('encoder_type', 'lstm') if self.encoder_type not in ['lstm', 'transformer']: raise ValueError('Encoder needs to be valid type.') if self.encoder_type == 'lstm': self.encoder = LSTMModel(cfg) elif self.encoder_type == 'transformer': self.encoder = TransformerModel(cfg) self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') if not cfg.get('use_cuda', True): self.device = torch.device('cpu')
def train( run_name: str, # Data train_filepath: str = CSNJS_TRAIN_FILEPATH, eval_filepath: str = CSNJS_VALID_FILEPATH, spm_filepath: str = SPM_UNIGRAM_FILEPATH, program_mode="identity", eval_program_mode="identity", label_mode="identifier", num_workers=1, limit_dataset_size=-1, # Model model_type="transformer", n_decoder_layers=4, d_model: int = 512, resume_path: str = "", resume_encoder_name: str = "encoder_q", # encoder_q, encoder_k, encoder resume_project: bool = False, # Optimization train_decoder_only: bool = False, num_epochs: int = 50, save_every: int = 2, batch_size: int = 256, lr: float = 8e-4, adam_beta1: float = 0.9, adam_beta2: float = 0.98, use_lr_warmup: bool = True, loss_type = "nll_token", # nll_token or nll_sequence # Loss subword_regularization_alpha: float = 0, # Computational use_cuda: bool = True, auto_test: bool = True, seed: int = 0, ): """Train model""" torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) run_dir = RUN_DIR / run_name run_dir.mkdir(exist_ok=True, parents=True) logger.add(str((run_dir / "train.log").resolve())) logger.info(f"Saving logs, model checkpoints to {run_dir}") config = locals() logger.info(f"Config: {config}") wandb.init(name=run_name, config=config, job_type="training", project="identifier-prediction", entity="ml4code") if use_cuda: assert torch.cuda.is_available(), "CUDA not available. Check env configuration, or pass --use_cuda False" train_augmentations = [ {"fn": "sample_lines", "line_length_pct": 0.5}, {"fn": "insert_var_declaration", "prob": 0.5}, {"fn": "rename_variable", "prob": 0.5}, ] sp = spm.SentencePieceProcessor() sp.Load(spm_filepath) pad_id = sp.PieceToId("[PAD]") # Create training dataset and dataloader logger.info(f"Training data path {train_filepath}") train_dataset = get_csnjs_dataset(train_filepath, label_mode=label_mode, limit_size=limit_dataset_size) logger.info(f"Training dataset size: {len(train_dataset)}") train_loader = javascript_dataloader( train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, augmentations=train_augmentations, sp=sp, program_mode=program_mode, subword_regularization_alpha=subword_regularization_alpha, ) # Create eval dataset and dataloader logger.info(f"Eval data path {eval_filepath}") eval_dataset = get_csnjs_dataset(eval_filepath, label_mode=label_mode, limit_size=limit_dataset_size) logger.info(f"Eval dataset size: {len(eval_dataset)}") eval_loader = javascript_dataloader( eval_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, augmentations=[], sp=sp, program_mode=eval_program_mode, subword_regularization_alpha=subword_regularization_alpha, ) # Create model pad_id = sp.PieceToId("[PAD]") if model_type == "transformer": model = TransformerModel(n_tokens=sp.GetPieceSize(), pad_id=pad_id, n_decoder_layers=n_decoder_layers, d_model=d_model) logger.info(f"Created TransformerModel with {count_parameters(model)} params") elif model_type == "lstm": model = Seq2SeqLSTM(n_tokens=sp.GetPieceSize(), pad_id=pad_id, d_model=d_model) logger.info(f"Created Seq2SeqLSTM with {count_parameters(model)} params") # Load checkpoint if resume_path: logger.info(f"Resuming training from checkpoint {resume_path}, resume_encoder_name={resume_encoder_name}") checkpoint = torch.load(resume_path) pretrained_state_dict = checkpoint["model_state_dict"] encoder_state_dict = {} assert resume_encoder_name in ["encoder_k", "encoder_q", "encoder"] for key, value in pretrained_state_dict.items(): if key.startswith(resume_encoder_name + ".") and "project_layer" not in key: remapped_key = key[len(resume_encoder_name + ".") :] logger.debug(f"Remapping checkpoint key {key} to {remapped_key}. Value mean: {value.mean().item()}") encoder_state_dict[remapped_key] = value if key.startswith(resume_encoder_name + ".") and "project_layer.0." in key and resume_project: remapped_key = key[len(resume_encoder_name + ".") :] logger.debug(f"Remapping checkpoint project key {key} to {remapped_key}. Value mean: {value.mean().item()}") encoder_state_dict[remapped_key] = value model.encoder.load_state_dict(encoder_state_dict, strict=False) logger.info(f"Loaded state dict from {resume_path}") logger.info(f"Loaded keys: {encoder_state_dict.keys()}") # Set up optimizer model = nn.DataParallel(model) model = model.cuda() if use_cuda else model wandb.watch(model, log="all") params = model.module.decoder.parameters() if train_decoder_only else model.parameters() optimizer = torch.optim.Adam(params, lr=lr, betas=(adam_beta1, adam_beta2), eps=1e-9) if use_lr_warmup: scheduler = get_linear_schedule_with_warmup(optimizer, 5000, len(train_loader) * num_epochs) else: scheduler = LambdaLR(optimizer, lr_lambda=lambda x: 1.0) global_step = 0 min_eval_loss = float("inf") for epoch in tqdm.trange(1, num_epochs + 1, desc="training", unit="epoch", leave=False): logger.info(f"Starting epoch {epoch}\n") if train_decoder_only: model.module.encoder.eval() model.module.decoder.train() else: model.train() pbar = tqdm.tqdm(train_loader, desc=f"epoch {epoch}") for X, Y, X_lengths, Y_lengths in pbar: if use_cuda: X = X.cuda() Y = Y.cuda() X_lengths, Y_lengths = X_lengths.cuda(), Y_lengths.cuda() optimizer.zero_grad() # NOTE: X and Y are [B, max_seq_len] tensors (batch first) logits = model(X, Y[:, :-1], X_lengths, Y_lengths) if loss_type == "nll_sequence": loss = F.cross_entropy(logits.transpose(1, 2), Y[:, 1:], ignore_index=pad_id, reduction='sum') loss = loss / X.size(0) # Average over num sequences, not target sequence lengths # Thus, minimize bits per sequence. elif loss_type == "nll_token": loss = F.cross_entropy(logits.transpose(1, 2), Y[:, 1:], ignore_index=pad_id,) loss.backward() optimizer.step() scheduler.step() # Log loss global_step += 1 wandb.log( {"epoch": epoch, f"label-{label_mode}/train_loss": loss.item(), "lr": scheduler.get_last_lr()[0]}, step=global_step ) pbar.set_description(f"epoch {epoch} loss {loss.item():.4f}") # Evaluate logger.info(f"Evaluating model after epoch {epoch} ({global_step} steps)...") max_decode_len = 20 if label_mode == "identifier" else 200 eval_loss = _evaluate(model, eval_loader, sp, use_cuda=use_cuda, max_decode_len=max_decode_len, loss_type=loss_type) logger.info(f"Evaluation loss after epoch {epoch} ({global_step} steps): {eval_loss:.4f}") wandb.log({"epoch": epoch, f"label-{label_mode}/eval_loss": eval_loss}, step=global_step) # Save checkpoint if save_every and epoch % save_every == 0 or eval_loss < min_eval_loss: checkpoint = { "model_state_dict": model.module.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "epoch": epoch, "global_step": global_step, "config": config, "eval_loss": eval_loss, } if eval_loss < min_eval_loss: logger.info(f"New best evaluation loss: prev {min_eval_loss:.4f} > new {eval_loss:.4f}") min_eval_loss = eval_loss model_file = run_dir / "ckpt_best.pth" else: model_file = run_dir / f"ckpt_ep{epoch:04d}.pth" logger.info(f"Saving checkpoint to {model_file}...") torch.save(checkpoint, str(model_file.resolve())) wandb.save(str(model_file.resolve())) logger.info("Done.") if auto_test: best_ckpt = run_dir / "ckpt_best.pth" test( str(best_ckpt.resolve()), CSNJS_TEST_FILEPATH, spm_filepath, program_mode, label_mode, num_workers, -1, n_decoder_layers=n_decoder_layers, )
def test( checkpoint_file: str, test_filepath: str = CSNJS_TEST_FILEPATH, spm_filepath: str = SPM_UNIGRAM_FILEPATH, program_mode="identity", label_mode="identifier", num_workers=1, limit_dataset_size=-1, batch_size=8, model_type="transformer", n_decoder_layers=4, d_model=512, use_cuda: bool = True, ): wandb.init(name=checkpoint_file, config=locals(), project="f1_eval", entity="ml4code") if use_cuda: assert torch.cuda.is_available(), "CUDA not available. Check env configuration, or pass --use_cuda False" sp = spm.SentencePieceProcessor() sp.Load(spm_filepath) # Create test dataset and dataloader logger.info(f"Test data path {test_filepath}") test_dataset = get_csnjs_dataset(test_filepath, label_mode=label_mode, limit_size=limit_dataset_size) logger.info(f"Test dataset size: {len(test_filepath)}") test_loader = javascript_dataloader( test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, sp=sp, program_mode=program_mode, subword_regularization_alpha=0, augmentations=[], ) pad_id = sp.PieceToId("[PAD]") if model_type == "transformer": model = TransformerModel(n_tokens=sp.GetPieceSize(), pad_id=pad_id, n_decoder_layers=n_decoder_layers, d_model=d_model) logger.info(f"Created TransformerModel with {count_parameters(model)} params") elif model_type == "lstm": model = Seq2SeqLSTM(n_tokens=sp.GetPieceSize(), pad_id=pad_id, d_model=d_model) logger.info(f"Created Seq2SeqLSTM with {count_parameters(model)} params") if use_cuda: model = model.cuda() # Load checkpoint checkpoint = torch.load(checkpoint_file) pretrained_state_dict = checkpoint["model_state_dict"] print("CHECKPOINT", checkpoint_file) from pprint import pprint print("KEYS", checkpoint["model_state_dict"].keys()) try: model.load_state_dict(pretrained_state_dict) except RuntimeError as e: logger.error(e) logger.error("Keys in checkpoint: " + str(list(pretrained_state_dict.keys()))) raise e logger.info(f"Loaded state dict from {checkpoint_file}") # Evaluate NLL model.eval() with torch.no_grad(): test_nll = calculate_nll(model, test_loader, sp, use_cuda=use_cuda, logger_fn=wandb.log) logger.info(f"NLL: {test_nll:.5f}") # Make metric metric = F1MetricMethodName() model.eval() with torch.no_grad(): precision, recall, f1, sample_generations = calculate_f1_metric(metric, model, test_loader, sp, use_cuda=use_cuda, logger_fn=wandb.log) logger.info(f"NLL: {test_nll:.5f}") logger.info(f"Precision: {precision:.5f}%") logger.info(f"Recall: {recall:.5f}%") logger.info(f"F1: {f1:.5f}%") df_generations = pd.DataFrame(sample_generations) df_generations.to_pickle(os.path.join(wandb.run.dir, "sample_generations.pickle.gz")) wandb.save(os.path.join(wandb.run.dir, "sample_generations.pickle.gz"))
class Transformer: def __init__(self, args): # real Transformer model architecture self.transformer_model = TransformerModel( args=args, transformer_dropout=0.05, embedding_dropout=0.05, use_same_embedding=False, ) self.args = args exp_name = args.data_set + '_' + args.exp_name # create experiment dir self.exp_dir = os.path.join(args.checkpoints_dir, exp_name) helper_fn.makedirs(self.exp_dir) hist_name = exp_name + '.hist' model_name = exp_name + '_final_model.h5' self.history_path = os.path.join(self.exp_dir, hist_name) self.model_path = os.path.join(self.exp_dir, model_name) outputs_dir = args.outputs_dir helper_fn.makedirs(outputs_dir) self.src_out_name = exp_name + '.src' self.src_out_path = os.path.join(outputs_dir, self.src_out_name) self.pred_out_name = exp_name + '.pred' self.pred_out_path = os.path.join(outputs_dir, self.pred_out_name) self.tar_out_name = exp_name + '.tgt' self.tar_out_path = os.path.join(outputs_dir, self.tar_out_name) def train(self): ds = DataSet(self.args) print('*' * 100) print('train sample number: ', ds.train_sample_num) print('valid sample number: ', ds.valid_sample_num) print('test sample number: ', ds.test_sample_num) print('*' * 100) train_generator = ds.data_generator( 'train', 'transformer', max_src_len=self.args.src_seq_length, max_tar_len=self.args.tar_seq_length, ) valid_generator = ds.data_generator( 'valid', 'transformer', max_src_len=self.args.src_seq_length, max_tar_len=self.args.tar_seq_length, ) def compile_new_model(): _model = self.transformer_model.get_model(ds.pad_id) _model.compile( optimizer=keras.optimizers.Adam(lr=self.args.lr), loss=keras.losses.sparse_categorical_crossentropy, ) return _model if os.path.exists(self.model_path): print('Loading model from: %s' % self.model_path) custom_dict = get_custom_objects() model = load_model(self.model_path, custom_objects=custom_dict) else: print('Compile new model...') model = compile_new_model() #model.summary() #plot_model(model, to_file='model_structure.png',show_shapes=True) verbose = 1 earlystopper = EarlyStopping(monitor='val_loss', patience=self.args.early_stop_patience, verbose=verbose) ckpt_name = 'model-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5' ckpt_path = os.path.join(self.exp_dir, ckpt_name) checkpoint = ModelCheckpoint(ckpt_path, monitor='val_loss', verbose=verbose, save_best_only=True, mode='min') lrate = keras.callbacks.ReduceLROnPlateau( monitor='val_loss', factor=0.5, patience=self.args.lr_decay_patience, verbose=verbose, mode='auto', min_delta=0.0001, cooldown=0, min_lr=self.args.lr_min, ) callback_list = [earlystopper, checkpoint, lrate] hist = model.fit_generator( generator=train_generator, steps_per_epoch=(ds.train_sample_num // self.args.batch_size), epochs=self.args.epochs, callbacks=callback_list, validation_data=valid_generator, validation_steps=(ds.valid_sample_num // self.args.batch_size), ) with open(self.history_path, 'w') as f: f.write(str(hist.history)) model.save(self.model_path) #plot_model(model, to_file='model_structure.png',show_shapes=True) def test(self): # load_model print('Loading model from: %s' % self.model_path) custom_dict = get_custom_objects() model = load_model(self.model_path, custom_objects=custom_dict) ds = DataSet(args) test_generator = ds.data_generator( 'test', 'transformer', max_src_len=self.args.src_seq_length, max_tar_len=self.args.tar_seq_length, ) src_outobj = open(self.src_out_path, 'w') pred_outobj = open(self.pred_out_path, 'w') tar_outobj = open(self.tar_out_path, 'w') for batch, ([src_input, tar_input], tar_loss_input) in enumerate(test_generator): if batch > (ds.test_sample_num // self.args.batch_size): # finish all of the prediction break print('Current batch: {}/{}. '.format( batch, ds.test_sample_num // self.args.batch_size)) cur_batch_size = tar_input.shape[0] tar_length = tar_input.shape[1] results = np.zeros_like(tar_input) results[:, 0] = ds.start_id for i in range(1, tar_length): results[:, i] = ds.pad_id for t in range(1, tar_length): preds = model.predict([ src_input, np.asarray(results) ]) # shape: (batch_size, tar_length, vocab_size) pred_id = np.argmax(preds, axis=-1) results[:, t] = pred_id[:, t - 1] def output_results(outputs, outobj): for result in outputs: seq = [] for _id in result: _id = int(_id) if _id == ds.end_id: break if _id != ds.pad_id and _id != ds.start_id: seq.append( ds.tar_id_tokens.get(_id, config.UNK_TOKEN)) write_line = ' '.join(seq) write_line = write_line + '\n' outobj.write(write_line) output_results(results, pred_outobj) output_results(src_input, src_outobj) output_results(tar_input, tar_outobj) src_outobj.close() pred_outobj.close() tar_outobj.close()
def main(args): device = torch.device(args.device) # Seed seed = args.seed torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # Build the models backbone_model = BackboneModel(hidden_dim=args.hidden_dim, arch=args.backbone) transformer_model = TransformerModel(d_model=args.hidden_dim, n_head=args.nheads, num_encoder_layers=args.enc_layers, num_decoder_layers=args.dec_layers, dim_feedforward=args.dim_feedforward, dropout=args.dropout, activation="relu", normalize_before=False) model = TraMapModel(backbone_model, transformer_model) print("DEVICE:", device) backbone_model.to(device) transformer_model.to(device) model.to(device) n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) # Data loader transforms = T.Compose([ T.ToTensor(), T.Normalize(mean=[0.1888, 0.2168, 0.2469], std=[0.3322, 0.2871, 0.2899]) ]) dataset_train = MapQueryDataset(transforms=transforms, split='train') sampler_train = torch.utils.data.RandomSampler(dataset_train) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=False) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, num_workers=args.num_workers) output_dir = Path(args.output_dir) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.eval: test_stats = None # Criterion / Loss function # criterion = MSLELoss() # criterion = nn.MSELoss() # criterion = nn.L1Loss() criterion = nn.SmoothL1Loss() criterion.to(device) # Logger thing MB = 1024.0 * 1024.0 print_every = 10 target = data_loader_train print("Start Training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): model.train() criterion.train() print("EPOCH:", epoch) i = 0 ## Training process ## # Move to GPU or CPU for sample, query, duration in data_returner(data_loader_train): query = query.to(device) sample = sample.to(device) ## Target duration duration = duration.to(device) duration = duration.float() outputs = model(sample, query) outputs = outputs.flatten() # RMSE if criterion set to MSE # loss = torch.sqrt(criterion(outputs, duration) + 1e-8) # Else loss = criterion(outputs, duration) loss_value = loss.item() if not math.isfinite(loss_value): print( "Loss is {}, stop the training process".format(loss_value)) sys.exit(1) optimizer.zero_grad() loss.backward() if args.clip_max_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_max_norm) optimizer.step() if i % print_every == 0: # print("Output: {} Target: {}".format(outputs.tolist()[0], duration.tolist()[0])) if torch.cuda.is_available(): print("Iter: {} Memory: {:d}MB Loss: {}".format( i, math.trunc(torch.cuda.max_memory_allocated() / MB), loss_value)) # print(outputs[0].item(), duration[0].item()) else: print("Iter: {} Loss:{}".format(i, loss_value)) i += 1 lr_scheduler.step() ## Saving or Not saving, there is no in between if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: torch.save( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
# Config dict and model is for reference config_dict = LoadConfig('conf').load_config() # Load Data dataset_name = config_dict['dataset_name'] data_creator = CreateData(config_path='conf') train_datasets, valid_datasets, test_datasets = data_creator.create_all() # Define Model model = TransformerModel( encoder_vocab_size=data_creator.tokenizer.lang_one_vocab_size, decoder_vocab_size=data_creator.tokenizer.lang_two_vocab_size, encoder_max_pos=config_dict['max_pos_length'], decoder_max_pos=config_dict['max_pos_length'], num_heads=config_dict['num_heads'], model_dim=config_dict['model_dim'], feed_forward_dim=config_dict['feed_forward_dim'], dropout_rate=config_dict['dropout_rate'], mha_concat_query=config_dict['mha_concat_query'], n_layers=config_dict['n_layers'], debug=config_dict['debug']) # Learning Rate Schedule model_learning_rate = CustomSchedule(config_dict['model_dim']) model_optimizer = tf.keras.optimizers.Adam(learning_rate=model_learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) # Loss Object # If reduction is NONE, this has shape [batch_size, d0, .. dN-1];