def predict_xl(text, model: Transformer, device, is_beam_search=False): input_id = get_input_char_index(text) # input_length = torch.LongTensor([len(input_id)]).to(device) input_tensor = torch.LongTensor([input_id]).to(device) batch_size, src_len = input_tensor.shape trg = input_tensor.new_full((batch_size, 1), model.sos_idx) src_mask, trg_mask = model.make_masks(input_tensor, trg) if is_beam_search == False: # while True: encoder_output = model.encoder(input_tensor, src_mask) step = 0 result = [] while step < 200: # print(step) output = model.decoder(trg, encoder_output, trg_mask, src_mask) output = torch.argmax(output[:, -1], dim=1) result.append(output.item()) if output.numpy()[0] == EOS_IDX: break output = output.unsqueeze(1) trg = torch.cat((trg, output), dim=1) src_mask, trg_mask = model.make_masks(input_tensor, trg) step += 1 # outpu_tensor = torch.argmax(output.squeeze(1), 1) ouput_str = get_output_char(result) return ouput_str else: target = beam_search.beam_decode(input_tensor, model, beam_with=5) print(target) print(len(target[0][0])) ouput_str = get_output_char(target[0][0][1:]) return ouput_str
def __init__(self, model): self.device = torch.device('cuda') checkpoint = torch.load(model) checkpoint_copy = checkpoint['model'].copy() for k in list(checkpoint_copy.keys()): new_key = k.replace('module.model.', '') checkpoint_copy.update({str(new_key): checkpoint_copy.pop(k)}) model_opt = checkpoint['settings'] model = Transformer( model_opt.src_vocab_size, model_opt.tgt_vocab_size, model_opt.max_token_seq_len, tgt_emb_prj_weight_sharing=model_opt.proj_share_weight, emb_src_tgt_weight_sharing=model_opt.embs_share_weight, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout) model.load_state_dict(checkpoint_copy) model = model.to(self.device) self.model = model for p in self.model.parameters(): p.requires_grad = False self.model.eval()
def __init__(self, opt): self.opt = opt self.device = torch.device('cuda' if opt.cuda else 'cpu') checkpoint = torch.load(opt.model) model_opt = checkpoint['settings'] self.model_opt = model_opt model = Transformer( model_opt.src_vocab_size, model_opt.tgt_vocab_size, model_opt.max_token_seq_len, tgt_emb_prj_weight_sharing=model_opt.proj_share_weight, emb_src_tgt_weight_sharing=model_opt.embs_share_weight, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout) model.load_state_dict(checkpoint['model']) print('[Info] Trained model state loaded.') # model.word_prob_prj = nn.LogSoftmax(dim=1) model = model.to(self.device) self.model = model self.model.eval()
def load_model(opt, device): checkpoint = torch.load(opt.model, map_location=device) model_opt = checkpoint['settings'] model = Transformer( model_opt.src_vocab_size, model_opt.trg_vocab_size, model_opt.src_pad_idx, model_opt.trg_pad_idx, trg_emb_prj_weight_sharing=model_opt.proj_share_weight, emb_src_trg_weight_sharing=model_opt.embs_share_weight, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout).to(device) model.load_state_dict(checkpoint['model']) print('[Info] Trained model state loaded.') return model
def train(model: Transformer, optimizer, criterion, clip, device): model.train() epoches_loss = 0 for index, batch in tqdm(enumerate(dataset_pro.train_iter)): shang_lian, shang_lian_length = batch.shang_lian shang_lian = shang_lian.permute(1, 0).to(device) # shang_lian_length = shang_lian_length.permute(1, 0).to(device) # shang_lian_length = shang_lian_length.numpy() # shang_lian_pos = torch.LongTensor(get_pos_ids(shang_lian_length, shang_lian.shape[1])).to(device) xia_lian, xia_lian_length = batch.xia_lian xia_lian = xia_lian.permute(1, 0).to(device) # xia_lian_length = xia_lian_length.numpy() # xia_lian_pos = torch.LongTensor(get_pos_ids(xia_lian_length, xia_lian.shape[1])).to(device) optimizer.zero_grad() outputs = model(shang_lian, xia_lian[:, :-1]) outputs = outputs.contiguous().view(-1, outputs.shape[-1]) xia_lian = xia_lian[:, 1:].contiguous().view(-1) loss = criterion(outputs, xia_lian) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), clip) # print(loss.item()) optimizer.step() epoches_loss += loss.item() result_loss = epoches_loss / len(dataset_pro.train_iter) return result_loss
def main(): parser = argparse.ArgumentParser() parser.add_argument('-read_feats_scp_file', required=True) parser.add_argument('-read_vocab_file', required=True) parser.add_argument('-max_token_seq_len', type=int, required=True) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=1024) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-save_model_file', required=True) opt = parser.parse_args() print('--------------------[PROCEDURE]--------------------') print( '[PROCEDURE] reading dimension from data file and initialize the model' ) for key, matrix in kaldi_io.read_mat_scp(opt.read_feats_scp_file): opt.src_dim = matrix.shape[1] break print('[INFO] get feature of dimension {} from {}.'.format( opt.src_dim, opt.read_feats_scp_file)) word2idx = torch.load(opt.read_vocab_file) opt.tgt_vocab_dim = len(word2idx) print('[INFO] get label of dimension {} from {}.'.format( opt.tgt_vocab_dim, opt.read_vocab_file)) print('[INFO] model will initialized with add_argument:\n{}.'.format(opt)) model = Transformer(opt.src_dim, opt.tgt_vocab_dim, opt.max_token_seq_len, n_layers=opt.n_layers, n_head=opt.n_head, d_model=opt.d_model, d_inner_hid=opt.d_inner_hid, d_k=opt.d_k, d_v=opt.d_v, dropout=opt.dropout, proj_share_weight=opt.proj_share_weight, embs_share_weight=opt.embs_share_weight) checkpoint = {'model': model, 'model_options': opt, 'epoch': 0} torch.save(checkpoint, opt.save_model_file) #can be readed by: #checkpoint = torch.load(opt.save_model_file) #model = checkpoint['model'] print('[INFO] initialized model is saved to {}.'.format( opt.save_model_file))
def __init__(self, opt, device): self.opt = opt self.device = device checkpoint = torch.load(opt.model) model_opt = checkpoint['settings'] self.model_opt = model_opt model = Transformer(model_opt.input_dim, model_opt.output_dim, model_opt.n_inputs_max_seq, model_opt.n_outputs_max_seq, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_inner_hid=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout, device=device, is_train=False) model.load_state_dict(checkpoint['model']) print('[Info] Trained model state loaded.') model.to(device) prob_projection.to(device) model.prob_projection = prob_projection self.model = model self.model.eval()
def main(): ''' Main function ''' opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model # ========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len trn_data, val_data = prepare_dataloaders(data, opt) opt.src_vocab_size = trn_data.dataset.src_vocab_size opt.tgt_vocab_size = trn_data.dataset.tgt_vocab_size # ========= Preparing Model =========# if opt.embs_share_weight: assert trn_data.dataset.src_word2idx == trn_data.dataset.tgt_word2idx,\ ('The src/tgt word2idx table are different but asked to share ' 'word embedding.') print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) train(transformer, trn_data, val_data, optimizer, device, opt)
def build_transformer(n_src_vocab=VOCAB_SZ + 1, n_tgt_vocab=VOCAB_SZ + 1, len_max_seq_encoder=MAX_QUESTION_SZ, len_max_seq_decoder=MAX_ANSWER_SZ): return Transformer( n_src_vocab=n_src_vocab, # add PAD in vocabulary n_tgt_vocab=n_tgt_vocab, # add PAD in vocabulary len_max_seq_encoder=len_max_seq_encoder, len_max_seq_decoder=len_max_seq_decoder, )
def __init__(self, opt): self.opt = opt self.tt = torch.cuda if opt.cuda else torch checkpoint = torch.load(opt.model, map_location=lambda storage, loc: storage) model_opt = checkpoint['settings'] if 'use_ctx' not in model_opt.__dict__: model_opt.use_ctx = False self.model_opt = model_opt model = Transformer(model_opt.src_vocab_size, model_opt.tgt_vocab_size, model_opt.max_token_seq_len, proj_share_weight=model_opt.proj_share_weight, embs_share_weight=model_opt.embs_share_weight, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner_hid=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout, use_ctx=model_opt.use_ctx) prob_projection = nn.LogSoftmax() model.load_state_dict(checkpoint['model']) # New max_token_seq_len for position encoding model = self.change_position_embedings(model, opt.max_token_seq_len, model_opt.d_word_vec, model_opt.use_ctx) model_opt.max_token_seq_len = opt.max_token_seq_len print('[Info] Trained model state loaded.') if opt.cuda: model.cuda() prob_projection.cuda() else: model.cpu() prob_projection.cpu() model.prob_projection = prob_projection self.model = model self.model.eval()
def __init__(self, device=None, jit=False): self.device = device self.jit = jit self.opt = Namespace(**{ 'batch_size': 128, 'd_inner_hid': 2048, 'd_k': 64, 'd_model': 512, 'd_word_vec': 512, 'd_v': 64, 'data_pkl': 'm30k_deen_shr.pkl', 'debug': '', 'dropout': 0.1, 'embs_share_weight': False, 'epoch': 1, 'label_smoothing': False, 'log': None, 'n_head': 8, 'n_layers': 6, 'n_warmup_steps': 128, 'cuda': True, 'proj_share_weight': False, 'save_mode': 'best', 'save_model': None, 'script': False, 'train_path': None, 'val_path': None, }) _, validation_data = prepare_dataloaders(self.opt, self.device) transformer = Transformer( self.opt.src_vocab_size, self.opt.trg_vocab_size, src_pad_idx=self.opt.src_pad_idx, trg_pad_idx=self.opt.trg_pad_idx, trg_emb_prj_weight_sharing=self.opt.proj_share_weight, emb_src_trg_weight_sharing=self.opt.embs_share_weight, d_k=self.opt.d_k, d_v=self.opt.d_v, d_model=self.opt.d_model, d_word_vec=self.opt.d_word_vec, d_inner=self.opt.d_inner_hid, n_layers=self.opt.n_layers, n_head=self.opt.n_head, dropout=self.opt.dropout).to(self.device) if self.jit: transformer = torch.jit.script(transformer) self.module = transformer batch = list(validation_data)[0] src_seq = patch_src(batch.src, self.opt.src_pad_idx).to(self.device) trg_seq, self.gold = map(lambda x: x.to(self.device), patch_trg(batch.trg, self.opt.trg_pad_idx)) # We use validation_data for training as well so that it can finish fast enough. self.example_inputs = (src_seq, trg_seq)
def evaluate(model: Transformer, criterion, device): model.eval() epoches_loss = 0 print('evaluate') with torch.no_grad(): for index, batch in enumerate(dataset_pro.valid_iter): shang_lian, shang_lian_length = batch.shang_lian shang_lian = shang_lian.permute(1, 0).to(device) # shang_lian_length = shang_lian_length.permute(1, 0).to(device) # shang_lian_length = shang_lian_length.numpy() # shang_lian_pos = torch.LongTensor(get_pos_ids(shang_lian_length, shang_lian.shape[1])).to(device) xia_lian, xia_lian_length = batch.xia_lian xia_lian = xia_lian.permute(1, 0).to(device) # xia_lian_length = xia_lian_length.numpy() # xia_lian_pos = torch.LongTensor(get_pos_ids(xia_lian_length, xia_lian.shape[1])).to(device) outputs = model(shang_lian, xia_lian[:, :-1]) outputs = outputs.contiguous().view(-1, outputs.shape[-1]) xia_lian = xia_lian[:, 1:].contiguous().view(-1) loss = criterion(outputs, xia_lian) epoches_loss += loss.item() return epoches_loss / len(dataset_pro.valid_iter)
def load_model(opt): # TODO not working with save mode 'all' checkpoint = torch.load(opt.model + '.chkpt', map_location=opt.device) model_opt = checkpoint['settings'] model = Transformer( model_opt.src_vocab_size, model_opt.tgt_vocab_size, model_opt.max_token_seq_len, tgt_emb_prj_weight_sharing=model_opt.proj_share_weight, emb_src_tgt_weight_sharing=False, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout) model.load_state_dict(checkpoint['model']) print('[Info] Trained model state loaded.') return model, model_opt
def skyline_model_provider(): opt = model_config() return TransformerWithLoss( Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout)).cuda()
def __init__(self, opt): self.opt = opt self.tt = torch.cuda if opt.cuda else torch checkpoint = torch.load(opt.model) model_opt = checkpoint['settings'] self.model_opt = model_opt model = Transformer( model_opt.src_vocab_size, model_opt.tgt_vocab_size, model_opt.max_token_seq_len, proj_share_weight=model_opt.proj_share_weight, embs_share_weight=model_opt.embs_share_weight, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner_hid=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout) prob_projection = nn.LogSoftmax() model.load_state_dict(checkpoint['model']) print('[Info] Trained model state loaded.') if opt.cuda: model.cuda() prob_projection.cuda() else: model.cpu() prob_projection.cpu() model.prob_projection = prob_projection self.model = model self.model.eval()
def train(): transformer = Transformer( n_src_vocab=1315, #opt.src_vocab_size, n_trg_vocab=10, #opt.trg_vocab_size, src_pad_idx=opt.src_pad_idx, trg_pad_idx=opt.trg_pad_idx, trg_emb_prj_weight_sharing=False, emb_src_trg_weight_sharing=False, d_k=64, d_v=64, d_model=512, d_word_vec=512, d_inner=2048, n_layers=6, n_head=8, dropout=0.1)
def __init__(self,_config): super(Multimodal_Context, self).__init__() #print("Config in multimodal context:",_config["multimodal_context_configs"]) self.config = _config (in_text,in_audio,in_video) = [ _config["num_context_sequence"]*e for e in _config["unimodal_context"]["hidden_sizes"]] #mfn config contains a list of configs and the first one of them is the config, which #contains a dictionary called h_dims which has the [ht,ha,hv]. (out_text,out_audio,out_video) = _config["mfn_configs"][0]["h_dims"] #The first one is hl self.fc_uni_text_to_mfn_text_input = nn.Linear(in_text,out_text) self.text_in_drop = nn.Dropout(_config["multimodal_context_configs"]["text_in_drop"]) #The second one is ha self.fc_uni_audio_to_mfn_audio_input = nn.Linear(in_audio,out_audio) self.audio_in_drop = nn.Dropout(_config["multimodal_context_configs"]["audio_in_drop"]) #The third one is hv self.fc_uni_video_to_mfn_video_input = nn.Linear(in_video,out_video) self.video_in_drop = nn.Dropout(_config["multimodal_context_configs"]["video_in_drop"]) #This one will output the initialization of the mfn meory encoder_config =self.config["multimodal_context_configs"] self.self_attention_module = Transformer( n_src_features = encoder_config["n_source_features"], len_max_seq = encoder_config["max_token_seq_len"], _config = self.config, tgt_emb_prj_weight_sharing=encoder_config["proj_share_weight"], emb_src_tgt_weight_sharing=encoder_config["embs_share_weight"], d_k=encoder_config["d_k"], d_v=encoder_config["d_v"], d_model=encoder_config["d_model"], d_word_vec=encoder_config["d_word_vec"], d_inner=encoder_config["d_inner_hid"], n_layers=encoder_config["n_layers"], n_head=encoder_config["n_head"], dropout=encoder_config["dropout"] ).to(self.config["device"]) self.mem_in_drop = nn.Dropout(_config["multimodal_context_configs"]["mem_in_drop"])
def build_transformer( n_src_vocab=VOCAB_SZ + 1, n_tgt_vocab=VOCAB_SZ + 1, len_max_seq_encoder=MAX_QUESTION_SZ, len_max_seq_decoder=MAX_ANSWER_SZ, built_in=False, weight_sharing=True, ): if built_in: raise NotImplementedError("Fix input shape error") return torch.nn.Transformer() return Transformer( n_src_vocab=n_src_vocab, # add PAD in vocabulary n_tgt_vocab=n_tgt_vocab, # add PAD in vocabulary len_max_seq_encoder=len_max_seq_encoder, len_max_seq_decoder=len_max_seq_decoder, tgt_emb_prj_weight_sharing=weight_sharing, emb_src_tgt_weight_sharing=weight_sharing, )
def __init__(self, num_layers = 6, num_heads = 8, key_dimension = 64, value_dimension = 64, dropout = 0.1, n_position = 160, d_char_vec = 512, inner_dimension = 2048, n_trg_position = MAX_ANSWER_SIZE, n_src_position = MAX_QUESTION_SIZE, padding = 1, critic_num_layers=4, critic_kernel_size=4, critic_padding=1, model=None): super(Policy_Network, self).__init__() self.action_transformer = Transformer(n_src_vocab=VOCAB_SIZE + 1, n_trg_vocab=VOCAB_SIZE+1, src_pad_idx=0, trg_pad_idx=0, d_char_vec=d_char_vec, d_model=d_char_vec, d_inner=inner_dimension, n_layers=num_layers, n_head=num_heads, d_k=key_dimension, d_v=value_dimension, dropout=dropout, n_trg_position=n_trg_position, n_src_position=n_src_position, trg_emb_prj_weight_sharing=True, emb_src_trg_weight_sharing=True) if model == None else model self.value_head = Critic(conv_layers=critic_num_layers, d_char_vec=d_char_vec, kernel_size=critic_kernel_size, n_vocab=VOCAB_SIZE+1, dropout=dropout, padding=critic_padding, src_embedding=self.action_transformer.encoder.src_word_emb, trg_embedding=self.action_transformer.decoder.trg_word_emb, src_position_enc=self.action_transformer.encoder.position_enc, trg_position_enc=self.action_transformer.decoder.position_enc)
def main(): if not os.path.exists(args.ckpt_file): raise FileNotFoundError("model file not found") data_dir = '/home/tiankeke/workspace/datas/sumdata/' TRAIN_X = os.path.join(data_dir, 'train/train.article.txt') TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt') TEST_X = args.input_file small_vocab_file = 'sumdata/small_vocab.json' if os.path.exists(small_vocab_file): small_vocab = json.load(open(small_vocab_file)) else: small_vocab = build_vocab([TRAIN_X, TRAIN_Y], small_vocab_file, vocab_size=80000) max_src_len = 101 max_tgt_len = 47 test_x = BatchManager(load_data(TEST_X, max_src_len, args.n_test), args.batch_size, small_vocab) model = Transformer(len(small_vocab), len(small_vocab), max_src_len, d_word_vec=300, d_model=300, d_inner=1200, n_layers=1, n_head=6, d_k=50, d_v=50, dropout=0.1, tgt_emb_prj_weight_sharing=True, emb_src_tgt_weight_sharing=True).cuda() # print(model) model.eval() saved_state = torch.load(args.ckpt_file) model.load_state_dict(saved_state['state_dict']) print('Load model parameters from %s' % args.ckpt_file) my_test(test_x, model, small_vocab)
def __init__(self, opt): #opt is from argprass self.opt = opt self.device = torch.device('cuda' if opt.cuda else 'cpu') self.m = opt.m #opt.model is the model path checkpoint = torch.load(opt.model) #model_opt is the model hyper params model_opt = checkpoint['settings'] self.model_opt = model_opt model = Transformer( model_opt.src_vocab_size, model_opt.tgt_vocab_size, model_opt.max_token_seq_len, tgt_emb_prj_weight_sharing=model_opt.proj_share_weight, emb_src_tgt_weight_sharing=model_opt.embs_share_weight, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout, return_attns=opt.return_attns) #Load the actual model weights model.load_state_dict(checkpoint['model']) print('[Info] Trained model state loaded.') model.word_prob_prj = nn.LogSoftmax(dim=1) model = model.to(self.device) self.model = model self.model.eval()
def __init__(self, opt): self.opt = opt self.device = torch.device('cuda' if opt.cuda else 'cpu') checkpoint = torch.load(opt.model) model_opt = checkpoint['settings'] self.model_opt = model_opt '''added by self''' checkpoint_copy = checkpoint['model'].copy() for k in list(checkpoint_copy.keys()): new_key = k.replace('module.model.', '') checkpoint_copy.update({str(new_key): checkpoint_copy.pop(k)}) ''' end ''' model = Transformer( model_opt.src_vocab_size, model_opt.tgt_vocab_size, model_opt.max_token_seq_len, tgt_emb_prj_weight_sharing=model_opt.proj_share_weight, emb_src_tgt_weight_sharing=model_opt.embs_share_weight, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout) model.load_state_dict(checkpoint_copy) print('[Info] Trained model state loaded.') model.word_prob_prj = nn.LogSoftmax(dim=1) model = model.to(self.device) self.model = model self.model.eval()
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data_all', default='data/csv/data_train_2_sort.torch') parser.add_argument('-save_model', default='module/2018-7-30.pt') parser.add_argument('-start_time', default='2018-07-01') parser.add_argument('-end_time', default='2018-08-30') parser.add_argument('-epoch', type=int, default=16) parser.add_argument('-batch_size', type=int, default=128) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=32) parser.add_argument('-d_v', type=int, default=32) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=2) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.3) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default='log/logs.log') parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') parser.add_argument('-batch_x', default=32) parser.add_argument('-batch_y', default=32) parser.add_argument('-train_type', default='name') opt = parser.parse_args() opt.cuda = torch.cuda.is_available() opt.d_word_vec = opt.d_model # ========= Loading Dataset =========# # opt.max_token_seq_len = data['settings'].max_token_seq_len training_data, validation_data, voc_name, data_val_ofpa = ld.get_data_loader( opt, device) opt.src_vocab_size = voc_name opt.tgt_vocab_size = opt.src_vocab_size if opt.train_type == 'time': voc = ld.get_time_vac(opt) opt.tgt_vocab_size = voc if voc > 500 else 728 # ========= Preparing Model =========# if opt.embs_share_weight: assert opt.src_vocab_size == opt.tgt_vocab_size, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.batch_x, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) if opt.train_type == 'time': print("train time dim ") # train(transformer, train_time, val_time, optimizer, device, opt) else: train(transformer, training_data, validation_data, optimizer, device, opt, data_val_ofpa)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-batch_size', type=int, default=64) # parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model # ========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len training_data, validation_data = prepare_dataloaders(data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size # ========= Preparing Model =========# if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, optimizer, device, opt)
def main(): ''' Usage: python train.py -data_pkl m30k_deen_shr.pkl -log m30k_deen_shr -embs_share_weight -proj_share_weight -label_smoothing -save_model trained -b 128 -epoch 100 -optim nero -lr 0.003 ''' parser = argparse.ArgumentParser() parser.add_argument('-data_pkl', default=None) # all-in-1 data pickle or bpe field parser.add_argument('-train_path', default=None) # bpe encoded data parser.add_argument('-val_path', default=None) # bpe encoded data parser.add_argument('-seed', type=int, default=0) parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-b', '--batch_size', type=int, default=2048) parser.add_argument('-optim', type=str, choices=['adam', 'sgd', 'nero', 'lamb']) parser.add_argument('-lr', type=float) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model # set random seed torch.manual_seed(opt.seed) np.random.seed(opt.seed) # tensorboard writer log_dir = 'runs/' + opt.optim + '_' + str(opt.lr) + '_seed' + str(opt.seed) writer = SummaryWriter(log_dir=log_dir) print("Saving tensorboard to "+log_dir) if not opt.log and not opt.save_model: print('No experiment result will be saved.') raise device = torch.device('cuda' if opt.cuda else 'cpu') #========= Loading Dataset =========# if all((opt.train_path, opt.val_path)): training_data, validation_data = prepare_dataloaders_from_bpe_files(opt, device) elif opt.data_pkl: training_data, validation_data = prepare_dataloaders(opt, device) else: raise print(opt) transformer = Transformer( opt.src_vocab_size, opt.trg_vocab_size, src_pad_idx=opt.src_pad_idx, trg_pad_idx=opt.trg_pad_idx, trg_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_trg_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) if opt.optim == 'adam': optimizer = optim.Adam(transformer.parameters(), lr=opt.lr, betas=(0.0, 0.999)) elif opt.optim == 'nero': optimizer = Nero(transformer.parameters(), lr=opt.lr) elif opt.optim == 'lamb': optimizer = Lamb(transformer.parameters(), lr=opt.lr, betas=(0.0, 0.999)) elif opt.optim == 'sgd': optimizer = optim.SGD(transformer.parameters(), lr=opt.lr, momentum=0) print("Using optim", type(optimizer).__name__) lr_lambda = lambda epoch : 2 * min(epoch / opt.epoch, (opt.epoch-epoch) / opt.epoch) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) train(transformer, training_data, validation_data, optimizer, scheduler, device, opt, writer) writer.close()
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-batch_size', type=int, default=64) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=1024) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model #========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len #========= Preparing DataLoader =========# training_data = DataLoader(data['dict']['src'], data['dict']['tgt'], src_insts=data['train']['src'], tgt_insts=data['train']['tgt'], batch_size=opt.batch_size, cuda=opt.cuda) validation_data = DataLoader(data['dict']['src'], data['dict']['tgt'], src_insts=data['valid']['src'], tgt_insts=data['valid']['tgt'], batch_size=opt.batch_size, shuffle=False, test=True, cuda=opt.cuda) opt.src_vocab_size = training_data.src_vocab_size opt.tgt_vocab_size = training_data.tgt_vocab_size #========= Preparing Model =========# if opt.embs_share_weight and training_data.src_word2idx != training_data.tgt_word2idx: print( '[Warning]', 'The src/tgt word2idx table are different but asked to share word embedding.' ) print(opt) transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, proj_share_weight=opt.proj_share_weight, embs_share_weight=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner_hid=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout) #print(transformer) optimizer = ScheduledOptim( optim.Adam(transformer.get_trainable_parameters(), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) def get_criterion(vocab_size): ''' With PAD token zero weight ''' weight = torch.ones(vocab_size) weight[Constants.PAD] = 0 return nn.CrossEntropyLoss(weight, size_average=False) crit = get_criterion(training_data.tgt_vocab_size) if opt.cuda: transformer = transformer.cuda() crit = crit.cuda() print("===>TRAIN\n") train(transformer, training_data, validation_data, crit, optimizer, opt)
def main(): device = torch.device("cuda:0" if USE_CUDA else "cpu") env = Environment() END_TAG_IDX = env.lang.word2idx[END_TAG] SAY_HI = "hello" targ_lang = env.lang vocab_inp_size = len(env.lang.word2idx) vocab_tar_size = len(targ_lang.word2idx) print("vocab_inp_size", vocab_inp_size) print("vocab_tar_size", vocab_tar_size) model = Transformer( vocab_inp_size, vocab_tar_size, MAX_TARGET_LEN, d_word_vec=32, d_model=32, d_inner=32, n_layers=3, n_head=4, d_k=32, d_v=32, dropout=0.1, ).to(device) # baseline = Baseline(UNITS) history = [] l_optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) batch = None def maybe_pad_sentence(s): return tf.keras.preprocessing.sequence.pad_sequences( s, maxlen=MAX_TARGET_LEN, padding='post') def get_returns(r: float, seq_len: int): return list(reversed([r * (GAMMA**t) for t in range(seq_len)])) def sentence_to_idxs(sentence: str): return [ env.lang.word2idx[token] for token in tokenize_sentence(sentence) ] for episode in range(EPISODES): # Start of Episode env.reset() model.eval() # get first state from the env state, _, done = env.step(SAY_HI) while not done: src_seq = [ env.lang.word2idx[token] for token in tokenize_sentence(state) ] src_seq, src_pos = collate_fn([src_seq]) src_seq, src_pos = src_seq.to(device), src_pos.to(device) enc_output, *_ = model.encoder(src_seq, src_pos) actions_t = [] actions = [] actions_idx = [] while len(actions) == 0 or actions[len(actions) - 1] != END_TAG_IDX and len( actions) < MAX_TARGET_LEN: # construct new tgt_seq based on what's outputed so far if len(actions_t) == 0: tgt_seq = [env.lang.word2idx[Constants.UNK_WORD]] else: tgt_seq = actions_idx tgt_seq, tgt_pos = collate_fn([tgt_seq]) tgt_seq, tgt_pos = tgt_seq.to(device), tgt_pos.to(device) # dec_output dims: [1, pos, hidden] dec_output, * \ _ = model.decoder(tgt_seq, tgt_pos, src_seq, enc_output) # pick last step dec_output = dec_output[:, -1, :] # w_logits dims: [1, vocab_size] w_logits = model.tgt_word_prj(dec_output) # w_probs dims: [1, vocab_size] w_probs = torch.nn.functional.softmax(w_logits, dim=1) w_dist = torch.distributions.categorical.Categorical( probs=w_probs) w_idx_t = w_dist.sample() w_idx = w_idx_t.cpu().numpy()[0] actions_t.append(w_idx_t) actions_idx.append(w_idx) actions.append(env.lang.idx2word[w_idx]) # action is a sentence (string) action_str = ' '.join(actions) next_state, reward, done = env.step(action_str) # print(reward) history.append((state, actions_t, action_str, reward)) state = next_state # record history (to be used for gradient updating after the episode is done) # End of Episode # Update policy model.train() while len(history) >= BATCH_SIZE: batch = history[:BATCH_SIZE] state_inp_b, action_inp_b, reward_b, ret_seq_b = zip(*[[ sentence_to_idxs(state), actions_b, reward, get_returns(reward, MAX_TARGET_LEN) ] for state, actions_b, _, reward in batch]) action_inp_b = [torch.stack(sent) for sent in action_inp_b] action_inp_b = torch.stack(action_inp_b) ret_seq_b = np.asarray(ret_seq_b) # ret_mean = np.mean(ret_seq_b) # ret_std = np.std(ret_seq_b) # ret_seq_b = (ret_seq_b - ret_mean) / ret_std ret_seq_b = np.exp((ret_seq_b - 0.5) * 5) ret_seq_b = torch.tensor(ret_seq_b, dtype=torch.float32).to(device) loss = 0 # loss_bl=0 l_optimizer.zero_grad() # accumulate gradient with GradientTape src_seq, src_pos = collate_fn(list(state_inp_b)) src_seq, src_pos = src_seq.to(device), src_pos.to(device) enc_output_b, *_ = model.encoder(src_seq, src_pos) max_sentence_len = action_inp_b.shape[1] tgt_seq = [[Constants.BOS] for i in range(BATCH_SIZE)] for t in range(max_sentence_len): # _b stands for batch prev_w_idx_b, tgt_pos = collate_fn(tgt_seq) prev_w_idx_b, tgt_pos = prev_w_idx_b.to(device), tgt_pos.to( device) # dec_output_b dims: [batch, pos, hidden] dec_output_b, *_ = \ model.decoder(prev_w_idx_b, tgt_pos, src_seq, enc_output_b) # pick last step dec_output_b = dec_output_b[:, -1, :] # w_logits_b dims: [batch, vocab_size] w_logits_b = model.tgt_word_prj(dec_output_b) # w_probs dims: [batch, vocab_size] w_probs_b = torch.nn.functional.softmax(w_logits_b, dim=1) dist_b = torch.distributions.categorical.Categorical( probs=w_probs_b) curr_w_idx_b = action_inp_b[:, t, :] log_probs_b = torch.transpose( dist_b.log_prob(torch.transpose(curr_w_idx_b, 0, 1)), 0, 1) # bl_val_b = baseline(tf.cast(dec_hidden_b, 'float32')) # delta_b = ret_b - bl_val_b # cost_b = -tf.math.multiply(log_probs_b, delta_b) # cost_b = -tf.math.multiply(log_probs_b, ret_b) ret_b = torch.reshape(ret_seq_b[:, t], (BATCH_SIZE, 1)).to(device) # alternatively, use torch.mul() but it is overloaded. Might need to try log_probs_b*vec.expand_as(A) cost_b = -torch.mul(log_probs_b, ret_b) # log_probs_b*vec.expand_as(A) # cost_b = -torch.bmm() #if we are doing batch multiplication loss += cost_b # loss_bl += -tf.math.multiply(delta_b, bl_val_b) prev_w_idx_b = curr_w_idx_b tgt_seq = np.append(tgt_seq, prev_w_idx_b.data.cpu().numpy(), axis=1).tolist() # calculate cumulative gradients # model_vars = encoder.variables + decoder.variables loss = loss.mean() loss.backward() # loss_bl.backward() # finally, apply gradient l_optimizer.step() # bl_optimizer.step() # Reset everything for the next episode history = history[BATCH_SIZE:] if episode % max(BATCH_SIZE, 32) == 0 and batch != None: print(">>>>>>>>>>>>>>>>>>>>>>>>>>") print("Episode # ", episode) print("Samples from episode with rewards > 0: ") good_rewards = [(s, a_str, r) for s, _, a_str, r in batch] for s, a, r in random.sample(good_rewards, min(len(good_rewards), 3)): print("prev_state: ", s) print("actions: ", a) print("reward: ", r) # print("return: ", get_returns(r, MAX_TARGET_LEN)) ret_seq_b_np = ret_seq_b.cpu().numpy() print("all returns: min=%f, max=%f, median=%f" % (np.min(ret_seq_b_np), np.max(ret_seq_b_np), np.median(ret_seq_b_np))) print("avg reward: ", sum(reward_b) / len(reward_b)) print("avg loss: ", np.mean(loss.cpu().detach().numpy()))
def main(): ''' Main function ''' parser = argparse.ArgumentParser() #----------------------参数都在这里面,默认参数!!!!!!!!!!!!!!!! parser.add_argument('-data', required=False) parser.add_argument('-epoch', type=int, default=1) #为了跑通我就先写1了. parser.add_argument('-batch_size', type=int, default=32) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default='/transformer_my') parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') # 这种action里面写上就表示默认调用的话就是true opt = parser.parse_args() opt.d_word_vec = opt.d_model ''' 下面来把参数写这里, 就方便了. ''' opt.saved_weight = '/trained.chkpt' # 就的模型的位置. opt.data = 'yunixng_bash/data/multi30k.atok.low.pt' # 数据集的位置. opt.save_model = 'trained' # 存模型的名字. opt.save_mode = 'best' # 数据集的位置. opt.proj_share_weight = True # 数据集的位置. opt.label_smoothing = True # 数据集的位置. opt.cuda = False opt.batch_size = 200 opt.epoch = 30 print(opt, 44444444444444444444444444444444444444444444444444444444444444) #========= Loading Dataset =========# data = torch.load( opt.data ) # 这里面的数据已经经过编码了. 具体的编码规则也都在data里面,data里面是一个字典.并且src 和tgt的字典是不一样的,所以上面的embs_share_weight 参数一定要false. 数据集一共大小才3mb. 真方便. 就是根目录下面的multi30k.atok.low.pt这个. 应该是一个小数据及,3万个句子对, 字典3k. 只有点常用的英文字. 并且没用使用word-piece. 只是word级别的编码. 所以随便给一个句子,超出字典非常正常. 但是目前用这个,对于测试非常方便,速度很快. opt.max_token_seq_len = data['settings'].max_token_seq_len # 进行数据长度预处理 , 就是加padding 而已. training_data, validation_data = prepare_dataloaders(data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size #========= Preparing Model =========# if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print('配的参数都打印在这里了') print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') transformer = Transformer( # 准备网络模型. opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, optimizer, device, opt)
# outpu_tensor = torch.argmax(output.squeeze(1), 1) ouput_str = get_output_char(result) return ouput_str else: target = beam_search.beam_decode(input_tensor, model, beam_with=5) print(target) print(len(target[0][0])) ouput_str = get_output_char(target[0][0][1:]) return ouput_str if __name__ == '__main__': args = get_args() # pad index device = torch.device('cuda' if args.no_cuda == False else 'cpu') transformer_model = Transformer(args.sl_vocab_size, args.xl_vocab_size, hid_dim=args.embedding_dim, pf_dim=args.fp_inner_dim, n_layers=args.n_layers, n_heads=args.n_head, dropout=args.dropout, device=device, SOS_IDX=SOS_IDX, PAD_IDX=PAD_IDX, EOS_IDX=EOS_IDX).to( device) # transformer_model.load_state_dict(torch.load('./models-bak/transformer/1121/transformer-model_11.pt', map_location='cpu')) transformer_model.load_state_dict(torch.load('./models-bak/transformer/1122/transformer-model_500.pt', map_location='cpu')) transformer_model.eval() text = '欲出烦恼须无我' print(predict_xl(text, transformer_model, device, is_beam_search=True)) # df = pd.read_excel('./couplet/result-test.xlsx') # df['transformer'] = df['上联'].apply(lambda x: predict_xl(x, transformer_model, device, is_beam_search=False)) # df['transformer_beam'] = df['上联'].apply(lambda x: predict_xl(x, transformer_model, device, is_beam_search=True)) # df.to_excel('./couplet/result-test.xlsx',index=False)
def main(): ''' Usage: python train.py -data_pkl m30k_deen_shr.pkl -log m30k_deen_shr -embs_share_weight -proj_share_weight -label_smoothing -save_model trained -b 256 -warmup 128000 ''' parser = argparse.ArgumentParser() parser.add_argument('-data_pkl', default=None) # all-in-1 data pickle or bpe field parser.add_argument('-train_path', default=None) # bpe encoded data parser.add_argument('-val_path', default=None) # bpe encoded data parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-b', '--batch_size', type=int, default=2048) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', default=True, action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model if not opt.log and not opt.save_model: print('No experiment result will be saved.') raise if opt.batch_size < 2048 and opt.n_warmup_steps <= 4000: print('[Warning] The warmup steps may be not enough.\n'\ '(sz_b, warmup) = (2048, 4000) is the official setting.\n'\ 'Using smaller batch w/o longer warmup may cause '\ 'the warmup stage ends with only little data trained.') device = torch.device('cuda' if opt.cuda else 'cpu') #========= Loading Dataset =========# if all((opt.train_path, opt.val_path)): training_data, validation_data = prepare_dataloaders_from_bpe_files( opt, device) elif opt.data_pkl: training_data, validation_data = prepare_dataloaders(opt, device) else: raise print(opt) transformer = Transformer(opt.src_vocab_size, opt.trg_vocab_size, src_pad_idx=opt.src_pad_idx, trg_pad_idx=opt.trg_pad_idx, trg_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_trg_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) optimizer = ScheduledOptim( optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09), 2.0, opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, optimizer, device, opt)