def load_model(opt, device): checkpoint = torch.load(opt.model, map_location=device) model_opt = checkpoint['settings'] model = Transformer( model_opt.src_vocab_size, model_opt.trg_vocab_size, model_opt.src_pad_idx, model_opt.trg_pad_idx, trg_emb_prj_weight_sharing=model_opt.proj_share_weight, emb_src_trg_weight_sharing=model_opt.embs_share_weight, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout).to(device) model.load_state_dict(checkpoint['model']) print('[Info] Trained model state loaded.') return model
def __init__(self, opt): self.opt = opt print(opt, "\n") self.device = torch.device('cuda' if opt.cuda else 'cpu') checkpoint = torch.load(opt.model) model_opt = checkpoint['settings'] self.model_opt = model_opt print(model_opt) model = Transformer( model_opt.src_vocab_size, model_opt.tgt_vocab_size, model_opt.max_token_seq_len, tgt_emb_prj_weight_sharing=model_opt.proj_share_weight, emb_src_tgt_weight_sharing=model_opt.embs_share_weight, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout) model.load_state_dict(checkpoint['model']) print('[Info] Trained model state loaded.') model.word_prob_prj = nn.LogSoftmax(dim=1) model = model.to(self.device) self.model = model self.model.eval()
def __init__(self, model): self.device = torch.device('cuda') checkpoint = torch.load(model) checkpoint_copy = checkpoint['model'].copy() for k in list(checkpoint_copy.keys()): new_key = k.replace('module.model.', '') checkpoint_copy.update({str(new_key): checkpoint_copy.pop(k)}) model_opt = checkpoint['settings'] model = Transformer( model_opt.src_vocab_size, model_opt.tgt_vocab_size, model_opt.max_token_seq_len, tgt_emb_prj_weight_sharing=model_opt.proj_share_weight, emb_src_tgt_weight_sharing=model_opt.embs_share_weight, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout) model.load_state_dict(checkpoint_copy) model = model.to(self.device) self.model = model for p in self.model.parameters(): p.requires_grad = False self.model.eval()
def __init__(self, opt, device): self.opt = opt self.device = device checkpoint = torch.load(opt.model) model_opt = checkpoint['settings'] self.model_opt = model_opt model = Transformer(model_opt.input_dim, model_opt.output_dim, model_opt.n_inputs_max_seq, model_opt.n_outputs_max_seq, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_inner_hid=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout, device=device, is_train=False) model.load_state_dict(checkpoint['model']) print('[Info] Trained model state loaded.') model.to(device) prob_projection.to(device) model.prob_projection = prob_projection self.model = model self.model.eval()
def __init__(self, opt): self.opt = opt self.device = torch.device('cuda' if opt.cuda else 'cpu') checkpoint = torch.load(opt.model) model_opt = checkpoint['settings'] self.model_opt = model_opt if opt.prune: # NetworkWrapper prune_params = {'alpha': opt.prune_alpha} pruner = Pruner(device=device, load_mask=opt.load_mask, prune_params=prune_params) transformer = NetworkWrapper( model_opt.src_vocab_size, model_opt.tgt_vocab_size, model_opt.max_token_seq_len, tgt_emb_prj_weight_sharing=model_opt.proj_share_weight, emb_src_tgt_weight_sharing=model_opt.embs_share_weight, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout, transformer=pruner) else: model = Transformer( model_opt.src_vocab_size, model_opt.tgt_vocab_size, model_opt.max_token_seq_len, tgt_emb_prj_weight_sharing=model_opt.proj_share_weight, emb_src_tgt_weight_sharing=model_opt.embs_share_weight, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout) model.load_state_dict(checkpoint['model']) print('[Info] Trained model state loaded.') model.word_prob_prj = nn.LogSoftmax(dim=1) model = model.to(self.device) self.model = model self.model.eval()
def __init__(self, opt): self.opt = opt self.tt = torch.cuda if opt.cuda else torch checkpoint = torch.load(opt.model, map_location=lambda storage, loc: storage) model_opt = checkpoint['settings'] if 'use_ctx' not in model_opt.__dict__: model_opt.use_ctx = False self.model_opt = model_opt model = Transformer(model_opt.src_vocab_size, model_opt.tgt_vocab_size, model_opt.max_token_seq_len, proj_share_weight=model_opt.proj_share_weight, embs_share_weight=model_opt.embs_share_weight, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner_hid=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout, use_ctx=model_opt.use_ctx) prob_projection = nn.LogSoftmax() model.load_state_dict(checkpoint['model']) # New max_token_seq_len for position encoding model = self.change_position_embedings(model, opt.max_token_seq_len, model_opt.d_word_vec, model_opt.use_ctx) model_opt.max_token_seq_len = opt.max_token_seq_len print('[Info] Trained model state loaded.') if opt.cuda: model.cuda() prob_projection.cuda() else: model.cpu() prob_projection.cpu() model.prob_projection = prob_projection self.model = model self.model.eval()
def main(): if not os.path.exists(args.ckpt_file): raise FileNotFoundError("model file not found") data_dir = '/home/tiankeke/workspace/datas/sumdata/' TRAIN_X = os.path.join(data_dir, 'train/train.article.txt') TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt') TEST_X = args.input_file small_vocab_file = 'sumdata/small_vocab.json' if os.path.exists(small_vocab_file): small_vocab = json.load(open(small_vocab_file)) else: small_vocab = build_vocab([TRAIN_X, TRAIN_Y], small_vocab_file, vocab_size=80000) max_src_len = 101 max_tgt_len = 47 test_x = BatchManager(load_data(TEST_X, max_src_len, args.n_test), args.batch_size, small_vocab) model = Transformer(len(small_vocab), len(small_vocab), max_src_len, d_word_vec=300, d_model=300, d_inner=1200, n_layers=1, n_head=6, d_k=50, d_v=50, dropout=0.1, tgt_emb_prj_weight_sharing=True, emb_src_tgt_weight_sharing=True).cuda() # print(model) model.eval() saved_state = torch.load(args.ckpt_file) model.load_state_dict(saved_state['state_dict']) print('Load model parameters from %s' % args.ckpt_file) my_test(test_x, model, small_vocab)
def __init__(self, opt): self.opt = opt self.tt = torch.cuda if opt.cuda else torch checkpoint = torch.load(opt.model) model_opt = checkpoint['settings'] self.model_opt = model_opt model = Transformer( model_opt.src_vocab_size, model_opt.tgt_vocab_size, model_opt.max_token_seq_len, proj_share_weight=model_opt.proj_share_weight, embs_share_weight=model_opt.embs_share_weight, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner_hid=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout) prob_projection = nn.LogSoftmax() model.load_state_dict(checkpoint['model']) print('[Info] Trained model state loaded.') if opt.cuda: model.cuda() prob_projection.cuda() else: model.cpu() prob_projection.cpu() model.prob_projection = prob_projection self.model = model self.model.eval()
def __init__(self, opt): self.opt = opt self.device = torch.device('cuda' if opt.cuda else 'cpu') checkpoint = torch.load(opt.model) model_opt = checkpoint['settings'] self.model_opt = model_opt '''added by self''' checkpoint_copy = checkpoint['model'].copy() for k in list(checkpoint_copy.keys()): new_key = k.replace('module.model.', '') checkpoint_copy.update({str(new_key): checkpoint_copy.pop(k)}) ''' end ''' model = Transformer( model_opt.src_vocab_size, model_opt.tgt_vocab_size, model_opt.max_token_seq_len, tgt_emb_prj_weight_sharing=model_opt.proj_share_weight, emb_src_tgt_weight_sharing=model_opt.embs_share_weight, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout) model.load_state_dict(checkpoint_copy) print('[Info] Trained model state loaded.') model.word_prob_prj = nn.LogSoftmax(dim=1) model = model.to(self.device) self.model = model self.model.eval()
def __init__(self, opt): #opt is from argprass self.opt = opt self.device = torch.device('cuda' if opt.cuda else 'cpu') self.m = opt.m #opt.model is the model path checkpoint = torch.load(opt.model) #model_opt is the model hyper params model_opt = checkpoint['settings'] self.model_opt = model_opt model = Transformer( model_opt.src_vocab_size, model_opt.tgt_vocab_size, model_opt.max_token_seq_len, tgt_emb_prj_weight_sharing=model_opt.proj_share_weight, emb_src_tgt_weight_sharing=model_opt.embs_share_weight, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout, return_attns=opt.return_attns) #Load the actual model weights model.load_state_dict(checkpoint['model']) print('[Info] Trained model state loaded.') model.word_prob_prj = nn.LogSoftmax(dim=1) model = model.to(self.device) self.model = model self.model.eval()
def load_model(opt): # TODO not working with save mode 'all' checkpoint = torch.load(opt.model + '.chkpt', map_location=opt.device) model_opt = checkpoint['settings'] model = Transformer( model_opt.src_vocab_size, model_opt.tgt_vocab_size, model_opt.max_token_seq_len, tgt_emb_prj_weight_sharing=model_opt.proj_share_weight, emb_src_tgt_weight_sharing=False, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout) model.load_state_dict(checkpoint['model']) print('[Info] Trained model state loaded.') return model, model_opt
def main(): ''' Main function ''' parser = argparse.ArgumentParser() #parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=200) parser.add_argument('-batch_size', type=int, default=32) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=4) parser.add_argument('-n_layers', type=int, default=4) parser.add_argument('-n_warmup_steps', type=int, default=2000) #parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-device', type=str, default='0') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model #========= Loading Dataset =========# #data = torch.load(opt.data) #opt.max_token_seq_len = data['settings'].max_token_seq_len opt.max_token_seq_len = 126 Dataloader = Loaders() Dataloader.get_loaders(opt) training_data, validation_data, coco_data = Dataloader.loader['train'], Dataloader.loader['val'], Dataloader.loader['coco'] opt.src_vocab_size = len(Dataloader.frame_vocab) opt.tgt_vocab_size = len(Dataloader.story_vocab) #========= Preparing Model =========# if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) device = torch.device(f'cuda:{opt.device}' if opt.cuda else 'cpu') transformer = Transformer( opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) if opt.model: checkpoint = torch.load(opt.model) transformer.load_state_dict(checkpoint['model']) optimizer = ScheduledOptim( optim.Adam( filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, coco_data, optimizer, device ,opt, Dataloader)
def main(): """ Main function """ parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-emb_path', default=None) parser.add_argument('-trained_model', default=None) parser.add_argument('-current_step', type=int, default=0) parser.add_argument('-epoch', type=int, default=5) parser.add_argument('-batch_size', type=int, default=32) #parser.add_argument('-d_word_vec', type=int, default=300) parser.add_argument('-d_model', type=int, default=300) parser.add_argument('-d_inner_hid', type=int, default=500) parser.add_argument('-d_k', type=int, default=50) parser.add_argument('-d_v', type=int, default=50) parser.add_argument('-n_head', type=int, default=6) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) # 日志保存地址(without suffix) parser.add_argument('-save_model', default=None) # 模型保存地址(without suffix) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model #========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len #========= Preparing DataLoader =========# training_data = DataLoader( data['dict']['src'], data['dict']['tgt'], src_insts=data['train']['src'], tgt_insts=data['train']['tgt'], batch_size=opt.batch_size, cuda=opt.cuda) validation_data = DataLoader( data['dict']['src'], data['dict']['tgt'], src_insts=data['valid']['src'], tgt_insts=data['valid']['tgt'], batch_size=opt.batch_size, shuffle=False, test=True, cuda=opt.cuda) opt.src_vocab_size = training_data.src_vocab_size opt.tgt_vocab_size = training_data.tgt_vocab_size #========= Preparing Model =========# if opt.embs_share_weight and training_data.src_word2idx != training_data.tgt_word2idx: print('[Warning]', 'The src/tgt word2idx table are different but asked to share word embedding.') print(opt) transformer = Transformer( opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, emb_path=opt.emb_path, proj_share_weight=opt.proj_share_weight, embs_share_weight=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner_hid=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout) if opt.trained_model: checkpoint = torch.load(opt.trained_model) transformer.load_state_dict(checkpoint['model']) print('[Info] Trained model state loaded.') #print(transformer) optimizer = ScheduledOptim( optim.Adam( transformer.get_trainable_parameters(), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps, opt.current_step) def get_criterion(vocab_size): """ With PAD token zero weight """ weight = torch.ones(vocab_size) weight[Constants.PAD] = 0 #被pad的部分在计算loss的时候权重设为0 return nn.CrossEntropyLoss(weight, size_average=False) crit = get_criterion(training_data.tgt_vocab_size) if opt.cuda: transformer = transformer.cuda() crit = crit.cuda() train(transformer, training_data, validation_data, crit, optimizer, opt)
# outpu_tensor = torch.argmax(output.squeeze(1), 1) ouput_str = get_output_char(result) return ouput_str else: target = beam_search.beam_decode(input_tensor, model, beam_with=5) print(target) print(len(target[0][0])) ouput_str = get_output_char(target[0][0][1:]) return ouput_str if __name__ == '__main__': args = get_args() # pad index device = torch.device('cuda' if args.no_cuda == False else 'cpu') transformer_model = Transformer(args.sl_vocab_size, args.xl_vocab_size, hid_dim=args.embedding_dim, pf_dim=args.fp_inner_dim, n_layers=args.n_layers, n_heads=args.n_head, dropout=args.dropout, device=device, SOS_IDX=SOS_IDX, PAD_IDX=PAD_IDX, EOS_IDX=EOS_IDX).to( device) # transformer_model.load_state_dict(torch.load('./models-bak/transformer/1121/transformer-model_11.pt', map_location='cpu')) transformer_model.load_state_dict(torch.load('./models-bak/transformer/1122/transformer-model_500.pt', map_location='cpu')) transformer_model.eval() text = '欲出烦恼须无我' print(predict_xl(text, transformer_model, device, is_beam_search=True)) # df = pd.read_excel('./couplet/result-test.xlsx') # df['transformer'] = df['上联'].apply(lambda x: predict_xl(x, transformer_model, device, is_beam_search=False)) # df['transformer_beam'] = df['上联'].apply(lambda x: predict_xl(x, transformer_model, device, is_beam_search=True)) # df.to_excel('./couplet/result-test.xlsx',index=False)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=100) parser.add_argument('-batch_size', type=int, default=64) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=6) parser.add_argument('-n_layers', type=int, default=2) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-seed', type=int, default=42) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best', 'record'], default='best') parser.add_argument('-save_thres', type=float, default=None) parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') parser.add_argument('-model', default=None, help='Path to model .pt file') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model set_seed(opt.seed) #========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len training_data, validation_data = prepare_dataloaders(data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size #========= Preparing Model =========# if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') if opt.model is None: transformer = Transformer( opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) else: checkpoint = torch.load(opt.model) model_opt = checkpoint['settings'] transformer = Transformer( model_opt.src_vocab_size, model_opt.tgt_vocab_size, model_opt.max_token_seq_len, tgt_emb_prj_weight_sharing=model_opt.proj_share_weight, emb_src_tgt_weight_sharing=model_opt.embs_share_weight, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout).to(device) transformer.load_state_dict(checkpoint['model']) print('[Info] Trained model state loaded.') optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, optimizer, device, opt)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-config', type=str, default='config/rnnt.yaml') parser.add_argument('-load_model', type=str, default=None) parser.add_argument('-fp16_allreduce', action='store_true', default=False, help='use fp16 compression during allreduce') parser.add_argument('-batches_per_allreduce', type=int, default=1, help='number of batches processed locally before ' 'executing allreduce across workers; it multiplies ' 'total batch size.') parser.add_argument( '-num_wokers', type=int, default=0, help='how many subprocesses to use for data loading. ' '0 means that the data will be loaded in the main process') parser.add_argument('-log', type=str, default='train.log') opt = parser.parse_args() configfile = open(opt.config) config = AttrDict(yaml.load(configfile)) global global_step global_step = 0 if hvd.rank() == 0: exp_name = config.data.name if not os.path.isdir(exp_name): os.mkdir(exp_name) logger = init_logger(exp_name + '/' + opt.log) else: logger = None if torch.cuda.is_available(): torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(config.training.seed) torch.backends.cudnn.deterministic = True else: raise NotImplementedError #========= Build DataLoader =========# train_dataset = AudioDateset(config.data, 'train') train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.data.train.batch_size, sampler=train_sampler) assert train_dataset.vocab_size == config.model.vocab_size #========= Build A Model Or Load Pre-trained Model=========# model = Transformer(config.model) if hvd.rank() == 0: n_params, enc_params, dec_params = count_parameters(model) logger.info('# the number of parameters in the whole model: %d' % n_params) logger.info('# the number of parameters in encoder: %d' % enc_params) logger.info('# the number of parameters in decoder: %d' % dec_params) model.cuda() # define an optimizer optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-9) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if opt.fp16_allreduce else hvd.Compression.none optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression) # load pretrain model if opt.load_model is not None and hvd.rank() == 0: checkpoint = torch.load(opt.load_model) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info('Load pretrainded Model and previous Optimizer!') elif hvd.rank() == 0: init_parameters(model) logger.info('Initialized all parameters!') # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # define loss function crit = nn.CrossEntropyLoss(ignore_index=0) # create a visualizer if config.training.visualization and hvd.rank() == 0: visualizer = SummaryWriter(exp_name + '/log') logger.info('Created a visualizer.') else: visualizer = None for epoch in range(config.training.epoches): train(epoch, model, crit, optimizer, train_loader, train_sampler, logger, visualizer, config) if hvd.rank() == 0: save_model(epoch, model, optimizer, config, logger) if hvd.rank() == 0: logger.info('Traing Process Finished')
def get_embedding(): import transformer.Constants as Constants from transformer.Models import Transformer from transformer.Optim import ScheduledOptim from transformer.Modules import LabelSmoothing from transformer.Beam import Beam from transformer.Translator import translate from preprocess import read_instances_from_file, convert_instance_to_idx_seq import evals from evals import Logger from DataLoader import DataLoader data = torch.load(opt.data) opt.max_token_seq_len_e = data['settings'].max_seq_len opt.max_token_seq_len_d = 30 opt.proj_share_weight = True opt.d_word_vec = opt.d_model # training_data = DataLoader( # data['dict']['src'], # data['dict']['tgt'], # src_insts=data['train']['src'], # tgt_insts=data['train']['tgt'], # batch_size=opt.batch_size, # shuffle=True, # cuda=opt.cuda) opt.src_vocab_size = training_data.src_vocab_size opt.tgt_vocab_size = training_data.tgt_vocab_size opt.tgt_vocab_size = opt.tgt_vocab_size - 4 opt.src_vocab_size = training_data.src_vocab_size opt.tgt_vocab_size = training_data.tgt_vocab_size opt.tgt_vocab_size = opt.tgt_vocab_size - 4 opt.d_v = int(opt.d_model / opt.n_head) opt.d_k = int(opt.d_model / opt.n_head) model = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len_e, opt.max_token_seq_len_d, proj_share_weight=opt.proj_share_weight, embs_share_weight=False, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner_hid=opt.d_inner_hid, n_layers_enc=opt.n_layers_enc, n_layers_dec=opt.n_layers_dec, n_head=opt.n_head, dropout=opt.dropout, dec_dropout=opt.dec_dropout, encoder=opt.encoder, decoder=opt.decoder, enc_transform=opt.enc_transform, onehot=opt.onehot, no_enc_pos_embedding=opt.no_enc_pos_embedding, dec_reverse=opt.dec_reverse, no_residual=opt.no_residual) state_dict = torch.load(opt.results_dir + '/' + opt.mname + '/model.chkpt') model.load_state_dict(state_dict['model']) model = model.cuda() model.eval() model.decoder.tgt_word_emb.weight W = model.decoder.tgt_word_emb.weight.data.cpu().numpy() numpy.save(W, 'Embedding')
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-batch_size', type=int, default=64) # parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default='default') parser.add_argument('-tensorboard', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model global global_counter global_counter = 0 writer = None if opt.tensorboard: writer = SummaryWriter(os.path.join('./logs', opt.tensorboard)) # ========= Loading Dataset =========# data = torch.load(opt.data) global idx2char idx2char = {v: k for k, v in data['dict']['src'].items()} opt.max_token_seq_len = data['settings'].max_token_seq_len training_data, validation_data, unique_char_len = prepare_dataloaders( data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size # ========= Preparing Model =========# if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) try: transformer.load_state_dict(torch.load('./checkpoints/model.pt')) print("Model loaded successfully.......") except: pass optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, optimizer, device, opt, unique_char_len, writer)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() #----------------------参数都在这里面,默认参数!!!!!!!!!!!!!!!! parser.add_argument('-data', required=False) parser.add_argument('-epoch', type=int, default=1) #为了跑通我就先写1了. parser.add_argument('-batch_size', type=int, default=32) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default='/transformer_my') parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') # 这种action里面写上就表示默认调用的话就是true opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model ''' 下面来把参数写这里, 就方便了. ''' opt.saved_weight = 'c:/trained.chkpt' # 就的模型的位置. opt.data = 'yunixng_bash/data/multi30k.atok.low.pt' # 数据集的位置. opt.save_model = 'trained_finetune' # 保存模型的名字. opt.save_mode = 'best' # 数据集的位置. opt.proj_share_weight = True # 数据集的位置. opt.label_smoothing = True # 数据集的位置. opt.cuda = False opt.batch_size = 200 opt.epoch = 30 #========= Loading Dataset =========# data = torch.load( opt.data ) # 这里面的数据已经经过编码了. 具体的编码规则也都在data里面,data里面是一个字典.并且src 和tgt的字典是不一样的,所以上面的embs_share_weight 参数一定要false. 数据集一共大小才3mb. 真方便. 就是根目录下面的multi30k.atok.low.pt这个. 应该是一个小数据及,3万个句子对, 字典3k. 只有点常用的英文字. 并且没用使用word-piece. 只是word级别的编码. 所以随便给一个句子,超出字典非常正常. 但是目前用这个,对于测试非常方便,速度很快. opt.max_token_seq_len = data['settings'].max_token_seq_len # 进行数据长度预处理 , 就是加padding 而已. training_data, validation_data = prepare_dataloaders(data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size #========= Preparing Model =========# if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print('配的参数都打印在这里了') print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') transformer = Transformer( # 准备网络模型. opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) tmp = torch.load( opt.saved_weight, map_location=torch.device('cpu'))['model'] # 源代码非常闲的piyan加一层model草他妈的. transformer.load_state_dict(tmp) train(transformer, training_data, validation_data, optimizer, device, opt)
def main(): print(args) data_dir = '/home/tiankeke/workspace/datas/sumdata/' TRAIN_X = os.path.join(data_dir, 'train/train.article.txt') TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt') VALID_X = os.path.join(data_dir, 'train/valid.article.filter.txt') VALID_Y = os.path.join(data_dir, 'train/valid.title.filter.txt') src_vocab, tgt_vocab = get_vocab(TRAIN_X, TRAIN_Y) small_vocab_file = 'sumdata/small_vocab.json' if os.path.exists(small_vocab_file): small_vocab = json.load(open(small_vocab_file)) else: small_vocab = build_vocab([TRAIN_X, TRAIN_Y], small_vocab_file, vocab_size=80000) max_src_len = 101 max_tgt_len = 47 bs = args.batch_size n_train = args.n_train n_valid = args.n_valid vocab = small_vocab train_x = BatchManager(load_data(TRAIN_X, max_src_len, n_train), bs, vocab) train_y = BatchManager(load_data(TRAIN_Y, max_tgt_len, n_train), bs, vocab) valid_x = BatchManager(load_data(VALID_X, max_src_len, n_valid), bs, vocab) valid_y = BatchManager(load_data(VALID_Y, max_tgt_len, n_valid), bs, vocab) model = Transformer(len(vocab), len(vocab), max_src_len, d_word_vec=300, d_model=300, d_inner=1200, n_layers=1, n_head=6, d_k=50, d_v=50, dropout=0.1, tgt_emb_prj_weight_sharing=True, emb_src_tgt_weight_sharing=True).cuda() # print(model) saved_state = {'epoch': 0, 'lr': 0.001} if os.path.exists(args.ckpt_file): saved_state = torch.load(args.ckpt_file) model.load_state_dict(saved_state['state_dict']) logging.info('Load model parameters from %s' % args.ckpt_file) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.Adam(parameters, lr=saved_state['lr']) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.3) scheduler.step( ) # last_epoch=-1, which will not update lr at the first time train(train_x, train_y, valid_x, valid_y, model, optimizer, scheduler, args.n_epochs, saved_state['epoch'])
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('--train_src', required=True) parser.add_argument('--valid_src', required=True) parser.add_argument('--max_word_seq_len', type=int, default=100) parser.add_argument('--min_word_count', type=int, default=5) parser.add_argument('--keep_case', action='store_true') parser.add_argument('--epoch', type=int, default=500) parser.add_argument('--batch_size', type=int, default=64) parser.add_argument('--num_worker', type=int, default=8) # parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('--d_model', type=int, default=512) parser.add_argument('--d_inner_hid', type=int, default=2048) parser.add_argument('--d_k', type=int, default=64) parser.add_argument('--d_v', type=int, default=64) parser.add_argument('--n_head', type=int, default=8) parser.add_argument('--n_layers', type=int, default=6) parser.add_argument('--n_warmup_steps', type=int, default=4000) parser.add_argument('--dropout', type=float, default=0.1) parser.add_argument('--embs_share_weight', action='store_true') parser.add_argument('--proj_share_weight', action='store_true') parser.add_argument('--model', default=None, help='Path to model file') parser.add_argument('--log', default=None) parser.add_argument('--save_model', default=None) parser.add_argument('--save_data', default='./data/word2idx.pth') parser.add_argument('--save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('--no_cuda', action='store_true') parser.add_argument('--label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model opt.max_token_seq_len = opt.max_word_seq_len + 2 #========= Loading Dataset =========# training_data = torch.utils.data.DataLoader(dataset.TranslationDataset( dir_name=opt.train_src, max_word_seq_len=opt.max_word_seq_len, min_word_count=opt.min_word_count, keep_case=opt.keep_case, src_word2idx=None, tgt_word2idx=None), num_workers=opt.num_worker, batch_size=opt.batch_size, collate_fn=paired_collate_fn, shuffle=True) validation_data = torch.utils.data.DataLoader(dataset.TranslationDataset( dir_name=opt.valid_src, max_word_seq_len=opt.max_word_seq_len, min_word_count=opt.min_word_count, keep_case=opt.keep_case, src_word2idx=training_data.dataset.src_word2idx, tgt_word2idx=training_data.dataset.tgt_word2idx), num_workers=opt.num_worker, batch_size=opt.batch_size, collate_fn=paired_collate_fn, shuffle=True) data = { 'dict': { 'src': training_data.dataset.src_word2idx, 'tgt': training_data.dataset.tgt_word2idx } } print('[Info] Dumping the processed data to pickle file', opt.save_data) torch.save(data, opt.save_data) print('[Info] Finish.') del data opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size #========= Preparing Model =========# if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) if (opt.model is not None): print('pretrain model!') checkpoint = torch.load(opt.model) model_opt = checkpoint['settings'] transformer = Transformer( model_opt.src_vocab_size, model_opt.tgt_vocab_size, model_opt.max_token_seq_len, tgt_emb_prj_weight_sharing=model_opt.proj_share_weight, emb_src_tgt_weight_sharing=model_opt.embs_share_weight, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout) transformer.load_state_dict(checkpoint['model']) transformer = transformer.to(device) train(transformer, training_data, validation_data, optimizer, device, opt)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() # parser.add_argument('-data', required=True) parser.add_argument('-train_atok', required=True) parser.add_argument('-valid_atok', required=True) parser.add_argument('-epoch', type=int, default=200) parser.add_argument('-batch_size', type=int, default=8) parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model #========= Loading Dataset =========# train_atok = torch.load(opt.train_atok) valid_atok = torch.load(opt.valid_atok) train_vocab = vocab.Vocab(train_atok['settings'].vocab) training_data = dataset.translation_dataloader(train_atok, opt.batch_size, shuffle=True) validation_data = dataset.translation_dataloader(valid_atok, opt.batch_size, shuffle=False) # data = torch.load(opt.data) opt.max_token_seq_len = train_atok['settings'].max_seq_len # training_data, validation_data = prepare_dataloaders(data, opt) opt.src_vocab_size = train_vocab.size() opt.tgt_vocab_size = train_vocab.size() #========= Preparing Model =========# # if opt.embs_share_weight: # assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ # 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) if os.path.exists("trained.chkpt"): x = torch.load("trained.chkpt") # print(type(x["model"])) transformer.load_state_dict(x["model"]) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, optimizer, device, opt)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-config', type=str, default='config/rnnt.yaml') parser.add_argument('-load_model', type=str, default=None) parser.add_argument('-num_workers', type=int, default=0, help='how many subprocesses to use for data loading. ' '0 means that the data will be loaded in the main process') parser.add_argument('-log', type=str, default='train.log') opt = parser.parse_args() configfile = open(opt.config) config = AttrDict(yaml.load(configfile)) exp_name = config.data.name if not os.path.isdir(exp_name): os.mkdir(exp_name) logger = init_logger(exp_name + '/' + opt.log) if torch.cuda.is_available(): torch.cuda.manual_seed(config.training.seed) torch.backends.cudnn.deterministic = True else: raise NotImplementedError #========= Build DataLoader =========# train_dataset = AudioDateset(config.data, 'train') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.data.train.batch_size, shuffle=True, num_workers=opt.num_workers) assert train_dataset.vocab_size == config.model.vocab_size #========= Build A Model Or Load Pre-trained Model=========# model = Transformer(config.model) n_params, enc_params, dec_params = count_parameters(model) logger.info('# the number of parameters in the whole model: %d' % n_params) logger.info('# the number of parameters in encoder: %d' % enc_params) logger.info('# the number of parameters in decoder: %d' % dec_params) if torch.cuda.is_available(): model.cuda() global global_step global_step = 0 # define an optimizer optimizer = ScheduledOptim(model, config.model.d_model, config.optimizer) # load pretrain model if opt.load_model is not None: checkpoint = torch.load(opt.load_model) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info('Load pretrainded Model and previous Optimizer!') else: init_parameters(model) logger.info('Initialized all parameters!') # define loss function crit = nn.CrossEntropyLoss(ignore_index=0) # create a visualizer if config.training.visualization: visualizer = SummaryWriter(exp_name + '/log') logger.info('Created a visualizer.') else: visualizer = None for epoch in range(config.training.epoches): train(epoch, model, crit, optimizer, train_loader, logger, visualizer, config) save_model(epoch, model, optimizer, config, logger) logger.info('Traing Process Finished')
def run(): parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset.") parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--gpt2_model_name", type=str, default="gpt2", help="name of the model ex)openai-gpt") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=30, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=4, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--keyword_module", type=str, default="new", help="add, attention, ") parser.add_argument("--temperature", type=int, default=0.8, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=30, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-label_smoothing', action='store_true') args = parser.parse_args() args.d_word_vec = args.d_model logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class = GPT2Tokenizer if "gpt2" in args.gpt2_model_name else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path tokenizer = tokenizer_class.from_pretrained(args.gpt2_model_name) num_tokens = len(tokenizer.encoder) num_added_tokens = tokenizer.add_special_tokens( ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there model = Transformer( num_tokens + num_added_tokens, num_tokens + num_added_tokens, src_pad_idx=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]), trg_pad_idx=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]), trg_emb_prj_weight_sharing=args.proj_share_weight, emb_src_trg_weight_sharing=args.embs_share_weight, d_k=args.d_k, d_v=args.d_v, d_model=args.d_model, d_word_vec=args.d_word_vec, d_inner=args.d_inner_hid, n_layers=args.n_layers, n_head=args.n_head, dropout=args.dropout, n_position=512, keyword_module=args.keyword_module).to(args.device) model.load_state_dict(torch.load(args.model_checkpoint), strict=False) model.eval() sourceList, targetList, scoreList = get_test_datasetEN( tokenizer, tokenizer, args.dataset_path) current_time = datetime.now().strftime('%b%d_%H-%M-%S') f1 = open((args.model_checkpoint + current_time + "_output.txt"), 'w') for line in tqdm(zip(sourceList, targetList, scoreList), total=len(sourceList)): out_ids = sample_sequence(line[0], line[2], tokenizer, model, tokenizer, args) out_texts = tokenizer.decode(out_ids) for text in out_texts: f1.write(text.replace('▁', ' ').replace('</s>', ' ')) """ for id in out_ids: f1.write(str(id)) f1.write(' ') """ f1.write("\n") f1.close()
if __name__ == "__main__": torch.cuda.set_device(hp.gpu) testLoader = dataset.getDataLoader(is_train=False, batch_size=5, shuffle=False) net1 = resnet.resnet34() net2 = Transformer(len_encoder=hp.enc_input_len, n_tgt_vocab=hp.num_classes, len_max_seq=hp.max_seq_len, n_layers=hp.n_layers) net2.word_prob_prj = nn.LogSoftmax(dim=1) net1.cuda().eval() #net2.cuda().eval() path_to_restore = os.path.join(hp.checkpoint_path, hp.model_path_pre+"_"+str(hp.model_path_idx) + ".pth") if os.path.exists(path_to_restore): print("restore from:", path_to_restore) checkpoint = torch.load(path_to_restore) net1.load_state_dict(checkpoint["state_dict_net1"]) net2.load_state_dict(checkpoint["state_dict_net2"]) print("restore successfully!") else: print("fail to restore, path don't exist") translator = Translator(net2, beam_size=hp.beam_size, max_seq_len=hp.max_seq_len, n_best=hp.n_best) print("************************begin infer*********************") for imgs_name, imgs, length_imgs, labels, legnth_labels in testLoader: imgs = Variable(imgs).cuda() length_imgs = Variable(length_imgs).cuda() enc_img = net1(imgs.float()) batch_size, channel, height, width = enc_img.shape enc_img = enc_img.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, channel) batch_pred, batch_prob = translator.translate_batch(enc_img, length_imgs)
def main(): ''' Usage: python train.py -data_pkl m30k_deen_shr.pkl -log m30k_deen_shr -embs_share_weight -proj_share_weight -label_smoothing -save_model trained -b 256 -warmup 128000 ''' parser = argparse.ArgumentParser() parser.add_argument('-data_pkl', default=None) # all-in-1 data pickle or bpe field parser.add_argument('-train_path', default=None) # bpe encoded data parser.add_argument('-val_path', default=None) # bpe encoded data parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-b', '--batch_size', type=int, default=2048) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000) parser.add_argument('-lr', '--learning_rate', type=float, default=2.0) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-restore', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model if not opt.log and not opt.save_model: print('No experiment result will be saved.') raise opt.plot_intval = 20 if opt.batch_size < 2048 and opt.n_warmup_steps <= 4000: print('[Warning] The warmup steps may be not enough.\n' \ '(sz_b, warmup) = (2048, 4000) is the official setting.\n' \ 'Using smaller batch w/o longer warmup may cause ' \ 'the warmup stage ends with only little data trained.') device = torch.device('cuda' if opt.cuda else 'cpu') # ========= Loading Dataset =========# if all((opt.train_path, opt.val_path)): training_data, validation_data = prepare_dataloaders_from_bpe_files(opt, device) elif opt.data_pkl: # training_data, validation_data = prepare_dataloaders(opt, device) training_data, validation_data = prepare_mydataloaders(opt, device) else: raise print(opt) transformer = Transformer( opt.src_vocab_size, opt.trg_vocab_size, src_pad_idx=opt.src_pad_idx, trg_pad_idx=opt.trg_pad_idx, trg_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_trg_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) if opt.restore: print("loading checkpoint from {}...".format(opt.restore.split("/")[-1])) checkpoint = torch.load(opt.restore) transformer.load_state_dict(checkpoint['model']) optimizer = ScheduledOptim( optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09), opt.learning_rate, opt.d_model, opt.n_warmup_steps) # optimizer = optim.Adagrad(transformer.parameters(), lr=0.15, initial_accumulator_value=0.1) train(transformer, training_data, validation_data, optimizer, device, opt)