def train(model: Transformer, optimizer, criterion, clip, device): model.train() epoches_loss = 0 for index, batch in tqdm(enumerate(dataset_pro.train_iter)): shang_lian, shang_lian_length = batch.shang_lian shang_lian = shang_lian.permute(1, 0).to(device) # shang_lian_length = shang_lian_length.permute(1, 0).to(device) # shang_lian_length = shang_lian_length.numpy() # shang_lian_pos = torch.LongTensor(get_pos_ids(shang_lian_length, shang_lian.shape[1])).to(device) xia_lian, xia_lian_length = batch.xia_lian xia_lian = xia_lian.permute(1, 0).to(device) # xia_lian_length = xia_lian_length.numpy() # xia_lian_pos = torch.LongTensor(get_pos_ids(xia_lian_length, xia_lian.shape[1])).to(device) optimizer.zero_grad() outputs = model(shang_lian, xia_lian[:, :-1]) outputs = outputs.contiguous().view(-1, outputs.shape[-1]) xia_lian = xia_lian[:, 1:].contiguous().view(-1) loss = criterion(outputs, xia_lian) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), clip) # print(loss.item()) optimizer.step() epoches_loss += loss.item() result_loss = epoches_loss / len(dataset_pro.train_iter) return result_loss
def main(): ''' Main function ''' opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model # ========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len trn_data, val_data = prepare_dataloaders(data, opt) opt.src_vocab_size = trn_data.dataset.src_vocab_size opt.tgt_vocab_size = trn_data.dataset.tgt_vocab_size # ========= Preparing Model =========# if opt.embs_share_weight: assert trn_data.dataset.src_word2idx == trn_data.dataset.tgt_word2idx,\ ('The src/tgt word2idx table are different but asked to share ' 'word embedding.') print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) train(transformer, trn_data, val_data, optimizer, device, opt)
d_model=args.d_model, d_word_vec=args.d_word_vec, d_inner=args.d_inner_hid, n_layers=args.n_layers, n_ensemble=args.n_ensemble, n_head=args.n_head, dropout=args.dropout, scale_emb_or_prj=args.scale_emb_or_prj).to(args.device) print('model initiated.') # optimizer = ScheduledOptim( # optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09), # args.lr_mul, args.d_model, args.n_warmup_steps) optimizer = MyScheduledOptim( optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09), optim.Adam(list(model.parameters())[model.encoder.num_shared_parameters:], betas=(0.9, 0.98), eps=1e-09), args.milestones, args.lr_list, args.sep_optimizer_start_step) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) def cal_performance(pred, dec_output, gold, trg_pad_idx, args, model, termination_bit_weight=None, smoothing=False): ''' Apply label smoothing if needed ''' loss = cal_loss(pred, dec_output, gold, trg_pad_idx, args, model, termination_bit_weight, smoothing) if args.input_type == 'node_based': pred = pred.max(1)[1] gold = gold.contiguous().view(-1)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-batch_size', type=int, default=64) # parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model # ========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len training_data, validation_data = prepare_dataloaders(data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size # ========= Preparing Model =========# if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, optimizer, device, opt)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() #----------------------参数都在这里面,默认参数!!!!!!!!!!!!!!!! parser.add_argument('-data', required=False) parser.add_argument('-epoch', type=int, default=1) #为了跑通我就先写1了. parser.add_argument('-batch_size', type=int, default=32) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default='/transformer_my') parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') # 这种action里面写上就表示默认调用的话就是true opt = parser.parse_args() opt.d_word_vec = opt.d_model ''' 下面来把参数写这里, 就方便了. ''' opt.saved_weight = '/trained.chkpt' # 就的模型的位置. opt.data = 'yunixng_bash/data/multi30k.atok.low.pt' # 数据集的位置. opt.save_model = 'trained' # 存模型的名字. opt.save_mode = 'best' # 数据集的位置. opt.proj_share_weight = True # 数据集的位置. opt.label_smoothing = True # 数据集的位置. opt.cuda = False opt.batch_size = 200 opt.epoch = 30 print(opt, 44444444444444444444444444444444444444444444444444444444444444) #========= Loading Dataset =========# data = torch.load( opt.data ) # 这里面的数据已经经过编码了. 具体的编码规则也都在data里面,data里面是一个字典.并且src 和tgt的字典是不一样的,所以上面的embs_share_weight 参数一定要false. 数据集一共大小才3mb. 真方便. 就是根目录下面的multi30k.atok.low.pt这个. 应该是一个小数据及,3万个句子对, 字典3k. 只有点常用的英文字. 并且没用使用word-piece. 只是word级别的编码. 所以随便给一个句子,超出字典非常正常. 但是目前用这个,对于测试非常方便,速度很快. opt.max_token_seq_len = data['settings'].max_token_seq_len # 进行数据长度预处理 , 就是加padding 而已. training_data, validation_data = prepare_dataloaders(data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size #========= Preparing Model =========# if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print('配的参数都打印在这里了') print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') transformer = Transformer( # 准备网络模型. opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, optimizer, device, opt)
def main(): ''' Usage: python train.py -data_pkl m30k_deen_shr.pkl -log m30k_deen_shr -embs_share_weight -proj_share_weight -label_smoothing -save_model trained -b 128 -epoch 100 -optim nero -lr 0.003 ''' parser = argparse.ArgumentParser() parser.add_argument('-data_pkl', default=None) # all-in-1 data pickle or bpe field parser.add_argument('-train_path', default=None) # bpe encoded data parser.add_argument('-val_path', default=None) # bpe encoded data parser.add_argument('-seed', type=int, default=0) parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-b', '--batch_size', type=int, default=2048) parser.add_argument('-optim', type=str, choices=['adam', 'sgd', 'nero', 'lamb']) parser.add_argument('-lr', type=float) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model # set random seed torch.manual_seed(opt.seed) np.random.seed(opt.seed) # tensorboard writer log_dir = 'runs/' + opt.optim + '_' + str(opt.lr) + '_seed' + str(opt.seed) writer = SummaryWriter(log_dir=log_dir) print("Saving tensorboard to "+log_dir) if not opt.log and not opt.save_model: print('No experiment result will be saved.') raise device = torch.device('cuda' if opt.cuda else 'cpu') #========= Loading Dataset =========# if all((opt.train_path, opt.val_path)): training_data, validation_data = prepare_dataloaders_from_bpe_files(opt, device) elif opt.data_pkl: training_data, validation_data = prepare_dataloaders(opt, device) else: raise print(opt) transformer = Transformer( opt.src_vocab_size, opt.trg_vocab_size, src_pad_idx=opt.src_pad_idx, trg_pad_idx=opt.trg_pad_idx, trg_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_trg_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) if opt.optim == 'adam': optimizer = optim.Adam(transformer.parameters(), lr=opt.lr, betas=(0.0, 0.999)) elif opt.optim == 'nero': optimizer = Nero(transformer.parameters(), lr=opt.lr) elif opt.optim == 'lamb': optimizer = Lamb(transformer.parameters(), lr=opt.lr, betas=(0.0, 0.999)) elif opt.optim == 'sgd': optimizer = optim.SGD(transformer.parameters(), lr=opt.lr, momentum=0) print("Using optim", type(optimizer).__name__) lr_lambda = lambda epoch : 2 * min(epoch / opt.epoch, (opt.epoch-epoch) / opt.epoch) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) train(transformer, training_data, validation_data, optimizer, scheduler, device, opt, writer) writer.close()
np.power(self.n_warmup_steps, -1.5) * self.n_current_steps ]) def _update_learning_rate(self): ''' Learning rate scheduling per step ''' self.n_current_steps += 1 lr = self.init_lr * self._get_lr_scale() for param_group in self.optimizer.param_groups: param_group['lr'] = lr # create optimizer optimizer = torch.optim.Adam(filter(lambda x: x.requires_grad, model.parameters()), betas=(0.9, 0.98), eps=1e-09, lr=1e-4, amsgrad=False) # create a sceduled optimizer object optimizer = ScheduledOptim(optimizer, config["model_dim"], config["warmup_steps"]) def save_checkpoint(filename, model, optimizer): ''' saves model into a state dict, along with its training statistics, and parameters :param model:
def main(): """ Main function """ parser = argparse.ArgumentParser() parser.add_argument("-data", required=True) parser.add_argument("-epoch", type=int, default=10) parser.add_argument("-batch_size", type=int, default=64) # parser.add_argument("-d_word_vec", type=int, default=512) parser.add_argument("-d_model", type=int, default=512) parser.add_argument("-d_inner_hid", type=int, default=2048) parser.add_argument("-d_k", type=int, default=64) parser.add_argument("-d_v", type=int, default=64) parser.add_argument("-n_head", type=int, default=8) parser.add_argument("-n_layers", type=int, default=6) parser.add_argument("-n_warmup_steps", type=int, default=4000) parser.add_argument("-dropout", type=float, default=0.1) parser.add_argument("-embs_share_weight", action="store_true") parser.add_argument("-proj_share_weight", action="store_true") parser.add_argument("-log", default=None) parser.add_argument("-save_model", default=None) parser.add_argument("-save_mode", type=str, choices=["all", "best"], default="best") parser.add_argument("-no_cuda", action="store_true") parser.add_argument("-label_smoothing", action="store_true") opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model # ========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data["settings"].max_token_seq_len training_data, validation_data = prepare_dataloaders(data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size # ========= Preparing Model =========# if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ "The src/tgt word2idx table are different but asked to share word embedding." print(opt) device = torch.device("cuda" if opt.cuda else "cpu") transformer = Transformer( opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) optimizer = ScheduledOptim( optim.Adam( filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, optimizer, device, opt)
device = torch.device('cuda' if args.no_cuda == False else 'cpu') transformer_model = Transformer(args.sl_vocab_size, args.xl_vocab_size, hid_dim=args.embedding_dim, pf_dim=args.fp_inner_dim, n_layers=args.n_layers, n_heads=args.n_head, dropout=args.dropout, device=device, SOS_IDX=TGT_SOS_IDX, PAD_IDX=SRC_PAD_IDX, EOS_IDX=TGT_EOS_IDX).to(device) # 优化器 optimizer = optim.Adam(transformer_model.parameters(), lr=args.lr, weight_decay=args.l2_reg) # 损失函数 criterion = nn.CrossEntropyLoss(ignore_index=SRC_PAD_IDX) # N_EPOCHS = 10 # CLIP = 1 best_valid_loss = float('inf') for epoch in range(args.epoches): start_time = time.time()
def main(): h = logging.StreamHandler() formatter = logging.Formatter("[%(asctime)s][%(levelname)s]%(message)s", datefmt="%Y-%m-%d %H:%M:%S") h.setFormatter(formatter) logger = logging.getLogger() logger.setLevel(logging.INFO) logger.addHandler(h) parser = argparse.ArgumentParser() parser.add_argument('-data_path', default="../Data/dataset") parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-b', '--batch_size', type=int, default=512) parser.add_argument('-d_model', type=int, default=15) parser.add_argument('-d_inner_hid', type=int, default=256) parser.add_argument('-d_k', type=int, default=15) parser.add_argument('-d_v', type=int, default=15) parser.add_argument('-n_head', type=int, default=1) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=100000) parser.add_argument('-lr_mul', type=float, default=2.0) parser.add_argument('-seed', type=int, default=None) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-scale_emb_or_prj', type=str, default='prj') parser.add_argument('-output_dir', type=str, default='./checkpoint/') parser.add_argument('-summary_dir', type=str, default='./summary') parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model logging.info(opt) writer = SummaryWriter(log_dir=str(opt.summary_dir)) if opt.seed is not None: torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = False np.random.seed(opt.seed) random.seed(opt.seed) if not os.path.exists(opt.output_dir): os.makedirs(opt.output_dir) device = torch.device('cuda' if opt.cuda else 'cpu') #========= Loading Dataset =========# pkl_files = os.listdir(opt.data_path) pwd = os.getcwd() pkl_files = [ os.path.join(pwd, opt.data_path, file) for file in pkl_files if 'train' in file ] data_list = [data for data in pkl_files if '.pkl' in data] random.shuffle(data_list) logging.info(data_list) transformer = Transformer(trg_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_trg_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout, scale_emb_or_prj=opt.scale_emb_or_prj).to(device) optimizer = ScheduledOptim( optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09), opt.lr_mul, opt.d_model, opt.n_warmup_steps) test(transformer, data_list, optimizer, device, opt, writer)
n_layers=hp.n_layers) net1 = net1.cuda() net2 = net2.cuda() trainLoader = dataset.getDataLoader(is_train=True, batch_size=hp.BATCH_SIZE, shuffle=True) iter_one_epoch = len(trainLoader) print("iteration_every_epoch: ", iter_one_epoch) #testloader = dataset.getDataLoader(is_train=False, batch_size=BATCH_SIZE, shuffle=False) lossFunction = nn.CrossEntropyLoss(ignore_index=Constants.PAD) optimizer_ = optim.Adam( [{ 'params': net1.parameters() }, { 'params': filter(lambda x: x.requires_grad, net2.parameters()) }], betas=[0.9, 0.98], lr=hp.LEARNING_RATE) optimizer = optimizer_ optimizer_scheduler = ExponentialLR(optimizer_, 0.98) #optimizer = ScheduledOptim(optimizer_, learning_rate=hp.LEARNING_RATE, n_warmup_steps=hp.n_warmup_steps) if not os.path.exists(hp.checkpoint_path): os.makedirs(hp.checkpoint_path) num_step = 1 model_restore_path = os.path.join( hp.checkpoint_path, hp.model_path_pre + "_" + str(hp.model_path_idx) + ".pth") if hp.model_restore and os.path.exists(model_restore_path): print("restore model from {}".format(model_restore_path))
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-emb', type=str, default=None) parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-batch_size', type=int, default=64) # used when we have multiple inputs (token, pos, pred in openie) parser.add_argument('-d_word_vec', type=str, default=None) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') parser.add_argument('-task', type=str, choices=['mt', 'openie'], default='mt') parser.add_argument('-emb_op', type=str, choices=['sum', 'concat'], default='no') parser.add_argument('-rel_pos_emb_op', type=str, choices=['no', 'lookup', 'lstm'], default='no') opt = parser.parse_args() opt.cuda = not opt.no_cuda # seed SEED = 2019 torch.manual_seed(SEED) np.random.seed(SEED) #========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len if opt.task == 'mt': training_data, validation_data = prepare_dataloaders(data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' elif opt.task == 'openie': training_data, validation_data = prepare_dataloaders_openie(data, opt) opt.vocab_size = training_data.dataset.vocab_size opt.n_class = data['settings'].n_class opt.n_pos = data['settings'].n_pos opt.n_rel_pos = data['settings'].n_path opt.n_pred_ind = data['settings'].n_pred_ind else: raise ValueError print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') if opt.task == 'mt': opt.d_word_vec = opt.d_model transformer = Transformer( opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) elif opt.task == 'openie': word_emb = None if opt.emb: word_emb = WordVector(opt.emb, is_binary=False, first_line=True, initializer='uniform').get_vectors() print('[Info] Use pretrained embedding with dim {}'.format(word_emb.shape[1])) # get dimensions # word, pos, pred_ind, pred_word, pred_pos if opt.d_word_vec: opt.d_vec_list = list(map(int, opt.d_word_vec.split(':'))) else: opt.d_word_vec = opt.d_model emb_dim = word_emb.shape[1] if word_emb is not None else opt.d_word_vec // 5 pred_emb_dim = word_emb.shape[1] if word_emb is not None else opt.d_word_vec // 5 rest_dim = opt.d_word_vec - emb_dim - pred_emb_dim pos_dim = rest_dim // 3 pred_pos_dim = rest_dim // 3 pred_idx_dim = rest_dim // 3 opt.d_vec_list = [emb_dim, pos_dim, pred_idx_dim, pred_emb_dim, pred_pos_dim] print('[Info] input: {}'.format(['word', 'pos', 'pred_ind', 'pred_word', 'pred_pos'])) print('[Info] input embedding dims: {}'.format(opt.d_vec_list)) print('[Info] Transformer input dims: {}'.format(opt.d_model)) opt.n_cate_list = [opt.vocab_size, opt.n_pos, opt.n_pred_ind, opt.vocab_size, opt.n_pos] opt.emb_learnable_list = [False, True, True, False, True] opt.pre_emb_list = [word_emb, None, None, word_emb, None] transformer = TransformerTagger( opt.n_cate_list, opt.n_class, opt.max_token_seq_len, d_vec_list=opt.d_vec_list, pre_emb_list=opt.pre_emb_list, emb_op=opt.emb_op, emb_learnable_list=opt.emb_learnable_list, rel_pos_emb_op=opt.rel_pos_emb_op, n_rel_pos=opt.n_rel_pos, d_model=opt.d_model, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, d_k=opt.d_k, d_v=opt.d_v, dropout=opt.dropout).to(device) else: raise ValueError optimizer = ScheduledOptim( optim.Adam( filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) print('[Info] #parameters: {}'.format(count_parameters(transformer))) train(transformer, training_data, validation_data, optimizer, device ,opt)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=10000) parser.add_argument('-batch_size', type=int, default=64) # parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=8) parser.add_argument('-d_inner_hid', type=int, default=16) parser.add_argument('-d_k', type=int, default=8) parser.add_argument('-d_v', type=int, default=8) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-max_size', type=int, default=0) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-lr', type=float, default=1E-3, help="Learning rate.") parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-sparsity', type=float, default=5.0) parser.add_argument('-padding_value_threshold', type=float, default=0.0) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-funnel', action='store_true', help="Use FunnelTransformer architecture.") opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model # ========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len # ========= Preparing DataLoader =========# max_size_src = opt.max_size if opt.max_size is not 0 else len( data['train']['src']) max_size_tgt = opt.max_size if opt.max_size is not 0 else len( data['train']['tgt']) print("training: max_size_src={} max_size_tgt={}".format( max_size_src, max_size_tgt)) training_data = DataLoader(data['dict']['src'], data['dict']['tgt'], src_insts=data['train']['src'][0:max_size_src], tgt_insts=data['train']['tgt'][0:max_size_tgt], batch_size=opt.batch_size, cuda=opt.cuda) max_size_src = opt.max_size if opt.max_size is not 0 else len( data['valid']['src']) max_size_tgt = opt.max_size if opt.max_size is not 0 else len( data['valid']['tgt']) print("validation: max_size_src={} max_size_tgt={}".format( max_size_src, max_size_tgt)) validation_data = DataLoader( data['dict']['src'], data['dict']['tgt'], src_insts=data['valid']['src'][0:max_size_src], tgt_insts=data['valid']['tgt'][0:max_size_tgt], batch_size=opt.batch_size, shuffle=opt.cuda, test=True, cuda=opt.cuda) opt.src_vocab_size = training_data.src_vocab_size opt.tgt_vocab_size = training_data.tgt_vocab_size # ========= Preparing Model =========# if opt.embs_share_weight and training_data.src_word2idx != training_data.tgt_word2idx: print( '[Warning]', 'The src/tgt word2idx table are different but asked to share word embedding.' ) print(opt) transformer = Transformer( opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, proj_share_weight=opt.proj_share_weight, embs_share_weight=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner_hid=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout) if not opt.funnel else FunnelTransformer( opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, proj_share_weight=opt.proj_share_weight, embs_share_weight=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner_hid=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout) #print(transformer) optimizer = ScheduledOptim( optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09, lr=opt.lr), opt.d_model, opt.n_warmup_steps) def get_criterion(vocab_size): ''' With PAD token zero weight ''' weight = torch.ones(vocab_size) weight[Constants.PAD] = 0 return nn.CrossEntropyLoss(weight, size_average=False) crit = get_criterion(training_data.tgt_vocab_size) if opt.cuda: transformer = transformer.cuda() crit = crit.cuda() train(transformer, training_data, validation_data, crit, optimizer, opt)
if config.test_data_path: train_data = d.get_trainset(config) validate_data = d.get_testset(config) else: train_data, validate_data = d.get_splite_data(config) model = Transformer(config.src_vocab_size, config.fix_len, d_k=config.d_k, d_v=config.d_v, d_model=config.d_model, d_word_vec=config.d_word_vec, d_inner=config.d_inner, n_layers=config.n_layers, n_head=config.n_head, dropout=config.dropout) criterion = nn.CrossEntropyLoss() if config.freeze: model_parameters = filter(lambda p: p.requires_grad, model.parameters()) else: model_parameters = model.parameters() optimzier = torch.optim.Adam(model_parameters, lr=config.learning_rate, weight_decay=config.weight_decay) train(model, train_data, validate_data, config, optimzier, criterion)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('--train_src', required=True) parser.add_argument('--valid_src', required=True) parser.add_argument('--max_word_seq_len', type=int, default=100) parser.add_argument('--min_word_count', type=int, default=5) parser.add_argument('--keep_case', action='store_true') parser.add_argument('--epoch', type=int, default=500) parser.add_argument('--batch_size', type=int, default=64) parser.add_argument('--num_worker', type=int, default=8) # parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('--d_model', type=int, default=512) parser.add_argument('--d_inner_hid', type=int, default=2048) parser.add_argument('--d_k', type=int, default=64) parser.add_argument('--d_v', type=int, default=64) parser.add_argument('--n_head', type=int, default=8) parser.add_argument('--n_layers', type=int, default=6) parser.add_argument('--n_warmup_steps', type=int, default=4000) parser.add_argument('--dropout', type=float, default=0.1) parser.add_argument('--embs_share_weight', action='store_true') parser.add_argument('--proj_share_weight', action='store_true') parser.add_argument('--model', default=None, help='Path to model file') parser.add_argument('--log', default=None) parser.add_argument('--save_model', default=None) parser.add_argument('--save_data', default='./data/word2idx.pth') parser.add_argument('--save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('--no_cuda', action='store_true') parser.add_argument('--label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model opt.max_token_seq_len = opt.max_word_seq_len + 2 #========= Loading Dataset =========# training_data = torch.utils.data.DataLoader(dataset.TranslationDataset( dir_name=opt.train_src, max_word_seq_len=opt.max_word_seq_len, min_word_count=opt.min_word_count, keep_case=opt.keep_case, src_word2idx=None, tgt_word2idx=None), num_workers=opt.num_worker, batch_size=opt.batch_size, collate_fn=paired_collate_fn, shuffle=True) validation_data = torch.utils.data.DataLoader(dataset.TranslationDataset( dir_name=opt.valid_src, max_word_seq_len=opt.max_word_seq_len, min_word_count=opt.min_word_count, keep_case=opt.keep_case, src_word2idx=training_data.dataset.src_word2idx, tgt_word2idx=training_data.dataset.tgt_word2idx), num_workers=opt.num_worker, batch_size=opt.batch_size, collate_fn=paired_collate_fn, shuffle=True) data = { 'dict': { 'src': training_data.dataset.src_word2idx, 'tgt': training_data.dataset.tgt_word2idx } } print('[Info] Dumping the processed data to pickle file', opt.save_data) torch.save(data, opt.save_data) print('[Info] Finish.') del data opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size #========= Preparing Model =========# if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) if (opt.model is not None): print('pretrain model!') checkpoint = torch.load(opt.model) model_opt = checkpoint['settings'] transformer = Transformer( model_opt.src_vocab_size, model_opt.tgt_vocab_size, model_opt.max_token_seq_len, tgt_emb_prj_weight_sharing=model_opt.proj_share_weight, emb_src_tgt_weight_sharing=model_opt.embs_share_weight, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout) transformer.load_state_dict(checkpoint['model']) transformer = transformer.to(device) train(transformer, training_data, validation_data, optimizer, device, opt)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-batch_size', type=int, default=64) # parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default='default') parser.add_argument('-tensorboard', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model global global_counter global_counter = 0 writer = None if opt.tensorboard: writer = SummaryWriter(os.path.join('./logs', opt.tensorboard)) # ========= Loading Dataset =========# data = torch.load(opt.data) global idx2char idx2char = {v: k for k, v in data['dict']['src'].items()} opt.max_token_seq_len = data['settings'].max_token_seq_len training_data, validation_data, unique_char_len = prepare_dataloaders( data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size # ========= Preparing Model =========# if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) try: transformer.load_state_dict(torch.load('./checkpoints/model.pt')) print("Model loaded successfully.......") except: pass optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, optimizer, device, opt, unique_char_len, writer)
print(' - (Training) accuracy: {accu:3.3f} %, '\ 'elapse: {elapse:3.3f} min'.format( accu=100*train_accu, elapse=(time.time()-start)/60)) start = time.time() valid_loss, valid_accu = eval_epoch(model, validation_data, predicates) print(' - (Validation) accuracy: {accu:3.3f} %, '\ 'elapse: {elapse:3.3f} min'.format( accu=100*valid_accu, elapse=(time.time()-start)/60)) valid_accus += [valid_accu] device = torch.device('cpu') word2idx,ints,en1_pos,en2_pos,predicates,relation2idx = data.build_sentences() training_data, validation_data = prepare_dataloaders(word2idx,ints,en1_pos,en2_pos,predicates) model = Transformer( n_src_vocab=len(word2idx), len_max_seq=config.max_seq_len).to(device) optimizer = ScheduledOptim( optim.Adam( filter(lambda x: x.requires_grad, model.parameters()), betas=(0.9, 0.98), eps=1e-09), 512, 1000) train(model, training_data, validation_data, optimizer,predicates)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', default="./pssp-data/data.pt") parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-batch_size', type=int, default=17) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=256) parser.add_argument('-d_inner_hid', type=int, default=512) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default="model") parser.add_argument('-save_plot', default="loss.png") parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model #========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len training_data, validation_data, test_data = prepare_dataloaders(data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size opt.vocab_src = training_data.dataset.src_word2idx opt.vocab_tgt = training_data.dataset.tgt_word2idx #========= Preparing Model =========# if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') transformer = Transformer( opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout) transformer = DataParallel(transformer, range(0, torch.cuda.device_count())).to(device) optimizer = ScheduledOptim( optim.Adam( filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) weight_mask = None crossEntropy = nn.CrossEntropyLoss(weight_mask, reduction='sum', ignore_index=Constants.PAD) train_loss, val_loss = train( transformer, training_data, validation_data, optimizer, device, opt, crossEntropy) print("Starting Test...") test(transformer, test_data, device, opt, crossEntropy) print("Making loss graph...") plt = plot(train_loss, val_loss) plt.savefig(opt.save_plot + ".png") print("Finished!")
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=None) parser.add_argument('-step', type=int, default=None) parser.add_argument('-batch_size', type=int, default=64) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) # NOTE(keshav2): This just refers to the learning rate schedule, # nothing performance related. parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('--checkpoint_dir', type=str, default='/lfs/1/keshav2/checkpoints/transformer') parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='all') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') parser.add_argument('--dist-url', default='env://', type=str, help='url used to set up distributed training') parser.add_argument('--dist-backend', default='nccl', type=str, help='Distributed backend') parser.add_argument('--local_rank', default=0, type=int, help='Local rank') parser.add_argument('--rank', default=None, type=int, help='Rank') parser.add_argument('--world_size', default=None, type=int, help='World size') parser.add_argument('--master_addr', default=None, type=str, help='Master address to use for distributed run') parser.add_argument('--master_port', default=None, type=int, help='Master port to use for distributed run') parser.add_argument('--throughput_estimation_interval', type=int, default=None, help='Steps between logging steps completed') parser.add_argument('--max_duration', type=int, default=None, help='Maximum duration in seconds') parser.add_argument('--enable_gavel_iterator', action='store_true', default=False, help='If set, use Gavel iterator') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model torch.cuda.set_device(opt.local_rank) if opt.epoch is not None and opt.step is not None: raise ValueError('Only one of epoch and step may be set') elif opt.epoch is None and opt.step is None: raise ValueError('One of epoch and step must be set') opt.distributed = False if opt.master_addr is not None: opt.distributed = True os.environ['MASTER_ADDR'] = opt.master_addr os.environ['MASTER_PORT'] = str(opt.master_port) dist.init_process_group(backend=opt.dist_backend, init_method=opt.dist_url, world_size=opt.world_size, rank=opt.rank) #========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len training_data, validation_data = prepare_dataloaders( data, opt, opt.master_addr is not None) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size #========= Preparing Model =========# if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) if opt.distributed: transformer = DDP(transformer, device_ids=[opt.local_rank], output_device=opt.local_rank) if opt.enable_gavel_iterator: training_data = GavelIterator(training_data, opt.checkpoint_dir, load_checkpoint, save_checkpoint) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, optimizer, device, opt)
def main(): print(args) data_dir = '/home/tiankeke/workspace/datas/sumdata/' TRAIN_X = os.path.join(data_dir, 'train/train.article.txt') TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt') VALID_X = os.path.join(data_dir, 'train/valid.article.filter.txt') VALID_Y = os.path.join(data_dir, 'train/valid.title.filter.txt') src_vocab, tgt_vocab = get_vocab(TRAIN_X, TRAIN_Y) small_vocab_file = 'sumdata/small_vocab.json' if os.path.exists(small_vocab_file): small_vocab = json.load(open(small_vocab_file)) else: small_vocab = build_vocab([TRAIN_X, TRAIN_Y], small_vocab_file, vocab_size=80000) max_src_len = 101 max_tgt_len = 47 bs = args.batch_size n_train = args.n_train n_valid = args.n_valid vocab = small_vocab train_x = BatchManager(load_data(TRAIN_X, max_src_len, n_train), bs, vocab) train_y = BatchManager(load_data(TRAIN_Y, max_tgt_len, n_train), bs, vocab) valid_x = BatchManager(load_data(VALID_X, max_src_len, n_valid), bs, vocab) valid_y = BatchManager(load_data(VALID_Y, max_tgt_len, n_valid), bs, vocab) model = Transformer(len(vocab), len(vocab), max_src_len, d_word_vec=300, d_model=300, d_inner=1200, n_layers=1, n_head=6, d_k=50, d_v=50, dropout=0.1, tgt_emb_prj_weight_sharing=True, emb_src_tgt_weight_sharing=True).cuda() # print(model) saved_state = {'epoch': 0, 'lr': 0.001} if os.path.exists(args.ckpt_file): saved_state = torch.load(args.ckpt_file) model.load_state_dict(saved_state['state_dict']) logging.info('Load model parameters from %s' % args.ckpt_file) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.Adam(parameters, lr=saved_state['lr']) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.3) scheduler.step( ) # last_epoch=-1, which will not update lr at the first time train(train_x, train_y, valid_x, valid_y, model, optimizer, scheduler, args.n_epochs, saved_state['epoch'])
def main(): """ Main function. """ parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=30) parser.add_argument('-batch_size', type=int, default=16) parser.add_argument('-d_model', type=int, default=64) parser.add_argument('-d_rnn', type=int, default=256) parser.add_argument('-d_inner_hid', type=int, default=128) parser.add_argument('-d_k', type=int, default=16) parser.add_argument('-d_v', type=int, default=16) parser.add_argument('-n_head', type=int, default=4) parser.add_argument('-n_layers', type=int, default=4) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-lr', type=float, default=1e-4) parser.add_argument('-smooth', type=float, default=0.1) parser.add_argument('-log', type=str, default='log.txt') opt = parser.parse_args() # default device is CUDA opt.device = torch.device('cuda') # setup the log file with open(opt.log, 'w') as f: f.write('Epoch, Log-likelihood, Accuracy, RMSE\n') print('[Info] parameters: {}'.format(opt)) """ prepare dataloader """ trainloader, testloader, num_types = prepare_dataloader(opt) """ prepare model """ model = Transformer( num_types=num_types, d_model=opt.d_model, d_rnn=opt.d_rnn, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, d_k=opt.d_k, d_v=opt.d_v, dropout=opt.dropout, ) model.to(opt.device) """ optimizer and scheduler """ optimizer = optim.Adam(filter(lambda x: x.requires_grad, model.parameters()), opt.lr, betas=(0.9, 0.999), eps=1e-05) scheduler = optim.lr_scheduler.StepLR(optimizer, 10, gamma=0.5) """ prediction loss function, either cross entropy or label smoothing """ if opt.smooth > 0: pred_loss_func = Utils.LabelSmoothingLoss(opt.smooth, num_types, ignore_index=-1) else: pred_loss_func = nn.CrossEntropyLoss(ignore_index=-1, reduction='none') """ number of parameters """ num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print('[Info] Number of parameters: {}'.format(num_params)) """ train the model """ train(model, trainloader, testloader, optimizer, scheduler, pred_loss_func, opt)
def main(): ''' Usage: python train.py -data_pkl m30k_deen_shr.pkl -embs_share_weight -proj_share_weight -label_smoothing -output_dir output -b 256 -warmup 128000 ''' parser = argparse.ArgumentParser() parser.add_argument('-data_pkl', default=None) # all-in-1 data pickle or bpe field parser.add_argument('-train_path', default=None) # bpe encoded data parser.add_argument('-val_path', default=None) # bpe encoded data parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-b', '--batch_size', type=int, default=2048) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-warmup','--n_warmup_steps', type=int, default=4000) parser.add_argument('-lr_mul', type=float, default=2.0) parser.add_argument('-seed', type=int, default=None) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-scale_emb_or_prj', type=str, default='prj') parser.add_argument('-output_dir', type=str, default=None) parser.add_argument('-use_tb', action='store_true') parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model # https://pytorch.org/docs/stable/notes/randomness.html # For reproducibility if opt.seed is not None: torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = False # torch.set_deterministic(True) np.random.seed(opt.seed) random.seed(opt.seed) if not opt.output_dir: print('No experiment result will be saved.') raise if not os.path.exists(opt.output_dir): os.makedirs(opt.output_dir) if opt.batch_size < 2048 and opt.n_warmup_steps <= 4000: print('[Warning] The warmup steps may be not enough.\n'\ '(sz_b, warmup) = (2048, 4000) is the official setting.\n'\ 'Using smaller batch w/o longer warmup may cause '\ 'the warmup stage ends with only little data trained.') device = torch.device('cuda' if opt.cuda else 'cpu') #========= Loading Dataset =========# if all((opt.train_path, opt.val_path)): training_data, validation_data = prepare_dataloaders_from_bpe_files(opt, device) elif opt.data_pkl: training_data, validation_data = prepare_dataloaders(opt, device) else: raise print(opt) transformer = Transformer( opt.src_vocab_size, opt.trg_vocab_size, src_pad_idx=opt.src_pad_idx, trg_pad_idx=opt.trg_pad_idx, trg_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_trg_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout, scale_emb_or_prj=opt.scale_emb_or_prj).to(device) optimizer = ScheduledOptim( optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09), opt.lr_mul, opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, optimizer, device, opt)
def main(): ''' Usage: python train.py -data_pkl m30k_deen_shr.pkl -log m30k_deen_shr -embs_share_weight -proj_share_weight -label_smoothing -save_model trained -b 256 -warmup 128000 ''' global C global shapes global Beta parser = argparse.ArgumentParser() parser.add_argument('-data_pkl', default=None) # all-in-1 data pickle or bpe field parser.add_argument('-srn', type=bool, default=False) parser.add_argument('-optimize_c', type=bool, default=False) parser.add_argument('-Beta', type=float, default=1.0) parser.add_argument("-lr", type=float, default=1e-1) parser.add_argument("-scheduler_mode", type=str, default=None) parser.add_argument("-scheduler_factor", type=float, default=0.5) parser.add_argument('-train_path', default=None) # bpe encoded data parser.add_argument('-val_path', default=None) # bpe encoded data parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-b', '--batch_size', type=int, default=2048) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model Beta = opt.Beta if not opt.log and not opt.save_model: print('No experiment result will be saved.') raise if opt.batch_size < 2048 and opt.n_warmup_steps <= 4000: print('[Warning] The warmup steps may be not enough.\n'\ '(sz_b, warmup) = (2048, 4000) is the official setting.\n'\ 'Using smaller batch w/o longer warmup may cause '\ 'the warmup stage ends with only little data trained.') device = torch.device('cuda' if opt.cuda else 'cpu') #========= Loading Dataset =========# if all((opt.train_path, opt.val_path)): training_data, validation_data = prepare_dataloaders_from_bpe_files( opt, device) elif opt.data_pkl: training_data, validation_data = prepare_dataloaders(opt, device) else: raise print(opt) transformer = Transformer(opt.src_vocab_size, opt.trg_vocab_size, src_pad_idx=opt.src_pad_idx, trg_pad_idx=opt.trg_pad_idx, trg_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_trg_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) if opt.srn: transformer = migrate_to_srn(transformer) transformer = transformer.to(device) if opt.optimize_c: srn_modules = [ module for module in transformer.modules() if isinstance(module, (SRNLinear, SRNConv2d)) ] sranks = [] shapes = [] for module in srn_modules: W = module.weight.detach() shape_w = W.shape W = W.view(shape_w[0], -1) sranks.append(stable_rank(W).item()) shapes.append(W.shape) # a rule of thump to initialize the target srank with the current srank of the model C = [ Parameter((torch.ones(1) * sranks[i] / min(shapes[i])).view(())) for i in range(len(srn_modules)) ] for i, module in enumerate(srn_modules): C[i].to(device) module.c = C[i] criteria = criteria_ else: criteria = cal_performance optimizer = ScheduledOptim(optim.Adam(transformer.parameters(), lr=1e-2, betas=(0.9, 0.98), eps=1e-09), opt.lr, opt.d_model, opt.n_warmup_steps, mode=opt.scheduler_mode, factor=opt.scheduler_factor, patience=3) train(transformer, training_data, validation_data, optimizer, device, opt, loss=criteria) print("~~~~~~~~~~~~~C~~~~~~~~~~~~~") print(C) print("~~~~~~~~~~~~~~~~~~~~~~~~~~~") print("-----------Model-----------") print(transformer) print("---------------------------") with torch.no_grad(): for pname, p in transformer.named_parameters(): if len(p.shape) > 1: print("...Parameter ", pname, ", srank=", stable_rank(p.view(p.shape[0], -1)).item())
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-mined_data', required=True) parser.add_argument('-snippet_model', required=True) parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-batch_size', type=int, default=64) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', type=bool, default=True) parser.add_argument('-save_model_dir', default=None, required=True) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='all') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') # For bleu eval parser.add_argument('-beam_size', type=int, default=5, help='Beam size') parser.add_argument('-n_best', type=int, default=1, help="""If verbose is set, will output the n_best decoded sentences""") parser.add_argument('-test_epoch', type=int, default=5, help='Test every x epochs') parser.add_argument('-resume_from_epoch', type=int, default=0, help='Warm restart') # Not really needed parser.add_argument('-alpha', type=float, default=1.0, help='Weighting loss') parser.add_argument('-loss_weight', type=float, default=0.1, help='Mined loss weight') parser.add_argument('-lr', type=float, default=1e-3, help='Learning rate') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model # Snippet model sentencepiece sp.Load(opt.snippet_model) #========= Loading Dataset =========# data = torch.load(opt.data) mined_data = torch.load(opt.mined_data) opt.inp_seq_max_len = 4 * data['settings'].train_max_input_len opt.out_seq_max_len = 4 * data['settings'].train_max_output_len opt.max_token_seq_len = int(opt.out_seq_max_len / 4) training_data, validation_data, test_data, mined_data = prepare_dataloaders( data, mined_data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size print(opt.inp_seq_max_len, opt.out_seq_max_len, opt.src_vocab_size, opt.tgt_vocab_size) #========= Preparing Model =========# if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.inp_seq_max_len, opt.out_seq_max_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09, lr=opt.lr), opt.d_model, opt.n_warmup_steps) save_params(opt) opt = check_restart_conditions(opt) if opt.resume_from_epoch >= 1: print('Loading Old model') print('Loading model files from folder: %s' % opt.save_model_dir) transformer = load_models(transformer, opt, opt.resume_from_epoch) train(transformer, training_data, validation_data, test_data, mined_data, optimizer, device, opt)
def main(): ''' Usage: python train.py -data_pkl m30k_deen_shr.pkl -log m30k_deen_shr -embs_share_weight -proj_share_weight -label_smoothing -save_model trained -b 256 -warmup 128000 ''' parser = argparse.ArgumentParser() parser.add_argument('-data_pkl', default=None) # all-in-1 data pickle or bpe field parser.add_argument('-train_path', default=None) # bpe encoded data parser.add_argument('-val_path', default=None) # bpe encoded data parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-b', '--batch_size', type=int, default=2048) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', default=True, action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model if not opt.log and not opt.save_model: print('No experiment result will be saved.') raise if opt.batch_size < 2048 and opt.n_warmup_steps <= 4000: print('[Warning] The warmup steps may be not enough.\n'\ '(sz_b, warmup) = (2048, 4000) is the official setting.\n'\ 'Using smaller batch w/o longer warmup may cause '\ 'the warmup stage ends with only little data trained.') device = torch.device('cuda' if opt.cuda else 'cpu') #========= Loading Dataset =========# if all((opt.train_path, opt.val_path)): training_data, validation_data = prepare_dataloaders_from_bpe_files( opt, device) elif opt.data_pkl: training_data, validation_data = prepare_dataloaders(opt, device) else: raise print(opt) transformer = Transformer(opt.src_vocab_size, opt.trg_vocab_size, src_pad_idx=opt.src_pad_idx, trg_pad_idx=opt.trg_pad_idx, trg_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_trg_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) optimizer = ScheduledOptim( optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09), 2.0, opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, optimizer, device, opt)
def train(): parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset.") parser.add_argument("--train_batch_size", type=int, default=64, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=64, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-4, help="Learning rate") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=15, help="Number of training epochs") parser.add_argument( "--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--fp16", type=str, default="", help= "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") parser.add_argument("--gpt2_model_name", type=str, default="gpt2", help="Path, url or short name of the model") parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-label_smoothing', action='store_true') args = parser.parse_args() args.d_word_vec = args.d_model # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", args.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, pretrained model and optimizer.") tokenizer_class = GPT2Tokenizer if "gpt2" in args.gpt2_model_name else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path tokenizer = tokenizer_class.from_pretrained(args.gpt2_model_name) num_tokens = len(tokenizer.encoder) num_added_tokens = tokenizer.add_special_tokens( ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there model = Transformer( num_tokens + num_added_tokens, num_tokens + num_added_tokens, src_pad_idx=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]), trg_pad_idx=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]), trg_emb_prj_weight_sharing=args.proj_share_weight, emb_src_trg_weight_sharing=args.embs_share_weight, d_k=args.d_k, d_v=args.d_v, d_model=args.d_model, d_word_vec=args.d_word_vec, d_inner=args.d_inner_hid, n_layers=args.n_layers, n_head=args.n_head, dropout=args.dropout).to(args.device) optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( args, tokenizer, tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) source_ids, target_ids, lm_labels = batch (lm_loss), *_ = model(source_ids, target_ids, labels=lm_labels) loss = lm_loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(args.device) for input_tensor in batch) source_ids, target_ids, lm_labels = batch #logger.info(tokenizer.decode(target_ids[0].tolist())) lm_logits, *_ = model(source_ids, target_ids) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, ), (lm_labels_flat_shifted, ) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0][0], x[1][0])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) log_dir = make_logdir(args.gpt2_model_name, args.dataset_path) tb_logger = TensorboardLogger(log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=4) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" takes care of distributed encapsulation torch.save(args, log_dir + '/model_training_args.bin') #getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME)) tokenizer.save_pretrained(log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def main(): device = torch.device("cuda:0" if USE_CUDA else "cpu") env = Environment() END_TAG_IDX = env.lang.word2idx[END_TAG] SAY_HI = "hello" targ_lang = env.lang vocab_inp_size = len(env.lang.word2idx) vocab_tar_size = len(targ_lang.word2idx) print("vocab_inp_size", vocab_inp_size) print("vocab_tar_size", vocab_tar_size) model = Transformer( vocab_inp_size, vocab_tar_size, MAX_TARGET_LEN, d_word_vec=32, d_model=32, d_inner=32, n_layers=3, n_head=4, d_k=32, d_v=32, dropout=0.1, ).to(device) # baseline = Baseline(UNITS) history = [] l_optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) batch = None def maybe_pad_sentence(s): return tf.keras.preprocessing.sequence.pad_sequences( s, maxlen=MAX_TARGET_LEN, padding='post') def get_returns(r: float, seq_len: int): return list(reversed([r * (GAMMA**t) for t in range(seq_len)])) def sentence_to_idxs(sentence: str): return [ env.lang.word2idx[token] for token in tokenize_sentence(sentence) ] for episode in range(EPISODES): # Start of Episode env.reset() model.eval() # get first state from the env state, _, done = env.step(SAY_HI) while not done: src_seq = [ env.lang.word2idx[token] for token in tokenize_sentence(state) ] src_seq, src_pos = collate_fn([src_seq]) src_seq, src_pos = src_seq.to(device), src_pos.to(device) enc_output, *_ = model.encoder(src_seq, src_pos) actions_t = [] actions = [] actions_idx = [] while len(actions) == 0 or actions[len(actions) - 1] != END_TAG_IDX and len( actions) < MAX_TARGET_LEN: # construct new tgt_seq based on what's outputed so far if len(actions_t) == 0: tgt_seq = [env.lang.word2idx[Constants.UNK_WORD]] else: tgt_seq = actions_idx tgt_seq, tgt_pos = collate_fn([tgt_seq]) tgt_seq, tgt_pos = tgt_seq.to(device), tgt_pos.to(device) # dec_output dims: [1, pos, hidden] dec_output, * \ _ = model.decoder(tgt_seq, tgt_pos, src_seq, enc_output) # pick last step dec_output = dec_output[:, -1, :] # w_logits dims: [1, vocab_size] w_logits = model.tgt_word_prj(dec_output) # w_probs dims: [1, vocab_size] w_probs = torch.nn.functional.softmax(w_logits, dim=1) w_dist = torch.distributions.categorical.Categorical( probs=w_probs) w_idx_t = w_dist.sample() w_idx = w_idx_t.cpu().numpy()[0] actions_t.append(w_idx_t) actions_idx.append(w_idx) actions.append(env.lang.idx2word[w_idx]) # action is a sentence (string) action_str = ' '.join(actions) next_state, reward, done = env.step(action_str) # print(reward) history.append((state, actions_t, action_str, reward)) state = next_state # record history (to be used for gradient updating after the episode is done) # End of Episode # Update policy model.train() while len(history) >= BATCH_SIZE: batch = history[:BATCH_SIZE] state_inp_b, action_inp_b, reward_b, ret_seq_b = zip(*[[ sentence_to_idxs(state), actions_b, reward, get_returns(reward, MAX_TARGET_LEN) ] for state, actions_b, _, reward in batch]) action_inp_b = [torch.stack(sent) for sent in action_inp_b] action_inp_b = torch.stack(action_inp_b) ret_seq_b = np.asarray(ret_seq_b) # ret_mean = np.mean(ret_seq_b) # ret_std = np.std(ret_seq_b) # ret_seq_b = (ret_seq_b - ret_mean) / ret_std ret_seq_b = np.exp((ret_seq_b - 0.5) * 5) ret_seq_b = torch.tensor(ret_seq_b, dtype=torch.float32).to(device) loss = 0 # loss_bl=0 l_optimizer.zero_grad() # accumulate gradient with GradientTape src_seq, src_pos = collate_fn(list(state_inp_b)) src_seq, src_pos = src_seq.to(device), src_pos.to(device) enc_output_b, *_ = model.encoder(src_seq, src_pos) max_sentence_len = action_inp_b.shape[1] tgt_seq = [[Constants.BOS] for i in range(BATCH_SIZE)] for t in range(max_sentence_len): # _b stands for batch prev_w_idx_b, tgt_pos = collate_fn(tgt_seq) prev_w_idx_b, tgt_pos = prev_w_idx_b.to(device), tgt_pos.to( device) # dec_output_b dims: [batch, pos, hidden] dec_output_b, *_ = \ model.decoder(prev_w_idx_b, tgt_pos, src_seq, enc_output_b) # pick last step dec_output_b = dec_output_b[:, -1, :] # w_logits_b dims: [batch, vocab_size] w_logits_b = model.tgt_word_prj(dec_output_b) # w_probs dims: [batch, vocab_size] w_probs_b = torch.nn.functional.softmax(w_logits_b, dim=1) dist_b = torch.distributions.categorical.Categorical( probs=w_probs_b) curr_w_idx_b = action_inp_b[:, t, :] log_probs_b = torch.transpose( dist_b.log_prob(torch.transpose(curr_w_idx_b, 0, 1)), 0, 1) # bl_val_b = baseline(tf.cast(dec_hidden_b, 'float32')) # delta_b = ret_b - bl_val_b # cost_b = -tf.math.multiply(log_probs_b, delta_b) # cost_b = -tf.math.multiply(log_probs_b, ret_b) ret_b = torch.reshape(ret_seq_b[:, t], (BATCH_SIZE, 1)).to(device) # alternatively, use torch.mul() but it is overloaded. Might need to try log_probs_b*vec.expand_as(A) cost_b = -torch.mul(log_probs_b, ret_b) # log_probs_b*vec.expand_as(A) # cost_b = -torch.bmm() #if we are doing batch multiplication loss += cost_b # loss_bl += -tf.math.multiply(delta_b, bl_val_b) prev_w_idx_b = curr_w_idx_b tgt_seq = np.append(tgt_seq, prev_w_idx_b.data.cpu().numpy(), axis=1).tolist() # calculate cumulative gradients # model_vars = encoder.variables + decoder.variables loss = loss.mean() loss.backward() # loss_bl.backward() # finally, apply gradient l_optimizer.step() # bl_optimizer.step() # Reset everything for the next episode history = history[BATCH_SIZE:] if episode % max(BATCH_SIZE, 32) == 0 and batch != None: print(">>>>>>>>>>>>>>>>>>>>>>>>>>") print("Episode # ", episode) print("Samples from episode with rewards > 0: ") good_rewards = [(s, a_str, r) for s, _, a_str, r in batch] for s, a, r in random.sample(good_rewards, min(len(good_rewards), 3)): print("prev_state: ", s) print("actions: ", a) print("reward: ", r) # print("return: ", get_returns(r, MAX_TARGET_LEN)) ret_seq_b_np = ret_seq_b.cpu().numpy() print("all returns: min=%f, max=%f, median=%f" % (np.min(ret_seq_b_np), np.max(ret_seq_b_np), np.median(ret_seq_b_np))) print("avg reward: ", sum(reward_b) / len(reward_b)) print("avg loss: ", np.mean(loss.cpu().detach().numpy()))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-data', default='./data/preprocessedData') parser.add_argument('-epoch', type=int, default=50) parser.add_argument('-batch_size', type=int, default=64) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default='log') # None parser.add_argument('-save_model', default='trained') # None parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-beam_size', type=int, default=5, help='Beam size') parser.add_argument('-n_best', type=int, default=1, help="""If verbose is set, will output the n_best decoded sentences""") parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true', default=True) opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model # Loading Dataset data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len training_data, validation_data = prepare_dataloaders(data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size # Preparing Model if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') # device = torch.device('cpu') transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout) discriminator = Discriminator(opt.d_model, 1024, opt.max_token_seq_len, device) #''' if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") transformer = nn.DataParallel(transformer) # ''' transformer.to(device) discriminator.to(device) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) optimizer_d = optim.RMSprop(discriminator.parameters(), lr=5e-4) train(transformer, discriminator, training_data, validation_data, optimizer, optimizer_d, device, opt)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data_all', default='data/csv/data_train_2_sort.torch') parser.add_argument('-save_model', default='module/2018-7-30.pt') parser.add_argument('-start_time', default='2018-07-01') parser.add_argument('-end_time', default='2018-08-30') parser.add_argument('-epoch', type=int, default=16) parser.add_argument('-batch_size', type=int, default=128) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=32) parser.add_argument('-d_v', type=int, default=32) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=2) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.3) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default='log/logs.log') parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') parser.add_argument('-batch_x', default=32) parser.add_argument('-batch_y', default=32) parser.add_argument('-train_type', default='name') opt = parser.parse_args() opt.cuda = torch.cuda.is_available() opt.d_word_vec = opt.d_model # ========= Loading Dataset =========# # opt.max_token_seq_len = data['settings'].max_token_seq_len training_data, validation_data, voc_name, data_val_ofpa = ld.get_data_loader( opt, device) opt.src_vocab_size = voc_name opt.tgt_vocab_size = opt.src_vocab_size if opt.train_type == 'time': voc = ld.get_time_vac(opt) opt.tgt_vocab_size = voc if voc > 500 else 728 # ========= Preparing Model =========# if opt.embs_share_weight: assert opt.src_vocab_size == opt.tgt_vocab_size, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.batch_x, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) if opt.train_type == 'time': print("train time dim ") # train(transformer, train_time, val_time, optimizer, device, opt) else: train(transformer, training_data, validation_data, optimizer, device, opt, data_val_ofpa)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-batch_size', type=int, default=64) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') parser.add_argument('-seed', type=int, default=None) parser.add_argument( '-use_TT', nargs='+', choices=[Constants.embedding_, Constants.pff_, Constants.attention_]) parser.add_argument('-n_tt_cores', nargs='+', type=int, default=3) parser.add_argument('-tt_rank', nargs='+', type=int, default=8) opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model # Parse TT Arguments opt.tt_params = {} if opt.use_TT: assert len(opt.use_TT) == len( opt.n_tt_cores ), f"Specify the number of TT-cores for each of the {opt.use_TT}" assert len(opt.use_TT) == len( opt.tt_rank ), f"Specify the number of TT-rank for each of the {opt.use_TT}" for i in range(len(opt.use_TT)): opt.tt_params[opt.use_TT[i]] = { "n_tt_cores": opt.n_tt_cores[i], "tt_rank": opt.tt_rank[i] } if opt.seed is not None: torch.random.manual_seed(opt.seed) #========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len training_data, validation_data = prepare_dataloaders(data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size #========= Preparing Model =========# if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' device = torch.device('cuda' if opt.cuda else 'cpu') # Print the model architecture and hyperparameters f = io.StringIO() with redirect_stdout(f): print(opt) transformer = Transformer( opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout, tt_params=opt.tt_params).to(device) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) print( f"Number of trainable parameters: {sum(p.numel() for p in transformer.parameters() if p.requires_grad)}" ) summary(transformer, [[opt.max_token_seq_len] for i in range(4)], dtype="long") architecture_summary = f.getvalue() print(architecture_summary) if opt.log: log_architecture_file = opt.log + '.architecture.log' with open(log_architecture_file, 'w') as log_a: log_a.write(architecture_summary) train(transformer, training_data, validation_data, optimizer, device, opt)