示例#1
0
def train(model: Transformer, optimizer, criterion, clip, device):
    model.train()
    epoches_loss = 0
    for index, batch in tqdm(enumerate(dataset_pro.train_iter)):
        shang_lian, shang_lian_length = batch.shang_lian
        shang_lian = shang_lian.permute(1, 0).to(device)
        # shang_lian_length = shang_lian_length.permute(1, 0).to(device)
        # shang_lian_length = shang_lian_length.numpy()
        # shang_lian_pos = torch.LongTensor(get_pos_ids(shang_lian_length, shang_lian.shape[1])).to(device)
        xia_lian, xia_lian_length = batch.xia_lian
        xia_lian = xia_lian.permute(1, 0).to(device)
        # xia_lian_length = xia_lian_length.numpy()
        # xia_lian_pos = torch.LongTensor(get_pos_ids(xia_lian_length, xia_lian.shape[1])).to(device)

        optimizer.zero_grad()

        outputs = model(shang_lian, xia_lian[:, :-1])
        outputs = outputs.contiguous().view(-1, outputs.shape[-1])
        xia_lian = xia_lian[:, 1:].contiguous().view(-1)
        loss = criterion(outputs, xia_lian)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        # print(loss.item())
        optimizer.step()

        epoches_loss += loss.item()
    result_loss = epoches_loss / len(dataset_pro.train_iter)

    return result_loss
示例#2
0
def main():
    ''' Main function '''

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    # ========= Loading Dataset =========#
    data = torch.load(opt.data)
    opt.max_token_seq_len = data['settings'].max_token_seq_len

    trn_data, val_data = prepare_dataloaders(data, opt)

    opt.src_vocab_size = trn_data.dataset.src_vocab_size
    opt.tgt_vocab_size = trn_data.dataset.tgt_vocab_size

    # ========= Preparing Model =========#
    if opt.embs_share_weight:
        assert trn_data.dataset.src_word2idx == trn_data.dataset.tgt_word2idx,\
            ('The src/tgt word2idx table are different but asked to share '
             'word embedding.')

    print(opt)

    device = torch.device('cuda' if opt.cuda else 'cpu')
    transformer = Transformer(opt.src_vocab_size,
                              opt.tgt_vocab_size,
                              opt.max_token_seq_len,
                              tgt_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_tgt_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout).to(device)

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)

    train(transformer, trn_data, val_data, optimizer, device, opt)
示例#3
0
    d_model=args.d_model,
    d_word_vec=args.d_word_vec,
    d_inner=args.d_inner_hid,
    n_layers=args.n_layers,
    n_ensemble=args.n_ensemble,
    n_head=args.n_head,
    dropout=args.dropout,
    scale_emb_or_prj=args.scale_emb_or_prj).to(args.device)

print('model initiated.')

# optimizer = ScheduledOptim(
#     optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09),
#     args.lr_mul, args.d_model, args.n_warmup_steps)
optimizer = MyScheduledOptim(
    optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09),
    optim.Adam(list(model.parameters())[model.encoder.num_shared_parameters:], betas=(0.9, 0.98), eps=1e-09),
    args.milestones, args.lr_list, args.sep_optimizer_start_step)

if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)



def cal_performance(pred, dec_output, gold, trg_pad_idx, args, model, termination_bit_weight=None, smoothing=False):
    ''' Apply label smoothing if needed '''

    loss = cal_loss(pred, dec_output, gold, trg_pad_idx, args, model, termination_bit_weight, smoothing)
    if args.input_type == 'node_based':
        pred = pred.max(1)[1]
        gold = gold.contiguous().view(-1)
示例#4
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    parser.add_argument('-data', required=True)

    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-batch_size', type=int, default=64)

    # parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default=None)
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    # ========= Loading Dataset =========#
    data = torch.load(opt.data)
    opt.max_token_seq_len = data['settings'].max_token_seq_len

    training_data, validation_data = prepare_dataloaders(data, opt)

    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    # ========= Preparing Model =========#
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table are different but asked to share word embedding.'

    print(opt)

    device = torch.device('cuda' if opt.cuda else 'cpu')
    transformer = Transformer(opt.src_vocab_size,
                              opt.tgt_vocab_size,
                              opt.max_token_seq_len,
                              tgt_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_tgt_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout).to(device)

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)

    train(transformer, training_data, validation_data, optimizer, device, opt)
示例#5
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()
    #----------------------参数都在这里面,默认参数!!!!!!!!!!!!!!!!
    parser.add_argument('-data', required=False)

    parser.add_argument('-epoch', type=int, default=1)  #为了跑通我就先写1了.
    parser.add_argument('-batch_size', type=int, default=32)

    #parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default='/transformer_my')
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing',
                        action='store_true')  # 这种action里面写上就表示默认调用的话就是true

    opt = parser.parse_args()

    opt.d_word_vec = opt.d_model
    '''
    下面来把参数写这里, 就方便了.
    '''
    opt.saved_weight = '/trained.chkpt'  # 就的模型的位置.
    opt.data = 'yunixng_bash/data/multi30k.atok.low.pt'  # 数据集的位置.
    opt.save_model = 'trained'  # 存模型的名字.
    opt.save_mode = 'best'  # 数据集的位置.
    opt.proj_share_weight = True  # 数据集的位置.
    opt.label_smoothing = True  # 数据集的位置.
    opt.cuda = False
    opt.batch_size = 200
    opt.epoch = 30

    print(opt, 44444444444444444444444444444444444444444444444444444444444444)
    #========= Loading Dataset =========#
    data = torch.load(
        opt.data
    )  # 这里面的数据已经经过编码了. 具体的编码规则也都在data里面,data里面是一个字典.并且src 和tgt的字典是不一样的,所以上面的embs_share_weight 参数一定要false.  数据集一共大小才3mb. 真方便. 就是根目录下面的multi30k.atok.low.pt这个.   应该是一个小数据及,3万个句子对, 字典3k. 只有点常用的英文字. 并且没用使用word-piece. 只是word级别的编码. 所以随便给一个句子,超出字典非常正常. 但是目前用这个,对于测试非常方便,速度很快.
    opt.max_token_seq_len = data['settings'].max_token_seq_len
    # 进行数据长度预处理 , 就是加padding 而已.
    training_data, validation_data = prepare_dataloaders(data, opt)

    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    #========= Preparing Model =========#
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table are different but asked to share word embedding.'
    print('配的参数都打印在这里了')
    print(opt)

    device = torch.device('cuda' if opt.cuda else 'cpu')
    transformer = Transformer(  # 准备网络模型.
        opt.src_vocab_size,
        opt.tgt_vocab_size,
        opt.max_token_seq_len,
        tgt_emb_prj_weight_sharing=opt.proj_share_weight,
        emb_src_tgt_weight_sharing=opt.embs_share_weight,
        d_k=opt.d_k,
        d_v=opt.d_v,
        d_model=opt.d_model,
        d_word_vec=opt.d_word_vec,
        d_inner=opt.d_inner_hid,
        n_layers=opt.n_layers,
        n_head=opt.n_head,
        dropout=opt.dropout).to(device)

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)

    train(transformer, training_data, validation_data, optimizer, device, opt)
示例#6
0
def main():
    ''' 
    Usage:
    python train.py -data_pkl m30k_deen_shr.pkl -log m30k_deen_shr -embs_share_weight -proj_share_weight -label_smoothing -save_model trained -b 128 -epoch 100 -optim nero -lr 0.003
    '''

    parser = argparse.ArgumentParser()

    parser.add_argument('-data_pkl', default=None)     # all-in-1 data pickle or bpe field

    parser.add_argument('-train_path', default=None)   # bpe encoded data
    parser.add_argument('-val_path', default=None)     # bpe encoded data

    parser.add_argument('-seed', type=int, default=0)
    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-b', '--batch_size', type=int, default=2048)
    parser.add_argument('-optim', type=str, choices=['adam', 'sgd', 'nero', 'lamb'])
    parser.add_argument('-lr', type=float)

    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default=None)
    parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    # set random seed
    torch.manual_seed(opt.seed)
    np.random.seed(opt.seed)

    # tensorboard writer
    log_dir = 'runs/' + opt.optim + '_' + str(opt.lr) + '_seed' + str(opt.seed) 
    writer = SummaryWriter(log_dir=log_dir)
    print("Saving tensorboard to "+log_dir)

    if not opt.log and not opt.save_model:
        print('No experiment result will be saved.')
        raise

    device = torch.device('cuda' if opt.cuda else 'cpu')

    #========= Loading Dataset =========#

    if all((opt.train_path, opt.val_path)):
        training_data, validation_data = prepare_dataloaders_from_bpe_files(opt, device)
    elif opt.data_pkl:
        training_data, validation_data = prepare_dataloaders(opt, device)
    else:
        raise

    print(opt)

    transformer = Transformer(
        opt.src_vocab_size,
        opt.trg_vocab_size,
        src_pad_idx=opt.src_pad_idx,
        trg_pad_idx=opt.trg_pad_idx,
        trg_emb_prj_weight_sharing=opt.proj_share_weight,
        emb_src_trg_weight_sharing=opt.embs_share_weight,
        d_k=opt.d_k,
        d_v=opt.d_v,
        d_model=opt.d_model,
        d_word_vec=opt.d_word_vec,
        d_inner=opt.d_inner_hid,
        n_layers=opt.n_layers,
        n_head=opt.n_head,
        dropout=opt.dropout).to(device)

    if opt.optim == 'adam':
        optimizer = optim.Adam(transformer.parameters(), lr=opt.lr, betas=(0.0, 0.999))
    elif opt.optim == 'nero':
        optimizer = Nero(transformer.parameters(), lr=opt.lr)
    elif opt.optim == 'lamb':
        optimizer = Lamb(transformer.parameters(), lr=opt.lr, betas=(0.0, 0.999))
    elif opt.optim == 'sgd':
        optimizer = optim.SGD(transformer.parameters(), lr=opt.lr, momentum=0)
    print("Using optim", type(optimizer).__name__)
    
    lr_lambda = lambda epoch : 2 * min(epoch / opt.epoch, (opt.epoch-epoch) / opt.epoch)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

    train(transformer, training_data, validation_data, optimizer, scheduler, device, opt, writer)

    writer.close()
示例#7
0
            np.power(self.n_warmup_steps, -1.5) * self.n_current_steps
        ])

    def _update_learning_rate(self):
        ''' Learning rate scheduling per step '''

        self.n_current_steps += 1
        lr = self.init_lr * self._get_lr_scale()

        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr


# create optimizer
optimizer = torch.optim.Adam(filter(lambda x: x.requires_grad,
                                    model.parameters()),
                             betas=(0.9, 0.98),
                             eps=1e-09,
                             lr=1e-4,
                             amsgrad=False)

# create a sceduled optimizer object
optimizer = ScheduledOptim(optimizer, config["model_dim"],
                           config["warmup_steps"])


def save_checkpoint(filename, model, optimizer):
    '''
    saves model into a state dict, along with its training statistics,
    and parameters
    :param model:
示例#8
0
def main():
    """ Main function """
    parser = argparse.ArgumentParser()

    parser.add_argument("-data", required=True)

    parser.add_argument("-epoch", type=int, default=10)
    parser.add_argument("-batch_size", type=int, default=64)

    # parser.add_argument("-d_word_vec", type=int, default=512)
    parser.add_argument("-d_model", type=int, default=512)
    parser.add_argument("-d_inner_hid", type=int, default=2048)
    parser.add_argument("-d_k", type=int, default=64)
    parser.add_argument("-d_v", type=int, default=64)

    parser.add_argument("-n_head", type=int, default=8)
    parser.add_argument("-n_layers", type=int, default=6)
    parser.add_argument("-n_warmup_steps", type=int, default=4000)

    parser.add_argument("-dropout", type=float, default=0.1)
    parser.add_argument("-embs_share_weight", action="store_true")
    parser.add_argument("-proj_share_weight", action="store_true")

    parser.add_argument("-log", default=None)
    parser.add_argument("-save_model", default=None)
    parser.add_argument("-save_mode", type=str, choices=["all", "best"], default="best")

    parser.add_argument("-no_cuda", action="store_true")
    parser.add_argument("-label_smoothing", action="store_true")

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    # ========= Loading Dataset =========#
    data = torch.load(opt.data)
    opt.max_token_seq_len = data["settings"].max_token_seq_len

    training_data, validation_data = prepare_dataloaders(data, opt)

    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    # ========= Preparing Model =========#
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            "The src/tgt word2idx table are different but asked to share word embedding."

    print(opt)

    device = torch.device("cuda" if opt.cuda else "cpu")
    transformer = Transformer(
        opt.src_vocab_size,
        opt.tgt_vocab_size,
        opt.max_token_seq_len,
        tgt_emb_prj_weight_sharing=opt.proj_share_weight,
        emb_src_tgt_weight_sharing=opt.embs_share_weight,
        d_k=opt.d_k,
        d_v=opt.d_v,
        d_model=opt.d_model,
        d_word_vec=opt.d_word_vec,
        d_inner=opt.d_inner_hid,
        n_layers=opt.n_layers,
        n_head=opt.n_head,
        dropout=opt.dropout).to(device)

    optimizer = ScheduledOptim(
        optim.Adam(
            filter(lambda x: x.requires_grad, transformer.parameters()),
            betas=(0.9, 0.98), eps=1e-09),
        opt.d_model, opt.n_warmup_steps)

    train(transformer, training_data, validation_data, optimizer, device, opt)
示例#9
0
    device = torch.device('cuda' if args.no_cuda == False else 'cpu')
    transformer_model = Transformer(args.sl_vocab_size,
                                    args.xl_vocab_size,
                                    hid_dim=args.embedding_dim,
                                    pf_dim=args.fp_inner_dim,
                                    n_layers=args.n_layers,
                                    n_heads=args.n_head,
                                    dropout=args.dropout,
                                    device=device,
                                    SOS_IDX=TGT_SOS_IDX,
                                    PAD_IDX=SRC_PAD_IDX,
                                    EOS_IDX=TGT_EOS_IDX).to(device)

    # 优化器
    optimizer = optim.Adam(transformer_model.parameters(),
                           lr=args.lr,
                           weight_decay=args.l2_reg)

    # 损失函数
    criterion = nn.CrossEntropyLoss(ignore_index=SRC_PAD_IDX)

    # N_EPOCHS = 10
    # CLIP = 1

    best_valid_loss = float('inf')

    for epoch in range(args.epoches):

        start_time = time.time()
示例#10
0
def main():
    h = logging.StreamHandler()
    formatter = logging.Formatter("[%(asctime)s][%(levelname)s]%(message)s",
                                  datefmt="%Y-%m-%d %H:%M:%S")
    h.setFormatter(formatter)
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    logger.addHandler(h)

    parser = argparse.ArgumentParser()

    parser.add_argument('-data_path', default="../Data/dataset")
    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-b', '--batch_size', type=int, default=512)

    parser.add_argument('-d_model', type=int, default=15)
    parser.add_argument('-d_inner_hid', type=int, default=256)
    parser.add_argument('-d_k', type=int, default=15)
    parser.add_argument('-d_v', type=int, default=15)

    parser.add_argument('-n_head', type=int, default=1)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-warmup',
                        '--n_warmup_steps',
                        type=int,
                        default=100000)
    parser.add_argument('-lr_mul', type=float, default=2.0)
    parser.add_argument('-seed', type=int, default=None)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')
    parser.add_argument('-scale_emb_or_prj', type=str, default='prj')

    parser.add_argument('-output_dir', type=str, default='./checkpoint/')
    parser.add_argument('-summary_dir', type=str, default='./summary')
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model
    logging.info(opt)

    writer = SummaryWriter(log_dir=str(opt.summary_dir))

    if opt.seed is not None:
        torch.manual_seed(opt.seed)
        torch.backends.cudnn.benchmark = False
        np.random.seed(opt.seed)
        random.seed(opt.seed)

    if not os.path.exists(opt.output_dir):
        os.makedirs(opt.output_dir)

    device = torch.device('cuda' if opt.cuda else 'cpu')

    #========= Loading Dataset =========#
    pkl_files = os.listdir(opt.data_path)
    pwd = os.getcwd()
    pkl_files = [
        os.path.join(pwd, opt.data_path, file) for file in pkl_files
        if 'train' in file
    ]
    data_list = [data for data in pkl_files if '.pkl' in data]
    random.shuffle(data_list)
    logging.info(data_list)

    transformer = Transformer(trg_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_trg_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout,
                              scale_emb_or_prj=opt.scale_emb_or_prj).to(device)

    optimizer = ScheduledOptim(
        optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09),
        opt.lr_mul, opt.d_model, opt.n_warmup_steps)

    test(transformer, data_list, optimizer, device, opt, writer)
                       n_layers=hp.n_layers)
    net1 = net1.cuda()
    net2 = net2.cuda()

    trainLoader = dataset.getDataLoader(is_train=True,
                                        batch_size=hp.BATCH_SIZE,
                                        shuffle=True)
    iter_one_epoch = len(trainLoader)
    print("iteration_every_epoch: ", iter_one_epoch)
    #testloader = dataset.getDataLoader(is_train=False, batch_size=BATCH_SIZE, shuffle=False)
    lossFunction = nn.CrossEntropyLoss(ignore_index=Constants.PAD)
    optimizer_ = optim.Adam(
        [{
            'params': net1.parameters()
        }, {
            'params': filter(lambda x: x.requires_grad, net2.parameters())
        }],
        betas=[0.9, 0.98],
        lr=hp.LEARNING_RATE)
    optimizer = optimizer_
    optimizer_scheduler = ExponentialLR(optimizer_, 0.98)
    #optimizer = ScheduledOptim(optimizer_, learning_rate=hp.LEARNING_RATE, n_warmup_steps=hp.n_warmup_steps)

    if not os.path.exists(hp.checkpoint_path):
        os.makedirs(hp.checkpoint_path)
    num_step = 1
    model_restore_path = os.path.join(
        hp.checkpoint_path,
        hp.model_path_pre + "_" + str(hp.model_path_idx) + ".pth")
    if hp.model_restore and os.path.exists(model_restore_path):
        print("restore model from {}".format(model_restore_path))
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    parser.add_argument('-data', required=True)
    parser.add_argument('-emb', type=str, default=None)

    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-batch_size', type=int, default=64)

    # used when we have multiple inputs (token, pos, pred in openie)
    parser.add_argument('-d_word_vec', type=str, default=None)

    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default=None)
    parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    parser.add_argument('-task', type=str, choices=['mt', 'openie'], default='mt')
    parser.add_argument('-emb_op', type=str,
                        choices=['sum', 'concat'], default='no')
    parser.add_argument('-rel_pos_emb_op', type=str,
                        choices=['no', 'lookup', 'lstm'], default='no')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda

    # seed
    SEED = 2019
    torch.manual_seed(SEED)
    np.random.seed(SEED)

    #========= Loading Dataset =========#
    data = torch.load(opt.data)
    opt.max_token_seq_len = data['settings'].max_token_seq_len

    if opt.task == 'mt':
        training_data, validation_data = prepare_dataloaders(data, opt)
        opt.src_vocab_size = training_data.dataset.src_vocab_size
        opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size
        if opt.embs_share_weight:
            assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
                'The src/tgt word2idx table are different but asked to share word embedding.'
    elif opt.task == 'openie':
        training_data, validation_data = prepare_dataloaders_openie(data, opt)
        opt.vocab_size = training_data.dataset.vocab_size
        opt.n_class = data['settings'].n_class
        opt.n_pos = data['settings'].n_pos
        opt.n_rel_pos = data['settings'].n_path
        opt.n_pred_ind = data['settings'].n_pred_ind
    else:
        raise ValueError

    print(opt)

    device = torch.device('cuda' if opt.cuda else 'cpu')
    if opt.task == 'mt':
        opt.d_word_vec = opt.d_model
        transformer = Transformer(
            opt.src_vocab_size,
            opt.tgt_vocab_size,
            opt.max_token_seq_len,
            tgt_emb_prj_weight_sharing=opt.proj_share_weight,
            emb_src_tgt_weight_sharing=opt.embs_share_weight,
            d_k=opt.d_k,
            d_v=opt.d_v,
            d_model=opt.d_model,
            d_word_vec=opt.d_word_vec,
            d_inner=opt.d_inner_hid,
            n_layers=opt.n_layers,
            n_head=opt.n_head,
            dropout=opt.dropout).to(device)
    elif opt.task == 'openie':
        word_emb = None
        if opt.emb:
            word_emb = WordVector(opt.emb, is_binary=False, first_line=True, initializer='uniform').get_vectors()
            print('[Info] Use pretrained embedding with dim {}'.format(word_emb.shape[1]))
        # get dimensions
        # word, pos, pred_ind, pred_word, pred_pos
        if opt.d_word_vec:
            opt.d_vec_list = list(map(int, opt.d_word_vec.split(':')))
        else:
            opt.d_word_vec = opt.d_model
            emb_dim = word_emb.shape[1] if word_emb is not None else opt.d_word_vec // 5
            pred_emb_dim = word_emb.shape[1] if word_emb is not None else opt.d_word_vec // 5
            rest_dim = opt.d_word_vec - emb_dim - pred_emb_dim
            pos_dim = rest_dim // 3
            pred_pos_dim = rest_dim // 3
            pred_idx_dim = rest_dim // 3
            opt.d_vec_list = [emb_dim, pos_dim, pred_idx_dim, pred_emb_dim, pred_pos_dim]
        print('[Info] input: {}'.format(['word', 'pos', 'pred_ind', 'pred_word', 'pred_pos']))
        print('[Info] input embedding dims: {}'.format(opt.d_vec_list))
        print('[Info] Transformer input dims: {}'.format(opt.d_model))
        opt.n_cate_list = [opt.vocab_size, opt.n_pos, opt.n_pred_ind, opt.vocab_size, opt.n_pos]
        opt.emb_learnable_list = [False, True, True, False, True]
        opt.pre_emb_list = [word_emb, None, None, word_emb, None]
        transformer = TransformerTagger(
            opt.n_cate_list,
            opt.n_class,
            opt.max_token_seq_len,
            d_vec_list=opt.d_vec_list,
            pre_emb_list=opt.pre_emb_list,
            emb_op=opt.emb_op,
            emb_learnable_list=opt.emb_learnable_list,
            rel_pos_emb_op=opt.rel_pos_emb_op,
            n_rel_pos=opt.n_rel_pos,
            d_model=opt.d_model,
            d_inner=opt.d_inner_hid,
            n_layers=opt.n_layers,
            n_head=opt.n_head,
            d_k=opt.d_k,
            d_v=opt.d_v,
            dropout=opt.dropout).to(device)
    else:
        raise ValueError

    optimizer = ScheduledOptim(
        optim.Adam(
            filter(lambda x: x.requires_grad, transformer.parameters()),
            betas=(0.9, 0.98), eps=1e-09),
        opt.d_model, opt.n_warmup_steps)

    print('[Info] #parameters: {}'.format(count_parameters(transformer)))
    train(transformer, training_data, validation_data, optimizer, device ,opt)
示例#13
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    parser.add_argument('-data', required=True)

    parser.add_argument('-epoch', type=int, default=10000)
    parser.add_argument('-batch_size', type=int, default=64)

    # parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=8)
    parser.add_argument('-d_inner_hid', type=int, default=16)
    parser.add_argument('-d_k', type=int, default=8)
    parser.add_argument('-d_v', type=int, default=8)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-max_size', type=int, default=0)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-lr', type=float, default=1E-3, help="Learning rate.")
    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-sparsity', type=float, default=5.0)
    parser.add_argument('-padding_value_threshold', type=float, default=0.0)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default=None)
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-funnel',
                        action='store_true',
                        help="Use FunnelTransformer architecture.")

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    # ========= Loading Dataset =========#
    data = torch.load(opt.data)
    opt.max_token_seq_len = data['settings'].max_token_seq_len

    # ========= Preparing DataLoader =========#
    max_size_src = opt.max_size if opt.max_size is not 0 else len(
        data['train']['src'])
    max_size_tgt = opt.max_size if opt.max_size is not 0 else len(
        data['train']['tgt'])
    print("training: max_size_src={} max_size_tgt={}".format(
        max_size_src, max_size_tgt))
    training_data = DataLoader(data['dict']['src'],
                               data['dict']['tgt'],
                               src_insts=data['train']['src'][0:max_size_src],
                               tgt_insts=data['train']['tgt'][0:max_size_tgt],
                               batch_size=opt.batch_size,
                               cuda=opt.cuda)
    max_size_src = opt.max_size if opt.max_size is not 0 else len(
        data['valid']['src'])
    max_size_tgt = opt.max_size if opt.max_size is not 0 else len(
        data['valid']['tgt'])
    print("validation: max_size_src={} max_size_tgt={}".format(
        max_size_src, max_size_tgt))
    validation_data = DataLoader(
        data['dict']['src'],
        data['dict']['tgt'],
        src_insts=data['valid']['src'][0:max_size_src],
        tgt_insts=data['valid']['tgt'][0:max_size_tgt],
        batch_size=opt.batch_size,
        shuffle=opt.cuda,
        test=True,
        cuda=opt.cuda)

    opt.src_vocab_size = training_data.src_vocab_size
    opt.tgt_vocab_size = training_data.tgt_vocab_size

    # ========= Preparing Model =========#
    if opt.embs_share_weight and training_data.src_word2idx != training_data.tgt_word2idx:
        print(
            '[Warning]',
            'The src/tgt word2idx table are different but asked to share word embedding.'
        )

    print(opt)

    transformer = Transformer(
        opt.src_vocab_size,
        opt.tgt_vocab_size,
        opt.max_token_seq_len,
        proj_share_weight=opt.proj_share_weight,
        embs_share_weight=opt.embs_share_weight,
        d_k=opt.d_k,
        d_v=opt.d_v,
        d_model=opt.d_model,
        d_word_vec=opt.d_word_vec,
        d_inner_hid=opt.d_inner_hid,
        n_layers=opt.n_layers,
        n_head=opt.n_head,
        dropout=opt.dropout) if not opt.funnel else FunnelTransformer(
            opt.src_vocab_size,
            opt.tgt_vocab_size,
            opt.max_token_seq_len,
            proj_share_weight=opt.proj_share_weight,
            embs_share_weight=opt.embs_share_weight,
            d_k=opt.d_k,
            d_v=opt.d_v,
            d_model=opt.d_model,
            d_word_vec=opt.d_word_vec,
            d_inner_hid=opt.d_inner_hid,
            n_layers=opt.n_layers,
            n_head=opt.n_head,
            dropout=opt.dropout)

    #print(transformer)

    optimizer = ScheduledOptim(
        optim.Adam(transformer.parameters(),
                   betas=(0.9, 0.98),
                   eps=1e-09,
                   lr=opt.lr), opt.d_model, opt.n_warmup_steps)

    def get_criterion(vocab_size):
        ''' With PAD token zero weight '''
        weight = torch.ones(vocab_size)
        weight[Constants.PAD] = 0
        return nn.CrossEntropyLoss(weight, size_average=False)

    crit = get_criterion(training_data.tgt_vocab_size)

    if opt.cuda:
        transformer = transformer.cuda()
        crit = crit.cuda()

    train(transformer, training_data, validation_data, crit, optimizer, opt)
    if config.test_data_path:
        train_data = d.get_trainset(config)
        validate_data = d.get_testset(config)
    else:
        train_data, validate_data = d.get_splite_data(config)

    model = Transformer(config.src_vocab_size,
                        config.fix_len,
                        d_k=config.d_k,
                        d_v=config.d_v,
                        d_model=config.d_model,
                        d_word_vec=config.d_word_vec,
                        d_inner=config.d_inner,
                        n_layers=config.n_layers,
                        n_head=config.n_head,
                        dropout=config.dropout)

    criterion = nn.CrossEntropyLoss()

    if config.freeze:
        model_parameters = filter(lambda p: p.requires_grad,
                                  model.parameters())
    else:
        model_parameters = model.parameters()

    optimzier = torch.optim.Adam(model_parameters,
                                 lr=config.learning_rate,
                                 weight_decay=config.weight_decay)

    train(model, train_data, validate_data, config, optimzier, criterion)
示例#15
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    parser.add_argument('--train_src', required=True)
    parser.add_argument('--valid_src', required=True)
    parser.add_argument('--max_word_seq_len', type=int, default=100)
    parser.add_argument('--min_word_count', type=int, default=5)
    parser.add_argument('--keep_case', action='store_true')

    parser.add_argument('--epoch', type=int, default=500)
    parser.add_argument('--batch_size', type=int, default=64)
    parser.add_argument('--num_worker', type=int, default=8)

    # parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('--d_model', type=int, default=512)
    parser.add_argument('--d_inner_hid', type=int, default=2048)
    parser.add_argument('--d_k', type=int, default=64)
    parser.add_argument('--d_v', type=int, default=64)

    parser.add_argument('--n_head', type=int, default=8)
    parser.add_argument('--n_layers', type=int, default=6)
    parser.add_argument('--n_warmup_steps', type=int, default=4000)

    parser.add_argument('--dropout', type=float, default=0.1)
    parser.add_argument('--embs_share_weight', action='store_true')
    parser.add_argument('--proj_share_weight', action='store_true')

    parser.add_argument('--model', default=None, help='Path to model file')
    parser.add_argument('--log', default=None)
    parser.add_argument('--save_model', default=None)
    parser.add_argument('--save_data', default='./data/word2idx.pth')
    parser.add_argument('--save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('--no_cuda', action='store_true')
    parser.add_argument('--label_smoothing', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    opt.max_token_seq_len = opt.max_word_seq_len + 2
    #========= Loading Dataset =========#
    training_data = torch.utils.data.DataLoader(dataset.TranslationDataset(
        dir_name=opt.train_src,
        max_word_seq_len=opt.max_word_seq_len,
        min_word_count=opt.min_word_count,
        keep_case=opt.keep_case,
        src_word2idx=None,
        tgt_word2idx=None),
                                                num_workers=opt.num_worker,
                                                batch_size=opt.batch_size,
                                                collate_fn=paired_collate_fn,
                                                shuffle=True)
    validation_data = torch.utils.data.DataLoader(dataset.TranslationDataset(
        dir_name=opt.valid_src,
        max_word_seq_len=opt.max_word_seq_len,
        min_word_count=opt.min_word_count,
        keep_case=opt.keep_case,
        src_word2idx=training_data.dataset.src_word2idx,
        tgt_word2idx=training_data.dataset.tgt_word2idx),
                                                  num_workers=opt.num_worker,
                                                  batch_size=opt.batch_size,
                                                  collate_fn=paired_collate_fn,
                                                  shuffle=True)
    data = {
        'dict': {
            'src': training_data.dataset.src_word2idx,
            'tgt': training_data.dataset.tgt_word2idx
        }
    }
    print('[Info] Dumping the processed data to pickle file', opt.save_data)
    torch.save(data, opt.save_data)
    print('[Info] Finish.')
    del data
    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    #========= Preparing Model =========#
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table are different but asked to share word embedding.'

    print(opt)

    device = torch.device('cuda' if opt.cuda else 'cpu')
    transformer = Transformer(opt.src_vocab_size,
                              opt.tgt_vocab_size,
                              opt.max_token_seq_len,
                              tgt_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_tgt_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout).to(device)

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)
    if (opt.model is not None):
        print('pretrain model!')
        checkpoint = torch.load(opt.model)
        model_opt = checkpoint['settings']
        transformer = Transformer(
            model_opt.src_vocab_size,
            model_opt.tgt_vocab_size,
            model_opt.max_token_seq_len,
            tgt_emb_prj_weight_sharing=model_opt.proj_share_weight,
            emb_src_tgt_weight_sharing=model_opt.embs_share_weight,
            d_k=model_opt.d_k,
            d_v=model_opt.d_v,
            d_model=model_opt.d_model,
            d_word_vec=model_opt.d_word_vec,
            d_inner=model_opt.d_inner_hid,
            n_layers=model_opt.n_layers,
            n_head=model_opt.n_head,
            dropout=model_opt.dropout)
        transformer.load_state_dict(checkpoint['model'])
        transformer = transformer.to(device)

    train(transformer, training_data, validation_data, optimizer, device, opt)
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    parser.add_argument('-data', required=True)

    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-batch_size', type=int, default=64)

    # parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default='default')
    parser.add_argument('-tensorboard', default=None)
    parser.add_argument('-save_model', default=None)
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model
    global global_counter
    global_counter = 0

    writer = None
    if opt.tensorboard:
        writer = SummaryWriter(os.path.join('./logs', opt.tensorboard))

    # ========= Loading Dataset =========#
    data = torch.load(opt.data)
    global idx2char
    idx2char = {v: k for k, v in data['dict']['src'].items()}

    opt.max_token_seq_len = data['settings'].max_token_seq_len

    training_data, validation_data, unique_char_len = prepare_dataloaders(
        data, opt)

    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    # ========= Preparing Model =========#
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table are different but asked to share word embedding.'

    print(opt)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    transformer = Transformer(opt.src_vocab_size,
                              opt.tgt_vocab_size,
                              opt.max_token_seq_len,
                              tgt_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_tgt_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout).to(device)

    try:
        transformer.load_state_dict(torch.load('./checkpoints/model.pt'))
        print("Model loaded successfully.......")
    except:
        pass

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)

    train(transformer, training_data, validation_data, optimizer, device, opt,
          unique_char_len, writer)
        print('  - (Training)   accuracy: {accu:3.3f} %, '\
              'elapse: {elapse:3.3f} min'.format(
                  accu=100*train_accu,
                  elapse=(time.time()-start)/60))

        start = time.time()
        valid_loss, valid_accu = eval_epoch(model, validation_data, predicates)
        print('  - (Validation)  accuracy: {accu:3.3f} %, '\
                'elapse: {elapse:3.3f} min'.format(
                    accu=100*valid_accu,
                    elapse=(time.time()-start)/60))

        valid_accus += [valid_accu]

device = torch.device('cpu')


word2idx,ints,en1_pos,en2_pos,predicates,relation2idx = data.build_sentences()

training_data, validation_data = prepare_dataloaders(word2idx,ints,en1_pos,en2_pos,predicates)
model = Transformer(
    n_src_vocab=len(word2idx),
    len_max_seq=config.max_seq_len).to(device)

optimizer = ScheduledOptim(
    optim.Adam(
        filter(lambda x: x.requires_grad, model.parameters()),
        betas=(0.9, 0.98), eps=1e-09),
    512, 1000)

train(model, training_data, validation_data, optimizer,predicates)
示例#18
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    parser.add_argument('-data', default="./pssp-data/data.pt")

    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-batch_size', type=int, default=17)

    #parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=256)
    parser.add_argument('-d_inner_hid', type=int, default=512)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default="model")
    parser.add_argument('-save_plot', default="loss.png")
    parser.add_argument('-save_mode', type=str,
                        choices=['all', 'best'], default='best')


    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    #========= Loading Dataset =========#
    data = torch.load(opt.data)
    opt.max_token_seq_len = data['settings'].max_token_seq_len

    training_data, validation_data, test_data = prepare_dataloaders(data, opt)

    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    opt.vocab_src = training_data.dataset.src_word2idx
    opt.vocab_tgt = training_data.dataset.tgt_word2idx

    #========= Preparing Model =========#
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table are different but asked to share word embedding.'

    print(opt)

    device = torch.device('cuda' if opt.cuda else 'cpu')

    transformer = Transformer(
        opt.src_vocab_size,
        opt.tgt_vocab_size,
        opt.max_token_seq_len,
        tgt_emb_prj_weight_sharing=opt.proj_share_weight,
        emb_src_tgt_weight_sharing=opt.embs_share_weight,
        d_k=opt.d_k,
        d_v=opt.d_v,
        d_model=opt.d_model,
        d_word_vec=opt.d_word_vec,
        d_inner=opt.d_inner_hid,
        n_layers=opt.n_layers,
        n_head=opt.n_head,
        dropout=opt.dropout)

    transformer = DataParallel(transformer, range(0, torch.cuda.device_count())).to(device)

    optimizer = ScheduledOptim(
        optim.Adam(
            filter(lambda x: x.requires_grad, transformer.parameters()),
            betas=(0.9, 0.98), eps=1e-09),
        opt.d_model, opt.n_warmup_steps)

    weight_mask = None
    
    crossEntropy = nn.CrossEntropyLoss(weight_mask, reduction='sum', ignore_index=Constants.PAD)

    train_loss, val_loss = train(
        transformer, training_data, validation_data, optimizer, device, opt, crossEntropy)
    print("Starting Test...")
    test(transformer, test_data, device, opt, crossEntropy)
    print("Making loss graph...")
    plt = plot(train_loss, val_loss)
    plt.savefig(opt.save_plot + ".png")
    print("Finished!")
示例#19
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    parser.add_argument('-data', required=True)

    parser.add_argument('-epoch', type=int, default=None)
    parser.add_argument('-step', type=int, default=None)
    parser.add_argument('-batch_size', type=int, default=64)

    #parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    # NOTE(keshav2): This just refers to the learning rate schedule,
    #                nothing performance related.
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('--checkpoint_dir',
                        type=str,
                        default='/lfs/1/keshav2/checkpoints/transformer')
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='all')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    parser.add_argument('--dist-url',
                        default='env://',
                        type=str,
                        help='url used to set up distributed training')
    parser.add_argument('--dist-backend',
                        default='nccl',
                        type=str,
                        help='Distributed backend')
    parser.add_argument('--local_rank', default=0, type=int, help='Local rank')
    parser.add_argument('--rank', default=None, type=int, help='Rank')
    parser.add_argument('--world_size',
                        default=None,
                        type=int,
                        help='World size')
    parser.add_argument('--master_addr',
                        default=None,
                        type=str,
                        help='Master address to use for distributed run')
    parser.add_argument('--master_port',
                        default=None,
                        type=int,
                        help='Master port to use for distributed run')

    parser.add_argument('--throughput_estimation_interval',
                        type=int,
                        default=None,
                        help='Steps between logging steps completed')
    parser.add_argument('--max_duration',
                        type=int,
                        default=None,
                        help='Maximum duration in seconds')
    parser.add_argument('--enable_gavel_iterator',
                        action='store_true',
                        default=False,
                        help='If set, use Gavel iterator')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    torch.cuda.set_device(opt.local_rank)

    if opt.epoch is not None and opt.step is not None:
        raise ValueError('Only one of epoch and step may be set')
    elif opt.epoch is None and opt.step is None:
        raise ValueError('One of epoch and step must be set')

    opt.distributed = False
    if opt.master_addr is not None:
        opt.distributed = True
        os.environ['MASTER_ADDR'] = opt.master_addr
        os.environ['MASTER_PORT'] = str(opt.master_port)
        dist.init_process_group(backend=opt.dist_backend,
                                init_method=opt.dist_url,
                                world_size=opt.world_size,
                                rank=opt.rank)

    #========= Loading Dataset =========#
    data = torch.load(opt.data)
    opt.max_token_seq_len = data['settings'].max_token_seq_len

    training_data, validation_data = prepare_dataloaders(
        data, opt, opt.master_addr is not None)

    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    #========= Preparing Model =========#
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table are different but asked to share word embedding.'

    print(opt)

    device = torch.device('cuda' if opt.cuda else 'cpu')
    transformer = Transformer(opt.src_vocab_size,
                              opt.tgt_vocab_size,
                              opt.max_token_seq_len,
                              tgt_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_tgt_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout).to(device)

    if opt.distributed:
        transformer = DDP(transformer,
                          device_ids=[opt.local_rank],
                          output_device=opt.local_rank)

    if opt.enable_gavel_iterator:
        training_data = GavelIterator(training_data, opt.checkpoint_dir,
                                      load_checkpoint, save_checkpoint)

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)

    train(transformer, training_data, validation_data, optimizer, device, opt)
示例#20
0
def main():
    print(args)

    data_dir = '/home/tiankeke/workspace/datas/sumdata/'
    TRAIN_X = os.path.join(data_dir, 'train/train.article.txt')
    TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt')
    VALID_X = os.path.join(data_dir, 'train/valid.article.filter.txt')
    VALID_Y = os.path.join(data_dir, 'train/valid.title.filter.txt')

    src_vocab, tgt_vocab = get_vocab(TRAIN_X, TRAIN_Y)

    small_vocab_file = 'sumdata/small_vocab.json'
    if os.path.exists(small_vocab_file):
        small_vocab = json.load(open(small_vocab_file))
    else:
        small_vocab = build_vocab([TRAIN_X, TRAIN_Y],
                                  small_vocab_file,
                                  vocab_size=80000)

    max_src_len = 101
    max_tgt_len = 47
    bs = args.batch_size
    n_train = args.n_train
    n_valid = args.n_valid

    vocab = small_vocab

    train_x = BatchManager(load_data(TRAIN_X, max_src_len, n_train), bs, vocab)
    train_y = BatchManager(load_data(TRAIN_Y, max_tgt_len, n_train), bs, vocab)
    valid_x = BatchManager(load_data(VALID_X, max_src_len, n_valid), bs, vocab)
    valid_y = BatchManager(load_data(VALID_Y, max_tgt_len, n_valid), bs, vocab)

    model = Transformer(len(vocab),
                        len(vocab),
                        max_src_len,
                        d_word_vec=300,
                        d_model=300,
                        d_inner=1200,
                        n_layers=1,
                        n_head=6,
                        d_k=50,
                        d_v=50,
                        dropout=0.1,
                        tgt_emb_prj_weight_sharing=True,
                        emb_src_tgt_weight_sharing=True).cuda()
    # print(model)

    saved_state = {'epoch': 0, 'lr': 0.001}
    if os.path.exists(args.ckpt_file):
        saved_state = torch.load(args.ckpt_file)
        model.load_state_dict(saved_state['state_dict'])
        logging.info('Load model parameters from %s' % args.ckpt_file)

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=saved_state['lr'])
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=1,
                                                gamma=0.3)
    scheduler.step(
    )  # last_epoch=-1, which will not update lr at the first time

    train(train_x, train_y, valid_x, valid_y, model, optimizer, scheduler,
          args.n_epochs, saved_state['epoch'])
示例#21
0
def main():
    """ Main function. """

    parser = argparse.ArgumentParser()

    parser.add_argument('-data', required=True)

    parser.add_argument('-epoch', type=int, default=30)
    parser.add_argument('-batch_size', type=int, default=16)

    parser.add_argument('-d_model', type=int, default=64)
    parser.add_argument('-d_rnn', type=int, default=256)
    parser.add_argument('-d_inner_hid', type=int, default=128)
    parser.add_argument('-d_k', type=int, default=16)
    parser.add_argument('-d_v', type=int, default=16)

    parser.add_argument('-n_head', type=int, default=4)
    parser.add_argument('-n_layers', type=int, default=4)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-lr', type=float, default=1e-4)
    parser.add_argument('-smooth', type=float, default=0.1)

    parser.add_argument('-log', type=str, default='log.txt')

    opt = parser.parse_args()

    # default device is CUDA
    opt.device = torch.device('cuda')

    # setup the log file
    with open(opt.log, 'w') as f:
        f.write('Epoch, Log-likelihood, Accuracy, RMSE\n')

    print('[Info] parameters: {}'.format(opt))
    """ prepare dataloader """
    trainloader, testloader, num_types = prepare_dataloader(opt)
    """ prepare model """
    model = Transformer(
        num_types=num_types,
        d_model=opt.d_model,
        d_rnn=opt.d_rnn,
        d_inner=opt.d_inner_hid,
        n_layers=opt.n_layers,
        n_head=opt.n_head,
        d_k=opt.d_k,
        d_v=opt.d_v,
        dropout=opt.dropout,
    )
    model.to(opt.device)
    """ optimizer and scheduler """
    optimizer = optim.Adam(filter(lambda x: x.requires_grad,
                                  model.parameters()),
                           opt.lr,
                           betas=(0.9, 0.999),
                           eps=1e-05)
    scheduler = optim.lr_scheduler.StepLR(optimizer, 10, gamma=0.5)
    """ prediction loss function, either cross entropy or label smoothing """
    if opt.smooth > 0:
        pred_loss_func = Utils.LabelSmoothingLoss(opt.smooth,
                                                  num_types,
                                                  ignore_index=-1)
    else:
        pred_loss_func = nn.CrossEntropyLoss(ignore_index=-1, reduction='none')
    """ number of parameters """
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('[Info] Number of parameters: {}'.format(num_params))
    """ train the model """
    train(model, trainloader, testloader, optimizer, scheduler, pred_loss_func,
          opt)
示例#22
0
def main():
    ''' 
    Usage:
    python train.py -data_pkl m30k_deen_shr.pkl -embs_share_weight -proj_share_weight -label_smoothing -output_dir output -b 256 -warmup 128000
    '''

    parser = argparse.ArgumentParser()

    parser.add_argument('-data_pkl', default=None)     # all-in-1 data pickle or bpe field

    parser.add_argument('-train_path', default=None)   # bpe encoded data
    parser.add_argument('-val_path', default=None)     # bpe encoded data

    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-b', '--batch_size', type=int, default=2048)

    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-warmup','--n_warmup_steps', type=int, default=4000)
    parser.add_argument('-lr_mul', type=float, default=2.0)
    parser.add_argument('-seed', type=int, default=None)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')
    parser.add_argument('-scale_emb_or_prj', type=str, default='prj')

    parser.add_argument('-output_dir', type=str, default=None)
    parser.add_argument('-use_tb', action='store_true')
    parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    # https://pytorch.org/docs/stable/notes/randomness.html
    # For reproducibility
    if opt.seed is not None:
        torch.manual_seed(opt.seed)
        torch.backends.cudnn.benchmark = False
        # torch.set_deterministic(True)
        np.random.seed(opt.seed)
        random.seed(opt.seed)

    if not opt.output_dir:
        print('No experiment result will be saved.')
        raise

    if not os.path.exists(opt.output_dir):
        os.makedirs(opt.output_dir)

    if opt.batch_size < 2048 and opt.n_warmup_steps <= 4000:
        print('[Warning] The warmup steps may be not enough.\n'\
              '(sz_b, warmup) = (2048, 4000) is the official setting.\n'\
              'Using smaller batch w/o longer warmup may cause '\
              'the warmup stage ends with only little data trained.')

    device = torch.device('cuda' if opt.cuda else 'cpu')

    #========= Loading Dataset =========#

    if all((opt.train_path, opt.val_path)):
        training_data, validation_data = prepare_dataloaders_from_bpe_files(opt, device)
    elif opt.data_pkl:
        training_data, validation_data = prepare_dataloaders(opt, device)
    else:
        raise

    print(opt)

    transformer = Transformer(
        opt.src_vocab_size,
        opt.trg_vocab_size,
        src_pad_idx=opt.src_pad_idx,
        trg_pad_idx=opt.trg_pad_idx,
        trg_emb_prj_weight_sharing=opt.proj_share_weight,
        emb_src_trg_weight_sharing=opt.embs_share_weight,
        d_k=opt.d_k,
        d_v=opt.d_v,
        d_model=opt.d_model,
        d_word_vec=opt.d_word_vec,
        d_inner=opt.d_inner_hid,
        n_layers=opt.n_layers,
        n_head=opt.n_head,
        dropout=opt.dropout,
        scale_emb_or_prj=opt.scale_emb_or_prj).to(device)

    optimizer = ScheduledOptim(
        optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09),
        opt.lr_mul, opt.d_model, opt.n_warmup_steps)

    train(transformer, training_data, validation_data, optimizer, device, opt)
def main():
    '''
    Usage:
    python train.py -data_pkl m30k_deen_shr.pkl -log m30k_deen_shr -embs_share_weight -proj_share_weight -label_smoothing -save_model trained -b 256 -warmup 128000
    '''
    global C
    global shapes
    global Beta
    parser = argparse.ArgumentParser()

    parser.add_argument('-data_pkl',
                        default=None)  # all-in-1 data pickle or bpe field
    parser.add_argument('-srn', type=bool, default=False)
    parser.add_argument('-optimize_c', type=bool, default=False)
    parser.add_argument('-Beta', type=float, default=1.0)
    parser.add_argument("-lr", type=float, default=1e-1)
    parser.add_argument("-scheduler_mode", type=str, default=None)
    parser.add_argument("-scheduler_factor", type=float, default=0.5)
    parser.add_argument('-train_path', default=None)  # bpe encoded data
    parser.add_argument('-val_path', default=None)  # bpe encoded data

    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-b', '--batch_size', type=int, default=2048)

    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default=None)
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model
    Beta = opt.Beta

    if not opt.log and not opt.save_model:
        print('No experiment result will be saved.')
        raise

    if opt.batch_size < 2048 and opt.n_warmup_steps <= 4000:
        print('[Warning] The warmup steps may be not enough.\n'\
              '(sz_b, warmup) = (2048, 4000) is the official setting.\n'\
              'Using smaller batch w/o longer warmup may cause '\
              'the warmup stage ends with only little data trained.')

    device = torch.device('cuda' if opt.cuda else 'cpu')

    #========= Loading Dataset =========#

    if all((opt.train_path, opt.val_path)):
        training_data, validation_data = prepare_dataloaders_from_bpe_files(
            opt, device)
    elif opt.data_pkl:
        training_data, validation_data = prepare_dataloaders(opt, device)
    else:
        raise

    print(opt)

    transformer = Transformer(opt.src_vocab_size,
                              opt.trg_vocab_size,
                              src_pad_idx=opt.src_pad_idx,
                              trg_pad_idx=opt.trg_pad_idx,
                              trg_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_trg_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout).to(device)
    if opt.srn:
        transformer = migrate_to_srn(transformer)
        transformer = transformer.to(device)
    if opt.optimize_c:
        srn_modules = [
            module for module in transformer.modules()
            if isinstance(module, (SRNLinear, SRNConv2d))
        ]
        sranks = []
        shapes = []

        for module in srn_modules:
            W = module.weight.detach()
            shape_w = W.shape
            W = W.view(shape_w[0], -1)
            sranks.append(stable_rank(W).item())
            shapes.append(W.shape)

        # a rule of thump to initialize the target srank with the current srank of the model
        C = [
            Parameter((torch.ones(1) * sranks[i] / min(shapes[i])).view(()))
            for i in range(len(srn_modules))
        ]
        for i, module in enumerate(srn_modules):
            C[i].to(device)
            module.c = C[i]
        criteria = criteria_
    else:
        criteria = cal_performance
    optimizer = ScheduledOptim(optim.Adam(transformer.parameters(),
                                          lr=1e-2,
                                          betas=(0.9, 0.98),
                                          eps=1e-09),
                               opt.lr,
                               opt.d_model,
                               opt.n_warmup_steps,
                               mode=opt.scheduler_mode,
                               factor=opt.scheduler_factor,
                               patience=3)

    train(transformer,
          training_data,
          validation_data,
          optimizer,
          device,
          opt,
          loss=criteria)
    print("~~~~~~~~~~~~~C~~~~~~~~~~~~~")
    print(C)
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print("-----------Model-----------")
    print(transformer)
    print("---------------------------")
    with torch.no_grad():
        for pname, p in transformer.named_parameters():
            if len(p.shape) > 1:
                print("...Parameter ", pname, ", srank=",
                      stable_rank(p.view(p.shape[0], -1)).item())
示例#24
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    parser.add_argument('-data', required=True)
    parser.add_argument('-mined_data', required=True)
    parser.add_argument('-snippet_model', required=True)

    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-batch_size', type=int, default=64)

    #parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', type=bool, default=True)
    parser.add_argument('-save_model_dir', default=None, required=True)
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='all')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    # For bleu eval
    parser.add_argument('-beam_size', type=int, default=5, help='Beam size')
    parser.add_argument('-n_best',
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")

    parser.add_argument('-test_epoch',
                        type=int,
                        default=5,
                        help='Test every x epochs')
    parser.add_argument('-resume_from_epoch',
                        type=int,
                        default=0,
                        help='Warm restart')

    # Not really needed
    parser.add_argument('-alpha',
                        type=float,
                        default=1.0,
                        help='Weighting loss')
    parser.add_argument('-loss_weight',
                        type=float,
                        default=0.1,
                        help='Mined loss weight')
    parser.add_argument('-lr', type=float, default=1e-3, help='Learning rate')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    # Snippet model sentencepiece
    sp.Load(opt.snippet_model)

    #========= Loading Dataset =========#
    data = torch.load(opt.data)
    mined_data = torch.load(opt.mined_data)

    opt.inp_seq_max_len = 4 * data['settings'].train_max_input_len
    opt.out_seq_max_len = 4 * data['settings'].train_max_output_len

    opt.max_token_seq_len = int(opt.out_seq_max_len / 4)

    training_data, validation_data, test_data, mined_data = prepare_dataloaders(
        data, mined_data, opt)

    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    print(opt.inp_seq_max_len, opt.out_seq_max_len, opt.src_vocab_size,
          opt.tgt_vocab_size)

    #========= Preparing Model =========#
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table are different but asked to share word embedding.'

    print(opt)

    device = torch.device('cuda' if opt.cuda else 'cpu')
    transformer = Transformer(opt.src_vocab_size,
                              opt.tgt_vocab_size,
                              opt.inp_seq_max_len,
                              opt.out_seq_max_len,
                              tgt_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_tgt_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout).to(device)

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09,
                   lr=opt.lr), opt.d_model, opt.n_warmup_steps)

    save_params(opt)

    opt = check_restart_conditions(opt)
    if opt.resume_from_epoch >= 1:
        print('Loading Old model')
        print('Loading model files from folder: %s' % opt.save_model_dir)
        transformer = load_models(transformer, opt, opt.resume_from_epoch)

    train(transformer, training_data, validation_data, test_data, mined_data,
          optimizer, device, opt)
示例#25
0
def main():
    ''' 
    Usage:
    python train.py -data_pkl m30k_deen_shr.pkl -log m30k_deen_shr -embs_share_weight -proj_share_weight -label_smoothing -save_model trained -b 256 -warmup 128000
    '''

    parser = argparse.ArgumentParser()

    parser.add_argument('-data_pkl',
                        default=None)  # all-in-1 data pickle or bpe field

    parser.add_argument('-train_path', default=None)  # bpe encoded data
    parser.add_argument('-val_path', default=None)  # bpe encoded data

    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-b', '--batch_size', type=int, default=2048)

    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default=None)
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-no_cuda', default=True, action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    if not opt.log and not opt.save_model:
        print('No experiment result will be saved.')
        raise

    if opt.batch_size < 2048 and opt.n_warmup_steps <= 4000:
        print('[Warning] The warmup steps may be not enough.\n'\
              '(sz_b, warmup) = (2048, 4000) is the official setting.\n'\
              'Using smaller batch w/o longer warmup may cause '\
              'the warmup stage ends with only little data trained.')

    device = torch.device('cuda' if opt.cuda else 'cpu')

    #========= Loading Dataset =========#

    if all((opt.train_path, opt.val_path)):
        training_data, validation_data = prepare_dataloaders_from_bpe_files(
            opt, device)
    elif opt.data_pkl:
        training_data, validation_data = prepare_dataloaders(opt, device)
    else:
        raise

    print(opt)

    transformer = Transformer(opt.src_vocab_size,
                              opt.trg_vocab_size,
                              src_pad_idx=opt.src_pad_idx,
                              trg_pad_idx=opt.trg_pad_idx,
                              trg_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_trg_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout).to(device)

    optimizer = ScheduledOptim(
        optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09),
        2.0, opt.d_model, opt.n_warmup_steps)

    train(transformer, training_data, validation_data, optimizer, device, opt)
def train():
    parser = ArgumentParser()
    parser.add_argument("--dataset_path",
                        type=str,
                        default="",
                        help="Path or url of the dataset.")
    parser.add_argument("--train_batch_size",
                        type=int,
                        default=64,
                        help="Batch size for training")
    parser.add_argument("--valid_batch_size",
                        type=int,
                        default=64,
                        help="Batch size for validation")
    parser.add_argument("--gradient_accumulation_steps",
                        type=int,
                        default=8,
                        help="Accumulate gradients on several steps")
    parser.add_argument("--lr",
                        type=float,
                        default=6.25e-4,
                        help="Learning rate")
    parser.add_argument("--max_norm",
                        type=float,
                        default=1.0,
                        help="Clipping gradient norm")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=15,
                        help="Number of training epochs")
    parser.add_argument(
        "--eval_before_start",
        action='store_true',
        help="If true start with a first evaluation before training")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument(
        "--fp16",
        type=str,
        default="",
        help=
        "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="Local rank for distributed training (-1: not distributed)")
    parser.add_argument("--gpt2_model_name",
                        type=str,
                        default="gpt2",
                        help="Path, url or short name of the model")

    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')
    args = parser.parse_args()
    args.d_word_vec = args.d_model

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Running process %d", args.local_rank
    )  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(args))

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    logger.info("Prepare tokenizer, pretrained model and optimizer.")

    tokenizer_class = GPT2Tokenizer if "gpt2" in args.gpt2_model_name else OpenAIGPTTokenizer  # cant use Autotokenizer because checkpoint could be a Path
    tokenizer = tokenizer_class.from_pretrained(args.gpt2_model_name)

    num_tokens = len(tokenizer.encoder)
    num_added_tokens = tokenizer.add_special_tokens(
        ATTR_TO_SPECIAL_TOKEN)  # doesn't add if they are already there

    model = Transformer(
        num_tokens + num_added_tokens,
        num_tokens + num_added_tokens,
        src_pad_idx=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]),
        trg_pad_idx=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]),
        trg_emb_prj_weight_sharing=args.proj_share_weight,
        emb_src_trg_weight_sharing=args.embs_share_weight,
        d_k=args.d_k,
        d_v=args.d_v,
        d_model=args.d_model,
        d_word_vec=args.d_word_vec,
        d_inner=args.d_inner_hid,
        n_layers=args.n_layers,
        n_head=args.n_head,
        dropout=args.dropout).to(args.device)

    optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if args.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16)
    if args.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(
        args, tokenizer, tokenizer)

    # Training function and trainer
    def update(engine, batch):
        model.train()
        batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
        source_ids, target_ids, lm_labels = batch

        (lm_loss), *_ = model(source_ids, target_ids, labels=lm_labels)

        loss = lm_loss / args.gradient_accumulation_steps
        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                           args.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()

    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(
                input_tensor.to(args.device) for input_tensor in batch)
            source_ids, target_ids, lm_labels = batch
            #logger.info(tokenizer.decode(target_ids[0].tolist()))

            lm_logits, *_ = model(source_ids, target_ids)
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(
                -1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return (lm_logits_flat_shifted, ), (lm_labels_flat_shifted, )

    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(val_loader))
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED,
                                  lambda _: evaluator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED,
                                  lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr",
                                [(0, args.lr),
                                 (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {
        "nll":
        Loss(torch.nn.CrossEntropyLoss(ignore_index=-100),
             output_transform=lambda x: (x[0][0], x[1][0]))
    }
    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"], args)
    })
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(
            Events.COMPLETED, lambda _: pbar.log_message(
                "Validation: %s" % pformat(evaluator.state.metrics)))

        log_dir = make_logdir(args.gpt2_model_name, args.dataset_path)
        tb_logger = TensorboardLogger(log_dir)

        tb_logger.attach(trainer,
                         log_handler=OutputHandler(tag="training",
                                                   metric_names=["loss"]),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=OptimizerParamsHandler(optimizer),
                         event_name=Events.ITERATION_STARTED)
        tb_logger.attach(evaluator,
                         log_handler=OutputHandler(tag="validation",
                                                   metric_names=list(
                                                       metrics.keys()),
                                                   another_engine=trainer),
                         event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(log_dir,
                                             'checkpoint',
                                             save_interval=1,
                                             n_saved=4)
        trainer.add_event_handler(
            Events.EPOCH_COMPLETED, checkpoint_handler,
            {'mymodel': getattr(model, 'module', model)
             })  # "getattr" takes care of distributed encapsulation

        torch.save(args, log_dir + '/model_training_args.bin')
        #getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME))
        tokenizer.save_pretrained(log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if args.local_rank in [-1, 0] and args.n_epochs > 0:
        os.rename(
            os.path.join(log_dir, checkpoint_handler._saved[-1][1]),
            os.path.join(log_dir, WEIGHTS_NAME)
        )  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
示例#27
0
def main():

    device = torch.device("cuda:0" if USE_CUDA else "cpu")

    env = Environment()

    END_TAG_IDX = env.lang.word2idx[END_TAG]

    SAY_HI = "hello"

    targ_lang = env.lang

    vocab_inp_size = len(env.lang.word2idx)
    vocab_tar_size = len(targ_lang.word2idx)

    print("vocab_inp_size", vocab_inp_size)
    print("vocab_tar_size", vocab_tar_size)

    model = Transformer(
        vocab_inp_size,
        vocab_tar_size,
        MAX_TARGET_LEN,
        d_word_vec=32,
        d_model=32,
        d_inner=32,
        n_layers=3,
        n_head=4,
        d_k=32,
        d_v=32,
        dropout=0.1,
    ).to(device)

    # baseline = Baseline(UNITS)

    history = []

    l_optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

    batch = None

    def maybe_pad_sentence(s):
        return tf.keras.preprocessing.sequence.pad_sequences(
            s, maxlen=MAX_TARGET_LEN, padding='post')

    def get_returns(r: float, seq_len: int):
        return list(reversed([r * (GAMMA**t) for t in range(seq_len)]))

    def sentence_to_idxs(sentence: str):
        return [
            env.lang.word2idx[token] for token in tokenize_sentence(sentence)
        ]

    for episode in range(EPISODES):

        # Start of Episode
        env.reset()
        model.eval()

        # get first state from the env
        state, _, done = env.step(SAY_HI)

        while not done:

            src_seq = [
                env.lang.word2idx[token] for token in tokenize_sentence(state)
            ]
            src_seq, src_pos = collate_fn([src_seq])
            src_seq, src_pos = src_seq.to(device), src_pos.to(device)
            enc_output, *_ = model.encoder(src_seq, src_pos)
            actions_t = []
            actions = []
            actions_idx = []

            while len(actions) == 0 or actions[len(actions) -
                                               1] != END_TAG_IDX and len(
                                                   actions) < MAX_TARGET_LEN:
                # construct new tgt_seq based on what's outputed so far
                if len(actions_t) == 0:
                    tgt_seq = [env.lang.word2idx[Constants.UNK_WORD]]
                else:
                    tgt_seq = actions_idx
                tgt_seq, tgt_pos = collate_fn([tgt_seq])
                tgt_seq, tgt_pos = tgt_seq.to(device), tgt_pos.to(device)
                # dec_output dims: [1, pos, hidden]
                dec_output, * \
                    _ = model.decoder(tgt_seq, tgt_pos, src_seq, enc_output)
                # pick last step
                dec_output = dec_output[:, -1, :]
                # w_logits dims: [1, vocab_size]
                w_logits = model.tgt_word_prj(dec_output)
                # w_probs dims: [1, vocab_size]
                w_probs = torch.nn.functional.softmax(w_logits, dim=1)
                w_dist = torch.distributions.categorical.Categorical(
                    probs=w_probs)
                w_idx_t = w_dist.sample()
                w_idx = w_idx_t.cpu().numpy()[0]
                actions_t.append(w_idx_t)
                actions_idx.append(w_idx)
                actions.append(env.lang.idx2word[w_idx])

            # action is a sentence (string)
            action_str = ' '.join(actions)
            next_state, reward, done = env.step(action_str)
            # print(reward)
            history.append((state, actions_t, action_str, reward))
            state = next_state

            # record history (to be used for gradient updating after the episode is done)
        # End of Episode
        # Update policy
        model.train()
        while len(history) >= BATCH_SIZE:
            batch = history[:BATCH_SIZE]
            state_inp_b, action_inp_b, reward_b, ret_seq_b = zip(*[[
                sentence_to_idxs(state), actions_b, reward,
                get_returns(reward, MAX_TARGET_LEN)
            ] for state, actions_b, _, reward in batch])
            action_inp_b = [torch.stack(sent) for sent in action_inp_b]
            action_inp_b = torch.stack(action_inp_b)

            ret_seq_b = np.asarray(ret_seq_b)

            # ret_mean = np.mean(ret_seq_b)
            # ret_std = np.std(ret_seq_b)
            # ret_seq_b = (ret_seq_b - ret_mean) / ret_std
            ret_seq_b = np.exp((ret_seq_b - 0.5) * 5)

            ret_seq_b = torch.tensor(ret_seq_b, dtype=torch.float32).to(device)

            loss = 0
            # loss_bl=0
            l_optimizer.zero_grad()
            # accumulate gradient with GradientTape
            src_seq, src_pos = collate_fn(list(state_inp_b))
            src_seq, src_pos = src_seq.to(device), src_pos.to(device)
            enc_output_b, *_ = model.encoder(src_seq, src_pos)
            max_sentence_len = action_inp_b.shape[1]
            tgt_seq = [[Constants.BOS] for i in range(BATCH_SIZE)]
            for t in range(max_sentence_len):
                # _b stands for batch
                prev_w_idx_b, tgt_pos = collate_fn(tgt_seq)
                prev_w_idx_b, tgt_pos = prev_w_idx_b.to(device), tgt_pos.to(
                    device)
                # dec_output_b dims: [batch, pos, hidden]
                dec_output_b, *_ = \
                    model.decoder(prev_w_idx_b, tgt_pos, src_seq, enc_output_b)
                # pick last step
                dec_output_b = dec_output_b[:, -1, :]
                # w_logits_b dims: [batch, vocab_size]
                w_logits_b = model.tgt_word_prj(dec_output_b)
                # w_probs dims: [batch, vocab_size]
                w_probs_b = torch.nn.functional.softmax(w_logits_b, dim=1)

                dist_b = torch.distributions.categorical.Categorical(
                    probs=w_probs_b)
                curr_w_idx_b = action_inp_b[:, t, :]
                log_probs_b = torch.transpose(
                    dist_b.log_prob(torch.transpose(curr_w_idx_b, 0, 1)), 0, 1)

                # bl_val_b = baseline(tf.cast(dec_hidden_b, 'float32'))
                # delta_b = ret_b - bl_val_b

                # cost_b = -tf.math.multiply(log_probs_b, delta_b)
                # cost_b = -tf.math.multiply(log_probs_b, ret_b)
                ret_b = torch.reshape(ret_seq_b[:, t],
                                      (BATCH_SIZE, 1)).to(device)
                # alternatively, use torch.mul() but it is overloaded. Might need to try log_probs_b*vec.expand_as(A)
                cost_b = -torch.mul(log_probs_b, ret_b)
                #  log_probs_b*vec.expand_as(A)
                # cost_b = -torch.bmm()   #if we are doing batch multiplication

                loss += cost_b
                # loss_bl += -tf.math.multiply(delta_b, bl_val_b)

                prev_w_idx_b = curr_w_idx_b
                tgt_seq = np.append(tgt_seq,
                                    prev_w_idx_b.data.cpu().numpy(),
                                    axis=1).tolist()

            # calculate cumulative gradients

            # model_vars = encoder.variables + decoder.variables
            loss = loss.mean()
            loss.backward()
            # loss_bl.backward()

            # finally, apply gradient

            l_optimizer.step()
            # bl_optimizer.step()

            # Reset everything for the next episode
            history = history[BATCH_SIZE:]

        if episode % max(BATCH_SIZE, 32) == 0 and batch != None:
            print(">>>>>>>>>>>>>>>>>>>>>>>>>>")
            print("Episode # ", episode)
            print("Samples from episode with rewards > 0: ")
            good_rewards = [(s, a_str, r) for s, _, a_str, r in batch]
            for s, a, r in random.sample(good_rewards,
                                         min(len(good_rewards), 3)):
                print("prev_state: ", s)
                print("actions: ", a)
                print("reward: ", r)
                # print("return: ", get_returns(r, MAX_TARGET_LEN))
            ret_seq_b_np = ret_seq_b.cpu().numpy()
            print("all returns: min=%f, max=%f, median=%f" %
                  (np.min(ret_seq_b_np), np.max(ret_seq_b_np),
                   np.median(ret_seq_b_np)))
            print("avg reward: ", sum(reward_b) / len(reward_b))
            print("avg loss: ", np.mean(loss.cpu().detach().numpy()))
示例#28
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-data', default='./data/preprocessedData')

    parser.add_argument('-epoch', type=int, default=50)
    parser.add_argument('-batch_size', type=int, default=64)

    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default='log')  # None
    parser.add_argument('-save_model', default='trained')  # None
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-beam_size', type=int, default=5, help='Beam size')
    parser.add_argument('-n_best',
                        type=int,
                        default=1,
                        help="""If verbose is set, will output the n_best
                        decoded sentences""")

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true', default=True)

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    # Loading Dataset
    data = torch.load(opt.data)
    opt.max_token_seq_len = data['settings'].max_token_seq_len

    training_data, validation_data = prepare_dataloaders(data, opt)

    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    # Preparing Model
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table are different but asked to share word embedding.'

    print(opt)

    device = torch.device('cuda' if opt.cuda else 'cpu')
    # device = torch.device('cpu')

    transformer = Transformer(opt.src_vocab_size,
                              opt.tgt_vocab_size,
                              opt.max_token_seq_len,
                              tgt_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_tgt_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout)

    discriminator = Discriminator(opt.d_model, 1024, opt.max_token_seq_len,
                                  device)

    #'''
    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        transformer = nn.DataParallel(transformer)
    #    '''
    transformer.to(device)
    discriminator.to(device)

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)
    optimizer_d = optim.RMSprop(discriminator.parameters(), lr=5e-4)

    train(transformer, discriminator, training_data, validation_data,
          optimizer, optimizer_d, device, opt)
示例#29
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()
    parser.add_argument('-data_all',
                        default='data/csv/data_train_2_sort.torch')
    parser.add_argument('-save_model', default='module/2018-7-30.pt')
    parser.add_argument('-start_time', default='2018-07-01')
    parser.add_argument('-end_time', default='2018-08-30')

    parser.add_argument('-epoch', type=int, default=16)
    parser.add_argument('-batch_size', type=int, default=128)

    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=32)
    parser.add_argument('-d_v', type=int, default=32)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=2)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.3)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default='log/logs.log')

    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')
    parser.add_argument('-batch_x', default=32)
    parser.add_argument('-batch_y', default=32)
    parser.add_argument('-train_type', default='name')

    opt = parser.parse_args()
    opt.cuda = torch.cuda.is_available()
    opt.d_word_vec = opt.d_model

    # ========= Loading Dataset =========#
    # opt.max_token_seq_len = data['settings'].max_token_seq_len

    training_data, validation_data, voc_name, data_val_ofpa = ld.get_data_loader(
        opt, device)
    opt.src_vocab_size = voc_name
    opt.tgt_vocab_size = opt.src_vocab_size
    if opt.train_type == 'time':
        voc = ld.get_time_vac(opt)
        opt.tgt_vocab_size = voc if voc > 500 else 728

        # ========= Preparing Model =========#
    if opt.embs_share_weight:
        assert opt.src_vocab_size == opt.tgt_vocab_size, \
            'The src/tgt word2idx table are different but asked to share word embedding.'

    print(opt)

    transformer = Transformer(opt.src_vocab_size,
                              opt.tgt_vocab_size,
                              opt.batch_x,
                              tgt_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_tgt_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout).to(device)

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)
    if opt.train_type == 'time':
        print("train time dim ")
        # train(transformer, train_time, val_time, optimizer, device, opt)
    else:
        train(transformer, training_data, validation_data, optimizer, device,
              opt, data_val_ofpa)
示例#30
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    parser.add_argument('-data', required=True)

    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-batch_size', type=int, default=64)

    #parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default=None)
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')
    parser.add_argument('-seed', type=int, default=None)
    parser.add_argument(
        '-use_TT',
        nargs='+',
        choices=[Constants.embedding_, Constants.pff_, Constants.attention_])
    parser.add_argument('-n_tt_cores', nargs='+', type=int, default=3)
    parser.add_argument('-tt_rank', nargs='+', type=int, default=8)

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    # Parse TT Arguments
    opt.tt_params = {}
    if opt.use_TT:
        assert len(opt.use_TT) == len(
            opt.n_tt_cores
        ), f"Specify the number of TT-cores for each of the {opt.use_TT}"
        assert len(opt.use_TT) == len(
            opt.tt_rank
        ), f"Specify the number of TT-rank for each of the {opt.use_TT}"
        for i in range(len(opt.use_TT)):
            opt.tt_params[opt.use_TT[i]] = {
                "n_tt_cores": opt.n_tt_cores[i],
                "tt_rank": opt.tt_rank[i]
            }

    if opt.seed is not None:
        torch.random.manual_seed(opt.seed)

    #========= Loading Dataset =========#
    data = torch.load(opt.data)
    opt.max_token_seq_len = data['settings'].max_token_seq_len

    training_data, validation_data = prepare_dataloaders(data, opt)

    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    #========= Preparing Model =========#
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table are different but asked to share word embedding.'

    device = torch.device('cuda' if opt.cuda else 'cpu')

    # Print the model architecture and hyperparameters
    f = io.StringIO()
    with redirect_stdout(f):
        print(opt)
        transformer = Transformer(
            opt.src_vocab_size,
            opt.tgt_vocab_size,
            opt.max_token_seq_len,
            tgt_emb_prj_weight_sharing=opt.proj_share_weight,
            emb_src_tgt_weight_sharing=opt.embs_share_weight,
            d_k=opt.d_k,
            d_v=opt.d_v,
            d_model=opt.d_model,
            d_word_vec=opt.d_word_vec,
            d_inner=opt.d_inner_hid,
            n_layers=opt.n_layers,
            n_head=opt.n_head,
            dropout=opt.dropout,
            tt_params=opt.tt_params).to(device)

        optimizer = ScheduledOptim(
            optim.Adam(filter(lambda x: x.requires_grad,
                              transformer.parameters()),
                       betas=(0.9, 0.98),
                       eps=1e-09), opt.d_model, opt.n_warmup_steps)
        print(
            f"Number of trainable parameters: {sum(p.numel() for p in transformer.parameters() if p.requires_grad)}"
        )
        summary(transformer, [[opt.max_token_seq_len] for i in range(4)],
                dtype="long")
    architecture_summary = f.getvalue()
    print(architecture_summary)
    if opt.log:
        log_architecture_file = opt.log + '.architecture.log'
        with open(log_architecture_file, 'w') as log_a:
            log_a.write(architecture_summary)

    train(transformer, training_data, validation_data, optimizer, device, opt)