Exemplo n.º 1
0
def load_model(opt, device):

    checkpoint = torch.load(opt.model, map_location=device)
    model_opt = checkpoint['settings']

    model = Transformer(
        model_opt.src_vocab_size,
        model_opt.trg_vocab_size,

        model_opt.src_pad_idx,
        model_opt.trg_pad_idx,

        trg_emb_prj_weight_sharing=model_opt.proj_share_weight,
        emb_src_trg_weight_sharing=model_opt.embs_share_weight,
        d_k=model_opt.d_k,
        d_v=model_opt.d_v,
        d_model=model_opt.d_model,
        d_word_vec=model_opt.d_word_vec,
        d_inner=model_opt.d_inner_hid,
        n_layers=model_opt.n_layers,
        n_head=model_opt.n_head,
        dropout=model_opt.dropout).to(device)

    model.load_state_dict(checkpoint['model'])
    print('[Info] Trained model state loaded.')
    return model 
Exemplo n.º 2
0
    def __init__(self, opt):
        self.opt = opt
        print(opt, "\n")
        self.device = torch.device('cuda' if opt.cuda else 'cpu')

        checkpoint = torch.load(opt.model)
        model_opt = checkpoint['settings']
        self.model_opt = model_opt
        print(model_opt)
        model = Transformer(
            model_opt.src_vocab_size,
            model_opt.tgt_vocab_size,
            model_opt.max_token_seq_len,
            tgt_emb_prj_weight_sharing=model_opt.proj_share_weight,
            emb_src_tgt_weight_sharing=model_opt.embs_share_weight,
            d_k=model_opt.d_k,
            d_v=model_opt.d_v,
            d_model=model_opt.d_model,
            d_word_vec=model_opt.d_word_vec,
            d_inner=model_opt.d_inner_hid,
            n_layers=model_opt.n_layers,
            n_head=model_opt.n_head,
            dropout=model_opt.dropout)

        model.load_state_dict(checkpoint['model'])
        print('[Info] Trained model state loaded.')

        model.word_prob_prj = nn.LogSoftmax(dim=1)

        model = model.to(self.device)

        self.model = model
        self.model.eval()
    def __init__(self, model):

        self.device = torch.device('cuda')
        checkpoint = torch.load(model)
        checkpoint_copy = checkpoint['model'].copy()

        for k in list(checkpoint_copy.keys()):
            new_key = k.replace('module.model.', '')
            checkpoint_copy.update({str(new_key): checkpoint_copy.pop(k)})

        model_opt = checkpoint['settings']
        model = Transformer(
            model_opt.src_vocab_size,
            model_opt.tgt_vocab_size,
            model_opt.max_token_seq_len,
            tgt_emb_prj_weight_sharing=model_opt.proj_share_weight,
            emb_src_tgt_weight_sharing=model_opt.embs_share_weight,
            d_k=model_opt.d_k,
            d_v=model_opt.d_v,
            d_model=model_opt.d_model,
            d_word_vec=model_opt.d_word_vec,
            d_inner=model_opt.d_inner_hid,
            n_layers=model_opt.n_layers,
            n_head=model_opt.n_head,
            dropout=model_opt.dropout)
        model.load_state_dict(checkpoint_copy)
        model = model.to(self.device)
        self.model = model
        for p in self.model.parameters():
            p.requires_grad = False
        self.model.eval()
    def __init__(self, opt, device):
        self.opt = opt
        self.device = device

        checkpoint = torch.load(opt.model)
        model_opt = checkpoint['settings']
        self.model_opt = model_opt

        model = Transformer(model_opt.input_dim,
                            model_opt.output_dim,
                            model_opt.n_inputs_max_seq,
                            model_opt.n_outputs_max_seq,
                            d_k=model_opt.d_k,
                            d_v=model_opt.d_v,
                            d_model=model_opt.d_model,
                            d_inner_hid=model_opt.d_inner_hid,
                            n_layers=model_opt.n_layers,
                            n_head=model_opt.n_head,
                            dropout=model_opt.dropout,
                            device=device,
                            is_train=False)

        model.load_state_dict(checkpoint['model'])
        print('[Info] Trained model state loaded.')

        model.to(device)
        prob_projection.to(device)

        model.prob_projection = prob_projection

        self.model = model
        self.model.eval()
    def __init__(self, opt):
        self.opt = opt
        self.device = torch.device('cuda' if opt.cuda else 'cpu')

        checkpoint = torch.load(opt.model)
        model_opt = checkpoint['settings']
        self.model_opt = model_opt

        if opt.prune:
            # NetworkWrapper
            prune_params = {'alpha': opt.prune_alpha}
            pruner = Pruner(device=device,
                            load_mask=opt.load_mask,
                            prune_params=prune_params)

            transformer = NetworkWrapper(
                model_opt.src_vocab_size,
                model_opt.tgt_vocab_size,
                model_opt.max_token_seq_len,
                tgt_emb_prj_weight_sharing=model_opt.proj_share_weight,
                emb_src_tgt_weight_sharing=model_opt.embs_share_weight,
                d_k=model_opt.d_k,
                d_v=model_opt.d_v,
                d_model=model_opt.d_model,
                d_word_vec=model_opt.d_word_vec,
                d_inner=model_opt.d_inner_hid,
                n_layers=model_opt.n_layers,
                n_head=model_opt.n_head,
                dropout=model_opt.dropout,
                transformer=pruner)
        else:
            model = Transformer(
                model_opt.src_vocab_size,
                model_opt.tgt_vocab_size,
                model_opt.max_token_seq_len,
                tgt_emb_prj_weight_sharing=model_opt.proj_share_weight,
                emb_src_tgt_weight_sharing=model_opt.embs_share_weight,
                d_k=model_opt.d_k,
                d_v=model_opt.d_v,
                d_model=model_opt.d_model,
                d_word_vec=model_opt.d_word_vec,
                d_inner=model_opt.d_inner_hid,
                n_layers=model_opt.n_layers,
                n_head=model_opt.n_head,
                dropout=model_opt.dropout)

        model.load_state_dict(checkpoint['model'])
        print('[Info] Trained model state loaded.')

        model.word_prob_prj = nn.LogSoftmax(dim=1)

        model = model.to(self.device)

        self.model = model
        self.model.eval()
Exemplo n.º 6
0
    def __init__(self, opt):
        self.opt = opt
        self.tt = torch.cuda if opt.cuda else torch

        checkpoint = torch.load(opt.model,
                                map_location=lambda storage, loc: storage)
        model_opt = checkpoint['settings']
        if 'use_ctx' not in model_opt.__dict__:
            model_opt.use_ctx = False
        self.model_opt = model_opt

        model = Transformer(model_opt.src_vocab_size,
                            model_opt.tgt_vocab_size,
                            model_opt.max_token_seq_len,
                            proj_share_weight=model_opt.proj_share_weight,
                            embs_share_weight=model_opt.embs_share_weight,
                            d_k=model_opt.d_k,
                            d_v=model_opt.d_v,
                            d_model=model_opt.d_model,
                            d_word_vec=model_opt.d_word_vec,
                            d_inner_hid=model_opt.d_inner_hid,
                            n_layers=model_opt.n_layers,
                            n_head=model_opt.n_head,
                            dropout=model_opt.dropout,
                            use_ctx=model_opt.use_ctx)

        prob_projection = nn.LogSoftmax()

        model.load_state_dict(checkpoint['model'])

        # New max_token_seq_len for position encoding
        model = self.change_position_embedings(model, opt.max_token_seq_len,
                                               model_opt.d_word_vec,
                                               model_opt.use_ctx)
        model_opt.max_token_seq_len = opt.max_token_seq_len
        print('[Info] Trained model state loaded.')

        if opt.cuda:
            model.cuda()
            prob_projection.cuda()
        else:
            model.cpu()
            prob_projection.cpu()

        model.prob_projection = prob_projection

        self.model = model
        self.model.eval()
Exemplo n.º 7
0
def main():
    if not os.path.exists(args.ckpt_file):
        raise FileNotFoundError("model file not found")

    data_dir = '/home/tiankeke/workspace/datas/sumdata/'
    TRAIN_X = os.path.join(data_dir, 'train/train.article.txt')
    TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt')
    TEST_X = args.input_file

    small_vocab_file = 'sumdata/small_vocab.json'
    if os.path.exists(small_vocab_file):
        small_vocab = json.load(open(small_vocab_file))
    else:
        small_vocab = build_vocab([TRAIN_X, TRAIN_Y],
                                  small_vocab_file,
                                  vocab_size=80000)

    max_src_len = 101
    max_tgt_len = 47

    test_x = BatchManager(load_data(TEST_X, max_src_len, args.n_test),
                          args.batch_size, small_vocab)

    model = Transformer(len(small_vocab),
                        len(small_vocab),
                        max_src_len,
                        d_word_vec=300,
                        d_model=300,
                        d_inner=1200,
                        n_layers=1,
                        n_head=6,
                        d_k=50,
                        d_v=50,
                        dropout=0.1,
                        tgt_emb_prj_weight_sharing=True,
                        emb_src_tgt_weight_sharing=True).cuda()
    # print(model)
    model.eval()

    saved_state = torch.load(args.ckpt_file)
    model.load_state_dict(saved_state['state_dict'])
    print('Load model parameters from %s' % args.ckpt_file)

    my_test(test_x, model, small_vocab)
Exemplo n.º 8
0
    def __init__(self, opt):
        self.opt = opt
        self.tt = torch.cuda if opt.cuda else torch

        checkpoint = torch.load(opt.model)
        model_opt = checkpoint['settings']
        self.model_opt = model_opt

        model = Transformer(
            model_opt.src_vocab_size,
            model_opt.tgt_vocab_size,
            model_opt.max_token_seq_len,
            proj_share_weight=model_opt.proj_share_weight,
            embs_share_weight=model_opt.embs_share_weight,
            d_k=model_opt.d_k,
            d_v=model_opt.d_v,
            d_model=model_opt.d_model,
            d_word_vec=model_opt.d_word_vec,
            d_inner_hid=model_opt.d_inner_hid,
            n_layers=model_opt.n_layers,
            n_head=model_opt.n_head,
            dropout=model_opt.dropout)

        prob_projection = nn.LogSoftmax()

        model.load_state_dict(checkpoint['model'])
        print('[Info] Trained model state loaded.')

        if opt.cuda:
            model.cuda()
            prob_projection.cuda()
        else:
            model.cpu()
            prob_projection.cpu()

        model.prob_projection = prob_projection

        self.model = model
        self.model.eval()
    def __init__(self, opt):
        self.opt = opt
        self.device = torch.device('cuda' if opt.cuda else 'cpu')

        checkpoint = torch.load(opt.model)
        model_opt = checkpoint['settings']
        self.model_opt = model_opt
        '''added by self'''
        checkpoint_copy = checkpoint['model'].copy()
        for k in list(checkpoint_copy.keys()):
            new_key = k.replace('module.model.', '')
            checkpoint_copy.update({str(new_key): checkpoint_copy.pop(k)})
        ''' end '''
        model = Transformer(
            model_opt.src_vocab_size,
            model_opt.tgt_vocab_size,
            model_opt.max_token_seq_len,
            tgt_emb_prj_weight_sharing=model_opt.proj_share_weight,
            emb_src_tgt_weight_sharing=model_opt.embs_share_weight,
            d_k=model_opt.d_k,
            d_v=model_opt.d_v,
            d_model=model_opt.d_model,
            d_word_vec=model_opt.d_word_vec,
            d_inner=model_opt.d_inner_hid,
            n_layers=model_opt.n_layers,
            n_head=model_opt.n_head,
            dropout=model_opt.dropout)

        model.load_state_dict(checkpoint_copy)
        print('[Info] Trained model state loaded.')

        model.word_prob_prj = nn.LogSoftmax(dim=1)

        model = model.to(self.device)

        self.model = model
        self.model.eval()
Exemplo n.º 10
0
    def __init__(self, opt):
        #opt is from argprass
        self.opt = opt
        self.device = torch.device('cuda' if opt.cuda else 'cpu')
        self.m = opt.m
        #opt.model is the model path
        checkpoint = torch.load(opt.model)
        #model_opt is the model hyper params
        model_opt = checkpoint['settings']
        self.model_opt = model_opt

        model = Transformer(
            model_opt.src_vocab_size,
            model_opt.tgt_vocab_size,
            model_opt.max_token_seq_len,
            tgt_emb_prj_weight_sharing=model_opt.proj_share_weight,
            emb_src_tgt_weight_sharing=model_opt.embs_share_weight,
            d_k=model_opt.d_k,
            d_v=model_opt.d_v,
            d_model=model_opt.d_model,
            d_word_vec=model_opt.d_word_vec,
            d_inner=model_opt.d_inner_hid,
            n_layers=model_opt.n_layers,
            n_head=model_opt.n_head,
            dropout=model_opt.dropout,
            return_attns=opt.return_attns)

        #Load the actual model weights
        model.load_state_dict(checkpoint['model'])
        print('[Info] Trained model state loaded.')

        model.word_prob_prj = nn.LogSoftmax(dim=1)

        model = model.to(self.device)

        self.model = model
        self.model.eval()
Exemplo n.º 11
0
def load_model(opt):
    # TODO not working with save mode 'all'
    checkpoint = torch.load(opt.model + '.chkpt', map_location=opt.device)
    model_opt = checkpoint['settings']

    model = Transformer(
        model_opt.src_vocab_size,
        model_opt.tgt_vocab_size,
        model_opt.max_token_seq_len,
        tgt_emb_prj_weight_sharing=model_opt.proj_share_weight,
        emb_src_tgt_weight_sharing=False,
        d_k=model_opt.d_k,
        d_v=model_opt.d_v,
        d_model=model_opt.d_model,
        d_word_vec=model_opt.d_word_vec,
        d_inner=model_opt.d_inner_hid,
        n_layers=model_opt.n_layers,
        n_head=model_opt.n_head,
        dropout=model_opt.dropout)

    model.load_state_dict(checkpoint['model'])
    print('[Info] Trained model state loaded.')

    return model, model_opt
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    #parser.add_argument('-data', required=True)

    parser.add_argument('-epoch', type=int, default=200)
    parser.add_argument('-batch_size', type=int, default=32)

    #parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=4)
    parser.add_argument('-n_layers', type=int, default=4)
    parser.add_argument('-n_warmup_steps', type=int, default=2000)
    #parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default=None)
    parser.add_argument('-model', default=None)
    parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-device', type=str, default='0')
    parser.add_argument('-label_smoothing', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    #========= Loading Dataset =========#
    #data = torch.load(opt.data)
    #opt.max_token_seq_len = data['settings'].max_token_seq_len
    opt.max_token_seq_len = 126

    Dataloader = Loaders()
    Dataloader.get_loaders(opt)
    training_data, validation_data, coco_data = Dataloader.loader['train'], Dataloader.loader['val'], Dataloader.loader['coco']

    opt.src_vocab_size = len(Dataloader.frame_vocab)
    opt.tgt_vocab_size = len(Dataloader.story_vocab)

    #========= Preparing Model =========#
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table are different but asked to share word embedding.'

    print(opt)

    device = torch.device(f'cuda:{opt.device}' if opt.cuda else 'cpu')
    transformer = Transformer(
        opt.src_vocab_size,
        opt.tgt_vocab_size,
        opt.max_token_seq_len,
        tgt_emb_prj_weight_sharing=opt.proj_share_weight,
        emb_src_tgt_weight_sharing=opt.embs_share_weight,
        d_k=opt.d_k,
        d_v=opt.d_v,
        d_model=opt.d_model,
        d_word_vec=opt.d_word_vec,
        d_inner=opt.d_inner_hid,
        n_layers=opt.n_layers,
        n_head=opt.n_head,
        dropout=opt.dropout).to(device)

    if opt.model:
        checkpoint = torch.load(opt.model)
        transformer.load_state_dict(checkpoint['model'])

    optimizer = ScheduledOptim(
        optim.Adam(
            filter(lambda x: x.requires_grad, transformer.parameters()),
            betas=(0.9, 0.98), eps=1e-09),
        opt.d_model, opt.n_warmup_steps)

    train(transformer, training_data, validation_data, coco_data, optimizer, device ,opt, Dataloader)
Exemplo n.º 13
0
def main():
    """ Main function """
    parser = argparse.ArgumentParser()

    parser.add_argument('-data', required=True)
    parser.add_argument('-emb_path', default=None)
    parser.add_argument('-trained_model', default=None)
    parser.add_argument('-current_step', type=int, default=0)

    parser.add_argument('-epoch', type=int, default=5)
    parser.add_argument('-batch_size', type=int, default=32)

    #parser.add_argument('-d_word_vec', type=int, default=300)
    parser.add_argument('-d_model', type=int, default=300)
    parser.add_argument('-d_inner_hid', type=int, default=500)
    parser.add_argument('-d_k', type=int, default=50)
    parser.add_argument('-d_v', type=int, default=50)

    parser.add_argument('-n_head', type=int, default=6)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None) # 日志保存地址(without suffix)
    parser.add_argument('-save_model', default=None) # 模型保存地址(without suffix)
    parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best')

    parser.add_argument('-no_cuda', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    #========= Loading Dataset =========#
    data = torch.load(opt.data)
    opt.max_token_seq_len = data['settings'].max_token_seq_len

    #========= Preparing DataLoader =========#
    training_data = DataLoader(
        data['dict']['src'],
        data['dict']['tgt'],
        src_insts=data['train']['src'],
        tgt_insts=data['train']['tgt'],
        batch_size=opt.batch_size,
        cuda=opt.cuda)

    validation_data = DataLoader(
        data['dict']['src'],
        data['dict']['tgt'],
        src_insts=data['valid']['src'],
        tgt_insts=data['valid']['tgt'],
        batch_size=opt.batch_size,
        shuffle=False,
        test=True,
        cuda=opt.cuda)

    opt.src_vocab_size = training_data.src_vocab_size
    opt.tgt_vocab_size = training_data.tgt_vocab_size

    #========= Preparing Model =========#
    if opt.embs_share_weight and training_data.src_word2idx != training_data.tgt_word2idx:
        print('[Warning]',
              'The src/tgt word2idx table are different but asked to share word embedding.')

    print(opt)

    transformer = Transformer(
        opt.src_vocab_size,
        opt.tgt_vocab_size,
        opt.max_token_seq_len,
        emb_path=opt.emb_path,
        proj_share_weight=opt.proj_share_weight,
        embs_share_weight=opt.embs_share_weight,
        d_k=opt.d_k,
        d_v=opt.d_v,
        d_model=opt.d_model,
        d_word_vec=opt.d_word_vec,
        d_inner_hid=opt.d_inner_hid,
        n_layers=opt.n_layers,
        n_head=opt.n_head,
        dropout=opt.dropout)

    if opt.trained_model:
        checkpoint = torch.load(opt.trained_model)
        transformer.load_state_dict(checkpoint['model'])
        print('[Info] Trained model state loaded.')

    #print(transformer)

    optimizer = ScheduledOptim(
        optim.Adam(
            transformer.get_trainable_parameters(),
            betas=(0.9, 0.98), eps=1e-09),
        opt.d_model, opt.n_warmup_steps, opt.current_step)


    def get_criterion(vocab_size):
        """ With PAD token zero weight """
        weight = torch.ones(vocab_size)
        weight[Constants.PAD] = 0  #被pad的部分在计算loss的时候权重设为0
        return nn.CrossEntropyLoss(weight, size_average=False)

    crit = get_criterion(training_data.tgt_vocab_size)

    if opt.cuda:
        transformer = transformer.cuda()
        crit = crit.cuda()

    train(transformer, training_data, validation_data, crit, optimizer, opt)
Exemplo n.º 14
0
        # outpu_tensor = torch.argmax(output.squeeze(1), 1)
        ouput_str = get_output_char(result)
        return ouput_str
    else:
        
        target = beam_search.beam_decode(input_tensor, model, beam_with=5)
        print(target)
        print(len(target[0][0]))
        ouput_str = get_output_char(target[0][0][1:])
        return ouput_str


if __name__ == '__main__':
    args = get_args()
    # pad index

    device = torch.device('cuda' if args.no_cuda == False else 'cpu')
    transformer_model = Transformer(args.sl_vocab_size, args.xl_vocab_size, hid_dim=args.embedding_dim,
                                    pf_dim=args.fp_inner_dim, n_layers=args.n_layers, n_heads=args.n_head,
									dropout=args.dropout, device=device, SOS_IDX=SOS_IDX, PAD_IDX=PAD_IDX, EOS_IDX=EOS_IDX).to(
        device)
    # transformer_model.load_state_dict(torch.load('./models-bak/transformer/1121/transformer-model_11.pt', map_location='cpu'))
    transformer_model.load_state_dict(torch.load('./models-bak/transformer/1122/transformer-model_500.pt', map_location='cpu'))
    transformer_model.eval()
    text = '欲出烦恼须无我'
    print(predict_xl(text, transformer_model, device, is_beam_search=True))
    # df = pd.read_excel('./couplet/result-test.xlsx')
    # df['transformer'] = df['上联'].apply(lambda x: predict_xl(x, transformer_model, device, is_beam_search=False))
    # df['transformer_beam'] = df['上联'].apply(lambda x: predict_xl(x, transformer_model, device, is_beam_search=True))
    # df.to_excel('./couplet/result-test.xlsx',index=False)
Exemplo n.º 15
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    parser.add_argument('-data', required=True)

    parser.add_argument('-epoch', type=int, default=100)
    parser.add_argument('-batch_size', type=int, default=64)

    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=6)
    parser.add_argument('-n_layers', type=int, default=2)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-seed', type=int, default=42)
    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default=None)
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best', 'record'],
                        default='best')
    parser.add_argument('-save_thres', type=float, default=None)

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    parser.add_argument('-model', default=None, help='Path to model .pt file')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model
    set_seed(opt.seed)

    #========= Loading Dataset =========#
    data = torch.load(opt.data)
    opt.max_token_seq_len = data['settings'].max_token_seq_len

    training_data, validation_data = prepare_dataloaders(data, opt)

    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    #========= Preparing Model =========#
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table are different but asked to share word embedding.'

    print(opt)

    device = torch.device('cuda' if opt.cuda else 'cpu')
    if opt.model is None:
        transformer = Transformer(
            opt.src_vocab_size,
            opt.tgt_vocab_size,
            opt.max_token_seq_len,
            tgt_emb_prj_weight_sharing=opt.proj_share_weight,
            emb_src_tgt_weight_sharing=opt.embs_share_weight,
            d_k=opt.d_k,
            d_v=opt.d_v,
            d_model=opt.d_model,
            d_word_vec=opt.d_word_vec,
            d_inner=opt.d_inner_hid,
            n_layers=opt.n_layers,
            n_head=opt.n_head,
            dropout=opt.dropout).to(device)

    else:
        checkpoint = torch.load(opt.model)
        model_opt = checkpoint['settings']

        transformer = Transformer(
            model_opt.src_vocab_size,
            model_opt.tgt_vocab_size,
            model_opt.max_token_seq_len,
            tgt_emb_prj_weight_sharing=model_opt.proj_share_weight,
            emb_src_tgt_weight_sharing=model_opt.embs_share_weight,
            d_k=model_opt.d_k,
            d_v=model_opt.d_v,
            d_model=model_opt.d_model,
            d_word_vec=model_opt.d_word_vec,
            d_inner=model_opt.d_inner_hid,
            n_layers=model_opt.n_layers,
            n_head=model_opt.n_head,
            dropout=model_opt.dropout).to(device)

        transformer.load_state_dict(checkpoint['model'])
        print('[Info] Trained model state loaded.')

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)

    train(transformer, training_data, validation_data, optimizer, device, opt)
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()
    parser.add_argument('-config', type=str, default='config/rnnt.yaml')
    parser.add_argument('-load_model', type=str, default=None)
    parser.add_argument('-fp16_allreduce',
                        action='store_true',
                        default=False,
                        help='use fp16 compression during allreduce')
    parser.add_argument('-batches_per_allreduce',
                        type=int,
                        default=1,
                        help='number of batches processed locally before '
                        'executing allreduce across workers; it multiplies '
                        'total batch size.')
    parser.add_argument(
        '-num_wokers',
        type=int,
        default=0,
        help='how many subprocesses to use for data loading. '
        '0 means that the data will be loaded in the main process')
    parser.add_argument('-log', type=str, default='train.log')
    opt = parser.parse_args()

    configfile = open(opt.config)
    config = AttrDict(yaml.load(configfile))

    global global_step
    global_step = 0

    if hvd.rank() == 0:
        exp_name = config.data.name
        if not os.path.isdir(exp_name):
            os.mkdir(exp_name)
        logger = init_logger(exp_name + '/' + opt.log)
    else:
        logger = None

    if torch.cuda.is_available():
        torch.cuda.set_device(hvd.local_rank())
        torch.cuda.manual_seed(config.training.seed)
        torch.backends.cudnn.deterministic = True
    else:
        raise NotImplementedError

    #========= Build DataLoader =========#
    train_dataset = AudioDateset(config.data, 'train')
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.data.train.batch_size,
        sampler=train_sampler)

    assert train_dataset.vocab_size == config.model.vocab_size

    #========= Build A Model Or Load Pre-trained Model=========#
    model = Transformer(config.model)

    if hvd.rank() == 0:
        n_params, enc_params, dec_params = count_parameters(model)
        logger.info('# the number of parameters in the whole model: %d' %
                    n_params)
        logger.info('# the number of parameters in encoder: %d' % enc_params)
        logger.info('# the number of parameters in decoder: %d' % dec_params)

    model.cuda()

    # define an optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=lr,
                                 betas=(0.9, 0.98),
                                 eps=1e-9)

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16 if opt.fp16_allreduce else hvd.Compression.none

    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        compression=compression)

    # load pretrain model
    if opt.load_model is not None and hvd.rank() == 0:
        checkpoint = torch.load(opt.load_model)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        logger.info('Load pretrainded Model and previous Optimizer!')
    elif hvd.rank() == 0:
        init_parameters(model)
        logger.info('Initialized all parameters!')

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # define loss function
    crit = nn.CrossEntropyLoss(ignore_index=0)

    # create a visualizer
    if config.training.visualization and hvd.rank() == 0:
        visualizer = SummaryWriter(exp_name + '/log')
        logger.info('Created a visualizer.')
    else:
        visualizer = None

    for epoch in range(config.training.epoches):
        train(epoch, model, crit, optimizer, train_loader, train_sampler,
              logger, visualizer, config)

        if hvd.rank() == 0:
            save_model(epoch, model, optimizer, config, logger)

    if hvd.rank() == 0:
        logger.info('Traing Process Finished')
Exemplo n.º 17
0
def get_embedding():

    import transformer.Constants as Constants
    from transformer.Models import Transformer
    from transformer.Optim import ScheduledOptim
    from transformer.Modules import LabelSmoothing
    from transformer.Beam import Beam
    from transformer.Translator import translate
    from preprocess import read_instances_from_file, convert_instance_to_idx_seq
    import evals
    from evals import Logger
    from DataLoader import DataLoader

    data = torch.load(opt.data)

    opt.max_token_seq_len_e = data['settings'].max_seq_len
    opt.max_token_seq_len_d = 30
    opt.proj_share_weight = True
    opt.d_word_vec = opt.d_model

    # training_data = DataLoader(
    #    data['dict']['src'],
    #    data['dict']['tgt'],
    #    src_insts=data['train']['src'],
    #    tgt_insts=data['train']['tgt'],
    #    batch_size=opt.batch_size,
    #    shuffle=True,
    #    cuda=opt.cuda)

    opt.src_vocab_size = training_data.src_vocab_size
    opt.tgt_vocab_size = training_data.tgt_vocab_size
    opt.tgt_vocab_size = opt.tgt_vocab_size - 4

    opt.src_vocab_size = training_data.src_vocab_size
    opt.tgt_vocab_size = training_data.tgt_vocab_size
    opt.tgt_vocab_size = opt.tgt_vocab_size - 4

    opt.d_v = int(opt.d_model / opt.n_head)
    opt.d_k = int(opt.d_model / opt.n_head)

    model = Transformer(opt.src_vocab_size,
                        opt.tgt_vocab_size,
                        opt.max_token_seq_len_e,
                        opt.max_token_seq_len_d,
                        proj_share_weight=opt.proj_share_weight,
                        embs_share_weight=False,
                        d_k=opt.d_k,
                        d_v=opt.d_v,
                        d_model=opt.d_model,
                        d_word_vec=opt.d_word_vec,
                        d_inner_hid=opt.d_inner_hid,
                        n_layers_enc=opt.n_layers_enc,
                        n_layers_dec=opt.n_layers_dec,
                        n_head=opt.n_head,
                        dropout=opt.dropout,
                        dec_dropout=opt.dec_dropout,
                        encoder=opt.encoder,
                        decoder=opt.decoder,
                        enc_transform=opt.enc_transform,
                        onehot=opt.onehot,
                        no_enc_pos_embedding=opt.no_enc_pos_embedding,
                        dec_reverse=opt.dec_reverse,
                        no_residual=opt.no_residual)

    state_dict = torch.load(opt.results_dir + '/' + opt.mname + '/model.chkpt')

    model.load_state_dict(state_dict['model'])

    model = model.cuda()
    model.eval()

    model.decoder.tgt_word_emb.weight

    W = model.decoder.tgt_word_emb.weight.data.cpu().numpy()

    numpy.save(W, 'Embedding')
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    parser.add_argument('-data', required=True)

    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-batch_size', type=int, default=64)

    # parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default='default')
    parser.add_argument('-tensorboard', default=None)
    parser.add_argument('-save_model', default=None)
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model
    global global_counter
    global_counter = 0

    writer = None
    if opt.tensorboard:
        writer = SummaryWriter(os.path.join('./logs', opt.tensorboard))

    # ========= Loading Dataset =========#
    data = torch.load(opt.data)
    global idx2char
    idx2char = {v: k for k, v in data['dict']['src'].items()}

    opt.max_token_seq_len = data['settings'].max_token_seq_len

    training_data, validation_data, unique_char_len = prepare_dataloaders(
        data, opt)

    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    # ========= Preparing Model =========#
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table are different but asked to share word embedding.'

    print(opt)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    transformer = Transformer(opt.src_vocab_size,
                              opt.tgt_vocab_size,
                              opt.max_token_seq_len,
                              tgt_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_tgt_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout).to(device)

    try:
        transformer.load_state_dict(torch.load('./checkpoints/model.pt'))
        print("Model loaded successfully.......")
    except:
        pass

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)

    train(transformer, training_data, validation_data, optimizer, device, opt,
          unique_char_len, writer)
Exemplo n.º 19
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()
    #----------------------参数都在这里面,默认参数!!!!!!!!!!!!!!!!
    parser.add_argument('-data', required=False)

    parser.add_argument('-epoch', type=int, default=1)  #为了跑通我就先写1了.
    parser.add_argument('-batch_size', type=int, default=32)

    #parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default='/transformer_my')
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing',
                        action='store_true')  # 这种action里面写上就表示默认调用的话就是true

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model
    '''
    下面来把参数写这里, 就方便了.
    '''
    opt.saved_weight = 'c:/trained.chkpt'  # 就的模型的位置.
    opt.data = 'yunixng_bash/data/multi30k.atok.low.pt'  # 数据集的位置.
    opt.save_model = 'trained_finetune'  # 保存模型的名字.
    opt.save_mode = 'best'  # 数据集的位置.
    opt.proj_share_weight = True  # 数据集的位置.
    opt.label_smoothing = True  # 数据集的位置.
    opt.cuda = False
    opt.batch_size = 200
    opt.epoch = 30

    #========= Loading Dataset =========#
    data = torch.load(
        opt.data
    )  # 这里面的数据已经经过编码了. 具体的编码规则也都在data里面,data里面是一个字典.并且src 和tgt的字典是不一样的,所以上面的embs_share_weight 参数一定要false.  数据集一共大小才3mb. 真方便. 就是根目录下面的multi30k.atok.low.pt这个.   应该是一个小数据及,3万个句子对, 字典3k. 只有点常用的英文字. 并且没用使用word-piece. 只是word级别的编码. 所以随便给一个句子,超出字典非常正常. 但是目前用这个,对于测试非常方便,速度很快.
    opt.max_token_seq_len = data['settings'].max_token_seq_len
    # 进行数据长度预处理 , 就是加padding 而已.
    training_data, validation_data = prepare_dataloaders(data, opt)

    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    #========= Preparing Model =========#
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table are different but asked to share word embedding.'
    print('配的参数都打印在这里了')
    print(opt)

    device = torch.device('cuda' if opt.cuda else 'cpu')
    transformer = Transformer(  # 准备网络模型.
        opt.src_vocab_size,
        opt.tgt_vocab_size,
        opt.max_token_seq_len,
        tgt_emb_prj_weight_sharing=opt.proj_share_weight,
        emb_src_tgt_weight_sharing=opt.embs_share_weight,
        d_k=opt.d_k,
        d_v=opt.d_v,
        d_model=opt.d_model,
        d_word_vec=opt.d_word_vec,
        d_inner=opt.d_inner_hid,
        n_layers=opt.n_layers,
        n_head=opt.n_head,
        dropout=opt.dropout).to(device)

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)
    tmp = torch.load(
        opt.saved_weight,
        map_location=torch.device('cpu'))['model']  # 源代码非常闲的piyan加一层model草他妈的.
    transformer.load_state_dict(tmp)

    train(transformer, training_data, validation_data, optimizer, device, opt)
Exemplo n.º 20
0
def main():
    print(args)

    data_dir = '/home/tiankeke/workspace/datas/sumdata/'
    TRAIN_X = os.path.join(data_dir, 'train/train.article.txt')
    TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt')
    VALID_X = os.path.join(data_dir, 'train/valid.article.filter.txt')
    VALID_Y = os.path.join(data_dir, 'train/valid.title.filter.txt')

    src_vocab, tgt_vocab = get_vocab(TRAIN_X, TRAIN_Y)

    small_vocab_file = 'sumdata/small_vocab.json'
    if os.path.exists(small_vocab_file):
        small_vocab = json.load(open(small_vocab_file))
    else:
        small_vocab = build_vocab([TRAIN_X, TRAIN_Y],
                                  small_vocab_file,
                                  vocab_size=80000)

    max_src_len = 101
    max_tgt_len = 47
    bs = args.batch_size
    n_train = args.n_train
    n_valid = args.n_valid

    vocab = small_vocab

    train_x = BatchManager(load_data(TRAIN_X, max_src_len, n_train), bs, vocab)
    train_y = BatchManager(load_data(TRAIN_Y, max_tgt_len, n_train), bs, vocab)
    valid_x = BatchManager(load_data(VALID_X, max_src_len, n_valid), bs, vocab)
    valid_y = BatchManager(load_data(VALID_Y, max_tgt_len, n_valid), bs, vocab)

    model = Transformer(len(vocab),
                        len(vocab),
                        max_src_len,
                        d_word_vec=300,
                        d_model=300,
                        d_inner=1200,
                        n_layers=1,
                        n_head=6,
                        d_k=50,
                        d_v=50,
                        dropout=0.1,
                        tgt_emb_prj_weight_sharing=True,
                        emb_src_tgt_weight_sharing=True).cuda()
    # print(model)

    saved_state = {'epoch': 0, 'lr': 0.001}
    if os.path.exists(args.ckpt_file):
        saved_state = torch.load(args.ckpt_file)
        model.load_state_dict(saved_state['state_dict'])
        logging.info('Load model parameters from %s' % args.ckpt_file)

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=saved_state['lr'])
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=1,
                                                gamma=0.3)
    scheduler.step(
    )  # last_epoch=-1, which will not update lr at the first time

    train(train_x, train_y, valid_x, valid_y, model, optimizer, scheduler,
          args.n_epochs, saved_state['epoch'])
Exemplo n.º 21
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    parser.add_argument('--train_src', required=True)
    parser.add_argument('--valid_src', required=True)
    parser.add_argument('--max_word_seq_len', type=int, default=100)
    parser.add_argument('--min_word_count', type=int, default=5)
    parser.add_argument('--keep_case', action='store_true')

    parser.add_argument('--epoch', type=int, default=500)
    parser.add_argument('--batch_size', type=int, default=64)
    parser.add_argument('--num_worker', type=int, default=8)

    # parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('--d_model', type=int, default=512)
    parser.add_argument('--d_inner_hid', type=int, default=2048)
    parser.add_argument('--d_k', type=int, default=64)
    parser.add_argument('--d_v', type=int, default=64)

    parser.add_argument('--n_head', type=int, default=8)
    parser.add_argument('--n_layers', type=int, default=6)
    parser.add_argument('--n_warmup_steps', type=int, default=4000)

    parser.add_argument('--dropout', type=float, default=0.1)
    parser.add_argument('--embs_share_weight', action='store_true')
    parser.add_argument('--proj_share_weight', action='store_true')

    parser.add_argument('--model', default=None, help='Path to model file')
    parser.add_argument('--log', default=None)
    parser.add_argument('--save_model', default=None)
    parser.add_argument('--save_data', default='./data/word2idx.pth')
    parser.add_argument('--save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('--no_cuda', action='store_true')
    parser.add_argument('--label_smoothing', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    opt.max_token_seq_len = opt.max_word_seq_len + 2
    #========= Loading Dataset =========#
    training_data = torch.utils.data.DataLoader(dataset.TranslationDataset(
        dir_name=opt.train_src,
        max_word_seq_len=opt.max_word_seq_len,
        min_word_count=opt.min_word_count,
        keep_case=opt.keep_case,
        src_word2idx=None,
        tgt_word2idx=None),
                                                num_workers=opt.num_worker,
                                                batch_size=opt.batch_size,
                                                collate_fn=paired_collate_fn,
                                                shuffle=True)
    validation_data = torch.utils.data.DataLoader(dataset.TranslationDataset(
        dir_name=opt.valid_src,
        max_word_seq_len=opt.max_word_seq_len,
        min_word_count=opt.min_word_count,
        keep_case=opt.keep_case,
        src_word2idx=training_data.dataset.src_word2idx,
        tgt_word2idx=training_data.dataset.tgt_word2idx),
                                                  num_workers=opt.num_worker,
                                                  batch_size=opt.batch_size,
                                                  collate_fn=paired_collate_fn,
                                                  shuffle=True)
    data = {
        'dict': {
            'src': training_data.dataset.src_word2idx,
            'tgt': training_data.dataset.tgt_word2idx
        }
    }
    print('[Info] Dumping the processed data to pickle file', opt.save_data)
    torch.save(data, opt.save_data)
    print('[Info] Finish.')
    del data
    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    #========= Preparing Model =========#
    if opt.embs_share_weight:
        assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table are different but asked to share word embedding.'

    print(opt)

    device = torch.device('cuda' if opt.cuda else 'cpu')
    transformer = Transformer(opt.src_vocab_size,
                              opt.tgt_vocab_size,
                              opt.max_token_seq_len,
                              tgt_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_tgt_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout).to(device)

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)
    if (opt.model is not None):
        print('pretrain model!')
        checkpoint = torch.load(opt.model)
        model_opt = checkpoint['settings']
        transformer = Transformer(
            model_opt.src_vocab_size,
            model_opt.tgt_vocab_size,
            model_opt.max_token_seq_len,
            tgt_emb_prj_weight_sharing=model_opt.proj_share_weight,
            emb_src_tgt_weight_sharing=model_opt.embs_share_weight,
            d_k=model_opt.d_k,
            d_v=model_opt.d_v,
            d_model=model_opt.d_model,
            d_word_vec=model_opt.d_word_vec,
            d_inner=model_opt.d_inner_hid,
            n_layers=model_opt.n_layers,
            n_head=model_opt.n_head,
            dropout=model_opt.dropout)
        transformer.load_state_dict(checkpoint['model'])
        transformer = transformer.to(device)

    train(transformer, training_data, validation_data, optimizer, device, opt)
Exemplo n.º 22
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    # parser.add_argument('-data', required=True)

    parser.add_argument('-train_atok', required=True)
    parser.add_argument('-valid_atok', required=True)

    parser.add_argument('-epoch', type=int, default=200)
    parser.add_argument('-batch_size', type=int, default=8)

    parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default=None)
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    #========= Loading Dataset =========#
    train_atok = torch.load(opt.train_atok)
    valid_atok = torch.load(opt.valid_atok)

    train_vocab = vocab.Vocab(train_atok['settings'].vocab)

    training_data = dataset.translation_dataloader(train_atok,
                                                   opt.batch_size,
                                                   shuffle=True)
    validation_data = dataset.translation_dataloader(valid_atok,
                                                     opt.batch_size,
                                                     shuffle=False)

    # data = torch.load(opt.data)
    opt.max_token_seq_len = train_atok['settings'].max_seq_len

    # training_data, validation_data = prepare_dataloaders(data, opt)

    opt.src_vocab_size = train_vocab.size()
    opt.tgt_vocab_size = train_vocab.size()

    #========= Preparing Model =========#
    # if opt.embs_share_weight:
    #     assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
    #         'The src/tgt word2idx table are different but asked to share word embedding.'

    print(opt)

    device = torch.device('cuda' if opt.cuda else 'cpu')
    transformer = Transformer(opt.src_vocab_size,
                              opt.tgt_vocab_size,
                              opt.max_token_seq_len,
                              tgt_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_tgt_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout).to(device)

    if os.path.exists("trained.chkpt"):
        x = torch.load("trained.chkpt")
        # print(type(x["model"]))
        transformer.load_state_dict(x["model"])

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)

    train(transformer, training_data, validation_data, optimizer, device, opt)
Exemplo n.º 23
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()
    parser.add_argument('-config', type=str, default='config/rnnt.yaml')
    parser.add_argument('-load_model', type=str, default=None)
    parser.add_argument('-num_workers', type=int, default=0,
                    help='how many subprocesses to use for data loading. '
                    '0 means that the data will be loaded in the main process')
    parser.add_argument('-log', type=str, default='train.log')
    opt = parser.parse_args()

    configfile = open(opt.config)
    config = AttrDict(yaml.load(configfile))

    exp_name = config.data.name
    if not os.path.isdir(exp_name):
        os.mkdir(exp_name)
    logger = init_logger(exp_name + '/' + opt.log)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(config.training.seed)
        torch.backends.cudnn.deterministic = True
    else:
        raise NotImplementedError

    #========= Build DataLoader =========#
    train_dataset = AudioDateset(config.data, 'train')
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.data.train.batch_size, shuffle=True, num_workers=opt.num_workers)

    assert train_dataset.vocab_size == config.model.vocab_size

    #========= Build A Model Or Load Pre-trained Model=========#
    model = Transformer(config.model)

    n_params, enc_params, dec_params = count_parameters(model)
    logger.info('# the number of parameters in the whole model: %d' % n_params)
    logger.info('# the number of parameters in encoder: %d' % enc_params)
    logger.info('# the number of parameters in decoder: %d' % dec_params)

    if torch.cuda.is_available():
        model.cuda()

    global global_step
    global_step = 0
    # define an optimizer
    optimizer = ScheduledOptim(model, config.model.d_model, config.optimizer)

    # load pretrain model
    if opt.load_model is not None:
        checkpoint = torch.load(opt.load_model)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        logger.info('Load pretrainded Model and previous Optimizer!')
    else:
        init_parameters(model)
        logger.info('Initialized all parameters!')

    # define loss function
    crit = nn.CrossEntropyLoss(ignore_index=0)

    # create a visualizer
    if config.training.visualization:
        visualizer = SummaryWriter(exp_name + '/log')
        logger.info('Created a visualizer.')
    else:
        visualizer = None

    for epoch in range(config.training.epoches):
        train(epoch, model, crit, optimizer, train_loader, logger, visualizer, config)
        save_model(epoch, model, optimizer, config, logger)
    
    logger.info('Traing Process Finished')
Exemplo n.º 24
0
def run():
    parser = ArgumentParser()
    parser.add_argument("--dataset_path",
                        type=str,
                        default="",
                        help="Path or url of the dataset.")
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--gpt2_model_name",
                        type=str,
                        default="gpt2",
                        help="name of the model ex)openai-gpt")
    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=30,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=4,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=0, help="Seed")
    parser.add_argument("--keyword_module",
                        type=str,
                        default="new",
                        help="add, attention, ")
    parser.add_argument("--temperature",
                        type=int,
                        default=0.8,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=30,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")

    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')
    args = parser.parse_args()
    args.d_word_vec = args.d_model

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.seed != 0:
        random.seed(args.seed)
        torch.random.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class = GPT2Tokenizer if "gpt2" in args.gpt2_model_name else OpenAIGPTTokenizer  # cant use Autotokenizer because checkpoint could be a Path
    tokenizer = tokenizer_class.from_pretrained(args.gpt2_model_name)

    num_tokens = len(tokenizer.encoder)
    num_added_tokens = tokenizer.add_special_tokens(
        ATTR_TO_SPECIAL_TOKEN)  # doesn't add if they are already there

    model = Transformer(
        num_tokens + num_added_tokens,
        num_tokens + num_added_tokens,
        src_pad_idx=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]),
        trg_pad_idx=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]),
        trg_emb_prj_weight_sharing=args.proj_share_weight,
        emb_src_trg_weight_sharing=args.embs_share_weight,
        d_k=args.d_k,
        d_v=args.d_v,
        d_model=args.d_model,
        d_word_vec=args.d_word_vec,
        d_inner=args.d_inner_hid,
        n_layers=args.n_layers,
        n_head=args.n_head,
        dropout=args.dropout,
        n_position=512,
        keyword_module=args.keyword_module).to(args.device)

    model.load_state_dict(torch.load(args.model_checkpoint), strict=False)
    model.eval()

    sourceList, targetList, scoreList = get_test_datasetEN(
        tokenizer, tokenizer, args.dataset_path)
    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    f1 = open((args.model_checkpoint + current_time + "_output.txt"), 'w')
    for line in tqdm(zip(sourceList, targetList, scoreList),
                     total=len(sourceList)):
        out_ids = sample_sequence(line[0], line[2], tokenizer, model,
                                  tokenizer, args)
        out_texts = tokenizer.decode(out_ids)
        for text in out_texts:
            f1.write(text.replace('▁', ' ').replace('</s>', ' '))
        """
        for id in out_ids:
            f1.write(str(id))
            f1.write(' ')
        """
        f1.write("\n")
    f1.close()
Exemplo n.º 25
0
if __name__ == "__main__":
	torch.cuda.set_device(hp.gpu)
	testLoader = dataset.getDataLoader(is_train=False, batch_size=5, shuffle=False)

	net1 = resnet.resnet34()
	net2 = Transformer(len_encoder=hp.enc_input_len, n_tgt_vocab=hp.num_classes, len_max_seq=hp.max_seq_len, n_layers=hp.n_layers)
	net2.word_prob_prj = nn.LogSoftmax(dim=1)
	net1.cuda().eval()
	#net2.cuda().eval()

	path_to_restore = os.path.join(hp.checkpoint_path, hp.model_path_pre+"_"+str(hp.model_path_idx) + ".pth")
	if os.path.exists(path_to_restore):
		print("restore from:", path_to_restore)
		checkpoint = torch.load(path_to_restore)
		net1.load_state_dict(checkpoint["state_dict_net1"])
		net2.load_state_dict(checkpoint["state_dict_net2"])
		print("restore successfully!")
	else:
		print("fail to restore, path don't exist")
	
	translator = Translator(net2, beam_size=hp.beam_size, max_seq_len=hp.max_seq_len, n_best=hp.n_best)

	print("************************begin infer*********************")
	for imgs_name, imgs, length_imgs, labels, legnth_labels in testLoader:
		imgs = Variable(imgs).cuda()
		length_imgs = Variable(length_imgs).cuda()
      
		enc_img = net1(imgs.float())
		batch_size, channel, height, width = enc_img.shape
		enc_img = enc_img.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, channel)
		batch_pred, batch_prob = translator.translate_batch(enc_img, length_imgs)
Exemplo n.º 26
0
def main():
    ''' 
    Usage:
    python train.py -data_pkl m30k_deen_shr.pkl -log m30k_deen_shr -embs_share_weight -proj_share_weight -label_smoothing -save_model trained -b 256 -warmup 128000
    '''

    parser = argparse.ArgumentParser()

    parser.add_argument('-data_pkl', default=None)  # all-in-1 data pickle or bpe field

    parser.add_argument('-train_path', default=None)  # bpe encoded data
    parser.add_argument('-val_path', default=None)  # bpe encoded data

    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-b', '--batch_size', type=int, default=2048)

    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000)
    parser.add_argument('-lr', '--learning_rate', type=float, default=2.0)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default=None)
    parser.add_argument('-restore', default=None)
    parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    if not opt.log and not opt.save_model:
        print('No experiment result will be saved.')
        raise
    opt.plot_intval = 20
    if opt.batch_size < 2048 and opt.n_warmup_steps <= 4000:
        print('[Warning] The warmup steps may be not enough.\n' \
              '(sz_b, warmup) = (2048, 4000) is the official setting.\n' \
              'Using smaller batch w/o longer warmup may cause ' \
              'the warmup stage ends with only little data trained.')

    device = torch.device('cuda' if opt.cuda else 'cpu')

    # ========= Loading Dataset =========#

    if all((opt.train_path, opt.val_path)):
        training_data, validation_data = prepare_dataloaders_from_bpe_files(opt, device)
    elif opt.data_pkl:
        # training_data, validation_data = prepare_dataloaders(opt, device)
        training_data, validation_data = prepare_mydataloaders(opt, device)
    else:
        raise

    print(opt)

    transformer = Transformer(
        opt.src_vocab_size,
        opt.trg_vocab_size,
        src_pad_idx=opt.src_pad_idx,
        trg_pad_idx=opt.trg_pad_idx,
        trg_emb_prj_weight_sharing=opt.proj_share_weight,
        emb_src_trg_weight_sharing=opt.embs_share_weight,
        d_k=opt.d_k,
        d_v=opt.d_v,
        d_model=opt.d_model,
        d_word_vec=opt.d_word_vec,
        d_inner=opt.d_inner_hid,
        n_layers=opt.n_layers,
        n_head=opt.n_head,
        dropout=opt.dropout).to(device)
    if opt.restore:
        print("loading checkpoint from {}...".format(opt.restore.split("/")[-1]))
        checkpoint = torch.load(opt.restore)
        transformer.load_state_dict(checkpoint['model'])
    optimizer = ScheduledOptim(
        optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09),
        opt.learning_rate, opt.d_model, opt.n_warmup_steps)
    # optimizer = optim.Adagrad(transformer.parameters(), lr=0.15, initial_accumulator_value=0.1)
    train(transformer, training_data, validation_data, optimizer, device, opt)