예제 #1
0
def main(opt, logger):
    logger.info('My PID is {0}'.format(os.getpid()))
    logger.info('PyTorch version: {0}'.format(str(torch.__version__)))
    logger.info(opt)

    if torch.cuda.is_available() and not opt.gpus:
        logger.info("WARNING: You have a CUDA device, so you should probably run with -gpus 0")
    if opt.seed > 0:
        torch.manual_seed(opt.seed)
    if opt.gpus:
        if opt.cuda_seed > 0:
            torch.cuda.manual_seed(opt.cuda_seed)
        # cuda.set_device(opt.gpus[0])
    logger.info('My seed is {0}'.format(torch.initial_seed()))
    logger.info('My cuda seed is {0}'.format(torch.cuda.initial_seed()))
    
    ###### ==================== Loading Dataset ==================== ######
    data = torch.load(opt.data)
    vocabularies = data['dict']
    if isinstance(vocabularies['src'], str):
        assert vocabularies['src'] == opt.pretrained
        options = {'transf':True, 'separate':False, 'tgt':False}
        vocabularies['src'] = Vocab.from_opt(pretrained=opt.pretrained, opt=options)
    train_data, valid_data = data['train'], data['valid']

    ### ===== load pre-trained vocabulary ===== ###
    if opt.pre_trained_vocab:
        if not opt.pretrained:
            opt.pre_trained_src_emb = vocabularies['pre-trained']['src']
        opt.pre_trained_tgt_emb = vocabularies['pre-trained']['tgt']
    
    ### ===== wrap datasets ===== ###
    attn_mask_file = '' if not opt.defined_slf_attn_mask else opt.defined_slf_attn_mask + '.train.npy'
    pad_id = vocabularies['src'].lookup('<|endoftext|>') if opt.pretrained.count('gpt2') else Constants.PAD
    trainData = DialogueDataset(train_data, opt.batch_size, copy=opt.copy, 
                                attn_mask_file=attn_mask_file, 
                                opt_cuda=opt.gpus, pad=pad_id)
    validData = DialogueDataset(valid_data, opt.eval_batch_size, copy=opt.copy, 
                                attn_mask_file=attn_mask_file,
                                opt_cuda=opt.gpus, pad=pad_id)
    
    opt.src_vocab_size, opt.tgt_vocab_size = vocabularies['src'].size, vocabularies['tgt'].size
    
    logger.info(' * vocabulary size. source = %d; target = %d' % (opt.src_vocab_size, opt.tgt_vocab_size))
    logger.info(' * number of training batches. %d' % len(trainData))
    logger.info(' * maximum batch size. %d' % opt.batch_size)
    
    ##### =================== Prepare Model =================== #####
    separate = -1
    device = torch.device('cuda:' + str(opt.gpus[0]) if len(opt.gpus) else 'cpu')
    checkpoint = torch.load(opt.checkpoint) if opt.checkpoint else None
    model, parameters_cnt = build_dialogue_model(opt, device, separate=separate, checkpoint=checkpoint)
    logger.info(' * Number of parameters to learn = %d' % parameters_cnt)

    ##### ==================== Prepare Optimizer ==================== #####
    optimizer = Optimizer.from_opt(model, opt)

    ##### ==================== Prepare Loss ==================== #####
    weight = torch.ones(opt.tgt_vocab_size)
    weight[Constants.PAD] = 0
    loss = NLLLoss(opt, weight=weight, size_average=False)
    if opt.gpus:
        cuda.set_device(opt.gpus[0])
        loss.cuda()
        
    ##### ==================== Prepare Translator ==================== #####
    forward_translator = DialogueTranslator(opt, vocabularies['tgt'], data['valid']['tokens'], vocabularies['src'])
    backward_translator = DialogueTranslator(opt, vocabularies['src'], data['valid']['tokens'], vocabularies['tgt'], reverse=True)
    
    # torch.save(opt, opt.save_model + '-opt.pt')
    # import ipdb; ipdb.set_trace()
    ##### ==================== Training ==================== #####
    trainer = DialogueSupervisedTrainer(model, loss, optimizer, 
                                        forward_translator, backward_translator,
                                        logger, opt, trainData, validData)
    trainer.train(device)
예제 #2
0
def main(opt):
    logging.info('My PID is {0}'.format(os.getpid()))
    logging.info('PyTorch version: {0}'.format(str(torch.__version__)))
    logging.info(opt)

    if torch.cuda.is_available() and not opt.gpus:
        logging.info(
            "WARNING: You have a CUDA device, so you should probably run with -gpus 0"
        )
    if opt.seed > 0:
        torch.manual_seed(opt.seed)
    if opt.gpus:
        if opt.cuda_seed > 0:
            torch.cuda.manual_seed(opt.cuda_seed)
        cuda.set_device(opt.gpus[0])
    logging.info('My seed is {0}'.format(torch.initial_seed()))
    logging.info('My cuda seed is {0}'.format(torch.cuda.initial_seed()))

    ###### ==================== Loading Options ==================== ######
    if opt.checkpoint:
        checkpoint = torch.load(opt.checkpoint)

    ###### ==================== Loading Dataset ==================== ######
    opt.sparse = True if opt.sparse else False
    # logger.info('Loading sequential data ......')
    # sequences = torch.load(opt.sequence_data)
    # seq_vocabularies = sequences['dict']
    # logger.info('Loading structural data ......')
    # graphs = torch.load(opt.graph_data)
    # graph_vocabularies = graphs['dict']

    ### ===== load pre-trained vocabulary ===== ###
    logging.info('Loading sequential data ......')
    sequences = torch.load(opt.sequence_data)
    seq_vocabularies = sequences['dict']
    logging.info('Loading pre-trained vocabulary ......')
    if opt.pre_trained_vocab:
        if not opt.pretrained:
            opt.pre_trained_src_emb = seq_vocabularies['pre-trained']['src']
        opt.pre_trained_tgt_emb = seq_vocabularies['pre-trained']['tgt']
        if opt.answer:
            opt.pre_trained_ans_emb = seq_vocabularies['pre-trained']['src']

    ### ===== wrap datasets ===== ###
    logging.info('Loading Dataset objects ......')
    trainData = torch.load(opt.train_dataset)
    validData = torch.load(opt.valid_dataset)
    trainData.batchSize = validData.batchSize = opt.batch_size
    trainData.numBatches = math.ceil(len(trainData.src) / trainData.batchSize)
    validData.numBatches = math.ceil(len(validData.src) / validData.batchSize)

    logging.info('Preparing vocabularies ......')
    opt.src_vocab_size = seq_vocabularies['src'].size
    opt.tgt_vocab_size = seq_vocabularies['tgt'].size
    opt.feat_vocab = [fv.size for fv in seq_vocabularies['feature']
                      ] if opt.feature else None

    logging.info('Loading structural data ......')
    graphs = torch.load(opt.graph_data)
    graph_vocabularies = graphs['dict']
    del graphs

    opt.edge_vocab_size = graph_vocabularies['edge']['in'].size
    opt.node_feat_vocab = [
        fv.size for fv in graph_vocabularies['feature'][1:-1]
    ] if opt.node_feature else None

    logging.info(' * vocabulary size. source = %d; target = %d' %
                 (opt.src_vocab_size, opt.tgt_vocab_size))
    logging.info(' * number of training batches. %d' % len(trainData))
    logging.info(' * maximum batch size. %d' % opt.batch_size)

    ##### =================== Prepare Model =================== #####
    device = torch.device('cuda' if opt.gpus else 'cpu')
    trainData.device = validData.device = device
    checkpoint = checkpoint if opt.checkpoint else None

    model, parameters_cnt = build_model(opt, device, checkpoint=checkpoint)
    del checkpoint

    logging.info(' * Number of parameters to learn = %d' % parameters_cnt)

    ##### ==================== Prepare Optimizer ==================== #####
    optimizer = Optimizer.from_opt(model, opt)

    ##### ==================== Prepare Loss ==================== #####
    weight = torch.ones(opt.tgt_vocab_size)
    weight[Constants.PAD] = 0
    loss = NLLLoss(opt, weight, size_average=False)
    if opt.gpus:
        loss.cuda()

    ##### ==================== Prepare Translator ==================== #####
    translator = Translator(opt, seq_vocabularies['tgt'],
                            sequences['valid']['tokens'],
                            seq_vocabularies['src'])

    ##### ==================== Training ==================== #####
    trainer = SupervisedTrainer(model, loss, optimizer, translator, opt,
                                trainData, validData, seq_vocabularies['src'],
                                graph_vocabularies['feature'])
    del model
    del trainData
    del validData
    del seq_vocabularies['src']
    del graph_vocabularies['feature']
    trainer.train(device)
예제 #3
0
def main(opt, logger):
    logger.info('My PID is {0}'.format(os.getpid()))
    logger.info('PyTorch version: {0}'.format(str(torch.__version__)))
    logger.info(opt)

    if torch.cuda.is_available() and not opt.gpus:
        logger.info("WARNING: You have a CUDA device, so you should probably run with -gpus 0")
    if opt.seed > 0:
        torch.manual_seed(opt.seed)
    if opt.gpus:
        if opt.cuda_seed > 0:
            torch.cuda.manual_seed(opt.cuda_seed)
        # cuda.set_device(opt.gpus[0])
    logger.info('My seed is {0}'.format(torch.initial_seed()))
    logger.info('My cuda seed is {0}'.format(torch.cuda.initial_seed()))
    
    ###### ==================== Loading Dataset ==================== ######
    data = torch.load(opt.data)
    vocabularies = data['dict']
    if isinstance(vocabularies['src'], str):
        assert vocabularies['src'] == opt.pretrained
        sep = True if opt.answer == 'sep' else False
        options = {'transf':opt.answer != 'enc', 'separate':sep, 'tgt':False}
        vocabularies['src'] = Vocab.from_opt(pretrained=opt.pretrained, opt=options)
    train_data, valid_data = data['train'], data['valid']

    ### ===== load pre-trained vocabulary ===== ###
    if opt.pre_trained_vocab:
        if not opt.pretrained:
            opt.pre_trained_src_emb = vocabularies['pre-trained']['src']
        opt.pre_trained_tgt_emb = vocabularies['pre-trained']['tgt']
        if opt.answer == 'enc':
            opt.pre_trained_ans_emb = vocabularies['pre-trained']['ans']
    
    ### ===== wrap datasets ===== ###
    attn_mask_file = '' if not opt.defined_slf_attn_mask else opt.defined_slf_attn_mask + '.train.npy'
    pad_id = vocabularies['src'].lookup('<|endoftext|>') if opt.pretrained.count('gpt2') else Constants.PAD
    trainData = Dataset(train_data, opt.batch_size, copy=opt.copy, 
                        answer=opt.answer == 'enc', ans_feature=opt.ans_feature, 
                        feature=opt.feature, attn_mask_file=attn_mask_file,
                        opt_cuda=opt.gpus, pad=pad_id)
    validData = Dataset(valid_data, opt.eval_batch_size, copy=opt.copy, 
                        answer=opt.answer == 'enc', ans_feature=opt.ans_feature, 
                        feature=opt.feature, attn_mask_file=attn_mask_file,
                        opt_cuda=opt.gpus, pad=pad_id)
    
    opt.src_vocab_size = vocabularies['src'].size
    opt.tgt_vocab_size = vocabularies['tgt'].size
    opt.feat_vocab = [fv.size for fv in vocabularies['feature']] if opt.feature else None
    opt.ans_feat_vocab = [fv.size for fv in vocabularies['ans_feature']] if opt.ans_feature else None

    logger.info(' * vocabulary size. source = %d; target = %d' % (opt.src_vocab_size, opt.tgt_vocab_size))
    logger.info(' * number of training batches. %d' % len(trainData))
    logger.info(' * maximum batch size. %d' % opt.batch_size)

    ##### =================== Prepare Model =================== #####
    separate = vocabularies['src'].lookup(Constants.SEP_WORD) if opt.answer == 'sep' else -1
    device = torch.device('cuda:' + str(opt.gpus[0]) if len(opt.gpus) else 'cpu')
    checkpoint = torch.load(opt.checkpoint) if opt.checkpoint else None
    if opt.rl:
        rl_device = [torch.device('cuda:' + str(gpu)) for gpu in opt.rl_gpu]
        rl_device = {k:v for k, v in zip(opt.rl, rl_device)}
        opt.rl_device = rl_device
        discriminator = load_rl_model(opt, device, rl_device)
    model, parameters_cnt = build_model(opt, device, separate=separate, checkpoint=checkpoint)
    logger.info(' * Number of parameters to learn = %d' % parameters_cnt)

    ##### ==================== Prepare Optimizer ==================== #####
    optimizer = Optimizer.from_opt(model, opt)

    ##### ==================== Prepare Loss ==================== #####
    weight = torch.ones(opt.tgt_vocab_size)
    weight[Constants.PAD] = 0
    loss = NLLLoss(opt, weight=weight, size_average=False)
    if opt.gpus:
        cuda.set_device(opt.gpus[0])
        loss.cuda()
        
    ##### ==================== Prepare Translator ==================== #####
    translator = Translator(opt, vocabularies['tgt'], data['valid']['tokens'], vocabularies['src'])
    
    ##### ==================== Training ==================== #####
    if opt.rl:
        trainer = RLTrainer(model, discriminator, loss, optimizer, translator, logger, 
                            opt, trainData, validData, vocabularies['src'], vocabularies['tgt'])
    else:
        trainer = SupervisedTrainer(model, loss, optimizer, translator, logger, 
                                    opt, trainData, validData, vocabularies['src'])
    trainer.train(device)
예제 #4
0
def main(opt):
    tokenizer = BertTokenizer.from_pretrained(opt.pre_model)
    ###========== Load Data ==========###
    train_data = filter_data(opt.train_src, opt.train_tgt, tokenizer)
    valid_data = filter_data(opt.valid_src, opt.valid_tgt, tokenizer)
    ###========== Get Index ==========###
    options = {'transf':False, 'separate':False, 'tgt':False}
    src_vocab = Vocab.from_opt(pretrained=opt.pre_model, opt=options)
    options = {'lower':False, 'mode':'size', 'size':1000, 'frequency':1,
               'transf':False, 'separate':False, 'tgt':False}
    tgt_vocab = Vocab.from_opt(corpus=train_data['tgt'], opt=options)
    train_src_idx = [src_vocab.convertToIdx(sent) for sent in train_data['src']]
    valid_src_idx = [src_vocab.convertToIdx(sent) for sent in valid_data['src']]
    train_tgt_idx = [tgt_vocab.convertToIdx(sent) for sent in train_data['tgt']]
    valid_tgt_idx = [tgt_vocab.convertToIdx(sent) for sent in valid_data['tgt']]
    ###========== Get Data ==========###
    train_data = Dataset({'src':train_src_idx, 'tgt':train_tgt_idx, 'feature':[train_data['idx']]}, 
                         opt.batch_size, feature=True, opt_cuda=opt.gpus)
    valid_data = Dataset({'src':valid_src_idx, 'tgt':valid_tgt_idx, 'feature':[valid_data['idx']]}, 
                         opt.batch_size, feature=True, opt_cuda=opt.gpus)
    opt.tgt_vocab_size = tgt_vocab.size
    ###========== Prepare Model ==========###
    device = torch.device('cuda')
    encoder = BertModel.from_pretrained(opt.pre_model)
    classifier = nn.Sequential(
        nn.Linear(768 // opt.maxout_pool_size, opt.tgt_vocab_size),     # TODO: fix this magic number later (hidden size of the model)
        nn.Softmax(dim=1)
    )
    model = NERTagger(encoder, classifier, device).to(device)
    for _, para in model.classifier.named_parameters():
        if para.dim() == 1:
            para.data.normal_(0, math.sqrt(6 / (1 + para.size(0))))
        else:
            nn.init.xavier_normal(para, math.sqrt(3))
    if len(opt.gpus) > 1:
        model = nn.DataParallel(model, device_ids=opt.gpus)
    ###========== Prepare for training ==========###
    opt.optim = 'adam'
    opt.decay_method = ''
    opt.learning_rate = 3e-5
    opt.learning_rate_decay = 1
    opt.decay_steps = 10000000
    opt.start_decay_steps = 10000000000
    opt.max_grad_norm = 5
    opt.max_weight_value = 20
    opt.decay_bad_cnt = 5
    optimizer = Optimizer.from_opt(model, opt)
    
    weight = torch.ones(opt.tgt_vocab_size)
    weight[0] = 0       # TODO: fix this magic number later (PAD)
    loss = NLLLoss(opt, weight, size_average=False)
    if opt.gpus:
        loss.cuda()
    ###========== Training ==========###
    best_val = 0

    def eval_model(M, D, L):
        M.eval()

        all_loss, all_accu, all_words = 0, 0, 0
        for i in tqdm(range(len(D)), mininterval=2, desc='  - (Validation)  ', leave=False):
            B = D[i]
            s, t, sid = B['src'][0], B['tgt'], B['feat'][0][0]
            t = t.transpose(0, 1)
            P = M(s, sid)
            lv, G = L.cal_loss_ner(P, t)

            all_loss += lv.item()
            all_words += P.size(0)
            P = P.max(1)[1]
            n_correct = P.eq(G.view(-1))
            n_correct = n_correct.sum().item()
            all_accu += n_correct

        return all_loss/all_words, all_accu/all_words

    def save_model(M, score, best_val, opt):
        if score > best_val:
            model_to_save = M.module.encoder if hasattr(M, 'module') else M.encoder  # Only save the model it-self
            output_model_file = os.path.join(opt.output_dir, "pytorch_model_" + str(round(score * 100, 2)) + ".bin")
            torch.save(model_to_save.state_dict(), output_model_file)
        print('validation', score)

    for _ in range(opt.num_train_epochs):
        train_data.shuffle()
        model.train()
        batch_order = torch.randperm(len(train_data))
        loss_print, words_cnt, accuracy = 0, 0, 0
        for idx in tqdm(range(len(train_data)), mininterval=2, desc='  - (Training)  ', leave=False):
            batch_idx = batch_order[idx]
            batch = train_data[batch_idx]

            src, tgt, src_idx = batch['src'][0], batch['tgt'], batch['feat'][0][0]
            tgt = tgt.transpose(0, 1)

            out = model(src, src_idx)
            loss_val, gold = loss.cal_loss_ner(out, tgt)
            if len(opt.gpus) > 1:
                loss_val = loss_val.mean()  # mean() to average on multi-gpu.
            if math.isnan(loss_val.item()) or loss_val.item() > 1e20:
                print('catch NaN')
                import ipdb; ipdb.set_trace()
            loss_val.backward()

            optimizer.step()
            optimizer.zero_grad()

            loss_print += loss_val.item()
            words_cnt += out.size(0)
            pred = out.max(1)[1]
            n_correct = pred.eq(gold.view(-1))
            n_correct = n_correct.sum().item()
            accuracy += n_correct
            if idx % 1000 == 0:
                loss_print /= words_cnt
                accuracy /= words_cnt
                print('loss', loss_print)
                print('accuracy', accuracy)
                loss_val, words_cnt, accuracy = 0, 0, 0
                if idx % 2000 == 0:
                    loss_val, accuracy_val = eval_model(model, valid_data, loss)
                    save_model(model, accuracy_val, best_val, opt)
                    if accuracy_val > best_val:
                        best_val = accuracy_val
    
    model_to_save = model.module.encoder if hasattr(model, 'module') else model.encoder  # Only save the model it-self
    output_model_file = os.path.join(opt.output_dir, "pytorch_model.bin")
    torch.save(model_to_save.state_dict(), output_model_file)