예제 #1
0
    def setup_train(self, model_file_path=None):
        self.model = Model(model_file_path)

        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        initial_lr = config.lr_coverage if config.is_coverage else config.lr
        self.optimizer = Adagrad(
            params,
            lr=initial_lr,
            initial_accumulator_value=config.adagrad_init_acc)

        start_iter, start_loss = 0, 0

        if model_file_path is not None:
            state = torch.load(model_file_path,
                               map_location=lambda storage, location: storage)
            start_iter = state['iter']
            start_loss = state['current_loss']

            if not config.is_coverage:
                self.optimizer.load_state_dict(state['optimizer'])
                if use_cuda:
                    for state in self.optimizer.state.values():
                        for k, v in state.items():
                            if torch.is_tensor(v):
                                state[k] = v.cuda()

        return start_iter, start_loss
예제 #2
0
    def setup_train(self, model_file_path=None):
        self.model = Model(model_file_path, vectors=self.vectors)

        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())

        pytorch_total_params = sum(p.numel() for p in params if p.requires_grad)
        print(f"Parameters count: {pytorch_total_params}")

        initial_lr = config.lr_coverage if config.is_coverage else config.lr
        # self.optimizer = adagrad.Adagrad(params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc)
        self.optimizer = Adam(params, lr=initial_lr)
        start_iter, start_training_loss, start_eval_loss = 0, 0, 0

        if model_file_path is not None:
            state = torch.load(model_file_path, map_location=lambda storage, location: storage)
            start_iter = state['iter']
            start_training_loss = state['current_train_loss']
            start_eval_loss = state['current_eval_loss']

            if not config.is_coverage:
                self.optimizer.load_state_dict(state['optimizer'])
                if use_cuda:
                    for state in self.optimizer.state.values():
                        for k, v in state.items():
                            print(k)
                            if isinstance(v, torch.Tensor):
                                state[k] = v.cuda()

        self.chechpoint = Checkpoint(self.model,
                                     self.optimizer,
                                     self.model_dir,
                                     start_eval_loss if start_eval_loss != 0 else float("inf"))

        return start_iter, start_training_loss, start_eval_loss
예제 #3
0
    def __init__(self, model_file_path, model_type="stem", load_batcher=True):

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        if load_batcher:
            self.batcher = Batcher(config.decode_data_path,
                                   self.vocab,
                                   mode='decode',
                                   batch_size=config.beam_size,
                                   single_pass=True)
            time.sleep(15)
        self.model = Model(model_file_path, is_eval=True)
        self.model_type = model_type
예제 #4
0
    def __init__(self, model_file_path, is_word_level, is_combined, alpha):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        # self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval',
        #                        batch_size=config.batch_size, single_pass=True)
        self.dataset = DailyMailDataset("val", self.vocab)
        # time.sleep(15)
        model_name = os.path.basename(model_file_path)

        self.is_word_level = is_word_level
        self.is_combined = is_combined
        self.alpha = alpha

        eval_dir = os.path.join(config.log_root, 'eval_%s' % (model_name))
        if not os.path.exists(eval_dir):
            os.mkdir(eval_dir)

        self.model = Model(model_file_path, is_eval=True)
예제 #5
0
def predict(sentence, model_path):
    if not os.path.exists(model_path):
        raise Exception("Need to provide model path")
    model = Model(model_path)
    checkpoint = torch.load(model_path,
                            map_location=lambda storage, location: storage)
    vocab = checkpoint['vocab']

    target_field = Field(sequential=True,
                         init_token=START_DECODING,
                         eos_token=STOP_DECODING,
                         pad_token=PAD_TOKEN,
                         batch_first=True,
                         include_lengths=True,
                         unk_token=UNKNOWN_TOKEN,
                         lower=True)

    source_field = Field(sequential=True,
                         init_token=SENTENCE_START,
                         eos_token=SENTENCE_END,
                         pad_token=PAD_TOKEN,
                         batch_first=True,
                         include_lengths=True,
                         unk_token=UNKNOWN_TOKEN,
                         lower=True)

    source_field.vocab = vocab
    target_field.vocab = vocab
    data = [{'src': sentence, 'tgt': ''}]
    predict_data = Mydataset(data=data,
                             fields=(('source', source_field), ('target',
                                                                target_field)))

    setattr(args, 'vectors', source_field.vocab.vectors)
    setattr(args, 'vocab_size', len(source_field.vocab.itos))
    setattr(args, 'emb_dim', vectors.dim)
                        dest="is_word_level",
                        action="store_true")
    parser.add_argument("--combined", dest="is_combined", action="store_true")
    parser.set_defaults(is_word_level=False)
    parser.set_defaults(is_combined=False)

    args = parser.parse_args()

    seq2seq_checkpoint_file = "./Seq2Seq_model_50000"
    pg_losses = [
    ]  #pickle.load(open("/home/lgpu0231/dumps_model_12_16_11_08/pg_losses_350.p", 'rb'))
    run_avg_losses = [
    ]  #pickle.load(open("/home/lgpu0231/dumps_model_12_16_11_08/run_avg_losses_350.p", 'rb'))

    # Model
    model = Model(seq2seq_checkpoint_file)
    # model = Model()

    # Load data
    trainer = TrainSeq2Seq(is_word_level=args.is_word_level,
                           is_combined=args.is_combined)
    # Prepare for training (e.g. optimizer)
    iter, running_avg_loss = trainer.setup(model, model_file_path=None)

    # GENERATOR MLE TRAINING - Pretrain
    print('Starting Generator MLE Training...')
    #trainer.train_nll(MLE_TRAIN_EPOCHS, iter, running_avg_loss)

    # ADVERSARIAL TRAINING
    print('\nStarting PG Training...')
    trainer.train_pg(PG_TRAIN_EPOCHS, iter, running_avg_loss, pg_losses,
예제 #7
0
def train():
    target_field = Field(sequential=True,
                         init_token=START_DECODING,
                         eos_token=STOP_DECODING,
                         pad_token=PAD_TOKEN,
                         batch_first=True,
                         include_lengths=True,
                         unk_token=UNKNOWN_TOKEN,
                         lower=True)

    source_field = Field(sequential=True,
                         init_token=SENTENCE_START,
                         eos_token=SENTENCE_END,
                         pad_token=PAD_TOKEN,
                         batch_first=True,
                         include_lengths=True,
                         unk_token=UNKNOWN_TOKEN,
                         lower=True)
    train_path = '../data/incar_alexa/train_public.pickle'
    dev_path = '../data/incar_alexa/dev_public.pickle'
    test_path = '../data/incar_alexa/test_public.pickle'
    path = '../data/cnn_stories_tokenized'
    summary_writer = SummaryWriter(config.summary_path)

    train_src, train_tgt, train_id = load_data(train_path)
    dev_src, dev_tgt, dev_id = load_data(dev_path)
    test_src, test_tgt, test_id = load_data(test_path)
    # train_data = prepare_data_cnn(path)
    # # print(train_data[0])
    # train_src = [dt['src'] for dt in train_data]
    # train_tgt = [dt['tgt'] for dt in train_data]
    # train_id = [dt['id'] for dt in train_data]
    # train_src, test_src, train_tgt, test_tgt = train_test_split(
    #     train_src, train_tgt, test_size=0.15, random_state=123)
    # train_id, test_id = train_test_split(
    #     train_id, test_size=0.15, random_state=123)
    # # print(f"{len(train_src)}, {len(train_tgt)}")
    # train_src, dev_src, train_tgt, dev_tgt = train_test_split(
    #     train_src, train_tgt, test_size=0.15, random_state=123)
    # train_id, dev_id = train_test_split(
    #     train_id, test_size=0.15, random_state=123)

    # print(source_field.preprocess(train_src[0]))
    # exit()
    train_src_preprocessed = [source_field.preprocess(x) for x in train_src]
    dev_src_preprocessed = [source_field.preprocess(x) for x in dev_src]
    test_src_preprocessed = [source_field.preprocess(x) for x in test_src]

    train_tgt_preprocessed = [target_field.preprocess(x) for x in train_tgt]
    dev_tgt_preprocessed = [target_field.preprocess(x) for x in dev_tgt]
    test_tgt_preprocessed = [target_field.preprocess(x) for x in test_tgt]
    # train_src_preprocessed = source_field.apply(lambda x: source_field.preprocess(x))

    vectors = Vectors(
        name='/home/binhna/Downloads/shared_resources/cc.en.300.vec',
        cache='/home/binhna/Downloads/shared_resources/')

    source_field.build_vocab([
        train_src_preprocessed, dev_src_preprocessed, train_tgt_preprocessed,
        dev_tgt_preprocessed
    ],
                             vectors=vectors)
    target_field.build_vocab([
        train_src_preprocessed, dev_src_preprocessed, train_tgt_preprocessed,
        dev_tgt_preprocessed
    ],
                             vectors=vectors)

    train_data = [{
        'src': src,
        'tgt': tgt,
        'id': id
    } for src, tgt, id in zip(train_src, train_tgt, train_id)]
    train_data = Mydataset(data=train_data,
                           fields=(('source', source_field), ('target',
                                                              target_field)))
    dev_data = [{
        'src': src,
        'tgt': tgt,
        'id': id
    } for src, tgt, id in zip(dev_src, dev_tgt, dev_id)]
    # print(dev_data[0])
    dev_data = Mydataset(data=dev_data,
                         fields=(('source', source_field), ('target',
                                                            target_field)))

    test_data = [{
        'src': src,
        'tgt': tgt,
        'id': id
    } for src, tgt, id in zip(test_src, test_tgt, test_id)]
    test_data = Mydataset(data=test_data,
                          fields=(('source', source_field), ('target',
                                                             target_field)))
    # print(train_data[10].source)
    # print(train_data[10].target)
    # print(len(target_field.vocab))
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    train_iter, test_iter, dev_iter = BucketIterator.splits(
        datasets=(train_data, test_data, dev_data),
        batch_sizes=(config.batch_size, config.batch_size, config.batch_size),
        device=device,
        sort_key=lambda x: len(x.source),
        sort_within_batch=True)

    args = ARGS()
    setattr(args, 'vectors', source_field.vocab.vectors)
    setattr(args, 'vocab_size', len(source_field.vocab.itos))
    setattr(args, 'emb_dim', vectors.dim)
    model = Model(args)

    params = list(model.encoder.parameters()) + list(
        model.decoder.parameters()) + list(model.reduce_state.parameters())
    initial_lr = config.lr_coverage if config.is_coverage else config.lr
    optimizer = Adagrad(params,
                        lr=initial_lr,
                        initial_accumulator_value=config.adagrad_init_acc)

    iter, running_avg_loss = 0, 0
    start = time.time()
    for epoch in range(500):
        print(f"Epoch: {epoch+1}")
        for i, batch in tqdm(enumerate(train_iter), total=len(train_iter)):
            # print(batch.source[0].size())
            # exit()
            batch_size = batch.batch_size
            # encoder part
            enc_padding_mask = get_mask(batch.source, device)
            enc_batch = batch.source[0]
            enc_lens = batch.source[1]
            encoder_outputs, encoder_feature, encoder_hidden = model.encoder(
                enc_batch, enc_lens)
            s_t_1 = model.reduce_state(encoder_hidden)
            coverage = Variable(torch.zeros(batch.source[0].size())).to(device)
            c_t_1 = Variable(torch.zeros(
                (batch_size, 2 * config.hidden_dim))).to(device)
            extra_zeros, enc_batch_extend_vocab, max_art_oovs = get_extra_features(
                batch.source[0], source_field.vocab)
            extra_zeros = extra_zeros.to(device)
            enc_batch_extend_vocab = enc_batch_extend_vocab.to(device)
            # decoder part
            dec_batch = batch.target[0][:, :-1]
            # print(dec_batch.size())
            target_batch = batch.target[0][:, 0:]
            dec_lens_var = batch.target[1]
            dec_padding_mask = get_mask(batch.target, device)
            max_dec_len = max(dec_lens_var)

            step_losses = []
            for di in range(min(max_dec_len, config.max_dec_steps) - 1):
                y_t_1 = dec_batch[:, di]  # Teacher forcing
                final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = model.decoder(
                    y_t_1, s_t_1, encoder_outputs, encoder_feature,
                    enc_padding_mask, c_t_1, extra_zeros,
                    enc_batch_extend_vocab, coverage, di)
                target = target_batch[:, di]
                gold_probs = torch.gather(final_dist, 1,
                                          target.unsqueeze(1)).squeeze()
                step_loss = -torch.log(gold_probs + config.eps)
                if config.is_coverage:
                    step_coverage_loss = torch.sum(
                        torch.min(attn_dist, coverage), 1)
                    step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                    coverage = next_coverage

                step_mask = dec_padding_mask[:, di]
                step_loss = step_loss * step_mask
                step_losses.append(step_loss)
            sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
            batch_avg_loss = sum_losses / dec_lens_var
            loss = torch.mean(batch_avg_loss)

            loss.backward()

            norm = clip_grad_norm_(model.encoder.parameters(),
                                   config.max_grad_norm)
            clip_grad_norm_(model.decoder.parameters(), config.max_grad_norm)
            clip_grad_norm_(model.reduce_state.parameters(),
                            config.max_grad_norm)

            optimizer.step()

            running_avg_loss = calc_running_avg_loss(loss.item(),
                                                     running_avg_loss,
                                                     summary_writer, iter)
            iter += 1
            summary_writer.flush()
            # print_interval = 10
            # if iter % print_interval == 0:
            #     print(f'steps {iter}, batch number: {i} with {time.time() - start} seconds, loss: {loss}')
            #     start = time.time()
            if iter % 300 == 0:
                save_model(model, optimizer, running_avg_loss, iter,
                           config.model_dir)
예제 #8
0
    dev_data = Mydataset(data=dev_data,
                         fields=(('source', source_field), ('target',
                                                            target_field)))

    test_data = [{
        'src': src,
        'tgt': tgt,
        'id': id
    } for src, tgt, id in zip(test_src, test_tgt, test_id)]
    test_data = Mydataset(data=test_data,
                          fields=(('source', source_field), ('target',
                                                             target_field)))

    setattr(args, 'vectors', source_field.vocab.vectors)
    setattr(args, 'vocab_size', len(source_field.vocab.itos))
    setattr(args, 'emb_dim', vectors.dim)

    model = Model(args)
    trainer = Trainer(model=model,
                      args=args,
                      train_dataset=train_data,
                      eval_dataset=dev_data,
                      test_dataset=test_data,
                      vocab=source_field.vocab,
                      is_train=True)
    trainer.train()
    # for name in ['train', 'dev', 'test']:
    #     process_incar_data(f'../data/incar_alexa/{name}_public.json')
    # vocabs = read_vocabs('../data/finished_files/vocab')
    # print(len(vocabs))