예제 #1
0
    def test(self):
        """
        Run trained model on test set
        """
        #
        # Setup data, logging
        #
        self.dataset = SummDatasetFactory.get(self.opt.dataset)
        test_iter = self.dataset.get_data_loader(split='test',
                                                 sample_reviews=False,
                                                 n_docs=1,
                                                 batch_size=self.hp.batch_size,
                                                 shuffle=False)

        tb_path = os.path.join(self.save_dir, 'tensorboard/test/')
        if not os.path.exists(tb_path):
            os.mkdir(tb_path)
        self.tb_test_writer = SummaryWriter(tb_path)

        #
        # Get model and loss
        #
        self.model = torch.load(opt.load_test_model)['model']
        if self.hp.clf_mse:
            self.loss_fn = nn.MSELoss()
        else:
            self.loss_fn = nn.CrossEntropyLoss()
        if torch.cuda.is_available():
            self.model.cuda()
        if len(self.opt.gpus) > 1:
            self.model = nn.DataParallel(self.model)

        n_params = sum([p.nelement() for p in self.model.parameters()])
        print('Number of parameters: {}'.format(n_params))

        #
        # Test
        #
        self.model.eval()
        with torch.no_grad():
            loss_avg, acc_avg, rating_diff_avg, per_rating_acc = self.run_epoch(
                test_iter,
                test_iter.__len__(),
                0,
                'test',
                tb_writer=self.tb_test_writer,
                save_intermediate=False)
        self.tb_test_writer.add_scalar('overall/loss', loss_avg, 0)
        self.tb_test_writer.add_scalar('overall/acc', acc_avg, 0)
        self.tb_test_writer.add_scalar('overall/rating_diff', rating_diff_avg,
                                       0)
        for r, acc in per_rating_acc.items():
            self.tb_test_writer.add_scalar(
                'overall/per_rating_acc_{}_stars'.format(r), acc, 0)
예제 #2
0
def main(opt):
    if opt.dataset:
        dataset = SummDatasetFactory.get(opt.dataset)
        dl = dataset.get_data_loader(split='lm',
                                     n_docs=opt.n_docs,
                                     sample_reviews=False,
                                     batch_size=1,
                                     num_workers=0,
                                     shuffle=False)
        print('Writing reviews to file')
        with open('/tmp/{}_data.txt'.format(opt.dataset),
                  'w',
                  encoding='utf-8') as f:
            for texts, ratings, metadata in dl:
                f.write('{}\n'.format(texts[0]))
        print('Creating token counts')
        token_counts = tokenizer.corpus_token_counts('/tmp/{}_data.txt'.format(
            opt.dataset),
                                                     opt.corpus_max_lines,
                                                     split_on_newlines=True)
    elif opt.corpus_filepattern:
        token_counts = tokenizer.corpus_token_counts(opt.corpus_filepattern,
                                                     opt.corpus_max_lines,
                                                     split_on_newlines=True)
    else:
        raise ValueError(
            'Must provide --dataset or provide --corpus_filepattern')

    print('Building to target size')
    encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
        opt.target_size, token_counts, 0, 1e9, reserved_tokens=RESERVED_TOKENS)

    print('Saving tokenizer')
    vocab_fp = os.path.join(opt.output_dir,
                            opt.output_fn + '.txt')  # stores vocab coutns
    encoder.store_to_file(vocab_fp)
    enc_fp = os.path.join(opt.output_dir, opt.output_fn + '.pkl')
    save_file(encoder, enc_fp, verbose=True)

    pdb.set_trace()
예제 #3
0
        Calculate and print some statistics on the original dataset
        """
        if self.reviews is None:
            self.reviews, self.item_to_reviews = AmazonDataset.load_all_reviews(
            )
        lens = []
        for rev in self.reviews:
            lens.append(len(self.subwordenc.encode(rev['reviewText'])))

        print(np.median(lens))
        print(np.percentile(lens, 75))
        print(np.percentile(lens, 90))
        pdb.set_trace()


if __name__ == '__main__':
    from data_loaders.summ_dataset_factory import SummDatasetFactory

    hp = HParams()
    ds = SummDatasetFactory.get('amazon')
    # ds.save_processed_splits()
    ds.print_original_data_stats()
    # ds.print_filtered_data_stats()

    # test_dl = ds.get_data_loader(split='test', n_docs=8, sample_reviews=True,
    #                              category='Electronics',
    #                              batch_size=4, shuffle=True)
    # for texts, ratings, metadata in test_dl:
    #     x, lengths, labels = ds.prepare_batch(texts, ratings)
    #     pdb.set_trace()
예제 #4
0
    def train(self):
        """
        Main train loop
        """
        #
        # Get data, setup
        #

        self.dataset = SummDatasetFactory.get(self.opt.dataset,
                                              '../datasets/yelp_dataset/')
        subwordenc = self.dataset.subwordenc
        train_iter = self.dataset.get_data_loader(
            split='train',
            n_docs=self.hp.n_docs,
            sample_reviews=True,
            batch_size=self.hp.batch_size,
            shuffle=True)
        train_nbatches = train_iter.__len__()
        val_iter = self.dataset.get_data_loader(split='val',
                                                n_docs=self.hp.n_docs,
                                                sample_reviews=False,
                                                batch_size=self.hp.batch_size,
                                                shuffle=False)
        val_nbatches = val_iter.__len__()

        tb_path = os.path.join(self.save_dir, 'tensorboard/')
        print('Tensorboard events will be logged to: {}'.format(tb_path))
        os.mkdir(tb_path)
        os.mkdir(tb_path + 'train/')
        os.mkdir(tb_path + 'val/')
        self.tb_tr_writer = SummaryWriter(tb_path + 'train/')
        self.tb_val_writer = SummaryWriter(tb_path + 'val/')

        #
        # Get model and loss
        #
        if len(self.opt.load_model) > 0:
            raise NotImplementedError(
                'Need to save run to same directory, handle changes in hp, etc.'
            )
            # checkpoint = torch.load(opt.load_model)
            # self.model = checkpoint['model']
        else:
            if self.hp.model_type == 'mlstm':
                embed = nn.Embedding(subwordenc.vocab_size, self.hp.emb_size)
                lstm = StackedLSTM(mLSTM,
                                   self.hp.lstm_layers,
                                   self.hp.emb_size,
                                   self.hp.hidden_size,
                                   subwordenc.vocab_size,
                                   self.hp.lstm_dropout,
                                   layer_norm=self.hp.lstm_ln)
                self.model = StackedLSTMEncoder(embed, lstm)
                self.loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_ID)
            elif self.hp.model_type == 'transformer':
                self.model = make_model(subwordenc.vocab_size,
                                        subwordenc.vocab_size,
                                        N=self.hp.tsfr_blocks,
                                        d_model=self.hp.hidden_size,
                                        d_ff=self.hp.tsfr_ff_size,
                                        dropout=self.hp.tsfr_dropout,
                                        tie_embs=self.hp.tsfr_tie_embs,
                                        decoder_only=True)
                self.loss_fn = LabelSmoothing(
                    size=subwordenc.vocab_size,
                    smoothing=self.hp.tsfr_label_smooth)
        if torch.cuda.is_available():
            self.model.cuda()
        self.ngpus = 1
        if len(self.opt.gpus) > 1:
            self.ngpus = len(self.opt.gpus.split(','))
            self.model = DataParallelModel(self.model)
            self.loss_fn = DataParallelCriterion(self.loss_fn)

        n_params = sum([p.nelement() for p in self.model.parameters()])
        print('Number of parameters: {}'.format(n_params))

        #
        # Get optimizer
        #
        if self.hp.optim == 'normal':
            self.optimizer = OptWrapper(
                self.model, self.hp.lm_clip,
                optim.Adam(self.model.parameters(), lr=self.hp.lm_lr))
        elif self.hp.optim == 'noam':
            d_model = self.model.module.tgt_embed[0].d_model if self.ngpus > 1 else \
                self.model.tgt_embed[0].d_model
            self.optimizer = NoamOpt(
                d_model, 2, self.hp.noam_warmup,
                torch.optim.Adam(self.model.parameters(),
                                 lr=0,
                                 betas=(0.9, 0.98),
                                 eps=1e-9))

        #
        # Train epochs
        #
        for e in range(hp.max_nepochs):
            try:
                self.model.train()
                loss_avg = self.run_epoch(train_iter,
                                          train_nbatches,
                                          e,
                                          'train',
                                          optimizer=self.optimizer,
                                          tb_writer=self.tb_tr_writer)
                self.tb_tr_writer.add_scalar('overall_stats/loss_avg',
                                             loss_avg, e)

            except KeyboardInterrupt:
                print('Exiting from training early')

            self.model.eval()
            loss_avg = self.run_epoch(val_iter,
                                      val_nbatches,
                                      e,
                                      'val',
                                      optimizer=None)
            self.tb_val_writer.add_scalar('overall_stats/loss_avg', loss_avg,
                                          e)
            save_model(self.save_dir, self.model, self.optimizer, e, self.opt,
                       loss_avg)
예제 #5
0
    def train(self):
        """
        Main train loop
        """
        #
        # Get data, setup
        #

        # NOTE: Use n_docs=1 so we can classify one review
        self.dataset = SummDatasetFactory.get(self.opt.dataset, self.opt.dir_path)
        train_iter = self.dataset.get_data_loader(split='train', sample_reviews=True, n_docs=1,
                                                  batch_size=self.hp.batch_size, shuffle=True)
        val_iter = self.dataset.get_data_loader(split='val', sample_reviews=False, n_docs=1,
                                                batch_size=self.hp.batch_size, shuffle=False)

        self.tb_tr_writer = None
        self.tb_val_writer = None
        tb_path = os.path.join(self.save_dir, 'tensorboard/')
        print('Tensorboard events will be logged to: {}'.format(tb_path))
        os.mkdir(tb_path)
        os.mkdir(tb_path + 'train/')
        os.mkdir(tb_path + 'val/')
        self.tb_tr_writer = SummaryWriter(tb_path + 'train/')
        self.tb_val_writer = SummaryWriter(tb_path + 'val/')

        #
        # Get model and loss
        #
        if len(self.opt.load_train_model) > 0:
            raise NotImplementedError('Need to save run to same directory, handle changes in hp, etc.')
            # checkpoint = torch.load(opt.load_model)
            # self.model = checkpoint['model']
        else:
            if self.hp.model_type == 'cnn':
                cnn_output_size = self.hp.cnn_n_feat_maps * len(self.hp.cnn_filter_sizes)
                self.model = TextClassifier(self.dataset.subwordenc.vocab_size, self.hp.emb_size,
                                            self.hp.cnn_filter_sizes, self.hp.cnn_n_feat_maps, self.hp.cnn_dropout,
                                            cnn_output_size, self.dataset.n_ratings_labels,
                                            onehot_inputs=self.hp.clf_onehot, mse=self.hp.clf_mse)

        if self.hp.clf_mse:
            self.loss_fn = nn.MSELoss()
        else:
            self.loss_fn = nn.CrossEntropyLoss()
        if torch.cuda.is_available():
            self.model.cuda()
        if len(self.opt.gpus) > 1:
            self.model = nn.DataParallel(self.model)

        n_params = sum([p.nelement() for p in self.model.parameters()])
        print('Number of parameters: {}'.format(n_params))

        #
        # Get optimizer
        #
        self.optimizer = OptWrapper(
            self.model,
            self.hp.clf_clip,
            optim.Adam(self.model.parameters(), lr=self.hp.clf_lr))

        #
        # Train epochs
        #
        for e in range(hp.max_nepochs):
            try:
                self.model.train()
                loss_avg, acc_avg, rating_diff_avg, per_rating_acc = self.run_epoch(
                    train_iter, train_iter.__len__(), e, 'train',
                    optimizer=self.optimizer, tb_writer=self.tb_tr_writer)
                self.tb_tr_writer.add_scalar('overall/loss', loss_avg, e)
                self.tb_tr_writer.add_scalar('overall/acc', acc_avg, e)
                self.tb_tr_writer.add_scalar('overall/rating_diff', rating_diff_avg, e)
                for r, acc in per_rating_acc.items():
                    self.tb_tr_writer.add_scalar('overall/per_rating_acc_{}_stars'.format(r), acc, e)
            except KeyboardInterrupt:
                print('Exiting from training early')

            self.model.eval()
            loss_avg, acc_avg, rating_diff_avg, per_rating_acc = self.run_epoch(
                val_iter, val_iter.__len__(), e, 'val', optimizer=None)
            self.tb_val_writer.add_scalar('overall/loss', loss_avg, e)
            self.tb_val_writer.add_scalar('overall/acc', acc_avg, e)
            self.tb_val_writer.add_scalar('overall/rating_diff', rating_diff_avg, e)
            for r, acc in per_rating_acc.items():
                self.tb_val_writer.add_scalar('overall/per_rating_acc_{}'.format(r), acc, e)
            fn_str = 'l{:.4f}_a{:.4f}_d{:.4f}'.format(loss_avg, acc_avg, rating_diff_avg)
            model_to_save = self.model.module if len(self.opt.gpus) > 1 else self.model
            save_model(self.save_dir, model_to_save, self.optimizer, e, self.opt, fn_str)
예제 #6
0
    def __init__(self, hp, opt):
        self.hp = hp
        self.opt = opt

        self.dataset = SummDatasetFactory.get(opt.dataset)
예제 #7
0
                    rating_to_count[ratings[i].item()] += 1

        print('Number of reviews per star rating:')
        for rating, count in sorted(rating_to_count.items()):
            print('-- {} stars: {:.2f} reviews; {} of dataset'.format(
                rating, count,
                float(count) / len(all_rev_lens)))
        print('Length of review:')
        print('-- mean: {}'.format(np.mean(all_rev_lens)))
        print('-- 75th percentile: {}'.format(np.percentile(all_rev_lens, 75)))
        print('-- 90th percentile: {}'.format(np.percentile(all_rev_lens, 90)))


if __name__ == '__main__':
    from data_loaders.summ_dataset_factory import SummDatasetFactory

    hp = HParams()
    ds = SummDatasetFactory.get('yelp')
    ds.save_processed_splits()
    # ds.print_original_data_stats()
    # ds.print_filtered_data_stats()

    # Variable batch size and n_docs
    # test_dl = ds.get_data_loader(split='test', n_docs_min=4, n_docs_max=16, sample_reviews=True,
    #                              batch_size=1, shuffle=False)
    # test_dl = ds.get_data_loader(split='test', n_docs=8, sample_reviews=False,
    #                              batch_size=1, shuffle=False)
    # for texts, ratings, metadata in test_dl:
    #     x, lengths, labels = ds.prepare_batch(texts, ratings)
    #     pdb.set_trace()