Exemplo n.º 1
0
    def train(self, training_data):
        x_train_text, y_train, x_test_text, y_test, vocab, class_to_i, i_to_class = preprocess_dataset(
            training_data
        )
        self.class_to_i = class_to_i
        self.i_to_class = i_to_class
        self.vocab = vocab

        embeddings, embedding_lookup = load_embeddings(vocab=vocab, expand_glove=True)
        self.embeddings = embeddings
        self.embedding_lookup = embedding_lookup

        x_train = [convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_train_text]
        for row in x_train:
            if len(row) == 0:
                row.append(embedding_lookup['UNK'])
        x_train = np.array(x_train)
        y_train = np.array(y_train)

        mems_train = load_memories([' '.join(x) for x in x_train_text], self.n_memories)
        mems_indices_train = memories_to_indices(mems_train, embedding_lookup)

        x_test = [convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_test_text]
        for row in x_test:
            if len(row) == 0:
                row.append(embedding_lookup['UNK'])
        x_test = np.array(x_test)
        y_test = np.array(y_test)

        mems_test = load_memories([' '.join(x) for x in x_test_text], self.n_memories)
        mem_indices_test = memories_to_indices(mems_test, embedding_lookup)

        self.n_classes = compute_n_classes(training_data[1])
Exemplo n.º 2
0
Arquivo: cnn.py Projeto: amit2014/qb
    def train(self, training_data: TrainingData) -> None:
        log.info('Preprocessing training data...')
        x_train, y_train, x_test, y_test, vocab, class_to_i, i_to_class = preprocess_dataset(
            training_data, create_runs=self.train_on_q_runs, full_question=self.train_on_full_q)
        if self.wiki_data_frac > 0:
            log.info('Using wikipedia with fraction: {}'.format(self.wiki_data_frac))
            wiki_data = FilteredWikipediaDataset().training_data()
            results = preprocess_dataset(
                wiki_data,
                train_size=1,
                vocab=vocab,
                class_to_i=class_to_i,
                i_to_class=i_to_class)
            x_train.extend(results[0])
            y_train.extend(results[1])

        self.class_to_i = class_to_i
        self.i_to_class = i_to_class
        self.vocab = vocab

        log.info('Creating embeddings...')
        embeddings, embedding_lookup = load_embeddings(vocab=vocab, expand_glove=self.expand_we, mask_zero=True)
        self.embeddings = embeddings
        self.embedding_lookup = embedding_lookup

        log.info('Converting dataset to embeddings...')
        x_train = [nn.convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_train]
        x_test = [nn.convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_test]
        self.n_classes = nn.compute_n_classes(training_data[1])
        self.max_len = nn.compute_max_len(training_data)
        x_train = np.array(nn.tf_format(x_train, self.max_len, 0))
        x_test = np.array(nn.tf_format(x_test, self.max_len, 0))

        log.info('Building keras model...')
        self.model = self.build_model()

        log.info('Training model...')
        callbacks = [
            TensorBoard(),
            EarlyStopping(patience=self.max_patience, monitor='val_sparse_categorical_accuracy'),
            ModelCheckpoint(
                safe_path(CNN_MODEL_TMP_TARGET),
                save_best_only=True,
                monitor='val_sparse_categorical_accuracy'
            )
        ]
        if self.decay_lr_on_plateau:
            callbacks.append(ReduceLROnPlateau(monitor='val_sparse_categorical_accuracy', factor=.5, patience=5))
        history = self.model.fit(
            x_train, y_train,
            validation_data=(x_test, y_test),
            batch_size=self.batch_size, epochs=self.max_n_epochs,
            callbacks=callbacks, verbose=2
        )
        self.history = history.history
        log.info('Done training')
Exemplo n.º 3
0
    def train(self, training_data: TrainingData) -> None:

        if self.use_qb:
            x_train_text, y_train, x_test_text, y_test, vocab, class_to_i, i_to_class = preprocess_dataset(
                training_data)
            if self.use_wiki:
                wiki_dataset = WikipediaDataset(set(training_data[1]))
                wiki_train_data = wiki_dataset.training_data()
                w_x_train_text, w_train_y, _, _, _, _, _ = preprocess_dataset(
                    wiki_train_data,
                    train_size=1,
                    vocab=vocab,
                    class_to_i=class_to_i,
                    i_to_class=i_to_class)
                x_train_text.extend(w_x_train_text)
                y_train.extend(w_train_y)
        else:
            if self.use_wiki:
                wiki_dataset = WikipediaDataset(set(training_data[1]))
                wiki_train_data = wiki_dataset.training_data()
                x_train_text, y_train, x_test_text, y_test, vocab, class_to_i, i_to_class = preprocess_dataset(
                    wiki_train_data)
            else:
                raise ValueError(
                    'use_wiki and use_qb cannot both be false, otherwise there is no training data'
                )

        self.class_to_i = class_to_i
        self.i_to_class = i_to_class
        self.vocab = vocab

        embeddings, embedding_lookup = load_embeddings(vocab=vocab,
                                                       expand_glove=True)
        self.embeddings = embeddings
        self.embedding_lookup = embedding_lookup

        x_train = [
            convert_text_to_embeddings_indices(q, embedding_lookup)
            for q in x_train_text
        ]
        for r in x_train:
            if len(r) == 0:
                r.append(embedding_lookup['UNK'])
        x_train = np.array(x_train)
        y_train = np.array(y_train)

        x_test = [
            convert_text_to_embeddings_indices(q, embedding_lookup)
            for q in x_test_text
        ]
        for r in x_test:
            if len(r) == 0:
                r.append(embedding_lookup['UNK'])
        x_test = np.array(x_test)
        y_test = np.array(y_test)

        self.n_classes = compute_n_classes(training_data[1])

        n_batches_train, t_x_train, t_offset_train, t_y_train = batchify(
            self.batch_size, x_train, y_train, truncate=True)
        n_batches_test, t_x_test, t_offset_test, t_y_test = batchify(
            self.batch_size, x_test, y_test, truncate=False)

        self.vocab_size = embeddings.shape[0]
        self.model = DanModel(self.vocab_size,
                              self.n_classes,
                              dropout_prob=self.dropout_prob,
                              k_softmaxes=self.k_softmaxes,
                              n_hidden_units=self.n_hidden_units,
                              non_linearity=self.non_linearity)
        log.info(f'Parameters:\n{pformat(self.parameters())}')
        log.info(f'Torch Model:\n{self.model}')
        self.model.init_weights(initial_embeddings=embeddings)
        if CUDA:
            self.model = self.model.cuda()

        self.optimizer = Adam(self.model.parameters(), lr=self.learning_rate)
        self.criterion = nn.NLLLoss()
        # self.criterion = nn.CrossEntropyLoss()
        self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer,
                                                        patience=5,
                                                        verbose=True)
        tb_experiment = ' '.join(
            f'{param}={value}' for param, value in
            [('model',
              'dan'), ('n_hidden_units',
                       self.n_hidden_units), ('dropout_prob',
                                              self.dropout_prob),
             ('k_softmaxes',
              self.k_softmaxes), ('non_linearity', self.non_linearity),
             ('learning_rate',
              self.learning_rate), ('batch_size', self.batch_size)])

        manager = TrainingManager([
            BaseLogger(log_func=log.info),
            TerminateOnNaN(),
            EarlyStopping(monitor='test_loss', patience=10, verbose=1),
            MaxEpochStopping(100),
            ModelCheckpoint(create_save_model(self.model),
                            '/tmp/dan.pt',
                            monitor='test_loss'),
            Tensorboard(tb_experiment)
        ])

        log.info('Starting training...')
        while True:
            self.model.train()
            train_acc, train_loss, train_time = self.run_epoch(n_batches_train,
                                                               t_x_train,
                                                               t_offset_train,
                                                               t_y_train,
                                                               evaluate=False)

            self.model.eval()
            test_acc, test_loss, test_time = self.run_epoch(n_batches_test,
                                                            t_x_test,
                                                            t_offset_test,
                                                            t_y_test,
                                                            evaluate=True)

            stop_training, reasons = manager.instruct(train_time, train_loss,
                                                      train_acc, test_time,
                                                      test_loss, test_acc)

            if stop_training:
                log.info(' '.join(reasons))
                break
            else:
                self.scheduler.step(test_loss)

        log.info('Done training')
Exemplo n.º 4
0
Arquivo: dan.py Projeto: amit2014/qb
    def train(self, training_data: TrainingData) -> None:
        x_train_text, y_train, x_test_text, y_test, vocab, class_to_i, i_to_class = preprocess_dataset(
            training_data
        )

        self.class_to_i = class_to_i
        self.i_to_class = i_to_class
        self.vocab = vocab

        embeddings, embedding_lookup = load_embeddings(vocab=vocab, expand_glove=True)
        self.embeddings = embeddings
        self.embedding_lookup = embedding_lookup

        x_train = [convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_train_text]
        for r in x_train:
            if len(r) == 0:
                r.append(embedding_lookup['UNK'])
        x_train = np.array(x_train)
        y_train = np.array(y_train)

        x_test = [convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_test_text]
        for r in x_test:
            if len(r) == 0:
                r.append(embedding_lookup['UNK'])
        x_test = np.array(x_test)
        y_test = np.array(y_test)

        self.n_classes = compute_n_classes(training_data[1])

        n_batches_train, t_x_train, t_offset_train, t_y_train = batchify(
            self.batch_size, x_train, y_train, truncate=True)
        n_batches_test, t_x_test, t_offset_test, t_y_test = batchify(
            self.batch_size, x_test, y_test, truncate=False)

        self.model = DanModel(embeddings.shape[0], self.n_classes)
        self.model.init_weights(initial_embeddings=embeddings)
        self.model.cuda()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
        self.criterion = nn.CrossEntropyLoss()

        manager = TrainingManager([
            BaseLogger(log_func=log.info), TerminateOnNaN(),
            EarlyStopping(monitor='test_acc', patience=10, verbose=1), MaxEpochStopping(100),
            ModelCheckpoint(create_save_model(self.model), '/tmp/dan.pt', monitor='test_acc')
            # Tensorboard('dan', log_dir='tb-logs')
        ])

        log.info('Starting training...')
        while True:
            self.model.train()
            train_acc, train_loss, train_time = self.run_epoch(
                n_batches_train,
                t_x_train, t_offset_train, t_y_train, evaluate=False
            )

            self.model.eval()
            test_acc, test_loss, test_time = self.run_epoch(
                n_batches_test,
                t_x_test, t_offset_test, t_y_test, evaluate=True
            )

            stop_training, reasons = manager.instruct(
                train_time, train_loss, train_acc,
                test_time, test_loss, test_acc
            )

            if stop_training:
                log.info(' '.join(reasons))
                break

        log.info('Done training')
Exemplo n.º 5
0
    def train(self, training_data: TrainingData):
        x_train_text, y_train, x_test_text, y_test, vocab, class_to_i, i_to_class = preprocess_dataset(
            training_data)

        if self.use_wiki:
            wiki_dataset = FilteredWikipediaDataset()
            wiki_train_data = wiki_dataset.training_data()
            w_x_train_text, w_train_y, _, _, _, _, _ = preprocess_dataset(
                wiki_train_data,
                train_size=1,
                vocab=vocab,
                class_to_i=class_to_i,
                i_to_class=i_to_class)
            x_train_text.extend(w_x_train_text)
            y_train.extend(w_train_y)

        self.class_to_i = class_to_i
        self.i_to_class = i_to_class
        self.vocab = vocab

        embeddings, embedding_lookup = load_embeddings(vocab=vocab,
                                                       expand_glove=True,
                                                       mask_zero=True)
        self.embeddings = embeddings
        self.embedding_lookup = embedding_lookup

        x_train = [
            convert_text_to_embeddings_indices(q,
                                               embedding_lookup,
                                               random_unk_prob=.05)
            for q in x_train_text
        ]
        for r in x_train:
            if len(r) == 0:
                r.append(embedding_lookup['UNK'])
        x_train = np.array(x_train)
        y_train = np.array(y_train)

        x_test = [
            convert_text_to_embeddings_indices(q,
                                               embedding_lookup,
                                               random_unk_prob=.05)
            for q in x_test_text
        ]
        for r in x_test:
            if len(r) == 0:
                r.append(embedding_lookup['UNK'])
        x_test = np.array(x_test)
        y_test = np.array(y_test)

        self.n_classes = compute_n_classes(training_data[1])

        n_batches_train, t_x_train, lengths_train, t_y_train, _ = batchify(
            self.batch_size, x_train, y_train, truncate=True)
        n_batches_test, t_x_test, lengths_test, t_y_test, _ = batchify(
            self.batch_size, x_test, y_test, truncate=False)

        self.model = RnnModel(embeddings.shape[0], self.n_classes)
        self.model.init_weights(embeddings=embeddings)
        self.model.cuda()
        self.optimizer = Adam(self.model.parameters(), lr=self.learning_rate)
        self.criterion = nn.CrossEntropyLoss()
        self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer,
                                                        'max',
                                                        patience=5,
                                                        verbose=True)

        manager = TrainingManager([
            BaseLogger(log_func=log.info),
            TerminateOnNaN(),
            EarlyStopping(monitor='test_acc', patience=10, verbose=1),
            MaxEpochStopping(100),
            ModelCheckpoint(create_save_model(self.model),
                            '/tmp/rnn.pt',
                            monitor='test_acc')
            #Tensorboard('rnn', log_dir='tb-logs')
        ])

        log.info('Starting training...')
        while True:
            self.model.train()
            train_acc, train_loss, train_time = self.run_epoch(n_batches_train,
                                                               t_x_train,
                                                               lengths_train,
                                                               t_y_train,
                                                               evaluate=False)

            self.model.eval()
            test_acc, test_loss, test_time = self.run_epoch(n_batches_test,
                                                            t_x_test,
                                                            lengths_test,
                                                            t_y_test,
                                                            evaluate=True)

            stop_training, reasons = manager.instruct(train_time, train_loss,
                                                      train_acc, test_time,
                                                      test_loss, test_acc)

            if stop_training:
                log.info(' '.join(reasons))
                break
            else:
                self.scheduler.step(test_acc)

        log.info('Done training')
Exemplo n.º 6
0
Arquivo: dan.py Projeto: nadesai/qb
    def train(self, training_data: TrainingData) -> None:

        if self.use_qb:
            x_train_text, y_train, x_test_text, y_test, vocab, class_to_i, i_to_class = preprocess_dataset(
                training_data
            )
            if self.use_wiki:
                wiki_dataset = WikipediaDataset(set(training_data[1]))
                wiki_train_data = wiki_dataset.training_data()
                w_x_train_text, w_train_y, _, _, _, _, _ = preprocess_dataset(
                    wiki_train_data, train_size=1, vocab=vocab, class_to_i=class_to_i, i_to_class=i_to_class
                )
                x_train_text.extend(w_x_train_text)
                y_train.extend(w_train_y)
        else:
            if self.use_wiki:
                wiki_dataset = WikipediaDataset(set(training_data[1]))
                wiki_train_data = wiki_dataset.training_data()
                x_train_text, y_train, x_test_text, y_test, vocab, class_to_i, i_to_class = preprocess_dataset(
                    wiki_train_data
                )
            else:
                raise ValueError('use_wiki and use_qb cannot both be false, otherwise there is no training data')

        self.class_to_i = class_to_i
        self.i_to_class = i_to_class
        self.vocab = vocab

        embeddings, embedding_lookup = load_embeddings(vocab=vocab, expand_glove=True)
        self.embeddings = embeddings
        self.embedding_lookup = embedding_lookup

        x_train = [convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_train_text]
        for r in x_train:
            if len(r) == 0:
                r.append(embedding_lookup['UNK'])
        x_train = np.array(x_train)
        y_train = np.array(y_train)

        x_test = [convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_test_text]
        for r in x_test:
            if len(r) == 0:
                r.append(embedding_lookup['UNK'])
        x_test = np.array(x_test)
        y_test = np.array(y_test)

        self.n_classes = compute_n_classes(training_data[1])

        log.info(f'Batching: {len(x_train)} train questions and {len(x_test)} test questions')

        n_batches_train, t_x_train, t_offset_train, t_y_train = batchify(
            self.batch_size, x_train, y_train, truncate=True)
        n_batches_test, t_x_test, t_offset_test, t_y_test = batchify(
            self.batch_size, x_test, y_test, truncate=False)

        self.vocab_size = embeddings.shape[0]
        self.model = DanModel(self.vocab_size, self.n_classes, embeddings=embeddings)
        if CUDA:
            self.model = self.model.cuda()
        log.info(f'Model:\n{self.model}')

        self.optimizer = Adam(self.model.parameters(), lr=self.learning_rate)
        self.criterion = nn.CrossEntropyLoss()
        self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer, patience=5, verbose=True, mode='max')

        manager = TrainingManager([
            BaseLogger(log_func=log.info), TerminateOnNaN(),
            EarlyStopping(monitor='test_acc', patience=10, verbose=1), MaxEpochStopping(100),
            ModelCheckpoint(create_save_model(self.model), '/tmp/dan.pt', monitor='test_acc')
        ])

        log.info('Starting training...')
        while True:
            self.model.train()
            train_acc, train_loss, train_time = self.run_epoch(
                n_batches_train,
                t_x_train, t_offset_train, t_y_train, evaluate=False
            )

            self.model.eval()
            test_acc, test_loss, test_time = self.run_epoch(
                n_batches_test,
                t_x_test, t_offset_test, t_y_test, evaluate=True
            )

            stop_training, reasons = manager.instruct(
                train_time, train_loss, train_acc,
                test_time, test_loss, test_acc
            )

            if stop_training:
                log.info(' '.join(reasons))
                break
            else:
                self.scheduler.step(test_acc)

        log.info('Done training')
Exemplo n.º 7
0
    def train(self, training_data: TrainingData):
        log.info('Preprocessing the dataset')
        self.nlp = spacy.load('en', create_pipeline=custom_spacy_pipeline)
        x_train_tokens, y_train, x_test_tokens, y_test, vocab, class_to_i, i_to_class = preprocess_dataset(
            self.nlp, training_data)

        self.class_to_i = class_to_i
        self.i_to_class = i_to_class
        self.vocab = vocab

        log.info('Loading word embeddings')
        word_embeddings, multi_embedding_lookup = load_multi_embeddings(
            multi_vocab=vocab)
        self.word_embeddings = word_embeddings
        self.multi_embedding_lookup = multi_embedding_lookup

        log.info('Batching the dataset')
        train_dataset = BatchedDataset(self.batch_size, multi_embedding_lookup,
                                       x_train_tokens, y_train)
        test_dataset = BatchedDataset(self.batch_size, multi_embedding_lookup,
                                      x_test_tokens, y_test)
        self.n_classes = compute_n_classes(training_data[1])

        log.info('Initializing neural model')
        self.model = RnnEntityModel(len(multi_embedding_lookup.word),
                                    len(multi_embedding_lookup.pos),
                                    len(multi_embedding_lookup.iob),
                                    len(multi_embedding_lookup.ent_type),
                                    self.n_classes)
        self.model.init_weights(word_embeddings=word_embeddings)
        self.model.cuda()
        self.optimizer = Adam(self.model.parameters(), lr=self.learning_rate)
        self.criterion = nn.CrossEntropyLoss()
        self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer,
                                                        'max',
                                                        patience=5,
                                                        verbose=True)

        manager = TrainingManager([
            BaseLogger(log_func=log.info),
            TerminateOnNaN(),
            EarlyStopping(monitor='test_acc', patience=10, verbose=1),
            MaxEpochStopping(100),
            ModelCheckpoint(create_save_model(self.model),
                            '/tmp/rnn_entity.pt',
                            monitor='test_acc')
            #Tensorboard('rnn_entity', log_dir='tb-logs')
        ])

        log.info('Starting training...')
        while True:
            self.model.train()
            train_acc, train_loss, train_time = self.run_epoch(train_dataset,
                                                               evaluate=False)

            self.model.eval()
            test_acc, test_loss, test_time = self.run_epoch(test_dataset,
                                                            evaluate=True)

            stop_training, reasons = manager.instruct(train_time, train_loss,
                                                      train_acc, test_time,
                                                      test_loss, test_acc)

            if stop_training:
                log.info(' '.join(reasons))
                break
            else:
                self.scheduler.step(test_acc)

        log.info('Done training')
Exemplo n.º 8
0
Arquivo: bcn.py Projeto: nadesai/qb
    def train(self, training_data) -> None:
        log.info('Preprocessing data')
        x_train_text, y_train, x_test_text, y_test, vocab, class_to_i, i_to_class = preprocess_dataset(
            training_data
        )
        self.class_to_i = class_to_i
        self.i_to_class = i_to_class
        self.vocab = vocab

        embeddings, embedding_lookup = load_embeddings(vocab=vocab, expand_glove=True, mask_zero=True)
        self.embeddings = embeddings
        self.embedding_lookup = embedding_lookup

        x_train = [convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_train_text]
        for r in x_train:
            if len(r) == 0:
                r.append(embedding_lookup['UNK'])
        x_train = np.array(x_train)
        y_train = np.array(y_train)

        x_test = [convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_test_text]
        for r in x_test:
            if len(r) == 0:
                r.append(embedding_lookup['UNK'])
        x_test = np.array(x_test)
        y_test = np.array(y_test)

        log.info('Batching data')
        n_batches_train, t_x_train, lengths_train, masks_train, t_y_train = batchify(
            self.batch_size, x_train, y_train, truncate=True
        )
        n_batches_test, t_x_test, lengths_test, masks_test, t_y_test = batchify(
            self.batch_size, x_test, y_test, truncate=False, shuffle=False
        )

        self.n_classes = compute_n_classes(training_data[1])

        log.info('Creating model')
        self.model = BCN(
            300, 500, embeddings.shape[0], self.n_classes,
            We=torch.from_numpy(embeddings)
        ).cuda()
        self.optimizer = Adam(self.model.parameters())
        self.criterion = nn.NLLLoss()

        log.info(f'Model:\n{self.model}')

        manager = TrainingManager([
            BaseLogger(log_func=log.info), TerminateOnNaN(),
            EarlyStopping(monitor='test_acc', patience=10, verbose=1), MaxEpochStopping(100),
            ModelCheckpoint(create_save_model(self.model), '/tmp/bcn.pt', monitor='test_acc'),
            Tensorboard('bcn')
        ])

        log.info('Starting training...')
        while True:
            self.model.train()
            train_acc, train_loss, train_time = self.run_epoch(
                n_batches_train,
                t_x_train, lengths_train, masks_train, t_y_train, evaluate=False
            )

            self.model.eval()
            test_acc, test_loss, test_time = self.run_epoch(
                n_batches_test,
                t_x_test, lengths_test, masks_test, t_y_test, evaluate=True
            )

            stop_training, reasons = manager.instruct(
                train_time, train_loss, train_acc,
                test_time, test_loss, test_acc
            )

            if stop_training:
                log.info(' '.join(reasons))
                break