예제 #1
0
    def test_reuse_text_processor_by_json(self):
        sample_text = ['sample text sentence']
        processor = text_processor(sample_text)
        target = processor(sample_text)
        json_cfg = processor.to_json()

        # rebuild with json config
        processor_from_json = text_processor(json_cfg, from_json=True)
        processed = processor_from_json(sample_text)

        self.assertTrue(np.all(processed == target))
예제 #2
0
    def test_text_processor(self):
        sample = ['test sentence', 'new sentence']

        vocab = set([word for sent in sample
                     for word in sent.split()] + ['<pad>', '<unk>'])
        init_size = len(vocab)

        processor = text_processor(sample)
        processed = processor(sample)
        self.assertEqual(processor.vocab_size, init_size)

        # check if each word in sentences is processed
        for s, p in zip(sample, processed):
            self.assertEqual(len(s.split()), len(p))

        # new one sentence with unseen words
        new_sample = ['test with new words']
        processed = processor(new_sample)

        # new sample has words wihich are not in original one
        # and '<unk>' token should be stored in the index
        self.assertEqual(len(processed[0]), len(new_sample[0].split()))

        for sent in new_sample:
            for word in sent.split():
                vocab.add(word)
        self.assertEqual(processor.vocab_size, init_size)

        processor.update(new_sample)
        self.assertEqual(processor.vocab_size, len(vocab))

        # updated vocab so that it has to match the length
        processed = processor(new_sample)
        self.assertEqual(len(processed[0]), len(new_sample[0].split()))
예제 #3
0
    def test_text_processor_passing_str(self):
        sample = 'test sentence'

        vocab = set(sample.split() + ['<pad>', '<unk>'])
        init_size = len(vocab)

        processor = text_processor(sample)
        self.assertEqual(processor.vocab_size, init_size)
예제 #4
0
 def test_processed_sentence_lengths_are_fixed_by_given_number(self):
     maxlen = 4
     sample_text = [
         'sample text sentence', 'This is another sentence',
         'This is not processed yet and this is treated as long sentence example for test'
     ]
     processor = text_processor(sample_text, maxlen=maxlen)
     processed = processor(sample_text)
     self.assertEqual(processed.shape, (len(sample_text), maxlen))
예제 #5
0
    def test_angle_brackets_not_filtered(self):
        sample = '<these> <are> <sample> <tags>'
        processor = text_processor(sample)
        processed = processor(sample)
        self.assertEqual(processed.shape, (1, len(sample.split())))

        # check all registered vocabulary has <> tags
        for word in processor.word_index.keys():
            self.assertTrue(re.search(r'^<\w+>$', word))
예제 #6
0
async def run_server(host, port):
    global _processor

    _processor = text_processor(num_words=15, from_config=True)

    server = await asyncio.start_server(
        run_prediction, host, port)
    addr = server.sockets[0].getsockname()
    log.info(f'Serving on {addr!r}')

    async with server:
        await server.serve_forever()
예제 #7
0
 def test_processed_sentence_lengths_are_same(self):
     sample_text = [
         'sample text sentence', 'This is another sentence',
         'This is not processed yet'
     ]
     processor = text_processor(sample_text)
     examples = [
         'this should be processed by tokenizer',
         'this is also be processed'
     ]
     processed = processor(examples)
     self.assertEqual(len(processed[0]), len(processed[1]))
예제 #8
0
    def test_fixed_vocab_size(self):
        num_words = 100
        start = 99
        processor = text_processor(num_words=num_words, from_config=True)

        # only words whose ids are less than 100 work
        test_sents = ' '.join(processor.index_word[i]
                              for i in range(start, 200))

        processed = processor(test_sents)
        num_kinds_of_words = len(set(id for id in processed[0]))
        self.assertEqual(num_kinds_of_words, num_words - start + 1)
예제 #9
0
    def __init__(self):
        if PredictionModel.__instance is None:
            self._build_model()
            self._processor = text_processor(
                num_words=20000,
                maxlen=Config.MODELS.get('QTYPE').get('seq_length'),
                from_config=True)

            PredictionModel.__instance = self
        else:
            raise RuntimeError(
                f'This object can not be instantiated. Use {self.__class__.__name__}.get_model() instead'
            )
예제 #10
0
    print()
    print('  Parameters')
    print('  ----------')
    print('  Data size:')
    print(f'      Train: {len(train):>7}')
    print(f'      Val:   {len(val):>7}')
    print()
    print(f'  Epoch:            {epochs}')
    print(f'  Batch Size:       {batch_size}')
    print(f'  Hidden unit size: {units}')
    print(f'  Vocabulary size:  {vocab_size}')
    print()

    # use all words from training set processed primarily
    processor = text_processor(maxlen=seq_length, from_config=True)
    ans_processor = text_processor(maxlen=ans_length, from_config=True)

    print('Time to setup: {:.4f}s'.format(time.time() - st))
    print()

    run(model_type,
        train,
        val,
        units=units,
        embedding_dim=embedding_dim,
        vocab_size=vocab_size,
        learning_rate=learning_rate,
        sequence_length=ans_length,
        save=save)
    print('Training completed')
예제 #11
0
    save = args.no_save

    st = time.time()
    print('Setting up dataset')
    with open('./data/answer_yes_no.json', 'r') as f:
        dataset = json.load(f)

    print('Total loaded data size:', len(dataset))

    random.shuffle(dataset)

    train, val = dataset[:data_size], dataset[data_size:data_size + val_size]
    print('Data size: Train: {} Val: {}'.format(len(train), len(val)))

    if args.no_config:
        # use only if words appeared in training set
        words = [d['question'] for d in train]
        processor = text_processor(words, maxlen=pad_max_len)
        assert processor(words).shape[1] == pad_max_len
    else:
        processor = text_processor(num_words=vocab_size,
                                   maxlen=pad_max_len,
                                   from_config=True)

    print('Time to setup: {:.4f}s'.format(time.time() - st))

    main(train, val, save=save)

    print('Training completed')
    print('Total running time: {:.4f}s'.format(time.time() - st))
def main(*, training=True, save_to=None, load_from=None, val=0.2):
    global data_size
    global num_classes
    global processor

    vqa = VQA()
    vqa.load_data(num_data=data_size)
    questions, question_types, _, _ = next(vqa.data_generator())
    labels = [
        q2id[q] if q in q2id else q2id['none of the above']
        for q in question_types
    ]

    # build processor based on training dataset
    # if processor is not reused
    if training:
        # preprocessing dataset
        # split train and test set
        train_size = int(data_size * (1 - val))

        # inputs
        inputs_train = questions[:train_size]
        inputs_val = questions[train_size:]

        # process inputs
        # if tokenizer is not loaded, create new one
        if processor is None:
            processor = text_processor(inputs_train)

    # iinitialize model
    model = QuestionTypeClassification(
        embedding_dim=embedding_dim,
        units=hidden_units,
        vocab_size=vocab_size,  # need to add 1 due to Embedding implementation
        num_classes=num_classes)

    # set initial weights to the model
    if load_from is not None:
        print('Loading weights...')
        model.load_weights(load_from)

    # TRAINING STEP
    if training:
        min_loss_val = 1.0

        print('Start training')

        inputs_train = processor(inputs_train)
        inputs_val = [processor(inputs_val)]

        # labels
        labels = np.array(labels, dtype=np.int32)

        labels_train = labels[:train_size]
        labels_val = labels[train_size:]

        loss = 0
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

        train_cls_step = make_training_cls_model(
            model, optimizer, loss='sparse_categorical_crossentropy')

        # execute training
        for epoch in range(epochs):
            print('=====' * 10)
            print('    Epoch {}'.format(epoch + 1))
            print('=====' * 10)

            dataset = data_generator(inputs_train, labels_train, batch_size)

            for batch, (ins, outs) in enumerate(dataset):
                st = time.time()
                ins = [ins]
                batch_loss, accuracy = train_cls_step(ins, outs)

                end = time.time()

                if batch % 100 == 0:
                    out_val = model(*inputs_val)
                    cost_val = tf.keras.losses.sparse_categorical_crossentropy(
                        labels_val, out_val, from_logits=True)
                    loss_val = tf.reduce_mean(cost_val)
                    acc_val = calculate_accuracy(out_val, labels_val)

                    if DEBUG:
                        print('[DEBUG] Batch:', batch)
                        for layer in model.layers:
                            print('  Layer:', model.name + ':' + layer.name)
                            print('  Weights:')
                            print('    mean:', np.mean(layer.get_weights()[0]))
                            print('     std:', np.std(layer.get_weights()[0]))
                            print()

                    batch_loss = batch_loss.numpy()
                    print('  Batch:', batch)
                    # TODO: add accuracy
                    print(
                        '    Loss: {:.4f}  Accuracy(Train): {:.4f}  Loss(Val): {:.4f}  Accuracy(Val): {:.4f}  Time(batch): {:.4f}s'
                        .format(batch_loss, accuracy, loss_val, acc_val,
                                end - st))

            if loss_val < min_loss_val:
                min_loss_val = loss_val
                print('Saving models...')
                # save tokenizer info for resuse
                processor.to_json('./.env/tokenizer_config.json')
                model.save_weights(save_to)
                print('Saved!!')

        print()
        print('Training completed')

    else:
        # if not training mode test with all given data
        st = time.time()
        inputs = processor(questions)
        out = model(inputs)
        labels = tf.Variable(labels, dtype=tf.int32)
        accuracy = calculate_accuracy(out, labels)
        end = time.time()
        print('Evaluated score: Accuracy: {:.4f} Time: {:.4f}s'.format(
            accuracy, end - st))

    return model
예제 #13
0
import tensorflow as tf

from main.settings import Config
from main.utils.loader import fetch_question_types, load_image
from main.utils.preprocess import text_processor
from .common import get_mobilenet_encoder
from ._base import BaseModel
from ._models import (
    QuestionTypeClassification,
    ClassificationModel,
    QuestionAnswerModel,
)

log = logging.getLogger(__name__)

processor = text_processor(num_words=20000, from_config=True)
img_encoder = get_mobilenet_encoder()
# TODO: tmp
classes = fetch_question_types()
id2q = [q for q in classes]

# tokens storing IDs for specific tokens
_tokens = {
    'bos': processor.word_index['<bos>'],
    'eos': processor.word_index['<eos>'],
    'unk': processor.word_index['<unk>'],
    'pad': processor.word_index['<pad>'],
}


class PredictionModel(BaseModel):
예제 #14
0
 def test_from_config(self):
     # use preprocessed config
     processor = text_processor(from_config=True)
     self.assertGreater(processor.vocab_size, 0)
예제 #15
0
 def test_text_processor_load_data_by_config(self):
     processor = text_processor(from_config=True)
     self.assertGreater(processor.vocab_size, 0)
예제 #16
0
            print()
            print(
                '      Validation(approx.): Loss - {:.4f}  Acc - {:.4f}  Time - {:.4f}s'
                .format(loss_val, acc_val, end_val - st_val))
            print('  Total time per epoch: {:.4f}s'.format(time.time() -
                                                           epoch_start))
            print()


if __name__ == '__main__':
    st = time.time()
    print('Setting up dataset')
    with open('./data/answer_yes_no.json', 'r') as f:
        dataset = json.load(f)

    random.shuffle(dataset)

    train, val = dataset[:data_size], dataset[data_size:data_size + val_size]

    # use only if words appeared in training set
    words = [d['question'] for d in train]

    processor = text_processor(words, maxlen=pad_max_len)

    assert processor(words).shape[1] == pad_max_len

    print('Time to setup: {:.4f}s'.format(time.time() - st))

    main(train, val)
    print('Total running time: {:.4f}s'.format(time.time() - st))