예제 #1
0
def get_dataloaders(batch_size, vocab, train_dataset_size, val_dataset_size):

    batchify_fn = nlp.data.batchify.Tuple(
        nlp.data.batchify.Stack(),
        nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
        nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
        nlp.data.batchify.Stack('float32'),
        nlp.data.batchify.Stack('float32'),
        nlp.data.batchify.Stack(),
    )

    train_data = SQuAD("train", version='2.0')[:train_dataset_size]

    train_data_transform, _ = preprocess_dataset(
        train_data,
        SQuADTransform(nlp.data.BERTTokenizer(vocab=vocab, lower=True),
                       max_seq_length=384,
                       doc_stride=128,
                       max_query_length=64,
                       is_pad=True,
                       is_training=True))

    train_dataloader = mx.gluon.data.DataLoader(train_data_transform,
                                                batchify_fn=batchify_fn,
                                                batch_size=batch_size,
                                                num_workers=4,
                                                shuffle=True)

    #we only get 4 validation samples
    dev_data = SQuAD("dev", version='2.0')[:val_dataset_size]
    dev_data = mx.gluon.data.SimpleDataset(dev_data)

    dev_dataset = dev_data.transform(SQuADTransform(
        nlp.data.BERTTokenizer(vocab=vocab, lower=True),
        max_seq_length=384,
        doc_stride=128,
        max_query_length=64,
        is_pad=False,
        is_training=False)._transform,
                                     lazy=False)

    dev_data_transform, _ = preprocess_dataset(
        dev_data,
        SQuADTransform(nlp.data.BERTTokenizer(vocab=vocab, lower=True),
                       max_seq_length=384,
                       doc_stride=128,
                       max_query_length=64,
                       is_pad=False,
                       is_training=False))

    dev_dataloader = mx.gluon.data.DataLoader(dev_data_transform,
                                              batchify_fn=batchify_fn,
                                              num_workers=1,
                                              batch_size=batch_size,
                                              shuffle=False,
                                              last_batch='keep')

    return train_dataloader, dev_dataloader, dev_dataset
예제 #2
0
    def test_preprocess_dataset_batch2_float_tfrecord(self):
        with self.test_session():
            test_image1 = tf.constant(np.arange(4 * 4 * 3) * 5,
                                      shape=[4, 4, 3],
                                      dtype=tf.uint8)
            encoded = tf.image.encode_png(test_image1)
            image1 = encoded.eval()
            with open(os.path.join("test_files", "test1.png"), "wb") as f:
                f.write(image1)

            test_image2 = tf.constant(np.flip(np.arange(4 * 4 * 3) * 5,
                                              axis=0),
                                      shape=[4, 4, 3],
                                      dtype=tf.uint8)
            encoded = tf.image.encode_png(test_image2)
            image2 = encoded.eval()
            with open(os.path.join("test_files", "test2.png"), "wb") as f:
                f.write(image2)

            files = glob.glob(os.path.join("test_files", "test*.png"))
            dataset = get_dataset(files)

            dataset = preprocess_dataset(dataset,
                                         size=[64, 64],
                                         batch_size=2,
                                         float_pixels=True)

            it = dataset.make_one_shot_iterator()
            data = it.get_next().eval()
            self.assertEqual(data.shape, (2, 64, 64, 3))
            self.assertAllClose(max(data.flatten()),
                                max(test_image1.eval().flatten()) / 127.5 - 1.)
            self.assertAllClose(min(data.flatten()),
                                min(test_image1.eval().flatten()) / 127.5 - 1.)
예제 #3
0
    def test_preprocess_dataset_batch2_float_raw(self):
        with self.test_session():
            test_image1 = tf.constant(np.arange(4 * 4 * 3),
                                      shape=[4, 4, 3],
                                      dtype=tf.uint8)

            test_image2 = tf.constant(np.flip(np.arange(4 * 4 * 3), axis=0),
                                      shape=[4, 4, 3],
                                      dtype=tf.uint8)
            writer = tf.python_io.TFRecordWriter(
                os.path.join("test_files", "test.tfrecords"))
            testimage1_bytes_list = tf.train.BytesList(
                value=[test_image1.eval().tobytes()])
            example1 = tf.train.Example(features=tf.train.Features(
                feature={
                    'data':
                    tf.train.Feature(bytes_list=testimage1_bytes_list),
                    'shape':
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=[4, 4, 3]))
                }))
            testimage2_bytes_list = tf.train.BytesList(
                value=[test_image2.eval().tobytes()])
            example2 = tf.train.Example(features=tf.train.Features(
                feature={
                    'data':
                    tf.train.Feature(bytes_list=testimage2_bytes_list),
                    'shape':
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=[4, 4, 3]))
                }))
            writer.write(example1.SerializeToString())
            writer.write(example2.SerializeToString())
            writer.close()
            files = glob.glob(os.path.join("test_files", "*.tfrecords"))
            dataset = get_dataset(files)

            dataset = preprocess_dataset(dataset,
                                         size=[64, 64],
                                         batch_size=2,
                                         float_pixels=True)

            it = dataset.make_one_shot_iterator()
            data = it.get_next().eval()
            self.assertEqual(data.shape, (2, 64, 64, 3))
            self.assertAllClose(max(data.flatten()),
                                max(test_image1.eval().flatten()) / 127.5 - 1.)
            self.assertAllClose(min(data.flatten()),
                                min(test_image1.eval().flatten()) / 127.5 - 1.)
예제 #4
0
#model = CBOW
#parmfile = './logs/scnwiki-cbow_2019-07-18.params'
#otcsv = './logs/cossim_scnwiki-cbow.csv'

output_dim = 300
batch_size = 1024
num_negatives = 5
subword_function = None
window = 5
frequent_token_subsampling = 1E-4

##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##  ##

##  load the data
data = nlp.data.TSVDataset(datafile)
data, vocab, idx_to_counts = preprocess_dataset(data)

##  load the model
embedding = model(token_to_idx=vocab.token_to_idx,
                  output_dim=output_dim,
                  batch_size=batch_size,
                  num_negatives=num_negatives,
                  negatives_weights=mx.nd.array(idx_to_counts))
embedding.load_parameters(parmfile)

##  get the word vectors
wvecs = embedding.embedding_out.weight.data()

##  "short vectors" -- only the words with at least 100 appearances
slimit = len(np.array(idx_to_counts)[np.array(idx_to_counts) >= 100])
svecs = wvecs[:slimit, ]
def train(args):
    """Training helper."""
    if not args.model.lower() in ['cbow', 'skipgram']:
        logging.error('Unsupported model %s.', args.model)
        sys.exit(1)

    if args.data.lower() == 'toy':
        data = mx.gluon.data.SimpleDataset(nlp.data.Text8(segment='train')[:2])
        data, vocab, idx_to_counts = preprocess_dataset(
            data, max_vocab_size=args.max_vocab_size)
    elif args.data.lower() == 'text8':
        data = nlp.data.Text8(segment='train')
        data, vocab, idx_to_counts = preprocess_dataset(
            data, max_vocab_size=args.max_vocab_size)
    elif args.data.lower() == 'fil9':
        data = nlp.data.Fil9(max_sentence_length=10000)
        data, vocab, idx_to_counts = preprocess_dataset(
            data, max_vocab_size=args.max_vocab_size)
    elif args.data.lower() == 'wiki':
        data, vocab, idx_to_counts = wiki(args.wiki_root, args.wiki_date,
                                          args.wiki_language,
                                          args.max_vocab_size)

    if args.ngram_buckets > 0:
        data, batchify_fn, subword_function = transform_data_fasttext(
            data,
            vocab,
            idx_to_counts,
            cbow=args.model.lower() == 'cbow',
            ngram_buckets=args.ngram_buckets,
            ngrams=args.ngrams,
            batch_size=args.batch_size,
            window_size=args.window,
            frequent_token_subsampling=args.frequent_token_subsampling)
    else:
        subword_function = None
        data, batchify_fn = transform_data_word2vec(
            data,
            vocab,
            idx_to_counts,
            cbow=args.model.lower() == 'cbow',
            batch_size=args.batch_size,
            window_size=args.window,
            frequent_token_subsampling=args.frequent_token_subsampling)

    num_tokens = float(sum(idx_to_counts))

    model = CBOW if args.model.lower() == 'cbow' else SG
    embedding = model(token_to_idx=vocab.token_to_idx,
                      output_dim=args.emsize,
                      batch_size=args.batch_size,
                      num_negatives=args.negative,
                      negatives_weights=mx.nd.array(idx_to_counts),
                      subword_function=subword_function)
    context = get_context(args)
    embedding.initialize(ctx=context)
    if not args.no_hybridize:
        embedding.hybridize(static_alloc=True, static_shape=True)

    optimizer_kwargs = dict(learning_rate=args.lr)
    try:
        trainer = mx.gluon.Trainer(embedding.collect_params(), args.optimizer,
                                   optimizer_kwargs)
    except ValueError as e:
        if args.optimizer == 'groupadagrad':
            logging.warning('MXNet <= v1.3 does not contain '
                            'GroupAdaGrad support. Falling back to AdaGrad')
            trainer = mx.gluon.Trainer(embedding.collect_params(), 'adagrad',
                                       optimizer_kwargs)
        else:
            raise e

    try:
        if args.no_prefetch_batch:
            data = data.transform(batchify_fn)
        else:
            from executors import LazyThreadPoolExecutor
            num_cpu = len(os.sched_getaffinity(0))
            ex = LazyThreadPoolExecutor(num_cpu)
    except (ImportError, SyntaxError, AttributeError):
        # Py2 - no async prefetching is supported
        logging.warning(
            'Asynchronous batch prefetching is not supported on Python 2. '
            'Consider upgrading to Python 3 for improved performance.')
        data = data.transform(batchify_fn)

    num_update = 0
    prefetched_iters = []
    for _ in range(min(args.num_prefetch_epoch, args.epochs)):
        prefetched_iters.append(iter(data))
    for epoch in range(args.epochs):
        if epoch + len(prefetched_iters) < args.epochs:
            prefetched_iters.append(iter(data))
        data_iter = prefetched_iters.pop(0)
        try:
            batches = ex.map(batchify_fn, data_iter)
        except NameError:  # Py 2 or batch prefetching disabled
            batches = data_iter

        # Logging variables
        log_wc = 0
        log_start_time = time.time()
        log_avg_loss = 0

        for i, batch in enumerate(batches):
            ctx = context[i % len(context)]
            batch = [array.as_in_context(ctx) for array in batch]
            with mx.autograd.record():
                loss = embedding(*batch)
            loss.backward()

            num_update += loss.shape[0]
            if len(context) == 1 or (i + 1) % len(context) == 0:
                trainer.step(batch_size=1)

            # Logging
            log_wc += loss.shape[0]
            log_avg_loss += loss.mean().as_in_context(context[0])
            if (i + 1) % args.log_interval == 0:
                # Forces waiting for computation by computing loss value
                log_avg_loss = log_avg_loss.asscalar() / args.log_interval
                wps = log_wc / (time.time() - log_start_time)
                # Due to subsampling, the overall number of batches is an upper
                # bound
                num_batches = num_tokens // args.batch_size
                if args.model.lower() == 'skipgram':
                    num_batches = (num_tokens * args.window *
                                   2) // args.batch_size
                else:
                    num_batches = num_tokens // args.batch_size
                logging.info('[Epoch {} Batch {}/{}] loss={:.4f}, '
                             'throughput={:.2f}K wps, wc={:.2f}K'.format(
                                 epoch, i + 1, num_batches, log_avg_loss,
                                 wps / 1000, log_wc / 1000))
                log_start_time = time.time()
                log_avg_loss = 0
                log_wc = 0

            if args.eval_interval and (i + 1) % args.eval_interval == 0:
                with print_time('mx.nd.waitall()'):
                    mx.nd.waitall()
                with print_time('evaluate'):
                    evaluate(args, embedding, vocab, num_update)

    # Evaluate
    with print_time('mx.nd.waitall()'):
        mx.nd.waitall()
    with print_time('evaluate'):
        evaluate(args,
                 embedding,
                 vocab,
                 num_update,
                 eval_analogy=not args.no_eval_analogy)

    # Save params
    with print_time('save parameters'):
        embedding.save_parameters(os.path.join(args.logdir,
                                               'embedding.params'))
예제 #6
0
def train(args):
    """Training helper."""
    if not args.model.lower() in ['cbow', 'skipgram']:
        logging.error('Unsupported model %s.', args.model)
        sys.exit(1)

    if args.data.lower() == 'toy':
        data = mx.gluon.data.SimpleDataset(nlp.data.Text8(segment='train')[:2])
        data, vocab, idx_to_counts = preprocess_dataset(
            data, max_vocab_size=args.max_vocab_size)
    elif args.data.lower() == 'text8':
        data = nlp.data.Text8(segment='train')
        data, vocab, idx_to_counts = preprocess_dataset(
            data, max_vocab_size=args.max_vocab_size)
    elif args.data.lower() == 'fil9':
        data = nlp.data.Fil9(max_sentence_length=10000)
        data, vocab, idx_to_counts = preprocess_dataset(
            data, max_vocab_size=args.max_vocab_size)
    elif args.data.lower() == 'wiki':
        data, vocab, idx_to_counts = wiki(args.wiki_root, args.wiki_date,
                                          args.wiki_language,
                                          args.max_vocab_size)

    if args.ngram_buckets > 0:
        data, batchify_fn, subword_function = transform_data_fasttext(
            data, vocab, idx_to_counts, cbow=args.model.lower() == 'cbow',
            ngram_buckets=args.ngram_buckets, ngrams=args.ngrams,
            batch_size=args.batch_size, window_size=args.window,
            frequent_token_subsampling=args.frequent_token_subsampling)
    else:
        subword_function = None
        data, batchify_fn = transform_data_word2vec(
            data, vocab, idx_to_counts, cbow=args.model.lower() == 'cbow',
            batch_size=args.batch_size, window_size=args.window,
            frequent_token_subsampling=args.frequent_token_subsampling)

    num_tokens = float(sum(idx_to_counts))

    model = CBOW if args.model.lower() == 'cbow' else SG
    embedding = model(token_to_idx=vocab.token_to_idx, output_dim=args.emsize,
                      batch_size=args.batch_size, num_negatives=args.negative,
                      negatives_weights=mx.nd.array(idx_to_counts),
                      subword_function=subword_function)
    context = get_context(args)
    embedding.initialize(ctx=context)
    if not args.no_hybridize:
        embedding.hybridize(static_alloc=True, static_shape=True)

    optimizer_kwargs = dict(learning_rate=args.lr)
    try:
        trainer = mx.gluon.Trainer(embedding.collect_params(), args.optimizer,
                                   optimizer_kwargs)
    except ValueError as e:
        if args.optimizer == 'groupadagrad':
            logging.warning('MXNet <= v1.3 does not contain '
                            'GroupAdaGrad support. Falling back to AdaGrad')
            trainer = mx.gluon.Trainer(embedding.collect_params(), 'adagrad',
                                       optimizer_kwargs)
        else:
            raise e

    try:
        if args.no_prefetch_batch:
            data = data.transform(batchify_fn)
        else:
            from executors import LazyThreadPoolExecutor
            num_cpu = len(os.sched_getaffinity(0))
            ex = LazyThreadPoolExecutor(num_cpu)
    except (ImportError, SyntaxError, AttributeError):
        # Py2 - no async prefetching is supported
        logging.warning(
            'Asynchronous batch prefetching is not supported on Python 2. '
            'Consider upgrading to Python 3 for improved performance.')
        data = data.transform(batchify_fn)

    num_update = 0
    prefetched_iters = []
    for _ in range(min(args.num_prefetch_epoch, args.epochs)):
        prefetched_iters.append(iter(data))
    for epoch in range(args.epochs):
        if epoch + len(prefetched_iters) < args.epochs:
            prefetched_iters.append(iter(data))
        data_iter = prefetched_iters.pop(0)
        try:
            batches = ex.map(batchify_fn, data_iter)
        except NameError:  # Py 2 or batch prefetching disabled
            batches = data_iter

        # Logging variables
        log_wc = 0
        log_start_time = time.time()
        log_avg_loss = 0

        for i, batch in enumerate(batches):
            ctx = context[i % len(context)]
            batch = [array.as_in_context(ctx) for array in batch]
            with mx.autograd.record():
                loss = embedding(*batch)
            loss.backward()

            num_update += loss.shape[0]
            if len(context) == 1 or (i + 1) % len(context) == 0:
                trainer.step(batch_size=1)

            # Logging
            log_wc += loss.shape[0]
            log_avg_loss += loss.mean().as_in_context(context[0])
            if (i + 1) % args.log_interval == 0:
                # Forces waiting for computation by computing loss value
                log_avg_loss = log_avg_loss.asscalar() / args.log_interval
                wps = log_wc / (time.time() - log_start_time)
                # Due to subsampling, the overall number of batches is an upper
                # bound
                num_batches = num_tokens // args.batch_size
                if args.model.lower() == 'skipgram':
                    num_batches = (num_tokens * args.window * 2) // args.batch_size
                else:
                    num_batches = num_tokens // args.batch_size
                logging.info('[Epoch {} Batch {}/{}] loss={:.4f}, '
                             'throughput={:.2f}K wps, wc={:.2f}K'.format(
                                 epoch, i + 1, num_batches, log_avg_loss,
                                 wps / 1000, log_wc / 1000))
                log_start_time = time.time()
                log_avg_loss = 0
                log_wc = 0

            if args.eval_interval and (i + 1) % args.eval_interval == 0:
                with print_time('mx.nd.waitall()'):
                    mx.nd.waitall()
                with print_time('evaluate'):
                    evaluate(args, embedding, vocab, num_update)

    # Evaluate
    with print_time('mx.nd.waitall()'):
        mx.nd.waitall()
    with print_time('evaluate'):
        evaluate(args, embedding, vocab, num_update,
                 eval_analogy=not args.no_eval_analogy)

    # Save params
    with print_time('save parameters'):
        embedding.save_parameters(os.path.join(args.logdir, 'embedding.params'))