def get_dataloaders(batch_size, vocab, train_dataset_size, val_dataset_size): batchify_fn = nlp.data.batchify.Tuple( nlp.data.batchify.Stack(), nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]), nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]), nlp.data.batchify.Stack('float32'), nlp.data.batchify.Stack('float32'), nlp.data.batchify.Stack(), ) train_data = SQuAD("train", version='2.0')[:train_dataset_size] train_data_transform, _ = preprocess_dataset( train_data, SQuADTransform(nlp.data.BERTTokenizer(vocab=vocab, lower=True), max_seq_length=384, doc_stride=128, max_query_length=64, is_pad=True, is_training=True)) train_dataloader = mx.gluon.data.DataLoader(train_data_transform, batchify_fn=batchify_fn, batch_size=batch_size, num_workers=4, shuffle=True) #we only get 4 validation samples dev_data = SQuAD("dev", version='2.0')[:val_dataset_size] dev_data = mx.gluon.data.SimpleDataset(dev_data) dev_dataset = dev_data.transform(SQuADTransform( nlp.data.BERTTokenizer(vocab=vocab, lower=True), max_seq_length=384, doc_stride=128, max_query_length=64, is_pad=False, is_training=False)._transform, lazy=False) dev_data_transform, _ = preprocess_dataset( dev_data, SQuADTransform(nlp.data.BERTTokenizer(vocab=vocab, lower=True), max_seq_length=384, doc_stride=128, max_query_length=64, is_pad=False, is_training=False)) dev_dataloader = mx.gluon.data.DataLoader(dev_data_transform, batchify_fn=batchify_fn, num_workers=1, batch_size=batch_size, shuffle=False, last_batch='keep') return train_dataloader, dev_dataloader, dev_dataset
def test_preprocess_dataset_batch2_float_tfrecord(self): with self.test_session(): test_image1 = tf.constant(np.arange(4 * 4 * 3) * 5, shape=[4, 4, 3], dtype=tf.uint8) encoded = tf.image.encode_png(test_image1) image1 = encoded.eval() with open(os.path.join("test_files", "test1.png"), "wb") as f: f.write(image1) test_image2 = tf.constant(np.flip(np.arange(4 * 4 * 3) * 5, axis=0), shape=[4, 4, 3], dtype=tf.uint8) encoded = tf.image.encode_png(test_image2) image2 = encoded.eval() with open(os.path.join("test_files", "test2.png"), "wb") as f: f.write(image2) files = glob.glob(os.path.join("test_files", "test*.png")) dataset = get_dataset(files) dataset = preprocess_dataset(dataset, size=[64, 64], batch_size=2, float_pixels=True) it = dataset.make_one_shot_iterator() data = it.get_next().eval() self.assertEqual(data.shape, (2, 64, 64, 3)) self.assertAllClose(max(data.flatten()), max(test_image1.eval().flatten()) / 127.5 - 1.) self.assertAllClose(min(data.flatten()), min(test_image1.eval().flatten()) / 127.5 - 1.)
def test_preprocess_dataset_batch2_float_raw(self): with self.test_session(): test_image1 = tf.constant(np.arange(4 * 4 * 3), shape=[4, 4, 3], dtype=tf.uint8) test_image2 = tf.constant(np.flip(np.arange(4 * 4 * 3), axis=0), shape=[4, 4, 3], dtype=tf.uint8) writer = tf.python_io.TFRecordWriter( os.path.join("test_files", "test.tfrecords")) testimage1_bytes_list = tf.train.BytesList( value=[test_image1.eval().tobytes()]) example1 = tf.train.Example(features=tf.train.Features( feature={ 'data': tf.train.Feature(bytes_list=testimage1_bytes_list), 'shape': tf.train.Feature(int64_list=tf.train.Int64List( value=[4, 4, 3])) })) testimage2_bytes_list = tf.train.BytesList( value=[test_image2.eval().tobytes()]) example2 = tf.train.Example(features=tf.train.Features( feature={ 'data': tf.train.Feature(bytes_list=testimage2_bytes_list), 'shape': tf.train.Feature(int64_list=tf.train.Int64List( value=[4, 4, 3])) })) writer.write(example1.SerializeToString()) writer.write(example2.SerializeToString()) writer.close() files = glob.glob(os.path.join("test_files", "*.tfrecords")) dataset = get_dataset(files) dataset = preprocess_dataset(dataset, size=[64, 64], batch_size=2, float_pixels=True) it = dataset.make_one_shot_iterator() data = it.get_next().eval() self.assertEqual(data.shape, (2, 64, 64, 3)) self.assertAllClose(max(data.flatten()), max(test_image1.eval().flatten()) / 127.5 - 1.) self.assertAllClose(min(data.flatten()), min(test_image1.eval().flatten()) / 127.5 - 1.)
#model = CBOW #parmfile = './logs/scnwiki-cbow_2019-07-18.params' #otcsv = './logs/cossim_scnwiki-cbow.csv' output_dim = 300 batch_size = 1024 num_negatives = 5 subword_function = None window = 5 frequent_token_subsampling = 1E-4 ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## load the data data = nlp.data.TSVDataset(datafile) data, vocab, idx_to_counts = preprocess_dataset(data) ## load the model embedding = model(token_to_idx=vocab.token_to_idx, output_dim=output_dim, batch_size=batch_size, num_negatives=num_negatives, negatives_weights=mx.nd.array(idx_to_counts)) embedding.load_parameters(parmfile) ## get the word vectors wvecs = embedding.embedding_out.weight.data() ## "short vectors" -- only the words with at least 100 appearances slimit = len(np.array(idx_to_counts)[np.array(idx_to_counts) >= 100]) svecs = wvecs[:slimit, ]
def train(args): """Training helper.""" if not args.model.lower() in ['cbow', 'skipgram']: logging.error('Unsupported model %s.', args.model) sys.exit(1) if args.data.lower() == 'toy': data = mx.gluon.data.SimpleDataset(nlp.data.Text8(segment='train')[:2]) data, vocab, idx_to_counts = preprocess_dataset( data, max_vocab_size=args.max_vocab_size) elif args.data.lower() == 'text8': data = nlp.data.Text8(segment='train') data, vocab, idx_to_counts = preprocess_dataset( data, max_vocab_size=args.max_vocab_size) elif args.data.lower() == 'fil9': data = nlp.data.Fil9(max_sentence_length=10000) data, vocab, idx_to_counts = preprocess_dataset( data, max_vocab_size=args.max_vocab_size) elif args.data.lower() == 'wiki': data, vocab, idx_to_counts = wiki(args.wiki_root, args.wiki_date, args.wiki_language, args.max_vocab_size) if args.ngram_buckets > 0: data, batchify_fn, subword_function = transform_data_fasttext( data, vocab, idx_to_counts, cbow=args.model.lower() == 'cbow', ngram_buckets=args.ngram_buckets, ngrams=args.ngrams, batch_size=args.batch_size, window_size=args.window, frequent_token_subsampling=args.frequent_token_subsampling) else: subword_function = None data, batchify_fn = transform_data_word2vec( data, vocab, idx_to_counts, cbow=args.model.lower() == 'cbow', batch_size=args.batch_size, window_size=args.window, frequent_token_subsampling=args.frequent_token_subsampling) num_tokens = float(sum(idx_to_counts)) model = CBOW if args.model.lower() == 'cbow' else SG embedding = model(token_to_idx=vocab.token_to_idx, output_dim=args.emsize, batch_size=args.batch_size, num_negatives=args.negative, negatives_weights=mx.nd.array(idx_to_counts), subword_function=subword_function) context = get_context(args) embedding.initialize(ctx=context) if not args.no_hybridize: embedding.hybridize(static_alloc=True, static_shape=True) optimizer_kwargs = dict(learning_rate=args.lr) try: trainer = mx.gluon.Trainer(embedding.collect_params(), args.optimizer, optimizer_kwargs) except ValueError as e: if args.optimizer == 'groupadagrad': logging.warning('MXNet <= v1.3 does not contain ' 'GroupAdaGrad support. Falling back to AdaGrad') trainer = mx.gluon.Trainer(embedding.collect_params(), 'adagrad', optimizer_kwargs) else: raise e try: if args.no_prefetch_batch: data = data.transform(batchify_fn) else: from executors import LazyThreadPoolExecutor num_cpu = len(os.sched_getaffinity(0)) ex = LazyThreadPoolExecutor(num_cpu) except (ImportError, SyntaxError, AttributeError): # Py2 - no async prefetching is supported logging.warning( 'Asynchronous batch prefetching is not supported on Python 2. ' 'Consider upgrading to Python 3 for improved performance.') data = data.transform(batchify_fn) num_update = 0 prefetched_iters = [] for _ in range(min(args.num_prefetch_epoch, args.epochs)): prefetched_iters.append(iter(data)) for epoch in range(args.epochs): if epoch + len(prefetched_iters) < args.epochs: prefetched_iters.append(iter(data)) data_iter = prefetched_iters.pop(0) try: batches = ex.map(batchify_fn, data_iter) except NameError: # Py 2 or batch prefetching disabled batches = data_iter # Logging variables log_wc = 0 log_start_time = time.time() log_avg_loss = 0 for i, batch in enumerate(batches): ctx = context[i % len(context)] batch = [array.as_in_context(ctx) for array in batch] with mx.autograd.record(): loss = embedding(*batch) loss.backward() num_update += loss.shape[0] if len(context) == 1 or (i + 1) % len(context) == 0: trainer.step(batch_size=1) # Logging log_wc += loss.shape[0] log_avg_loss += loss.mean().as_in_context(context[0]) if (i + 1) % args.log_interval == 0: # Forces waiting for computation by computing loss value log_avg_loss = log_avg_loss.asscalar() / args.log_interval wps = log_wc / (time.time() - log_start_time) # Due to subsampling, the overall number of batches is an upper # bound num_batches = num_tokens // args.batch_size if args.model.lower() == 'skipgram': num_batches = (num_tokens * args.window * 2) // args.batch_size else: num_batches = num_tokens // args.batch_size logging.info('[Epoch {} Batch {}/{}] loss={:.4f}, ' 'throughput={:.2f}K wps, wc={:.2f}K'.format( epoch, i + 1, num_batches, log_avg_loss, wps / 1000, log_wc / 1000)) log_start_time = time.time() log_avg_loss = 0 log_wc = 0 if args.eval_interval and (i + 1) % args.eval_interval == 0: with print_time('mx.nd.waitall()'): mx.nd.waitall() with print_time('evaluate'): evaluate(args, embedding, vocab, num_update) # Evaluate with print_time('mx.nd.waitall()'): mx.nd.waitall() with print_time('evaluate'): evaluate(args, embedding, vocab, num_update, eval_analogy=not args.no_eval_analogy) # Save params with print_time('save parameters'): embedding.save_parameters(os.path.join(args.logdir, 'embedding.params'))