def testBuildDatasetFromSameFile(self): files = [utils.get_data_file('classify.seq.label.txt')] x_tokenizer = SpaceTokenizer() x_tokenizer.build_from_corpus( [utils.get_data_file('classify.seq.txt')]) config = { 'train_batch_size': 2, 'eval_batch_size': 2, 'predict_batch_size': 2, 'buffer_size': 100 } dataset = SeqClassifyDataset(x_tokenizer, config) train_dataset = dataset.build_train_dataset(files) print(next(iter(train_dataset))) print('=' * 120) eval_dataset = dataset.build_eval_dataset(files) print(next(iter(eval_dataset))) print('=' * 120) predict_files = [utils.get_data_file('classify.seq.txt')] predict_dataset = dataset.build_predict_dataset(predict_files) print(next(iter(predict_dataset))) print('=' * 120)
def testBuildDatasetFromSameFile(self): files = [ utils.get_data_file('iwslt15.tst2013.100.envi'), utils.get_data_file('iwslt15.tst2013.100.envi'), ] x_tokenizer = SpaceTokenizer() x_tokenizer.build_from_corpus( [utils.get_data_file('iwslt15.tst2013.100.en')]) y_tokenizer = SpaceTokenizer() y_tokenizer.build_from_corpus( [utils.get_data_file('iwslt15.tst2013.100.vi')]) config = { 'train_batch_size': 2, 'predict_batch_size': 2, 'eval_batch_size': 2, 'buffer_size': 100 } dataset = Seq2SeqDataset(x_tokenizer, y_tokenizer, config) train_dataset = dataset.build_train_dataset(files) print(next(iter(train_dataset))) print('=' * 120) eval_dataset = dataset.build_eval_dataset(files) print(next(iter(eval_dataset))) print('=' * 120) predict_files = [utils.get_data_file('iwslt15.tst2013.100.envi')] predict_dataset = dataset.build_predict_dataset(predict_files) print(next(iter(predict_dataset))) print('=' * 120)
def testBuildDatasetFromSameFile(self): files = [ utils.get_data_file('dssm.query.doc.label.txt'), utils.get_data_file('dssm.query.doc.label.txt'), ] x_tokenizer = SpaceTokenizer() x_tokenizer.build_from_vocab(utils.get_data_file('dssm.vocab.txt')) y_tokenizer = SpaceTokenizer() y_tokenizer.build_from_vocab(utils.get_data_file('dssm.vocab.txt')) config = { 'train_batch_size': 2, 'eval_batch_size': 2, 'predict_batch_size': 2, 'buffer_size': 100, } dataset = SeqMatchDataset(x_tokenizer, y_tokenizer, config) train_dataset = dataset.build_train_dataset(files) print(next(iter(train_dataset))) print('=' * 120) eval_dataset = dataset.build_eval_dataset(files) print(next(iter(eval_dataset))) print('=' * 120) predict_files = [utils.get_data_file('dssm.query.doc.label.txt')] predict_dataset = dataset.build_predict_dataset(predict_files) print(next(iter(predict_dataset))) print('=' * 120)
def testBuildFromVocab(self): print('============start build from vocab=============') tokenizer = SpaceTokenizer() tokenizer.build_from_vocab(data_dir_utils.get_data_file('vocab.test.txt')) print('token2id dict: ', tokenizer.token2id_dict) print('id2token dict: ', tokenizer.id2token_dict) words = tf.constant(['I', 'am', 'a', 'developer']) v0 = tokenizer.encode(words) print(v0) ids = tf.constant([1, 0, 2, 3, 4], dtype=tf.dtypes.int64) v1 = tokenizer.decode(ids) print(v1) print('============end build from vocab=============')
def buildTokenizer(self): tokenizer = SpaceTokenizer() corpus = ['iwslt15.tst2013.100.en'] corpus = [data_dir_utils.get_data_file(f) for f in corpus] tokenizer.build_from_corpus(corpus, token_filters=[EmptyTokenFilter()]) return tokenizer
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import os import tensorflow as tf from easylib.dl import KerasModelDatasetRunner from nlp_datasets.abstract_dataset import AbstractXYDataset from nlp_datasets.tokenizers import SpaceTokenizer from nlp_datasets.xyz_dataset import XYZSameFileDataset from mp import models, utils tokenizer = SpaceTokenizer() tokenizer.build_from_vocab(os.path.join(utils.testdat_dir(), 'vocab.txt')) config = { 'x_max_len': 1000, 'y_max_len': 1000, 'train_batch_size': 1, 'predict_batch_size': 32, 'shuffle_size': -1, 'num_parallel_calls': tf.data.experimental.AUTOTUNE } dataset = XYZSameFileDataset(x_tokenizer=tokenizer, y_tokenizer=tokenizer, config=config) if __name__ == '__main__': parser = argparse.ArgumentParser()
'vocab_size': 10, 'embedding_size': 256, 'vec_dim': 256, } runner_config = { 'ckpt_period': 1, 'model_dir': '/tmp/dssm' } config.update(dataset_config) config.update(model_config) config.update(runner_config) if not os.path.exists(config['model_dir']): os.makedirs(config['model_dir']) tokenizer = SpaceTokenizer() tokenizer.build_from_vocab(config['vocab_file']) logging.info('Build tokenizer from vocab file: %s' % config['vocab_file']) logging.info('vocab size of tokenizer: %d' % tokenizer.vocab_size) config['vocab_size'] = tokenizer.vocab_size args, _ = parser.parse_known_args() if 'mlp' == args.model: model = models.build_mlp_model(config) elif 'lstm' == args.model: model = models.build_lstm_model(config) else: raise ValueError('Invalid model: %s' % args.model) dataset = XYZSameFileDataset(x_tokenizer=tokenizer, y_tokenizer=tokenizer, config=None)