def testBuildDatasetFromSameFile(self): files = [ utils.get_data_file('iwslt15.tst2013.100.envi'), utils.get_data_file('iwslt15.tst2013.100.envi'), ] x_tokenizer = SpaceTokenizer() x_tokenizer.build_from_corpus( [utils.get_data_file('iwslt15.tst2013.100.en')]) y_tokenizer = SpaceTokenizer() y_tokenizer.build_from_corpus( [utils.get_data_file('iwslt15.tst2013.100.vi')]) config = { 'train_batch_size': 2, 'predict_batch_size': 2, 'eval_batch_size': 2, 'buffer_size': 100 } dataset = Seq2SeqDataset(x_tokenizer, y_tokenizer, config) train_dataset = dataset.build_train_dataset(files) print(next(iter(train_dataset))) print('=' * 120) eval_dataset = dataset.build_eval_dataset(files) print(next(iter(eval_dataset))) print('=' * 120) predict_files = [utils.get_data_file('iwslt15.tst2013.100.envi')] predict_dataset = dataset.build_predict_dataset(predict_files) print(next(iter(predict_dataset))) print('=' * 120)
def testBuildDatasetFromSameFile(self): files = [utils.get_data_file('classify.seq.label.txt')] x_tokenizer = SpaceTokenizer() x_tokenizer.build_from_corpus( [utils.get_data_file('classify.seq.txt')]) config = { 'train_batch_size': 2, 'eval_batch_size': 2, 'predict_batch_size': 2, 'buffer_size': 100 } dataset = SeqClassifyDataset(x_tokenizer, config) train_dataset = dataset.build_train_dataset(files) print(next(iter(train_dataset))) print('=' * 120) eval_dataset = dataset.build_eval_dataset(files) print(next(iter(eval_dataset))) print('=' * 120) predict_files = [utils.get_data_file('classify.seq.txt')] predict_dataset = dataset.build_predict_dataset(predict_files) print(next(iter(predict_dataset))) print('=' * 120)
def testBuildDatasetFromSameFile(self): files = [ utils.get_data_file('dssm.query.doc.label.txt'), utils.get_data_file('dssm.query.doc.label.txt'), ] x_tokenizer = SpaceTokenizer() x_tokenizer.build_from_vocab(utils.get_data_file('dssm.vocab.txt')) y_tokenizer = SpaceTokenizer() y_tokenizer.build_from_vocab(utils.get_data_file('dssm.vocab.txt')) config = { 'train_batch_size': 2, 'eval_batch_size': 2, 'predict_batch_size': 2, 'buffer_size': 100, } dataset = SeqMatchDataset(x_tokenizer, y_tokenizer, config) train_dataset = dataset.build_train_dataset(files) print(next(iter(train_dataset))) print('=' * 120) eval_dataset = dataset.build_eval_dataset(files) print(next(iter(eval_dataset))) print('=' * 120) predict_files = [utils.get_data_file('dssm.query.doc.label.txt')] predict_dataset = dataset.build_predict_dataset(predict_files) print(next(iter(predict_dataset))) print('=' * 120)
def testBuildFromVocab(self): print('============start build from vocab=============') tokenizer = SpaceTokenizer() tokenizer.build_from_vocab(data_dir_utils.get_data_file('vocab.test.txt')) print('token2id dict: ', tokenizer.token2id_dict) print('id2token dict: ', tokenizer.id2token_dict) words = tf.constant(['I', 'am', 'a', 'developer']) v0 = tokenizer.encode(words) print(v0) ids = tf.constant([1, 0, 2, 3, 4], dtype=tf.dtypes.int64) v1 = tokenizer.decode(ids) print(v1) print('============end build from vocab=============')
def testSaveVocabFile(self): tokenizer = self.buildTokenizer() tokenizer.save_to_vocab(data_dir_utils.get_data_file('vocab.test.txt')) print(tokenizer.token2id_dict) print(tokenizer.id2token_dict)
def buildTokenizer(self): tokenizer = SpaceTokenizer() corpus = ['iwslt15.tst2013.100.en'] corpus = [data_dir_utils.get_data_file(f) for f in corpus] tokenizer.build_from_corpus(corpus, token_filters=[EmptyTokenFilter()]) return tokenizer