def test_multi_feat_single_label_parser(self): # init token_dicts token_dicts = None feature_field_list = [] # param = ["name", "processor", "type", "dtype", "shape", "max_len", "token_dict_name"] feature_field_list.append( DataSchema(name='query', processor='to_np', dtype='int32', shape=(None, ), is_with_len=True)) feature_field_list.append( DataSchema(name='width', processor='to_np', dtype='int32', shape=(4))) label_field = DataSchema(name='label', processor='to_np', dtype='float32', shape=(1, )) parser = TextlineParser(token_dicts, feature_field_list, label_field) line = '12\t2\t1 2 3 4 5\t1 2 3 4' label, addinfo, features, _ = parser.parse(line) # print('label',label, features) np.testing.assert_array_equal(label, np.asarray([2.0])) self.assertListEqual(addinfo, ['12']) np.testing.assert_array_equal(features[0], np.asarray([1, 2, 3, 4, 5])) np.testing.assert_array_equal(features[2], np.asarray([1, 2, 3, 4]))
def test_text_seq2seq_model(self): # init token_dicts token_dicts = TokenDicts('tests/data/dicts', {'query': 0}) data_field_list = [] # param = ["name", "processor", "type", "dtype", "shape", "max_len", "token_dict_name"] data_field_list.append( DataSchema(name='query', processor='to_tokenid', dtype='int32', shape=(None, ), is_with_len=False, token_dict_name='query')) label_field = DataSchema(name='label', processor='to_tokenid', dtype='int32', shape=(None, ), is_with_len=True, token_dict_name='query') parser = TextlineParser(token_dicts, data_field_list, label_field) generator = TFDataset(parser=parser, file_path='tests/data/raw_datasets', file_suffix='text_seq2seq.input') dataset = generator.generate_dataset(batch_size=12, num_epochs=1, is_shuffle=False) for (batchs, (inputs, targets)) in enumerate(dataset): print('bacths', batchs, 'inputs', inputs, 'targets', targets)
def test_raw_query_float_dataset(self): # init token_dicts token_dicts = TokenDicts('tests/data/dicts', {'query': 0}) data_field_list = [] # param = ["name", "processor", "type", "dtype", "shape", "max_len", "token_dict_name"] data_field_list.append( DataSchema(name='query', processor='to_tokenid', dtype='int32', shape=(None, ), is_with_len=True, token_dict_name='query')) data_field_list.append( DataSchema(name='width', processor='to_np', dtype='float32', shape=(4))) label_field = DataSchema(name='label', processor='to_np', dtype='float32', shape=(1, )) parser = TextlineParser(token_dicts, data_field_list, label_field) generator = TFDataset(parser=parser, file_path='tests/data/raw_datasets', file_suffix='query_float.input') dataset = generator.generate_dataset(batch_size=12, num_epochs=1, is_shuffle=False) for _ in enumerate(dataset): pass
def test_tfrecord_dataset_varnum_writer_and_reader(self): token_dicts = None data_field_list = [] data_field_list.append( DataSchema(name='query', processor='to_np', dtype='int32', shape=(None, ), is_with_len=True)) label_field = DataSchema(name='label', processor='to_np', dtype='float32', shape=(1, ), is_with_len=False) parser = TextlineParser(token_dicts, data_field_list, label_field) generator = TFDataset(parser=parser, file_path='tests/data/raw_datasets', file_suffix='varnum.input') if not os.path.exists('outputs'): os.mkdir('outputs') generator.to_tfrecords('outputs/file.tfrecord') generator = TFDataset(parser=parser, file_path='outputs', file_suffix='file.tfrecord', file_system='tfrecord') dataset = generator.generate_dataset(batch_size=1, num_epochs=1) for d in dataset.take(4): print(d) def weight(_): return 1.2 parser = TextlineParser(token_dicts, data_field_list, label_field, weight_fn=weight) generator = TFDataset(parser=parser, file_path='tests/data/raw_datasets', file_suffix='varnum.input') if not os.path.exists('outputs'): os.mkdir('outputs') generator.to_tfrecords('outputs/file_weight.tfrecord') generator = TFDataset(parser=parser, file_path='outputs', file_suffix='file_weight.tfrecord', file_system='tfrecord') dataset = generator.generate_dataset(batch_size=1, num_epochs=1) for d in dataset.take(4): print(d)
def test_raw_dataset_varnum(self): token_dicts = None data_field_list = [] data_field_list.append( DataSchema(name='query', processor='to_np', dtype='int32', shape=(None, ), is_with_len=True)) label_field = DataSchema(name='label', processor='to_np', dtype='float32', shape=(1, ), is_with_len=False) parser = TextlineParser(token_dicts, data_field_list, label_field) generator = TFDataset(parser=parser, file_path='tests/data/raw_datasets', file_suffix='varnum.input') self.pass_allway_dataset(generator, 4)
def test_multi_feat_multi_label_parser(self): # init token_dicts token_dicts = None feature_field_list = [] # param = ["name", "processor", "type", "dtype", "shape", "max_len", "token_dict_name"] feature_field_list.append( DataSchema(name='query', processor='to_np', dtype='int32', shape=(None, ), is_with_len=True)) feature_field_list.append( DataSchema(name='width', processor='to_np', dtype='int32', shape=(4))) label_field = [ DataSchema(name='label1', processor='to_np', dtype='float32', shape=(1, )), DataSchema(name='label2', processor='to_np', dtype='int32', shape=(None, ), is_with_len=True) ] parser = TextlineParser(token_dicts, feature_field_list, label_field, additive_schema=['id', 'session']) line = '12\tcuid\t2\t2 3 4\t1 2 3 4 5\t1 2 3 4' label, addinfo, features, _ = parser.parse(line) # print('label',label, features) assert len(label) == 3 np.testing.assert_array_equal(label[0], np.asarray([2.0])) np.testing.assert_array_equal(label[1], np.asarray([2, 3, 4])) self.assertListEqual(addinfo, ['12', 'cuid']) assert len(features) == 3 np.testing.assert_array_equal(features[0], np.asarray([1, 2, 3, 4, 5])) assert features[1] == 5 np.testing.assert_array_equal(features[2], np.asarray([1, 2, 3, 4]))
def test_text_seq2seq_model(self): """Test something.""" # init token_dicts gpus = tf.config.experimental.list_physical_devices('GPU') print(gpus) for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) token_dicts = TokenDicts('tests/data/dicts', {'query': 0}) data_field_list = [] #param = ["name", "processor", "type", "dtype", "shape", "max_len", "token_dict_name"] data_field_list.append( DataSchema(name='query', processor='to_tokenid', dtype='int32', shape=(None, ), is_with_len=False, token_dict_name='query')) label_field = DataSchema(name='label', processor='to_np', dtype='float32', shape=(1, ), is_with_len=False) parser = TextlineParser(token_dicts, data_field_list, label_field) generator = TFDataset(parser=parser, file_path='tests/data', file_suffix='text_classifier.input') dataset = generator.generate_dataset(batch_size=16, num_epochs=20, is_shuffle=False) for (batchs, (inputs, targets)) in enumerate(dataset): print('bacths', batchs, 'inputs', inputs, 'targets', targets) if batchs > 3: break query_vocab_size = token_dicts.dict_size_by_name('query') print('query_size', query_vocab_size) optimizer = tf.keras.optimizers.Adam() model = ClassiferModel(optimizer=optimizer, loss='binary_crossentropy', model=self.create_model(query_vocab_size)) # model.summary() model.fit(dataset, 12, epochs=8, bar_step=10) model.model.save_weights('model.h5')
def test_text_seq2seq_model(self): """Test something.""" # init token_dicts gpus = tf.config.experimental.list_physical_devices('GPU') print(gpus) for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) token_dicts = TokenDicts('tests/data/dicts', {'query': 0}) data_field_list = [] #param = ["name", "processor", "type", "dtype", "shape", "max_len", "token_dict_name"] data_field_list.append(DataSchema(name='query', processor='to_tokenid', dtype='int32', shape=(None,), is_with_len=False, token_dict_name='query')) label_field = DataSchema( name='label', processor='to_tokenid', dtype='int32', shape=(None,), is_with_len=True, token_dict_name='query') parser = TextlineParser(token_dicts, data_field_list, label_field) generator = TFDataset(parser=parser, file_path='tests/data', file_suffix='text_seq2seq.input') dataset = generator.generate_dataset(batch_size=64, num_epochs=900, is_shuffle=False) # for (batchs, (inputs, targets)) in enumerate(dataset): # #print('bacths', batchs, 'inputs', inputs, 'targets', targets) # if batchs > 3: # break query_vocab_size = token_dicts.dict_size_by_name('query') print('query_size', query_vocab_size) print('<s>', token_dicts.to_id('query', '<s>')) print(r'<\s>', token_dicts.to_id('query', r'<\s>')) encoder = GRUEncoder(64, query_vocab_size) decoder = GRUDecoder(64, query_vocab_size, query_vocab_size) optimizer = tf.keras.optimizers.Adam() model = Seq2seqModel(optimizer, seq2seq_cross_entropy_loss, encoder, decoder, feature_fields=data_field_list, label_fields=[label_field]) # model.model.summary() #plot_model(model.model, 'model.png', show_shapes=True) model.model.load_weights('model.h5') #model.fit(dataset, 64, epochs=20, bar_step=20) #model.model.save_weights('model.h5') for (batchs, (inputs, targets)) in enumerate(dataset): #result = model.predict(inputs) result = model.predict_beam(inputs) print('target', targets, 'predicts', result) if batchs > 1: break
def test_session_parser_dataset(self): # init token_dicts token_dicts = TokenDicts('tests/data/dicts', {'query': 0}) data_field_list = [] # param = ["name", "processor", "type", "dtype", "shape", "max_len", "token_dict_name"] data_field_list.append( DataSchema(name='query', processor='to_tokenid', dtype='int32', shape=( None, None, ), is_with_len=False, token_dict_name='query', has_session=True)) data_field_list.append( DataSchema(name='width', processor='to_np', dtype='float32', shape=( None, 4, ), has_session=True)) label_field = DataSchema(name='label', processor='to_np', dtype='float32', shape=(1, )) parser = SeqParser(token_dicts, data_field_list, label_field) generator = TFDataset(parser=parser, file_path='tests/data/seq_datasets', file_suffix='simple_seq.input') print('Shapes', generator._get_shapes(is_training=True)) dataset = generator.generate_dataset(batch_size=12, num_epochs=1, is_shuffle=False) for _ in enumerate(dataset): pass
def processor(filepath, filename, tfrecord_filename): token_dicts = None data_field_list = [] data_field_list.append( DataSchema(name='query', processor='to_np', type=tf.int32, dtype='int32', shape=(None, ), is_with_len=True)) label_field = DataSchema(name='label', processor='to_np', type=tf.float32, dtype='float32', shape=(1, ), is_with_len=False) parser = TextlineParser(token_dicts, data_field_list, label_field) generator = TFDataset(parser=parser, file_path=filepath, file_suffix=filename) generator.to_tfrecords(tfrecord_filename) return tfrecord_filename
def test_simple_no_label_parser(self): # init token_dicts token_dicts = None feature_field_list = [] # param = ["name", "processor", "type", "dtype", "shape", "max_len", "token_dict_name"] feature_field_list.append( DataSchema(name='query', processor='to_np', dtype='int32', shape=(None, ), is_with_len=True)) parser = TextlineParser(token_dicts, feature_field_list) line = '12\t1 2 3 4 5' label, addinfo, features, _ = parser.parse(line) assert label is None self.assertListEqual(addinfo, ['12']) np.testing.assert_array_equal(features[0], np.asarray([1, 2, 3, 4, 5]))