def test_nested_len(self): tds = TextDataSetPrep(nrows=1) all_lens = [ e for e in tds._len_per_nested_list([[[1, 2], [3]], [[4]]]) ] self.assertListEqual([2, 1], all_lens[0]) self.assertEqual([2, 1, 1], all_lens[1])
def test_doc_to_pickle(self): self._cleanup() tds = TextDataSetPrep(nrows=20) dataset_dir = tds.csv_to_pickle(target_dir=self.temp_pkl_path) first_id = tds.label_df.iloc[0, 0] first_label = tds.label_df.iloc[0, 1] test_path = os.path.join(dataset_dir, first_label, first_id + '.pkl') self.assertTrue(os.path.isfile(test_path))
def test_get_ids_per_dataset(self): v1 = TextDataSetPrep._get_ids_per_dataset(list(range(10)), [0.5, 0.5], seed=7357) v2 = TextDataSetPrep._get_ids_per_dataset(list(range(10)), [0.25, 0.25, 0.25, 0.25], seed=7357) self.assertEqual(len(v1), 2) self.assertEqual(len(v2), 4) self.assertListEqual([4, 1, 3, 5, 7], v1[0])
def test_ragged_memory(self): tds = TextDataSetPrep(csv_path=IMDB_CSV_TEST, nrows=100) x, _ = tds.get_ragged_tensors_dataset() self.assertListEqual(list(x.bounding_shape().numpy()), [100, 940]) x, _ = tds.get_ragged_tensors_dataset(split_characters=True) self.assertListEqual(list(x.bounding_shape().numpy()), [100, 940, 24]) x, _ = tds.get_ragged_tensors_dataset(split_characters=True, split_sentences=True) self.assertListEqual(list(x.bounding_shape().numpy()), [100, 46, 188, 24])
def test_text_split(self): doc = "this is the first sentence. This is the second one. \n\n This is a new paragraph" tds_def = TextDataSetPrep(nrows=1, spacy_model=None) tokens = tds_def._text_split(doc) self.assertEqual('this', tokens[0]) self.assertEqual(len(tokens), 17) self.assertIsInstance(tokens[0], str) tokens_s = tds_def._text_split(doc, split_sentences=True) self.assertEqual('this', tokens_s[0][0]) self.assertEqual(len(tokens_s), 3) self.assertIsInstance(tokens_s[0][0], str) tokens_s_c = tds_def._text_split(doc, split_sentences=True, split_characters=True) self.assertEqual('t', tokens_s_c[0][0][0]) self.assertEqual(len(tokens_s_c), 3) self.assertIsInstance(tokens_s_c[0][0][0], str) tds_spacy = TextDataSetPrep(nrows=1) tokens_s = tds_spacy._text_split(doc, split_sentences=True) self.assertEqual('this', tokens_s[0][0]) self.assertEqual(len(tokens_s), 3) self.assertIsInstance(tokens_s[0][0], str)
def test_serial_deserial(self): tds = TextDataSetPrep(csv_path=None, id_col='id', text_col='text', label_col='label') data = [ 'abc', { 'label': 'label', 'text': "this is my text for testing. It has two sentences" } ] serialized = tds._serialize_tokens_tfr(data) with tf.io.TFRecordWriter(self.temp_tfr_path) as writer: writer.write(serialized.SerializeToString()) dataset = tf.data.TFRecordDataset(self.temp_tfr_path) for e in dataset: _ = tds._deserialize_tokens(e)
def test_write_tfr(self): self._cleanup() _ = TextDataSetPrep(csv_path=IMDB_CSV_TEST).write_tfr_datasets( tfr_names=[self.temp_tfr_path]) _ = TextDataSetPrep(csv_path=IMDB_CSV_TEST).write_tfr_datasets( tfr_names=[self.temp_tfr_path])
def test_get_x_y(self): tds = TextDataSetPrep(nrows=40, spacy_model=None, csv_path=IMDB_CSV_TEST) df_ids = tds._selected_ids(seed=7357, n_per_label=2) self.assertIn('5855_1.txt', df_ids.values)
def test_get_imdb_data(self): _ = TextDataSetPrep(nrows=10)
def test_split_all(self): tds = TextDataSetPrep(nrows=1) text = "This is my first sentence. This is Sparta." tokens, w_l, s_l = tds._split_all(text) self.assertListEqual([6, 4], s_l) self.assertEqual([4, 2, 2, 5, 8, 1, 4, 2, 6, 1], w_l)
import tensorflow as tf from data.dataset_prep import TextDataSetPrep tds = TextDataSetPrep() dataset = tds.read_tfr_dataset('train_small.tfr') char_vocab_size = 200 word_vector_len = 50 def tfr_to_ragged(example, label): nested_0 = tf.RaggedTensor.from_row_lengths( values=example['characters'], row_lengths=example['len_level_1']) nested_1 = tf.RaggedTensor.from_row_lengths( values=nested_0, row_lengths=example['len_level_0']) return nested_0, label dataset_ragged = dataset.map(tfr_to_ragged) dataset = dataset_ragged.batch(32).prefetch(256) # x = tf.keras.layers.Input(shape=(None, None, ), ragged=True, name='input_x') # y = tf.keras.layers.Input(shape=(None,), ragged=False, name='input_y') for x, y in dataset: x_cropped = x word_buckets = tf.strings.to_hash_bucket_fast(x_cropped, char_vocab_size) embedded = tf.keras.layers.Embedding(char_vocab_size, 8)(word_buckets) # need to turn each word into an obwervation to apply LSTM tf.ragged.stack()
import tensorflow as tf from data.dataset_prep import TextDataSetPrep tds = TextDataSetPrep(nrows=20000) # need pos and neg examples MAX_SENT_LEN = 30 MAX_WORD_LEN = 15 # 15 MAX_REVIEW_LEN = 200 # 200 VOCAB_SIZE = 200 # not too many possible characters (but bear in mind accents, upper/lower,... CHAR_EMBEDD_DIM = 8 # 3 WORD_VECTORS_LEN = 128 # 256 REVIEW_EMBEDDING = 64 x, y = tds.get_ragged_tensors_dataset(split_characters=True) x_cropped = x[:, :MAX_REVIEW_LEN, :MAX_WORD_LEN] word_buckets = tf.strings.to_hash_bucket_fast(x_cropped, VOCAB_SIZE).to_tensor() evaluation = tf.keras.metrics.CategoricalAccuracy() inputs = tf.keras.layers.Input(shape=( None, None, ), ragged=False) embedded_c = tf.keras.layers.Embedding(VOCAB_SIZE, CHAR_EMBEDD_DIM)(inputs) reshaped = tf.reshape(embedded_c, (-1, MAX_WORD_LEN, CHAR_EMBEDD_DIM)) embedded_w = tf.keras.layers.LSTM(WORD_VECTORS_LEN)(reshaped) group_by_review = tf.reshape(embedded_w, (-1, MAX_REVIEW_LEN, WORD_VECTORS_LEN)) lstm_review = tf.keras.layers.LSTM(REVIEW_EMBEDDING)(group_by_review)
from data.dataset_prep import TextDataSetPrep if __name__ == '__main__': tds_small = TextDataSetPrep(nrows=600) tds_small.write_tfr_datasets( seed=123, dataset_split=[0.8, 0.1, 0.1], tfr_names=["train_small.tfr", "val_small.tfr", "test_small.tfr"])
""" applying a model on a specific dimentions of a ragged tensor """ import tensorflow as tf from data.dataset_prep import TextDataSetPrep VOCAB_N_L0 = 5 EMBEDDING_SIZE_L0 = 2 ENCODING_SIZE_L1 = 4 ragged_simple_seq = tf.ragged.constant( [['First', 'observation', 'single', 'sentence'], ['Second', 'observation', 'another', 'sentence']]) ds = TextDataSetPrep(nrows=1).read_tfr_dataset("train_small.tfr") ts_list = [] len_l0 = [] len_l1 = [] for item in ds.take(2): ts_list.append(item[0]['characters']) len_l0.append(item[0]['len_level_0']) len_l1.append(item[0]['len_level_1']) input_tokens = tf.keras.layers.Input(shape=(None, ), name='characters', dtype=tf.string) input_grouping = tf.keras.layers.Input(shape=(None, ), name='len_level_0', dtype=tf.int32)
import tensorflow as tf from data.dataset_prep import TextDataSetPrep from models.basic_sequence import model from models.config import vocab_size tds = TextDataSetPrep(nrows=200) # need pos and neg examples MAX_SENT_LEN = 30 MAX_WORD_LEN = 15 MAX_REVIEW_LEN = 200 x, y = tds.get_ragged_tensors_dataset() x_cropped = x[:, :MAX_REVIEW_LEN] print(x.bounding_shape()) word_buckets = tf.dtypes.cast( tf.strings.to_hash_bucket_fast(x_cropped, vocab_size), tf.float32) evaluation = tf.keras.metrics.CategoricalAccuracy() model.summary() model.fit(word_buckets, y, batch_size=32, epochs=40) print( f"Categorical accuracy after train: {evaluation(model(word_buckets), y).numpy()}" ) model.evaluate(word_buckets, y)
from data.dataset_prep import TextDataSetPrep tds = TextDataSetPrep() tds.csv_to_pickle(split_characters=True)