예제 #1
0
 def test_nested_len(self):
     tds = TextDataSetPrep(nrows=1)
     all_lens = [
         e for e in tds._len_per_nested_list([[[1, 2], [3]], [[4]]])
     ]
     self.assertListEqual([2, 1], all_lens[0])
     self.assertEqual([2, 1, 1], all_lens[1])
예제 #2
0
 def test_doc_to_pickle(self):
     self._cleanup()
     tds = TextDataSetPrep(nrows=20)
     dataset_dir = tds.csv_to_pickle(target_dir=self.temp_pkl_path)
     first_id = tds.label_df.iloc[0, 0]
     first_label = tds.label_df.iloc[0, 1]
     test_path = os.path.join(dataset_dir, first_label, first_id + '.pkl')
     self.assertTrue(os.path.isfile(test_path))
예제 #3
0
 def test_get_ids_per_dataset(self):
     v1 = TextDataSetPrep._get_ids_per_dataset(list(range(10)), [0.5, 0.5],
                                               seed=7357)
     v2 = TextDataSetPrep._get_ids_per_dataset(list(range(10)),
                                               [0.25, 0.25, 0.25, 0.25],
                                               seed=7357)
     self.assertEqual(len(v1), 2)
     self.assertEqual(len(v2), 4)
     self.assertListEqual([4, 1, 3, 5, 7], v1[0])
예제 #4
0
    def test_ragged_memory(self):
        tds = TextDataSetPrep(csv_path=IMDB_CSV_TEST, nrows=100)
        x, _ = tds.get_ragged_tensors_dataset()
        self.assertListEqual(list(x.bounding_shape().numpy()), [100, 940])

        x, _ = tds.get_ragged_tensors_dataset(split_characters=True)
        self.assertListEqual(list(x.bounding_shape().numpy()), [100, 940, 24])

        x, _ = tds.get_ragged_tensors_dataset(split_characters=True,
                                              split_sentences=True)
        self.assertListEqual(list(x.bounding_shape().numpy()),
                             [100, 46, 188, 24])
예제 #5
0
    def test_text_split(self):
        doc = "this is the first sentence. This is the second one. \n\n This is  a new paragraph"
        tds_def = TextDataSetPrep(nrows=1, spacy_model=None)
        tokens = tds_def._text_split(doc)
        self.assertEqual('this', tokens[0])
        self.assertEqual(len(tokens), 17)
        self.assertIsInstance(tokens[0], str)

        tokens_s = tds_def._text_split(doc, split_sentences=True)

        self.assertEqual('this', tokens_s[0][0])
        self.assertEqual(len(tokens_s), 3)
        self.assertIsInstance(tokens_s[0][0], str)

        tokens_s_c = tds_def._text_split(doc,
                                         split_sentences=True,
                                         split_characters=True)
        self.assertEqual('t', tokens_s_c[0][0][0])
        self.assertEqual(len(tokens_s_c), 3)
        self.assertIsInstance(tokens_s_c[0][0][0], str)

        tds_spacy = TextDataSetPrep(nrows=1)
        tokens_s = tds_spacy._text_split(doc, split_sentences=True)
        self.assertEqual('this', tokens_s[0][0])
        self.assertEqual(len(tokens_s), 3)
        self.assertIsInstance(tokens_s[0][0], str)
예제 #6
0
 def test_serial_deserial(self):
     tds = TextDataSetPrep(csv_path=None,
                           id_col='id',
                           text_col='text',
                           label_col='label')
     data = [
         'abc', {
             'label': 'label',
             'text': "this is my text for testing. It has two sentences"
         }
     ]
     serialized = tds._serialize_tokens_tfr(data)
     with tf.io.TFRecordWriter(self.temp_tfr_path) as writer:
         writer.write(serialized.SerializeToString())
     dataset = tf.data.TFRecordDataset(self.temp_tfr_path)
     for e in dataset:
         _ = tds._deserialize_tokens(e)
예제 #7
0
 def test_write_tfr(self):
     self._cleanup()
     _ = TextDataSetPrep(csv_path=IMDB_CSV_TEST).write_tfr_datasets(
         tfr_names=[self.temp_tfr_path])
     _ = TextDataSetPrep(csv_path=IMDB_CSV_TEST).write_tfr_datasets(
         tfr_names=[self.temp_tfr_path])
예제 #8
0
 def test_get_x_y(self):
     tds = TextDataSetPrep(nrows=40,
                           spacy_model=None,
                           csv_path=IMDB_CSV_TEST)
     df_ids = tds._selected_ids(seed=7357, n_per_label=2)
     self.assertIn('5855_1.txt', df_ids.values)
예제 #9
0
 def test_get_imdb_data(self):
     _ = TextDataSetPrep(nrows=10)
예제 #10
0
 def test_split_all(self):
     tds = TextDataSetPrep(nrows=1)
     text = "This is my first sentence. This is Sparta."
     tokens, w_l, s_l = tds._split_all(text)
     self.assertListEqual([6, 4], s_l)
     self.assertEqual([4, 2, 2, 5, 8, 1, 4, 2, 6, 1], w_l)
예제 #11
0
import tensorflow as tf
from data.dataset_prep import TextDataSetPrep

tds = TextDataSetPrep()

dataset = tds.read_tfr_dataset('train_small.tfr')

char_vocab_size = 200
word_vector_len = 50


def tfr_to_ragged(example, label):
    nested_0 = tf.RaggedTensor.from_row_lengths(
        values=example['characters'], row_lengths=example['len_level_1'])
    nested_1 = tf.RaggedTensor.from_row_lengths(
        values=nested_0, row_lengths=example['len_level_0'])
    return nested_0, label


dataset_ragged = dataset.map(tfr_to_ragged)
dataset = dataset_ragged.batch(32).prefetch(256)

# x = tf.keras.layers.Input(shape=(None, None, ), ragged=True, name='input_x')
# y = tf.keras.layers.Input(shape=(None,), ragged=False, name='input_y')

for x, y in dataset:
    x_cropped = x
    word_buckets = tf.strings.to_hash_bucket_fast(x_cropped, char_vocab_size)
    embedded = tf.keras.layers.Embedding(char_vocab_size, 8)(word_buckets)
    # need to turn each word into an obwervation to apply LSTM
    tf.ragged.stack()
예제 #12
0
import tensorflow as tf
from data.dataset_prep import TextDataSetPrep

tds = TextDataSetPrep(nrows=20000)  # need pos and neg examples
MAX_SENT_LEN = 30
MAX_WORD_LEN = 15  # 15
MAX_REVIEW_LEN = 200  # 200
VOCAB_SIZE = 200  # not too many possible characters (but bear in mind accents, upper/lower,...
CHAR_EMBEDD_DIM = 8  # 3
WORD_VECTORS_LEN = 128  # 256
REVIEW_EMBEDDING = 64

x, y = tds.get_ragged_tensors_dataset(split_characters=True)

x_cropped = x[:, :MAX_REVIEW_LEN, :MAX_WORD_LEN]

word_buckets = tf.strings.to_hash_bucket_fast(x_cropped,
                                              VOCAB_SIZE).to_tensor()
evaluation = tf.keras.metrics.CategoricalAccuracy()

inputs = tf.keras.layers.Input(shape=(
    None,
    None,
), ragged=False)

embedded_c = tf.keras.layers.Embedding(VOCAB_SIZE, CHAR_EMBEDD_DIM)(inputs)
reshaped = tf.reshape(embedded_c, (-1, MAX_WORD_LEN, CHAR_EMBEDD_DIM))
embedded_w = tf.keras.layers.LSTM(WORD_VECTORS_LEN)(reshaped)
group_by_review = tf.reshape(embedded_w,
                             (-1, MAX_REVIEW_LEN, WORD_VECTORS_LEN))
lstm_review = tf.keras.layers.LSTM(REVIEW_EMBEDDING)(group_by_review)
예제 #13
0
from data.dataset_prep import TextDataSetPrep

if __name__ == '__main__':

    tds_small = TextDataSetPrep(nrows=600)
    tds_small.write_tfr_datasets(
        seed=123,
        dataset_split=[0.8, 0.1, 0.1],
        tfr_names=["train_small.tfr", "val_small.tfr", "test_small.tfr"])
예제 #14
0
"""
applying a model on a specific dimentions of a ragged tensor
"""

import tensorflow as tf
from data.dataset_prep import TextDataSetPrep

VOCAB_N_L0 = 5
EMBEDDING_SIZE_L0 = 2
ENCODING_SIZE_L1 = 4

ragged_simple_seq = tf.ragged.constant(
    [['First', 'observation', 'single', 'sentence'],
     ['Second', 'observation', 'another', 'sentence']])

ds = TextDataSetPrep(nrows=1).read_tfr_dataset("train_small.tfr")

ts_list = []
len_l0 = []
len_l1 = []
for item in ds.take(2):
    ts_list.append(item[0]['characters'])
    len_l0.append(item[0]['len_level_0'])
    len_l1.append(item[0]['len_level_1'])

input_tokens = tf.keras.layers.Input(shape=(None, ),
                                     name='characters',
                                     dtype=tf.string)
input_grouping = tf.keras.layers.Input(shape=(None, ),
                                       name='len_level_0',
                                       dtype=tf.int32)
예제 #15
0
import tensorflow as tf
from data.dataset_prep import TextDataSetPrep
from models.basic_sequence import model
from models.config import vocab_size

tds = TextDataSetPrep(nrows=200)  # need pos and neg examples
MAX_SENT_LEN = 30
MAX_WORD_LEN = 15
MAX_REVIEW_LEN = 200

x, y = tds.get_ragged_tensors_dataset()

x_cropped = x[:, :MAX_REVIEW_LEN]
print(x.bounding_shape())

word_buckets = tf.dtypes.cast(
    tf.strings.to_hash_bucket_fast(x_cropped, vocab_size), tf.float32)
evaluation = tf.keras.metrics.CategoricalAccuracy()

model.summary()

model.fit(word_buckets, y, batch_size=32, epochs=40)

print(
    f"Categorical accuracy after train: {evaluation(model(word_buckets), y).numpy()}"
)

model.evaluate(word_buckets, y)
예제 #16
0
from data.dataset_prep import TextDataSetPrep

tds = TextDataSetPrep()

tds.csv_to_pickle(split_characters=True)