Exemplo n.º 1
0
 def testNestedOutputs(self):
   ds = Dataset.zip((Dataset.range(4), Dataset.zip((Dataset.range(4),
                                                    Dataset.range(4)))))
   total = 0
   # The Iterator will return a nested structure of Tensor objects.
   # Some funkiness to compare against simple integers.
   for (i, x) in enumerate(datasets.Iterator(ds)):
     want = (i, (i, i))
     got = (x[0].numpy(), (x[1][0].numpy(), x[1][1].numpy()))
     self.assertEqual(got, want)
     total += 1
   self.assertEqual(4, total)
Exemplo n.º 2
0
def deploy_dataset_generator(deploy_root_dir='data/Deploy',
                             batch_size=conf.batch_size,
                             input_size=conf.input_size):
    def _decode_image(im_path):
        im_raw = tf.read_file(im_path)
        # convert to a grayscale image and downscale x2
        image = tf.image.decode_jpeg(im_raw, channels=1, ratio=2)
        return image

    def _preprocess(im):
        # Convert to float scaled [0, 1].
        if im.dtype != tf.float32:
            im = tf.image.convert_image_dtype(im, dtype=tf.float32)

        # Resize image to output size.
        im = tf.image.resize_images(im, input_size)

        # H x W x C --> C x H x W
        return tf.transpose(im, perm=(2, 0, 1))

    def _sort(p):
        """'data/Deploy/KLAC/KLAC0003/KLAC0003_86.jpg'
        ==> 'data/Deploy/KLAC/KLAC0003/KLAC0003_0086.jpg'
        """
        prefix, old_name = p.split('_')
        new_name = old_name.zfill(8)
        return '_'.join([prefix, new_name])

    frames_name_list = sorted(glob('{}/*/*/*.jpg'.format(deploy_root_dir)),
                              key=_sort)
    dir_dataset = Dataset.from_tensor_slices(frames_name_list)
    img_dataset = dir_dataset.map(_decode_image)
    img_dataset = img_dataset.map(_preprocess)
    dataset = Dataset.zip((img_dataset, dir_dataset))
    return dataset.batch(batch_size), len(frames_name_list)
Exemplo n.º 3
0
def get_dataset_encoded(dir='train', batch_size=32):
    # Load encoder.
    encoder = tfds.deprecated.text.SubwordTextEncoder.load_from_file('vocab')
    print('Vocab size is', encoder.vocab_size)
    # Load data.
    with open('dataset/' + dir + '/original.txt') as original:
        # Remove newline at the end.
        data_orig = original.readlines()[:-1]
    with open('dataset/' + dir + '/shuffled.txt') as shuffled:
        data_shuffled = shuffled.readlines()[:-1]
    data = data_orig + data_shuffled
    # Get song with max length to know the size for padding.
    max_len = 0
    longest_song = ''
    count = 0
    for i in range(len(data)):
        count += 1
        data[i] = data[i].strip()
        song = data[i]
        data[i] = encoder.encode(data[i])
        if len(data[i]) > max_len:
            max_len = len(data[i])
            longest_song = song
    print('max len is', max_len)
    print('longest song:', longest_song)
    # Create labels.
    labels = [1] * len(data_orig) + [0] * len(data_shuffled)
    # Shuffle.
    random.seed(42)
    random.shuffle(data)
    random.seed(42)
    random.shuffle(labels)
    # Create Dataset objects from generators.
    data_gen = lambda: (d for d in data)
    label_gen = lambda: ([l] for l in labels)
    dataset_data = tf.data.Dataset.from_generator(data_gen,
                                                  output_types=tf.int32,
                                                  output_shapes=tf.TensorShape(
                                                      [None]))
    dataset_labels = tf.data.Dataset.from_generator(
        label_gen, output_types=tf.int32, output_shapes=tf.TensorShape([1]))
    dataset = Dataset.zip((dataset_data, dataset_labels))
    # Each batch is padded to the size of the longest element in that batch.
    dataset_batched = dataset.padded_batch(batch_size,
                                           padding_values=0,
                                           padded_shapes=(max_len, 1))
    # Debug prints:
    print('{0} dataset: {1}'.format(dir, dataset_batched.cardinality()))
    # for element in dataset:
    #   print(element)
    for text_batch, label_batch in dataset_batched.take(1):
        print(text_batch.shape)
        print(label_batch.shape)
        for i in range(5):
            print(text_batch[i])
            print(label_batch[i])
    return dataset
Exemplo n.º 4
0
def get_dataset(dir='train', batch_size=32, padding_char='_'):
    fixed_len = 275
    # Load data, padding from left to fixed length.
    with open('dataset/' + dir + '/original.txt') as original:
        # Remove newline at the end.
        data_orig = original.readlines()[:-1]
    with open('dataset/' + dir + '/shuffled.txt') as shuffled:
        data_shuffled = shuffled.readlines()[:-1]
    data = data_orig + data_shuffled
    for i in range(len(data)):
        data[i] = data[i].strip()
        # # Add padding
        # data_len = len(data[i].split(' '))
        # padding_len = fixed_len - data_len
        # data[i] = (padding_char + ' ') * padding_len + data[i]
    # Add labels.
    labels = [1] * len(data_orig) + [0] * len(data_shuffled)
    # Shuffle.
    random.seed(42)
    random.shuffle(data)
    random.seed(42)
    random.shuffle(labels)
    # Convert to tensors.
    data_tensor = tf.ragged.constant(data)
    labels_tensor = tf.ragged.constant(labels)
    # Convert to Dataset object.
    features_dataset = Dataset.from_tensor_slices(data_tensor)
    labels_dataset = Dataset.from_tensor_slices(labels_tensor)
    dataset = Dataset.zip((features_dataset, labels_dataset))
    dataset = dataset.batch(batch_size)
    # Debug prints:
    print('{0} dataset: {1}'.format(dir, dataset.cardinality()))
    # for element in dataset:
    #   print(element)
    # for text_batch, label_batch in dataset.take(1):
    #       for i in range(5):
    #          print(text_batch.numpy()[i])
    #          print(label_batch.numpy()[i])
    return dataset
Exemplo n.º 5
0
tgt_dataset = Dataset.from_tensor_slices(
    tf.constant( [ 'a b', 'b c', '',  'c c' ] )
)
src_eos_id = tf.cast(
    src_vocab_table.lookup( tf.constant( 'eos' ) ),
    tf.int32
)
tgt_sos_id = tf.cast(
    tgt_vocab_table.lookup( tf.constant( 'sos' ) ),
    tf.int32
)
tgt_eos_id = tf.cast(
    tgt_vocab_table.lookup( tf.constant( 'eos' ) ),
    tf.int32
)
src_tgt_dataset = Dataset.zip( ( src_dataset, tgt_dataset ) )
print( 'begin')
print_Dataset( src_tgt_dataset )

src_tgt_dataset = src_tgt_dataset.map(
    lambda src, tgt: (
        tf.string_split( [src] ).values, tf.string_split( [tgt] ).values
    )
)
print( 'string_split')
print_Dataset( src_tgt_dataset )

src_tgt_dataset = src_tgt_dataset.filter(
    lambda src, tgt: tf.logical_and( tf.size(src) >0, tf.size( tgt) > 0 )
)
print( 'Filter zero length input sequences')
Exemplo n.º 6
0
import matplotlib.pyplot as plt
import pickle


#examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True,
 #                              as_supervised=True)
#train_examples, val_examples = examples['train'], examples['validation']
from tensorflow.python import keras
from tensorflow.python.data import Dataset
from tensorflow_datasets.core.features.text import TokenTextEncoder

#dataset_x = dataset_x.map(lambda token: token.numpy().decode("utf-8"))

dataset_x = tf.data.TextLineDataset("data/texts_noisy.txt")
dataset_y = tf.data.TextLineDataset("data/texts.txt")
dataset = Dataset.zip((dataset_x, dataset_y))



print("data loaded")

vocab = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
         'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'æ', 'ø', 'å', ' ']

tokenizer_pt = None

tokenizer_en = None

if os.path.isfile('tokenizer_pt.pickle'):
    with open('tokenizer_pt.pickle', 'rb') as handle:
        tokenizer_pt = pickle.load(handle)
Exemplo n.º 7
0
def get_dataset(dir='train', batch_size=32):
    # Load encoder.
    encoder = tfds.features.text.SubwordTextEncoder.load_from_file('vocab')
    # Load data.
    with open('dataset/' + dir + '/original.txt') as original:
        # Remove newline at the end.
        data_orig = original.readlines()[:-1]
    with open('dataset/' + dir + '/shuffled.txt') as shuffled:
        data_shuffled = shuffled.readlines()[:-1]
    data = data_orig + data_shuffled
    max_len = 0
    count = 0
    for i in range(len(data)):
        count += 1
        data[i] = data[i].strip()
        data[i] = encoder.encode(data[i])
        if len(data[i]) > max_len:
            max_len = len(data[i])
    print('max len is', max_len)
    # Add padding.
    # for i in range(len(data)):
    #     data[i] += [0]*(max_len - len(data[i]))
    # Add labels.
    labels = [1] * len(data_orig) + [0] * len(data_shuffled)
    # Shuffle.
    random.seed(42)
    random.shuffle(data)
    random.seed(42)
    random.shuffle(labels)
    # Convert to tensors.
    # data_tensor = tf.ragged.constant(data)
    # labels_tensor = tf.ragged.constant(labels)
    # # Convert to Dataset object.
    # features_dataset = Dataset.from_tensor_slices(data_tensor)
    # labels_dataset = Dataset.from_tensor_slices(labels_tensor)
    # dataset = Dataset.zip((features_dataset, labels_dataset))
    # # Convert to numpy array to create Dataset object.
    # dataset = Dataset.from_tensor_slices((data, labels))
    data_gen = lambda: (d for d in data)
    label_gen = lambda: ([l] for l in labels)
    dataset_data = tf.data.Dataset.from_generator(data_gen,
                                                  output_types=tf.int32,
                                                  output_shapes=tf.TensorShape(
                                                      [None]))
    dataset_labels = tf.data.Dataset.from_generator(
        label_gen, output_types=tf.int32, output_shapes=tf.TensorShape([1]))
    dataset = Dataset.zip((dataset_data, dataset_labels))
    # im_dataset = im_dataset.prefetch(4)
    # print("output data type is ", im_dataset.output_types)
    # print("output data shape is ", im_dataset.output_shapes)
    # iterator = im_dataset.make_initializable_iterator()
    # with tf.Session() as sess:
    #     sess.run(iterator.initializer)
    #     a = sess.run(iterator.get_next())
    # print("shape of the run results are: ")
    # print(a[0].shape)
    # print(a[1].shape)
    # print(a[2].shape)
    # print(a[3].shape)
    # for elem, val in dataset:
    #     print(elem)
    #     print(val)
    #     break
    # Each batch is padded to the size of the longest element in that batch.
    dataset_batched = dataset.padded_batch(batch_size,
                                           padding_values=0,
                                           padded_shapes=(max_len, 1))
    # Debug prints:
    print('{0} dataset: {1}'.format(dir, dataset_batched.cardinality()))
    # for element in dataset:
    #   print(element)
    # for text_batch, label_batch in dataset_batched.take(1):
    #     print(text_batch.shape)
    #     print(label_batch.shape)
    #     for i in range(5):
    #         print(text_batch[i])
    #         print(label_batch[i])
    return dataset