Exemplos de Dataset em Python, exemplos de tensorflow.data.Dataset em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: dataset.py Projeto: solislemuslab/dna-nn-theory

def motif_discovery_raw(train_file, test_file):
    subset_size = 690 * 190

    x_shape = len(range(101))
    train_gen = gen_from_fasta(train_file, None)
    test_gen = gen_from_fasta(test_file, None)

    # datasets
    bacth_size = 512
    prefetch = tf.data.experimental.AUTOTUNE

    output_shapes = ((), ())
    output_types = (tf.string, tf.float32)

    train_ds = Dataset.from_generator(train_gen, output_types, output_shapes)
    # takes about 30 seconds to skip the training data
    val_ds = train_ds.skip(subset_size).take(690 * 10).map(vectorize_text)
    train_ds = train_ds.take(subset_size).shuffle(500).batch(bacth_size).map(
        vectorize_text).prefetch(prefetch)

    test_ds = Dataset.from_generator(test_gen, output_types, output_shapes)
    test_ds = test_ds.take(subset_size).batch(bacth_size).map(
        vectorize_text).prefetch(prefetch)

    x_val, y_val = [], []
    for d in val_ds:
        x_val.append(d[0])
        y_val.append(d[1])
    x_val = tf.convert_to_tensor(x_val)
    y_val = tf.convert_to_tensor(y_val)
    validation_data = (x_val, y_val)

    return x_shape, train_ds, validation_data, test_ds

Exemplo n.º 2

0

Exibir arquivo

Arquivo: train.py Projeto: berndverst/kubeflow-and-mlops

def run(data_path, image_size=160, epochs=10, batch_size=32, learning_rate=0.0001, output='model', dataset=None):
    img_shape = (image_size, image_size, 3)

    info('Loading Data Set')
    # load dataset
    train, test, val, labels = load_dataset(data_path, dataset)

    # training data
    train_data, train_labels = zip(*train)
    train_ds = Dataset.zip((Dataset.from_tensor_slices(list(train_data)),
                            Dataset.from_tensor_slices(list(train_labels))))

    train_ds = train_ds.map(map_func=process_image, 
                            num_parallel_calls=5)

    train_ds = train_ds.apply(tf.data.experimental.ignore_errors())

    train_ds = train_ds.batch(batch_size)
    train_ds = train_ds.prefetch(buffer_size=5)
    train_ds = train_ds.repeat()

    # model
    info('Creating Model')
    base_model = tf.keras.applications.ResNet50(input_shape=img_shape,
                                               include_top=False, 
                                               weights='imagenet')
    base_model.trainable = True

    model = tf.keras.Sequential([
        base_model,
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(lr=learning_rate), 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

    model.summary()

    # training
    info('Training')
    steps_per_epoch = math.ceil(len(train)/batch_size)
    history = model.fit(train_ds, epochs=epochs, steps_per_epoch=steps_per_epoch)

    # save model
    info('Saving Model')
    
    # check existence of base model folder
    output = check_dir(output)

    print('Serializing into saved_model format')
    tf.saved_model.save(model, str(output))

    # add time prefix folder
    #stamp = datetime.now().strftime('%y_%m_%d_%H_%M.h5')
    #stamped = str(Path(output).joinpath(stamp))
    file_output = str(Path(output).joinpath('latest.h5'))
    #print('Serializing model to:\n{}\n{}'.format(stamped, output)
    model.save(file_output)

Exemplo n.º 3

0

Exibir arquivo

def make_tf_dataset(file_path='', batch_size=10):
    loaded_data = np.load(file_path)
    X_train = loaded_data['X_train']
    X_test = loaded_data['X_test']
    Y_train = loaded_data['Y_train']
    Y_test = loaded_data['Y_test']

    print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape, flush=True)

    X_train = tf.cast(X_train, tf.float32)
    X_test = tf.cast(X_test, tf.float32)
    Y_train = tf.cast(Y_train, tf.int32)
    Y_test = tf.cast(Y_test, tf.int32)

    train_dat = Dataset.from_tensor_slices((X_train, Y_train))
    train_dat = train_dat.batch(batch_size)

    test_dat = Dataset.from_tensor_slices((X_test, Y_test))
    test_dat = test_dat.batch(batch_size)
    data_dict = {}

    iterator = Iterator.from_structure(train_dat.output_types,
                                       train_dat.output_shapes)

    data_dict['iterator'] = iterator
    data_dict['train_it_init'] = iterator.make_initializer(train_dat)
    data_dict['test_it_init'] = iterator.make_initializer(test_dat)

    #data_dict['train_it'] = train_iterator
    #data_dict['test_it'] = test_iterator
    #data_dict['train_it_init'] = train_iterator.initializer
    #data_dict['test_it_init'] = test_iterator.initializer

    return data_dict

Exemplo n.º 4

0

Exibir arquivo

def main():

    dataset_count = 10

    def create_dataset(i):
        return Dataset.range(4 * i, 4 * (i + 1))

    dataset = Dataset.range(dataset_count).map(create_dataset)

    for d in dataset:
        show_dataset(d)

    d = dataset.flat_map(lambda x: x)
    show_dataset(d)

    d = dataset.interleave(lambda x: x, cycle_length=2, block_length=3)
    show_dataset(d)

    # Repeat two datasets of different lengths and interleave them.
    a = Dataset.from_tensor_slices(np.arange(10)).repeat()
    b = Dataset.from_tensor_slices(100 + np.arange(17)).repeat()
    datasets = [a, b]
    n = len(datasets)
    c = Dataset.from_tensor_slices(datasets)
    d = c.interleave(lambda x: x, cycle_length=n).take(50)
    show_dataset(d)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: data_otf_generator_tf.py Projeto: VitjanZ/PoCo_V2

    def prepare_train_generator(self):
        image_names = glob.glob(self.dir_name +
                                "/training_data/images/images/*.jpg")
        image_names.extend(
            glob.glob(self.dir_name + "/training_data/images/images/*.png"))
        image_names.extend(
            glob.glob(self.dir_name + "/training_data/images/images/*.bmp"))
        image_names.extend(
            glob.glob(self.dir_name + "/training_data/images/images/*.tif"))
        sample_img = cv2.imread(image_names[0])
        target_shape = (sample_img.shape[0], sample_img.shape[1])

        crop_generator = CropGenerator(self.dir_name, target_shape)

        #image_dataset = tf.data.Dataset.list_files(self.dir_name + '/training_data/images/images/*')
        total_dataset = Dataset.range(1, 8).interleave(
            lambda x: Dataset.from_generator(
                CropGenerator(self.dir_name, target_shape),
                output_types=(tf.float32, tf.float32)),
            cycle_length=8)
        total_dataset = total_dataset.shuffle(buffer_size=20)
        #total_dataset = total_dataset.cache("./data_cache.")
        total_dataset = total_dataset.repeat()
        total_dataset = total_dataset.prefetch(buffer_size=20)
        data_tf = total_dataset.make_one_shot_iterator().get_next()
        return data_tf, crop_generator()

Exemplo n.º 6

0

Exibir arquivo

def test_input_fn(x_test,y_test,batch_size):
    if y_test is None:
        ds=tds.from_tensor_slices(
            {'input-features':x_test})
    else:
        ds=tds.from_tensor_slices(
            ({'input-features':x_test},
             y_test.reshape(-1,1)))
    return ds.batch(batch_size)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: main.py Projeto: tactycHQ/Mercury2

def getData(mypath, config):

    # get list of filepaths
    onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
    data_dict = [mypath + "\\" + s for s in onlyfiles]

    # create numpy datasets for each stock
    data = []
    for fname in data_dict:
        data.append(
            DataLoader(fname,
                       window=config.experiment.window,
                       threshold=config.experiment.threshold))

    # initialize numpy arrays for training and test data
    X_train = data[0].X_train_std
    Y_train = data[0].Y_train
    X_val = data[0].X_val_std
    Y_val = data[0].Y_val
    X_test = data[0].X_test_std
    Y_test = data[0].Y_test

    # add other stocks to previously initialized numpy arrays
    for i in range(1, len(data)):
        X_train = np.concatenate((X_train, data[i].X_train_std), axis=0)
        Y_train = np.concatenate((Y_train, data[i].Y_train), axis=0)
        X_val = np.concatenate((X_val, data[i].X_val_std), axis=0)
        Y_val = np.concatenate((Y_val, data[i].Y_val), axis=0)
        X_test = np.concatenate((X_test, data[i].X_test_std), axis=0)
        Y_test = np.concatenate((Y_test, data[i].Y_test), axis=0)

    # Save number of features and samples
    num_train_samples = X_train.shape[0]
    num_val_samples = X_val.shape[0]
    num_test_samples = X_test.shape[0]
    num_train_features = X_train.shape[1]

    # Generate TF dataset for Keras model
    logging.info('------Final Training and Test Datasets------')
    logging.info('Size of X_Train: %s', X_train.shape)
    logging.info('Size of Y_Train: %s', Y_train.shape)
    logging.info('Size of X_val: %s', X_val.shape)
    logging.info('Size of Y_val: %s', Y_val.shape)
    logging.info('Size of X_Test: %s', X_test.shape)
    logging.info('Size of Y_Test: %s', Y_test.shape)
    train_dataset = Dataset.from_tensor_slices((X_train, Y_train))
    train_dataset = train_dataset.shuffle(config.model.shuffle).batch(
        config.model.batch_size).repeat()
    val_dataset = Dataset.from_tensor_slices((X_val, Y_val))
    val_dataset = val_dataset.shuffle(config.model.shuffle).batch(
        config.model.batch_size).repeat()
    test_dataset = Dataset.from_tensor_slices((X_test, Y_test))
    test_dataset = test_dataset.shuffle(config.model.shuffle).batch(
        config.model.batch_size).repeat()

    return train_dataset, val_dataset, test_dataset, num_train_features, num_train_samples, num_val_samples, num_test_samples

Exemplo n.º 8

0

Exibir arquivo

def batch_and_repeat(ds: Dataset, batch_size: int, shuffle: bool,
                     repeat: bool) -> Dataset:
    ds = ds.prefetch(buffer_size=AUTOTUNE)
    if shuffle:
        ds = ds.shuffle(1024, seed=SEED)
    if repeat:
        ds = ds.repeat()
    if batch_size > 0:
        ds = ds.batch(batch_size, drop_remainder=False)
    return ds

Exemplo n.º 9

0

Exibir arquivo

Arquivo: DataOps.py Projeto: choct155/en685_621

 def train_test_split(data: Dataset,
                      test_cnt: int) -> Tuple[Dataset, Dataset]:
     values, labels = next(
         data.as_numpy_iterator())  # both numpy arrays of the same length
     permuted_idxs: np.array = np.random.permutation(len(values))
     pvals, plabs = (values[permuted_idxs], labels[permuted_idxs])
     train: tf.data.Dataset = Dataset.from_tensors(
         (pvals[test_cnt:], plabs[test_cnt:]))
     test: tf.data.Dataset = Dataset.from_tensors(
         (pvals[:test_cnt], plabs[:test_cnt]))
     return (train, test)

Exemplo n.º 10

0

Exibir arquivo

    def train(self,
              train_dataset: Dataset,
              valid_dataset: Dataset = None,
              batch_size: int = 256,
              epochs: int = 16,
              checkpoints_path: Path = None):
        print("Training model...")

        ckpt = None
        manager = None
        if checkpoints_path is not None:
            checkpoints_path.mkdir(parents=True, exist_ok=True)
            ckpt = tf.train.Checkpoint(step=tf.Variable(1),
                                       optimizer=self.optimizer,
                                       net=self.network)
            manager = tf.train.CheckpointManager(ckpt,
                                                 checkpoints_path,
                                                 max_to_keep=3)
            ckpt.restore(manager.latest_checkpoint)
            if manager.latest_checkpoint:
                print(f"Restored from {manager.latest_checkpoint}")
            else:
                print("Initializing from scratch.")

        # Batch the datasets
        train_dataset = train_dataset.shuffle(1024).batch(batch_size).prefetch(
            buffer_size=tf.data.experimental.AUTOTUNE)
        valid_dataset = valid_dataset.batch(batch_size)

        # Start training the model.
        for epoch in range(1, epochs + 1):
            for images, labels in train_dataset:
                self._train_step(images, labels)

            for valid_images, valid_labels in valid_dataset:
                self._test_step(valid_images, valid_labels)

            if checkpoints_path is not None:
                ckpt.step.assign_add(1)
                if int(ckpt.step) % 10 == 0:
                    save_path = manager.save()
                    print(
                        f"ðŸ’¾ Saved checkpoint for step {int(ckpt.step)}: {save_path}"
                    )

            print(
                f"Epoch {epoch}, "
                f"Loss: {self.train_loss.result()}, Accuracy: {self.train_accuracy.result() * 100}, "
                f"Valid Loss: {self.test_loss.result()}, Valid Accuracy: {self.test_accuracy.result() * 100}"
            )

        # Save the model.
        self.network.trainable = False
        self.network.save(self.save_path)

Exemplo n.º 11

0

Exibir arquivo

def make_generator(src_dir, valid_rate, input_size, batch_size):

    # インスタンス作成
    train_datagen = ImageDataGenerator(rescale=1. / 255,
                                       rotation_range=30,
                                       width_shift_range=0.2,
                                       height_shift_range=0.2,
                                       shear_range=30,
                                       zoom_range=[0.7, 0.3],
                                       horizontal_flip=True,
                                       vertical_flip=True,
                                       validation_split=valid_rate)

    # ジェネレータ作成
    # --- 訓練用データ
    train_generator = train_datagen.flow_from_directory(
        directory=src_dir,
        target_size=input_size,
        batch_size=batch_size,
        shuffle=True,
        class_mode='categorical',
        subset='training')

    # ジェネレータ作成
    # --- 検証用データ
    valid_generator = train_datagen.flow_from_directory(
        directory=src_dir,
        target_size=input_size,
        batch_size=batch_size,
        shuffle=True,
        class_mode='categorical',
        subset='validation')

    # ラッピング
    # --- 訓練データジェネレータ
    train_ds = Dataset.from_generator(lambda: train_generator,
                                      output_types=(tf.float32, tf.float32),
                                      output_shapes=([
                                          None, *train_generator.image_shape
                                      ], [None, train_generator.num_classes]))

    # ラッピング
    # --- 検証データジェネレータ
    valid_ds = Dataset.from_generator(lambda: valid_generator,
                                      output_types=(tf.float32, tf.float32),
                                      output_shapes=([
                                          None, *valid_generator.image_shape
                                      ], [None, valid_generator.num_classes]))

    # 各Datasetを無限に繰り返す設定
    train_ds = train_ds.repeat()
    valid_ds = valid_ds.repeat()

    return train_ds, train_generator.n, valid_ds, valid_generator.n

Exemplo n.º 12

0

Exibir arquivo

def create_final_datasets(X_train, X_valid, y_train, y_valid):
    train = Dataset.from_tensor_slices((X_train, (y_train.identity_hate.values, y_train.insult.values,
                                                  y_train.obscene.values, y_train.severe_toxic.values,
                                                  y_train.threat.values, y_train.toxic.values))).map(
        custom_loss.preprocess_sample).batch(config.BATCH_SIZE).repeat()

    valid = Dataset.from_tensor_slices((X_valid, (y_valid.identity_hate.values, y_valid.insult.values,
                                                  y_valid.obscene.values, y_valid.severe_toxic.values,
                                                  y_valid.threat.values, y_valid.toxic.values))).map(
        custom_loss.preprocess_sample).batch(config.BATCH_SIZE).repeat()

    return train, valid

Exemplo n.º 13

0

Exibir arquivo

Arquivo: train-sa-bilstm.py Projeto: Minys233/SA-SPR

def train_sa_bilstm(pad_to, lstm_hidden, da, r, lr, loss, savefigto):
    (train_x, train_y), (val_x, val_y), (test_x, test_y) = \
        load_ESOL('data/ESOL-solubility.csv', 'data/mol2vec_model_300dim.pkl', pad_to=pad_to)
    _, _, vector_size = train_x.shape
    model = build_sa_bilstm_model(pad_to=pad_to,
                                  vector_size=vector_size,
                                  lstm_hidden=lstm_hidden,
                                  da=da,
                                  r=r)
    model.compile(optimizer=keras.optimizers.Adam(lr=lr),
                  loss=loss,
                  metrics=['mse'])
    print(model.summary())
    print(train_x.shape, train_y.shape)
    train_dataset = Dataset.from_tensor_slices(
        (train_x, train_y)).shuffle(buffer_size=128).batch(64,
                                                           drop_remainder=True)
    val_dataset = Dataset.from_tensor_slices(
        (val_x, val_y)).batch(32, drop_remainder=True)
    test_dataset = Dataset.from_tensor_slices(
        (test_x, test_y)).batch(32, drop_remainder=True)
    # This eats huge HD space!
    tensorboard_callback = keras.callbacks.TensorBoard(log_dir='./logs',
                                                       histogram_freq=1,
                                                       update_freq='batch')
    earlystop_callback = keras.callbacks.EarlyStopping(patience=10)
    checkpoint_callback = keras.callbacks.ModelCheckpoint(
        f'./checkpoints/model-sa-bilstm-{pad_to}-{lstm_hidden}-{da}-{r}-{lr}-{loss}.ckpt',
        save_best_only=True)
    model.fit(train_dataset,
              epochs=100,
              validation_data=val_dataset,
              callbacks=[
                  tensorboard_callback, earlystop_callback, checkpoint_callback
              ])

    # std, mean
    predict = np.array(model.predict(test_x)).ravel() * 2.0965 - 3.058
    truth = np.array(test_y).ravel() * 2.0965 - 3.058

    plt.figure(figsize=(5, 5))
    plt.scatter(predict, truth)
    plt.plot([-8, 0], [-8, 0], 'r--')
    plt.axis([-8, 0, -8, 0])
    plt.xlabel("Prediction")
    plt.ylabel("Groundtruth")
    MSE = ((predict - truth)**2).mean()
    plt.title(f"MSE = {MSE:.3f}")
    plt.savefig(
        Path(savefigto) /
        f'./solubility_sa_bilstm-{pad_to}-{lstm_hidden}-{da}-{r}-{lr}-{loss}-{MSE:.4f}.png'
    )
    plt.close()

Exemplo n.º 14

0

Exibir arquivo

Arquivo: tensorflow_train_example.py Projeto: ray1201/ray-1

def simple_dataset(config):
    batch_size = config["batch_size"]
    x_train, y_train = linear_dataset(size=NUM_TRAIN_SAMPLES)
    x_test, y_test = linear_dataset(size=NUM_TEST_SAMPLES)

    train_dataset = Dataset.from_tensor_slices((x_train, y_train))
    test_dataset = Dataset.from_tensor_slices((x_test, y_test))
    train_dataset = train_dataset.shuffle(NUM_TRAIN_SAMPLES).repeat().batch(
        batch_size)
    test_dataset = test_dataset.repeat().batch(batch_size)

    return train_dataset, test_dataset

Exemplo n.º 15

0

Exibir arquivo

Arquivo: utils.py Projeto: fyk7/keras_image_cookiecutter

def make_generator(src_dir, valid_rate, input_size, batch_size):
    '''Dataset generatorを作成する関数
    dir -> generator -> Datasetの流れでデータセットを作成
    src_dir下のディレクトリ名が自動でクラス名となる(flow_from_directory)
    ImageDataGeneratorのパラメータはターゲットにより柔軟に変更(道路標識の上下フリップは必要ないetc...)
    '''
    train_datagen = ImageDataGenerator(rescale=1. / 255,
                                       rotation_range=30,
                                       width_shift_range=0.2,
                                       height_shift_range=0.2,
                                       shear_range=30,
                                       zoom_range=[0.7, 1.3],
                                       horizontal_flip=True,
                                       vertical_flip=True,
                                       validation_split=valid_rate)

    # directoryの構造・名称から自動でdata_generatorを作成
    train_generator = train_datagen.flow_from_directory(
        src_dir,
        target_size=input_size,
        batch_size=batch_size,
        shuffle=True,
        class_mode='categorical',
        subset='training')

    valid_generator = train_datagen.flow_from_directory(
        src_dir,
        target_size=input_size,
        batch_size=batch_size,
        shuffle=True,
        class_mode='categorical',
        subset='validation')

    train_ds = Dataset.from_generator(lambda: train_generator,
                                      output_types=(tf.float32, tf.float32),
                                      output_shapes=([
                                          None, *train_generator.image_shape
                                      ], [None, train_generator.num_classes]))

    valid_ds = Dataset.from_generator(lambda: valid_generator,
                                      output_types=(tf.float32, tf.float32),
                                      output_shapes=([
                                          None, *valid_generator.image_shape
                                      ], [None, valid_generator.num_classes]))

    train_ds = train_ds.repeat()
    valid_ds = valid_ds.repeat()

    cls_info = {v: k for k, v in train_generator.class_indices.items()}

    return train_ds, train_generator.n, valid_ds, valid_generator.n, cls_info

Exemplo n.º 16

0

Exibir arquivo

Arquivo: sentiment_dataset.py Projeto: YingZ98/asg4

def _input_fn(directory, config, mode):

    print("Fetching {} data...".format(mode))

    all_features = []
    all_labels = []

    if config["cloud"] == 0:
        all_files = os.listdir(directory)
        for file in all_files:
            features, labels = _load_json_file(os.path.join(directory, file),
                                               config)
            all_features += features
            all_labels += labels
    else:
        s = sagemaker.Session()
        all_files = s.list_s3_files(config["bucket"], directory)
        for file in all_files[1:]:
            features, labels = _load_json_file(
                s.read_s3_file(config["bucket"], file), config)
            all_features += features
            all_labels += labels

    num_data_points = len(all_features)
    num_batches = math.ceil(len(all_features) / config["batch_size"])

    dataset = Dataset.from_tensor_slices((all_features, all_labels))

    if mode == "train":

        dataset = Dataset.from_tensor_slices((all_features, all_labels))
        dataset = dataset.batch(config["batch_size"]).shuffle(
            10000, seed=12345).repeat(config["num_epoch"])
        num_batches = math.ceil(len(all_features) / config["batch_size"])

    if mode in ("validation", "eval"):

        dataset = dataset.batch(config["batch_size"]).repeat(
            config["num_epoch"])
        num_batches = math.ceil(len(all_features) / config["batch_size"])

    iterator = dataset.make_one_shot_iterator()
    dataset_features, dataset_labels = iterator.get_next()

    return [{
        config["input_tensor_name"]: dataset_features
    }, dataset_labels, {
        "num_data_point": num_data_points,
        "num_batches": num_batches
    }]

Exemplo n.º 17

0

Exibir arquivo

def make_generator(src_dir, valid_rate, input_size, batch_size):

    # インスタンス生成
    # --- ImageDataGeneratorクラス
    train_datagen = ImageDataGenerator(rescale=1 / 255,
                                       validation_split=valid_rate)

    # ジェネレータ作成
    # --- 訓練データの読込み
    # --- 250 * (1 - 0.2) = 200
    train_generator = train_datagen.flow_from_directory(
        src_dir,
        target_size=input_size,
        batch_size=batch_size,
        shuffle=True,
        class_mode='categorical',
        subset='training')

    # ジェネレータ作成
    # --- 検証データの読込み
    # --- 250 * 0.2 = 50
    valid_generator = train_datagen.flow_from_directory(
        src_dir,
        target_size=input_size,
        batch_size=batch_size,
        shuffle=True,
        class_mode='categorical',
        subset='validation')

    # ラッピング
    # --- 訓練データジェネレータ
    trans_ds = Dataset.from_generator(lambda: train_generator,
                                      output_types=(tf.float32, tf.float32),
                                      output_shapes=([
                                          None, *train_generator.image_shape
                                      ], [None, train_generator.num_classes]))

    # ラッピング
    # --- 検証データジェネレータ
    valid_ds = Dataset.from_generator(lambda: valid_generator,
                                      output_types=(tf.float32, tf.float32),
                                      output_shapes=([
                                          None, *valid_generator.image_shape
                                      ], [None, valid_generator.num_classes]))

    # 各Datasetを無限に繰り返す設定
    trans_ds = trans_ds.repeat()
    trans_ds = trans_ds.repeat()

    return trans_ds, train_generator.n, valid_ds, valid_generator.n

Exemplo n.º 18

0

Exibir arquivo

def generate_tf_data(enc_input: list, dec_input: list, batch_size: int,
                     train_size: int, val_size: int) -> [Dataset]:
    '''Generates a tensorflow data set, splits it in train, test and validation sets.

    Problem: Feeding in three arrays containing almost two million sequences each, requires too much main memory.
    Solution: We use the Tensorflow Dataset, where we can feed the model with slices of the whole dataset.

    Also: shuffles the observations.

    Args:
        enc_input: encoder input ids, token ids for each word and each sentence
        dec_input: used for teacher forcing. Token ids for each word and each sentence in target lang.
            More specific:
                - decoder input, token sequences (index 0 in dec_input)
                - decoder target output, token sequences (for teacher forcing, index 1 in dec_input)
        batch_size: Number of observation passed to the Seq2Seq model during training time.
        train_size: Fraction of all observations to be reserved for training the model.
        val_size: Fraction of all observations to be reserved for evaluating the model performance during training.
    Returns:
        train_data: contains encoder_input, decoder_input, decoder_target_output for training the model.
        val_data: contains encoder_input, decoder_input, decoder_target_output for evaluating the model.
    '''

    assert train_size + val_size == 1, "Train, Validation and Test size doesn't sum up to 1!"

    data_size = enc_input[0].shape[0]

    # Summarize the source language token ids and the decoder input as: model_input
    model_input = Dataset.from_tensor_slices((enc_input[0], dec_input[0]))
    #                                         enc_token_ids dec_token_ids

    # convert decoder_target_output to TF.Dataset
    decoder_target_output = Dataset.from_tensor_slices((dec_input[1]))
    #                                            dec_token_ids used as target output (shifted by one observation)

    # Combine the model_input and the decoder_target_output to a full TF.Dataset, shuffle it
    full_data = Dataset.zip(
        (model_input, decoder_target_output)).shuffle(data_size)

    # Train Val split
    train_size = int(train_size * data_size)
    val_size = int(val_size * data_size)

    train_data = full_data.take(train_size)
    val_data = full_data.skip(train_size)

    train_data = train_data.batch(batch_size, drop_remainder=True)
    val_data = val_data.batch(batch_size, drop_remainder=True)

    return train_data, val_data

Exemplo n.º 19

0

Exibir arquivo

Arquivo: train_tf2_fashion_mnist_custom.py Projeto: aws-samples/amazon-sagemaker-script-mode-with-debugger

def prepare_batch_datasets(x_train, y_train, batch_size):
    logger.info('Preparing train and validation datasets for batches...')
    # Reserve the required samples for validation
    x_val = x_train[-(len(x_train) * int(VALIDATION_DATA_SPLIT)):]
    y_val = y_train[-(len(y_train) * int(VALIDATION_DATA_SPLIT)):]
    # Prepare the training dataset with shuffling
    train_dataset = Dataset.from_tensor_slices((x_train, y_train))
    train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)
    # Prepare the validation dataset
    val_dataset = Dataset.from_tensor_slices((x_val, y_val))
    val_dataset = val_dataset.batch(batch_size)
    logger.info(
        'Completed preparing train and validation datasets for batches.')
    return x_val, y_val, train_dataset, val_dataset

Exemplo n.º 20

0

Exibir arquivo

def augment_ds(ds: Dataset, grayscale: bool) -> Dataset:
    if not grayscale:
        ds = ds.map(
            lambda x, y: (_random_hue_saturation_brightness_contrast(x), y),
            num_parallel_calls=AUTOTUNE,
        )
    if grayscale:
        ds = ds.map(lambda x, y: (_random_crop_mnist(x), y),
                    num_parallel_calls=AUTOTUNE)
    else:
        ds = ds.map(lambda x, y: (_random_crop_cifar(x), y),
                    num_parallel_calls=AUTOTUNE)
    ds = ds.map(lambda x, y: (_random_horizontal_flip(x), y),
                num_parallel_calls=AUTOTUNE)
    return ds

Exemplo n.º 21

0

Exibir arquivo

Arquivo: main.py Projeto: danieljanes/keras-tfds-example

def ds_rndm() -> Tuple[Dataset, Dataset, int, int, int]:
    # Hardcoded values taken from MNIST
    num_classes = 10
    m_train = 60000
    m_test = 10000
    # Random noise
    ds_image = Dataset.from_tensor_slices(
        (tf.random_uniform([m_train, 28, 28, 1], maxval=255, dtype=tf.int32)))
    ds_label = Dataset.from_tensor_slices((tf.random_uniform([m_train],
                                                             maxval=9,
                                                             dtype=tf.int64)))
    ds_train = Dataset.zip((ds_image, ds_label))
    ds_test = ds_train.take(m_test)

    return ds_train, ds_test, num_classes, m_train, m_test

Exemplo n.º 22

0

Exibir arquivo

Arquivo: dataset.py Projeto: solislemuslab/dna-nn-theory

def h3(file, word_size=3, region_size=0, expand=True):
    sequences, labels = read_fasta(file)
    test_size = 0.15
    val_size = 0.15
    split_options = dict(test_size=test_size,
                         stratify=labels,
                         random_state=3264)
    x_train_val, x_test, y_train_val, y_test = train_test_split(
        sequences, labels, **split_options)
    # normalize val_size and update options
    split_options.update(
        dict(test_size=val_size / (1 - test_size), stratify=y_train_val))
    x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val,
                                                      **split_options)
    del x_train_val, y_train_val

    encode_func = encode(word_size, region_size, expand=expand)
    x_shape = encoded_shape(sequences[0],
                            word_size,
                            region_size,
                            expand=expand)

    train_gen = gen_from_arrays(x_train, y_train, encode_func)
    val_gen = gen_from_arrays(x_val, y_val, encode_func)
    test_gen = gen_from_arrays(x_test, y_test, encode_func)

    # datasets
    batch_size = 32
    prefetch = tf.data.experimental.AUTOTUNE

    output_shapes = (x_shape, ())
    output_types = (tf.float32, tf.float32)

    train_ds = Dataset.from_generator(train_gen, output_types, output_shapes)
    train_ds = train_ds.shuffle(500).batch(batch_size).prefetch(prefetch)

    test_ds = Dataset.from_generator(test_gen, output_types, output_shapes)
    test_ds = test_ds.batch(batch_size).prefetch(prefetch)

    x_val_encode, y_val_encode = [], []
    for x, y in val_gen():
        x_val_encode.append(x)
        y_val_encode.append(y)
    x_val_encode = np.array(x_val_encode)
    y_val_encode = np.array(y_val_encode)
    validation_data = (x_val_encode, y_val_encode)

    return x_shape, train_ds, validation_data, test_ds

Exemplo n.º 23

0

Exibir arquivo

Arquivo: test_tf_dataset.py Projeto: HilbertXu/face_recognition

def simple_test():
    image_path = ['/home/kamerider/Documents/DataBase/1610763/10000.png','/home/kamerider/Documents/DataBase/1610763/10000.png','/home/kamerider/Documents/DataBase/1610763/10000.png','/home/kamerider/Documents/DataBase/1610763/10000.png','/home/kamerider/Documents/DataBase/1610763/10000.png']
    label = np.array([1,2])
    data = np.random.uniform(size=(12,3))

    image_path = convert_to_tensor(image_path, dtype=dtypes.string)
    label = convert_to_tensor(label, dtype=dtypes.int32)

    dataset = Dataset.from_tensor_slices((image_path, label))

    
    iterator = dataset.make_one_shot_iterator()
    one_element = iterator.get_next()
    with tf.Session() as sess:
        try:
            while True:
                result = sess.run(one_element)
                #print(result[0])
                image_string = tf.read_file(result[0])
                image_decode = tf.image.decode_png(image_string, channels=3)
                image_resize = tf.image.resize_images(image_decode,[64, 64])
                print (image_resize)
        except tf.errors.OutOfRangeError:
            print("end!")
    
    '''
    with tf.Session() as sess:
        for i in range(3):
            print (sess.run(one_element))
    '''

    '''

Exemplo n.º 24

0

Exibir arquivo

def train(examples, labels,
          features=None, lr=1e-4, steps=100, batch_size=1, model=None):
    '''Create and train a linear regression model.'''
    # Create datasets.
    if not features:
        features = examples.columns
    fcs = [tf.feature_column.numeric_column(feature) for feature in features]

    ds = Ds.from_tensor_slices(
        ({feature: examples[feature] for feature in features}, labels))

    opt = tf.contrib.estimator.clip_gradients_by_norm(
        tf.train.GradientDescentOptimizer(learning_rate=lr),
        5.0)

    if not model:
        model = tf.estimator.LinearRegressor(fcs, optimizer=opt)

    for _ in range(10):
        model.train(
            train_fn(ds, batch_size=batch_size),
            steps=steps//10)
        preds = model.predict(
            lambda: ds.batch(1).make_one_shot_iterator().get_next())
        predictions = np.hstack(pred['predictions'] for pred in preds)
        print("Mean squared error: ", mse(predictions, labels))

    return model

Exemplo n.º 25

0

Exibir arquivo

Arquivo: vanilla_rnn.py Projeto: Jephthia/NNs

    def load_data(self):
        data = GFile(self.file_path, 'rb').read().decode(encoding='UTF-8')

        # Get a list of the unique characters in the text
        vocab = list(sorted(set(data)))
        vocab_size = len(vocab)

        chars_to_ids = StringLookup(vocabulary=vocab)
        self.ids_to_chars_layer = StringLookup(
            vocabulary=chars_to_ids.get_vocabulary(), invert=True)

        # Split the entire text by character
        chars = unicode_split(data, 'UTF-8')
        ids_of_chars = chars_to_ids(chars)

        # Group characters to form sequences (+1 since the targets are shifted by one)
        sequences_ds = Dataset.from_tensor_slices(ids_of_chars)
        sequences_ds = sequences_ds.batch(C.SEQUENCE_LENGTH + 1)

        # Batch the sequences
        ds = sequences_ds.padded_batch(C.BATCH_SIZE)
        ds = ds.map(self._to_inputs_and_targets,
                    num_parallel_calls=tf.data.experimental.AUTOTUNE)
        ds = ds.shuffle(C.BUFFER_SIZE)
        ds = ds.prefetch(tf.data.experimental.AUTOTUNE)

        return ds

Exemplo n.º 26

0

Exibir arquivo

Arquivo: finetune.py Projeto: Datdj/MTCNN

def load_validation_data(data_folder, batch_size):
    x_validation = np.load(data_folder + '/validation/images.npy')
    x_validation = np.add(x_validation, -127.5, dtype=np.float32) / 127.5
    y1_validation = np.load(data_folder + '/validation/class_labels.npy')
    y2_validation = np.load(data_folder + '/validation/bounding_box_labels.npy')
    y3_validation = np.load(data_folder + '/validation/landmark_labels.npy')
    return Dataset.from_tensor_slices((x_validation, (y1_validation, y2_validation, y3_validation))).batch(batch_size, drop_remainder=True)

Exemplo n.º 27

0

Exibir arquivo

def eval_dataset(params: HParams, iterator: ner_data.Generator):
    """ test function for tf estimator """
    data = Dataset.from_generator(iterator.generator(), iterator.datatypes(),
                                  iterator.datashape())

    data = data.padded_batch(params.batch_size, iterator.datashape())
    return data

Exemplo n.º 28

0

Exibir arquivo

Arquivo: datagenerator_v.py Projeto: cfederer/TrainCNNsWithNeuralData

    def __init__(self,
                 txt_file,
                 mode,
                 batch_size,
                 img_size=227,
                 buffer_size=1000):

        self.txt_file = txt_file

        # retrieve the data from the text file
        self._read_txt_file()

        self.img_size = img_size

        # number of samples in the dataset
        self.data_size = len(self.RSAs)
        self.batch_size = batch_size

        # convert lists to TF tensor
        self.img1_paths = convert_to_tensor(self.img1_paths,
                                            dtype=dtypes.string)
        self.img2_paths = convert_to_tensor(self.img2_paths,
                                            dtype=dtypes.string)
        self.RSAs = convert_to_tensor(self.RSAs, dtype=dtypes.float32)

        # create dataset
        data = Dataset.from_tensor_slices(
            (self.img1_paths, self.img2_paths, self.RSAs))

        data = data.map(self._parse_function_train)

        data = data.batch(batch_size)

        self.data = data

Exemplo n.º 29

0

Exibir arquivo

def load_and_format_images_for_fitting(folder):
    all_images, all_image_labels = get_all_images(folder)

    ds = Dataset.from_tensor_slices((all_images, all_image_labels))
    ds = ds.shuffle(buffer_size=len(all_images))
    ds = ds.batch(batch_size)
    return ds

Exemplo n.º 30

0

Exibir arquivo

Arquivo: improving_neural.py Projeto: natsirtguy/gmlcc

def validate(model, examples, labels, features=None):
    '''Check the mse on the validation set.'''
    if not features:
        features = examples.columns

    ds = Ds.from_tensor_slices(
        ({feature: examples[feature]
          for feature in features}, labels))
    predictions = get_predictions(model, ds)
    plt.figure()
    plt.subplot(1, 2, 1)
    plt.scatter(examples['longitude'],
                examples['latitude'],
                cmap='coolwarm',
                c=labels.iloc[:, 0])
    plt.subplot(1, 2, 2)
    plt.scatter(examples['longitude'],
                examples['latitude'],
                cmap='coolwarm',
                c=predictions)
    if "classifier" in str(type(model)).casefold():
        print("Validation log loss:", log_loss(labels, predictions))
    else:
        print("Validation mse:", mse(predictions, labels))
    return predictions

Exemplo n.º 31

0

Exibir arquivo

Arquivo: character_predictor_model.py Projeto: MostafaGazar/line-reader

    def train(self, checkpoints_path: Path, train_dataset: Dataset, valid_dataset: Dataset = None,
              batch_size: int = 256, epochs: int = 16):
        checkpoints_path.mkdir(parents=True, exist_ok=True)
        ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=self.optimizer, net=self.network)
        manager = tf.train.CheckpointManager(ckpt, checkpoints_path, max_to_keep=3)
        ckpt.restore(manager.latest_checkpoint)
        if manager.latest_checkpoint:
            print(f"Restored from {manager.latest_checkpoint}")
        else:
            print("Initializing from scratch.")

        # Batch the datasets
        train_dataset = train_dataset.shuffle(1024).batch(batch_size).prefetch(
            buffer_size=tf.data.experimental.AUTOTUNE)
        valid_dataset = valid_dataset.batch(batch_size)

        # Start training the model.
        for epoch in range(1, epochs + 1):
            for images, labels in train_dataset:
                self._train_step(images, labels)

            for valid_images, valid_labels in valid_dataset:
                self._test_step(valid_images, valid_labels)

            ckpt.step.assign_add(1)
            if int(ckpt.step) % 10 == 0:
                save_path = manager.save()
                print(f"ðŸ’¾ Saved checkpoint for step {int(ckpt.step)}: {save_path}")

            print(f"Epoch {epoch}, "
                  f"Loss: {self.train_loss.result()}, Accuracy: {self.train_accuracy.result() * 100}, "
                  f"Valid Loss: {self.test_loss.result()}, Valid Accuracy: {self.test_accuracy.result() * 100}")

        # Save the model.
        self.network.trainable = False
        self.network.save(self.save_path)