Exemplo n.º 1
0
    def next_batch(self):
        if self.last_batch():
            if self.should_shuffle:
                self.dataset = shuffle_dict_unison_inplace(self.dataset)
            self.reset()

        sub_batch = {}
        for features_name in self.dataset.features:
            sub_batch[features_name] = self.dataset.get(
                features_name,
                range(self.index,
                      min(self.index + self.batch_size, self.total_size)))

        self.index += self.batch_size
        return sub_batch
Exemplo n.º 2
0
def load_data(
        hdf5_file_path,
        input_features,
        output_features,
        split_data=True,
        shuffle_training=False
):
    logger.info('Loading data from: {0}'.format(hdf5_file_path))
    # Load data from file
    hdf5_data = h5py.File(hdf5_file_path, 'r')
    dataset = {}
    for input_feature in input_features:
        if input_feature['type'] == TEXT:
            text_data_field = text_feature_data_field(input_feature)
            dataset[text_data_field] = hdf5_data[text_data_field].value
        else:
            dataset[input_feature['name']] = hdf5_data[
                input_feature['name']
            ].value
    for output_feature in output_features:
        if output_feature['type'] == TEXT:
            dataset[text_feature_data_field(output_feature)] = hdf5_data[
                text_feature_data_field(output_feature)
            ].value
        else:
            dataset[output_feature['name']] = hdf5_data[
                output_feature['name']].value
        if 'limit' in output_feature:
            dataset[output_feature['name']] = collapse_rare_labels(
                dataset[output_feature['name']],
                output_feature['limit']
            )

    if not split_data:
        hdf5_data.close()
        return dataset

    split = hdf5_data['split'].value
    hdf5_data.close()
    training_set, test_set, validation_set = split_dataset_tvt(dataset, split)

    # shuffle up
    if shuffle_training:
        training_set = data_utils.shuffle_dict_unison_inplace(training_set)

    return training_set, test_set, validation_set
Exemplo n.º 3
0
    def next_batch(self):
        if self.last_batch():
            if self.should_shuffle:
                self.dataset = shuffle_dict_unison_inplace(
                    self.dataset, np.random.RandomState(self.epoch))
            self.reset()
            self.epoch += 1

        sub_batch = {}
        for features_name in self.dataset.features:
            sub_batch[features_name] = self.dataset.get(
                features_name,
                range(self.index,
                      min(self.index + self.batch_size, self.max_index)))

        self.index += self.batch_size
        self.step += 1
        return sub_batch