Exemplo n.º 1
0
 def _category_lookup(self, params: dict):
     key, input_layer = self._get_input_layer(params)
     num_oov_buckets = params.get('num_oov_buckets', 0)
     if input_layer.dtype == 'string':
         if 'vocabulary_file' in params.keys():
             return StringLookup(max_tokens=params['vocabulary_size'],
                                 num_oov_indices=num_oov_buckets,
                                 mask_token=None,
                                 vocabulary=params['vocabulary_file'])(input_layer)
         elif 'vocabulary_list' in params.keys():
             return StringLookup(max_tokens=len(params['vocabulary_list']) + num_oov_buckets,
                                 num_oov_indices=num_oov_buckets,
                                 mask_token=None,
                                 vocabulary=params['vocabulary_list'])(input_layer)
     else:
         if 'vocabulary_file' in params.keys():
             return IntegerLookup(max_values=params['vocabulary_size'] + num_oov_buckets,
                                  num_oov_indices=num_oov_buckets,
                                  mask_value=None,
                                  vocabulary=['vocabulary_file'])(input_layer)
         elif 'vocabulary_list' in params.keys():
             return IntegerLookup(max_values=len(params['vocabulary_list']) + num_oov_buckets,
                                  num_oov_indices=num_oov_buckets,
                                  mask_value=None,
                                  vocabulary=params['vocabulary_list'])(input_layer)
Exemplo n.º 2
0
    def __init__(self, model_weight='mn_model_weight.h5', scale_ratio=1):
        self.scale_ratio = scale_ratio
        self.characters = sorted([
            *set("".join(
                sum(ArtsInfo.ArtNames, []) + ArtsInfo.TypeNames +
                list(ArtsInfo.MainAttrNames.values()) +
                list(ArtsInfo.SubAttrNames.values()) + list(".,+%0123456789")))
        ])
        # Mapping characters to integers
        self.char_to_num = StringLookup(vocabulary=list(self.characters),
                                        num_oov_indices=0,
                                        mask_token="")

        # Mapping integers back to original characters
        self.num_to_char = StringLookup(
            vocabulary=self.char_to_num.get_vocabulary(),
            oov_token="",
            mask_token="",
            invert=True)

        self.width = 240
        self.height = 16
        self.max_length = 15
        self.build_model(input_shape=(self.width, self.height))
        self.model.load_weights(model_weight)
Exemplo n.º 3
0
    def load_data(self):
        data = GFile(self.file_path, 'rb').read().decode(encoding='UTF-8')

        # Get a list of the unique characters in the text
        vocab = list(sorted(set(data)))
        vocab_size = len(vocab)

        chars_to_ids = StringLookup(vocabulary=vocab)
        self.ids_to_chars_layer = StringLookup(
            vocabulary=chars_to_ids.get_vocabulary(), invert=True)

        # Split the entire text by character
        chars = unicode_split(data, 'UTF-8')
        ids_of_chars = chars_to_ids(chars)

        # Group characters to form sequences (+1 since the targets are shifted by one)
        sequences_ds = Dataset.from_tensor_slices(ids_of_chars)
        sequences_ds = sequences_ds.batch(C.SEQUENCE_LENGTH + 1)

        # Batch the sequences
        ds = sequences_ds.padded_batch(C.BATCH_SIZE)
        ds = ds.map(self._to_inputs_and_targets,
                    num_parallel_calls=tf.data.experimental.AUTOTUNE)
        ds = ds.shuffle(C.BUFFER_SIZE)
        ds = ds.prefetch(tf.data.experimental.AUTOTUNE)

        return ds
Exemplo n.º 4
0
def encode_inputs(inputs, encoding_size):
    encoded_features = []
    for feature_name in inputs:
        if feature_name in CATEGORICAL_FEATURES_WITH_VOCABULARY:
            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
            # Create a lookup to convert a string values to an integer indices.
            # Since we are not using a mask token nor expecting any out of vocabulary
            # (oov) token, we set mask_token to None and  num_oov_indices to 0.
            index = StringLookup(vocabulary=vocabulary,
                                 mask_token=None,
                                 num_oov_indices=0)
            # Convert the string input values into integer indices.
            value_index = index(inputs[feature_name])
            # Create an embedding layer with the specified dimensions
            embedding_ecoder = layers.Embedding(input_dim=len(vocabulary),
                                                output_dim=encoding_size)
            # Convert the index values to embedding representations.
            encoded_feature = embedding_ecoder(value_index)
        else:
            # Project the numeric feature to encoding_size using linear transformation.
            encoded_feature = tf.expand_dims(inputs[feature_name], -1)
            encoded_feature = layers.Dense(
                units=encoding_size)(encoded_feature)
        encoded_features.append(encoded_feature)
    return encoded_features
Exemplo n.º 5
0
def encode_inputs(inputs, use_embedding=False):
    encoded_features = []
    for feature_name in inputs:
        if feature_name in CATEGORICAL_FEATURE_NAMES:
            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
            # Create a lookup to convert string values to an integer indices.
            # Since we are not using a mask token nor expecting any out of vocabulary
            # (oov) token, we set mask_token to None and  num_oov_indices to 0.
            index = StringLookup(vocabulary=vocabulary,
                                 mask_token=None,
                                 num_oov_indices=0)
            # Convert the string input values into integer indices.
            value_index = index(inputs[feature_name])
            if use_embedding:
                embedding_dims = int(math.sqrt(len(vocabulary)))
                # Create an embedding layer with the specified dimensions.
                embedding_ecoder = layers.Embedding(input_dim=len(vocabulary),
                                                    output_dim=embedding_dims)
                # Convert the index values to embedding representations.
                encoded_feature = embedding_ecoder(value_index)
            else:
                # Create a one-hot encoder.
                onehot_encoder = CategoryEncoding(output_mode="binary")
                onehot_encoder.adapt(index(vocabulary))
                # Convert the index values to a one-hot representation.
                encoded_feature = onehot_encoder(value_index)
        else:
            # Use the numerical features as-is.
            encoded_feature = tf.expand_dims(inputs[feature_name], -1)

        encoded_features.append(encoded_feature)

    all_features = layers.concatenate(encoded_features)
    return all_features
Exemplo n.º 6
0
def encode_string_categorical_feature(feature, name, dataset):
    # Create a StringLookup layer which will turn strings into integer indices
    index = StringLookup()

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the set of possible string values and assign them a fixed integer index
    index.adapt(feature_ds)

    # Turn the string input into integer indices
    encoded_feature = index(feature)

    # Create a CategoryEncoding for our integer indices
    encoder = CategoryEncoding(output_mode="binary")

    # Prepare a dataset of indices
    feature_ds = feature_ds.map(index)

    # Learn the space of possible indices
    encoder.adapt(feature_ds)

    # Apply one-hot encoding to our indices
    encoded_feature = encoder(encoded_feature)
    return encoded_feature
Exemplo n.º 7
0
    def build(self, input_shape=None):
        self.squeeze = False
        if 2 == len(input_shape):
            if 1 != input_shape[-1]:
                raise ValueError(
                    'Input 0 of layer {} is incompatible with the layer: if ndim=2 expected axis[-1]=1, found '
                    'axis[-1]={}. Full shape received: {}'.format(self.name, input_shape[-1], input_shape))

            self.squeeze = True
            input_shape = input_shape[:1]

        self.lookup = StringLookup(vocabulary=self._vocabulary, mask_token=None, oov_token=self.UNK_MARK)
        self.lookup.build(input_shape)

        if 'adapt' == self.embed_type:
            self.embed = AdaptiveEmbedding(
                self.adapt_cutoff, self.lookup.vocabulary_size(), self.output_dim, factor=self.adapt_factor,
                embeddings_initializer=self.embeddings_initializer)
        else:
            self.embed = layers.Embedding(
                self.lookup.vocabulary_size(), self.output_dim, embeddings_initializer=self.embeddings_initializer)
            if 'dense_auto' == self.embed_type:
                self.embed.build(input_shape)
            else:  # 'dense_cpu' == self.embed_type
                with tf.device('cpu:0'):
                    self.embed.build(input_shape)

        super().build(input_shape)
Exemplo n.º 8
0
 def __init__(self, log_dir):
     self.log_dir = log_dir
     self.START_TOKEN = '[SOS]'
     self.END_TOKEN = '[EOS]'
     self.vocab = list(sorted(set(string.printable))) + [self.START_TOKEN, self.END_TOKEN]
     self.chars_to_ids = StringLookup(vocabulary=self.vocab)
     self.vocab_size = self.chars_to_ids.vocab_size()
def encode_inputs(inputs):
    encoded_features = []
    for feature_name in inputs:
        if feature_name in CATEGORICAL_FEATURE_NAMES:
            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
            # Create a lookup to convert a string values to an integer indices.
            # Since we are not using a mask token, nor expecting any out of vocabulary
            # (oov) token, we set mask_token to None and num_oov_indices to 0.
            lookup = StringLookup(vocabulary=vocabulary,
                                  mask_token=None,
                                  num_oov_indices=0)
            # Convert the string input values into integer indices.
            value_index = lookup(inputs[feature_name])
            embedding_dims = int(math.sqrt(lookup.vocabulary_size()))
            # Create an embedding layer with the specified dimensions.
            embedding = layers.Embedding(input_dim=lookup.vocabulary_size(),
                                         output_dim=embedding_dims)
            # Convert the index values to embedding representations.
            encoded_feature = embedding(value_index)
        else:
            # Use the numerical features as-is.
            encoded_feature = inputs[feature_name]
            if inputs[feature_name].shape[-1] is None:
                encoded_feature = tf.expand_dims(encoded_feature, -1)

        encoded_features.append(encoded_feature)

    encoded_features = layers.concatenate(encoded_features)
    return encoded_features
Exemplo n.º 10
0
    def __init__(self, vocabulary, embedding_dim, num_buckets, name=None):
        super(QREmbedding, self).__init__(name=name)
        self.num_buckets = num_buckets

        self.index_lookup = StringLookup(
            vocabulary=vocabulary, mask_token=None, num_oov_indices=0
        )
        self.q_embeddings = layers.Embedding(num_buckets, embedding_dim,)
        self.r_embeddings = layers.Embedding(num_buckets, embedding_dim,)
Exemplo n.º 11
0
def load_data():
    data = pd.read_csv(
        "https://storage.googleapis.com/tf-datasets/titanic/train.csv")
    from sklearn.model_selection import train_test_split
    labels = data.pop('survived')
    label_names = ["Not survived", "Survived"]
    features = {}

    # Converting CSV file into Tensorflow object

    for name, column in data.items():
        dtype = column.dtype
        if dtype == object:
            dtype = string
        else:
            dtype = float32
        features[name] = Input(shape=(1, ), name=name, dtype=dtype)

    # Extracting and normalizing numeric features
    numeric_features = {
        name: feature
        for name, feature in features.items() if feature.dtype == float32
    }

    x = Concatenate()(list(numeric_features.values()))
    norm = Normalization()
    norm.adapt(np.array(data[numeric_features.keys()]))
    numeric_features = norm(x)

    processed_features = [numeric_features]
    # Extracting and normalizing non-numeric features

    for name, feature in features.items():
        if feature.dtype == float32:
            continue
        word = StringLookup(vocabulary=np.unique(data[name]))
        one_hot = CategoryEncoding(max_tokens=word.vocab_size())

        x = word(feature)
        x = one_hot(x)
        processed_features.append(x)

    processed_features = Concatenate()(processed_features)
    processed_features = Model(features, processed_features)

    utils.plot_model(model=processed_features,
                     rankdir='LR',
                     dpi=72,
                     show_shapes=True)

    feature_dict = {name: np.array(value) for name, value in data.items()}

    train_features, test_features, train_labels, test_labels = train_test_split(
        processed_features(feature_dict).numpy(), labels, test_size=0.2)
    return train_features, train_labels, test_features, test_labels
Exemplo n.º 12
0
def embedding_encoder(vocabulary, embedding_dim, num_oov_indices=0, name=None):
    return keras.Sequential(
        [
            StringLookup(
                vocabulary=vocabulary, mask_token=None, num_oov_indices=num_oov_indices
            ),
            layers.Embedding(
                input_dim=len(vocabulary) + num_oov_indices, output_dim=embedding_dim
            ),
        ],
        name=f"{name}_embedding" if name else None,
    )
Exemplo n.º 13
0
    def get_svg_ds(self):
        data = GFile('datasets/svgs/simpleline.svg',
                     'rb').read().decode(encoding='UTF-8')

        # Get the list of the unique characters in the text
        vocab = ['e', 'g', 'n', 'r', '\n']
        vocab_size = len(vocab)

        # Build the id to char lookup table
        chars_to_ids = StringLookup(vocabulary=vocab)
        self.ids_to_chars_layer = StringLookup(
            vocabulary=chars_to_ids.get_vocabulary(), invert=True)

        # Split the entire text by character
        chars = unicode_split(data, 'UTF-8')
        ids_of_chars = chars_to_ids(chars)

        # Group characters to form sequences
        svg_ds = Dataset.from_tensor_slices(ids_of_chars)
        svg_ds = svg_ds.batch(C.SEQUENCE_LENGTH)
        svg_ds = svg_ds.batch(C.BATCH_SIZE)

        return svg_ds
Exemplo n.º 14
0
def encode_string_categorical_feature(feature, name, dataset):
    index = StringLookup()

    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    index.adapt(feature_ds)

    encoded_feature = index(feature)
    encoder = CategoryEncoding(output_mode="binary")
    feature_ds = feature_ds.map(index)
    encoder.adapt(feature_ds)
    encoded_feature = encoder(encoded_feature)
    return encoded_feature
Exemplo n.º 15
0
    def __init__(self, emb_name, vocab):
        super(CustomEmbed, self).__init__()

        self.vocab = vocab
        self.vocab_size = len(vocab)
        self.output_dim = int(math.sqrt(self.vocab_size))

        self.custom_embed = layers.Embedding(input_dim=self.vocab_size,
                                             output_dim=self.output_dim,
                                             name=f"{emb_name}_embedding")
        self.stringLookUp = StringLookup(vocabulary=self.vocab,
                                         mask_token=None,
                                         num_oov_indices=0)
        print(emb_name, self.output_dim)
Exemplo n.º 16
0
    def build(self, input_shape):
        if self.options & WordShape.SHAPE_CHAR_CAT_FIRST or self.options & WordShape.SHAPE_CHAR_CAT_LAST:
            category_vocab = [
                'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Me', 'Mc', 'Nd', 'Nl',
                'No', 'Zs', 'Zl', 'Zp', 'Cc', 'Cf', 'Co', 'Cs', 'Pd', 'Ps',
                'Pe', 'Pc', 'Po', 'Sm', 'Sc', 'Sk', 'So', 'Pi', 'Pf'
            ]
            self.cat_lookup = StringLookup(num_oov_indices=0,
                                           oov_token='Cn',
                                           vocabulary=category_vocab)
            if self.cat_lookup.vocab_size() != 30:
                raise ValueError('Wrong vocabulary size')

        super(WordShape, self).build(input_shape)
Exemplo n.º 17
0
def character_encoder(vocab):
    """Character encoder

    Parameters:
        vocab: list, characters to be encoded.

    Returns:
        Character encoder(keras.preprocessing.StringLookup).
    """
    char_to_num = StringLookup(mask_token=None,
                               num_oov_indices=0,
                               vocabulary=list(vocab),
                               invert=False)

    return char_to_num
Exemplo n.º 18
0
def character_decoder(encoder):
    """Character decoder

    Parameters:
        encoder: keras.preprocessing.StringLookup, character encoder.

    Returns:
        Character decoder(keras.preprocessing.StringLookup).
    """
    num_to_char = StringLookup(mask_token=None,
                               num_oov_indices=1,
                               vocabulary=encoder.get_vocabulary(),
                               invert=True)

    return num_to_char
Exemplo n.º 19
0
    def _encode_categorical_feature(
        feature: KerasTensor,
        name: str,
        dataset: Optional[BatchDataset],
    ) -> KerasTensor:
        """One-hot encode categorical features.

        Args:
            - feature: The input layer of the feature.
            - name: The feature's name (its column name in the original dataframe).
            - dataset: The training data, if not specified, return a no-op layer.

        Returns:
            The one-hot encoded tensor of the input feature.

        """
        # Return generic layer for the tuner initialization
        if not dataset:
            return KerasTensor(type_spec=TensorSpec(
                shape=(None, 1), dtype=tf.float32, name=None))

        # Create a StringLookup layer which will turn strings into integer indices
        index = StringLookup()

        # Prepare a Dataset that only yields our feature
        feature_ds = dataset.map(lambda x, y: x[name])
        feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

        # Learn the set of possible string values and assign them a fixed integer index
        index.adapt(feature_ds)

        # Turn the string input into integer indices
        encoded_feature = index(feature)

        # Create a CategoryEncoding for our integer indices
        encoder = CategoryEncoding(output_mode="binary")

        # Learn the space of possible indices
        encoder.adapt(np.arange(index.vocab_size()))

        # Apply one-hot encoding to our indices{split + 1} / {n_splits}
        encoded_feature = encoder(encoded_feature)

        return encoded_feature
Exemplo n.º 20
0
 def __init__(self):
     super().__init__(StringLookup())
Exemplo n.º 21
0
if __name__ == "__main__":
    if len(sys.argv) < 4:
        print("usage: python predict.py [input_path] [model_path] [output_path]")
        sys.exit()
    
    input_path = sys.argv[1]
    model_path = sys.argv[2]
    output_path = sys.argv[2]
    data_loader = DataLoader(input_path, training_ratio=0.7)

    raw_train_ds, raw_val_ds = data_loader.load()

    # Why N? for one encoding purpose, last character = [0, 0, 0, 0]
    VOCAB = ["A", "G", "T", "N"]
    string_lookup = StringLookup(vocabulary=VOCAB)

    AUTOTUNE = tf.data.experimental.AUTOTUNE
    BATCH_SIZE = 256
    SHUFFLE_SIZE = 1000

    encoded_train_ds = raw_train_ds.cache().shuffle(SHUFFLE_SIZE)
    encoded_train_ds = encoded_train_ds.prefetch(buffer_size=AUTOTUNE)
    encoded_train_ds = encoded_train_ds.map(preprocess)
    encoded_val_ds = raw_val_ds.cache().map(preprocess)

    train_ds = encoded_train_ds.cache().batch(BATCH_SIZE)
    train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)
    val_ds = encoded_val_ds.cache().batch(BATCH_SIZE)

    model = TwoTowerModel(RNA_length=33, gRNA_length=23)
Exemplo n.º 22
0
def encode_input_features(inputs,
                          sequence_length,
                          USER_FEATURES,
                          CATEGORICAL_FEATURES_WITH_VOCABULARY,
                          movies,
                          genres,
                          include_user_id=True,
                          include_user_features=True,
                          include_movie_features=True):
    encoded_transformer_features = []
    encoded_other_features = []

    other_feature_names = []
    if include_user_id:
        other_feature_names.append("user_id")
    if include_movie_features:
        other_feature_names.extend(USER_FEATURES)

    # Encode user features.
    for feature_name in other_feature_names:
        # Conver the string input values into integer indices.
        vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
        idx = StringLookup(vocabulary=vocabulary,
                           mask_token=None,
                           num_oov_indices=0)(inputs[feature_name])

        # Compute embedding dimensions.
        embedding_dims = int(math.sqrt(len(vocabulary)))

        # Create an embedding layer with the specified dimensions.
        embedding_encoder = layers.Embedding(
            input_dim=len(vocabulary),
            output_dim=embedding_dims,
            name=f"{feature_name}_embedding",
        )

        # Convert the index values to embedding representations.
        encoded_other_features.append(embedding_encoder(idx))

    # Create a single embedding vector for the user features.
    if len(encoded_other_features) > 1:
        encoded_other_features = layers.concatenate(encoded_other_features)
    elif len(encoded_other_features) == 1:
        encoded_other_features = encoded_other_features[0]
    else:
        encoded_other_features = None

    # Create a movie embedding encoder.
    movie_vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY["movie_id"]
    movie_embedding_dims = int(math.sqrt(len(movie_vocabulary)))

    # Create a lookup to convert string values to integer indices.
    movie_index_lookup = StringLookup(
        input_dim=len(movie_vocabulary),
        mask_token=None,
        num_oov_indices=0,
        name="movie_index_lookup",
    )

    # Create an embedding layer with the specified dimensions.
    movie_embedding_encoder = layers.Embedding(
        input_dim=len(movie_vocabulary),
        output_dim=movie_embedding_dims,
        name=f"movie_embedding",
    )

    # Create a vector lookup for movie genres.
    genre_vectors = movies[genres].to_numpy()
    movie_genres_lookup = layers.Embedding(
        input_dim=genre_vectors.shape[0],
        output_dim=genre_vectors.shape[1],
        embeddings_initializer=tf.keras.initializers.Constant(genre_vectors),
        trainable=False,
        name="genres_vector")

    # Create a processing layer for genres.
    movie_embedding_processor = layers.Dense(
        units=movie_embedding_dims,
        activation="relu",
        name="process_movie_embedding_with_genres",
    )

    # Define a function to encode a given movie id.
    def encode_movie(movie_id):
        # Convert the string input values into integer indices.
        movie_idx = movie_index_lookup(movie_id)
        movie_embedding = movie_embedding_encoder(movie_idx)
        encoded_movie = movie_embedding
        if include_movie_features:
            movie_genres_vector = movie_genres_lookup(movie_idx)
            encoded_movie = movie_embedding_processor(
                layers.concatenate([movie_embedding, movie_genres_vector]))
        return encoded_movie

    # Encoded target_movie_id.
    target_movie_id = inputs["target_movie_id"]
    encoded_target_movie = encode_movie(target_movie_id)

    # Encoding sequence movie_ids.
    sequence_movie_ids = inputs["sequence_movie_ids"]
    encoded_sequence_movies = encode_movie(sequence_movie_ids)

    # Create positional embedding.
    positional_embedding_encoder = layers.Embedding(
        input_dim=sequence_length,
        output_dim=movie_embedding_dims,
        name="positional_embedding",
    )
    positions = tf.range(start=0, limit=sequence_length - 1, delta=1)
    encoded_positions = positional_embedding_encoder(positions)

    # Retrieve sequence ratings to incorporate them into the encoding
    # of the movie.
    sequence_ratings = tf.expand_dims(inputs["sequence_ratings"], -1)

    # Add the positional encoding to the movie encodings and multiply
    # them by rating.
    encoded_sequence_movies_with_position_and_rating = layers.Multiply()([
        (encoded_sequence_movies + encoded_positions), sequence_ratings
    ])

    # Construct the transformer inputs.
    for encoded_movie in tf.unstack(
            encoded_sequence_movies_with_position_and_rating, axis=1):
        encoded_transformer_features.append(tf.expand_dims(encoded_movie, 1))
    encoded_transformer_features.append(encoded_target_movie)

    encoded_transformer_features = layers.concatenate(
        encoded_transformer_features, axis=1)

    return encoded_transformer_features, encoded_other_features
Exemplo n.º 23
0
TARGET_FEATURE_NAME = "income_bracket"
# A list of the labels of the target features.
TARGET_LABELS = [" <=50K", " >50K"]
"""
## Create `tf.data.Dataset` objects for training and validation

We create an input function to read and parse the file, and convert features and labels
into a [`tf.data.Dataset`](https://www.tensorflow.org/guide/datasets)
for training and validation. We also preprocess the input by mapping the target label
to an index.
"""

from tensorflow.keras.layers.experimental.preprocessing import StringLookup

target_label_lookup = StringLookup(vocabulary=TARGET_LABELS,
                                   mask_token=None,
                                   num_oov_indices=0)


def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=CSV_HEADER,
        column_defaults=COLUMN_DEFAULTS,
        label_name=TARGET_FEATURE_NAME,
        num_epochs=1,
        header=False,
        na_value="?",
        shuffle=shuffle,
    ).map(lambda features, target: (features, target_label_lookup(target)))
"""
### Building the character vocabulary

Keras provides different preprocessing layers to deal with different modalities of data.
[This guide](https://keras.io/guides/preprocessing_layers/) provids a comprehensive introduction.
Our example involves preprocessing labels at the character
level. This means that if there are two labels, e.g. "cat" and "dog", then our character
vocabulary should be {a, c, d, g, o, t} (without any special tokens). We use the
[`StringLookup`](https://keras.io/api/layers/preprocessing_layers/categorical/string_lookup/)
layer for this purpose.
"""

AUTOTUNE = tf.data.AUTOTUNE

# Mapping characters to integers.
char_to_num = StringLookup(vocabulary=list(characters), mask_token=None)

# Mapping integers back to original characters.
num_to_char = StringLookup(vocabulary=char_to_num.get_vocabulary(),
                           mask_token=None,
                           invert=True)
"""
### Resizing images without distortion

Instead of square images, many OCR models work with rectangular images. This will become
clearer in a moment when we will visualize a few samples from the dataset. While
aspect-unaware resizing square images does not introduce a significant amount of
distortion this is not the case for rectangular images. But resizing images to a uniform
size is a requirement for mini-batching. So we need to perform our resizing such that
the following criteria are met:
Exemplo n.º 25
0
def main():
    # Prepare the data.
    CSV_HEADER = [
        "age",
        "workclass",
        "fnlwgt",
        "education",
        "education_num",
        "marital_status",
        "occupation",
        "relationship",
        "race",
        "gender",
        "capital_gain",
        "capital_loss",
        "hours_per_week",
        "native_country",
        "income_bracket",
    ]

    train_data_url = (
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
    )
    train_data = pd.read_csv(train_data_url, header=None, names=CSV_HEADER)

    test_data_url = (
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
    )
    test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER)

    print(f"Train dataset shape: {train_data.shape}")
    print(f"Test dataset shape: {test_data.shape}")

    # Remove the first record (because it is not a valid example) and a
    # trailing "dot" in the class labels.
    test_data = test_data[1:]
    test_data.income_bracket = test_data.income_bracket.apply(
        lambda value: value.replace(".", ""))

    # Store the training and test data splits locally as CSV files.
    train_data_file = "train_data.csv"
    test_data_file = "test_data.csv"

    train_data.to_csv(train_data_file, index=False, header=False)
    test_data.to_csv(test_data_file, index=False, header=False)

    # Define dataset metadata.
    # Here, define the metadata of the dataset that will be useful for
    # reading and parsing and encoding input features.
    # A list of numerical feature names.
    NUMERICAL_FEATURE_NAMES = [
        "age",
        "education_num",
        "capital_gain",
        "capital_loss",
        "hours_per_week",
    ]

    # A dictionary of the categorical features and their vocabulary.
    CATEGORICAL_FEATURES_WITH_VOCABULARY = {
        "workclass": sorted(list(train_data["workclass"].unique())),
        "education": sorted(list(train_data["education"].unique())),
        "marital_status": sorted(list(train_data["marital_status"].unique())),
        "occupation": sorted(list(train_data["occupation"].unique())),
        "relationship": sorted(list(train_data["relationship"].unique())),
        "race": sorted(list(train_data["race"].unique())),
        "gender": sorted(list(train_data["gender"].unique())),
        "native_country": sorted(list(train_data["native_country"].unique())),
    }

    # A list of the columns to ignore from the dataset.
    IGNORE_COLUMN_NAMES = ["fnlwgt"]

    # A list of the categorical feature names.
    CATEGORICAL_FEATURE_NAMES = list(
        CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())

    # A list of all the input features.
    FEATURE_NAMES = NUMERICAL_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES

    # A list of column default values for each feature.
    COLUMN_DEFAULTS = [[0.0] if feature_name in NUMERICAL_FEATURE_NAMES +
                       IGNORE_COLUMN_NAMES else ["NA"]
                       for feature_name in CSV_HEADER]

    # The name of the target feature.
    TARGET_FEATURE_NAME = "income_bracket"

    # A list of the labels of the target features.
    TARGET_LABELS = [" <=50K", " >50K"]

    # Create tf.data.Dataset objects for training and validation.
    target_label_lookup = StringLookup(vocabulary=TARGET_LABELS,
                                       mask_token=None,
                                       num_oov_indices=0)

    # Set up the code that will train and evaluate the model.
    learning_rate = 0.01
    batch_size = 265
    num_epochs = 10
    hidden_units = [64, 64]

    # Experiment 1: Train a decision tree model.
    # In this experiment, train a single neural decision tree model
    # that uses all input features.
    num_trees = 10
    depth = 10
    used_features_rate = 1.0
    num_classes = len(TARGET_LABELS)

    tree_model = create_tree_model(FEATURE_NAMES, NUMERICAL_FEATURE_NAMES,
                                   CATEGORICAL_FEATURE_NAMES,
                                   CATEGORICAL_FEATURES_WITH_VOCABULARY, depth,
                                   used_features_rate, num_classes)
    run_experiment(tree_model, learning_rate, train_data_file, test_data_file,
                   CSV_HEADER, COLUMN_DEFAULTS, TARGET_FEATURE_NAME,
                   target_label_lookup, batch_size, num_epochs)

    # Experiment 2: Train a forest model.
    # In this experiment, train a neural decision forest with num_trees
    # where each tree uses randomly selected 50% of the input features.
    # Can control the number of features to be used in each tree by
    # setting the used_features_rate variable. In addition, set the
    # depth to 5 instead of 10 compared to the previous experiment.
    num_trees = 25
    depth = 5
    used_features_rate = 0.5

    forest_model = create_forest_model(FEATURE_NAMES, NUMERICAL_FEATURE_NAMES,
                                       CATEGORICAL_FEATURE_NAMES,
                                       CATEGORICAL_FEATURES_WITH_VOCABULARY,
                                       num_trees, depth, used_features_rate,
                                       num_classes)
    run_experiment(forest_model, learning_rate, train_data_file,
                   test_data_file, CSV_HEADER, COLUMN_DEFAULTS,
                   TARGET_FEATURE_NAME, target_label_lookup, batch_size,
                   num_epochs)

    # Exit the program.
    exit(0)