def main(save_filename=None, load_filename="simple_rnn_custom_model_weights.h5", do_train=False, num_epochs=2, cell_type='gru'): """ Entry point """ if do_train: print("Training and saving model...") (model, vocab) = train_model(file_name=save_filename, num_epochs=num_epochs) ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab)) vocab_size = len(ids_from_chars.get_vocabulary()) else: if load_filename is None: print( "ERROR load file name not provided and training flag set to false, no model can be used" ) return 1 # TODO Somehow this vocab should be accessible without needed to read and process this data data = open('./archive/drake_lyrics.txt').read() print('Length of text: {} characters'.format(len(data))) vocab = sorted(set(data)) ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab)) vocab_size = len(ids_from_chars.get_vocabulary()) print("Loading model from disk...") #cell = custom_models.MyRNNCell(vocab_size) cell = custom_models.MyGRUCell(vocab_size) model = custom_models.MyCellModelWrapper(cell) utils.load_weights(load_filename, model, tf.TensorShape([1, seq_length, vocab_size])) print("Generating Bars...please wait") seed_texts = [ "[Verse]", "you", "love", "boy", "I love", "I love you", "Kiki, ", "Swanging" ] for seed in seed_texts: num_chars = 400 output_text = utils.generate_text_one_h(seed, model, seq_length, ids_from_chars, chars_to_gen=num_chars) print(">>>>>>>>>>>>>>>>>>>>") print("Input seed: %s" % (seed)) print("%d character generated sequence:\n%s\n" % (num_chars, output_text)) print("<<<<<<<<<<<<<<<<<<<<") print("End of output for seed: %s" % (seed)) #Hope you enjoyed :) return 0
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None): # Create a StringLookup layer which will turn strings into integer indices if dtype == 'string': index = preprocessing.StringLookup(max_tokens=max_tokens) else: index = preprocessing.IntegerLookup(max_values=max_tokens) # Prepare a Dataset that only yields our feature feature_ds = dataset.map(lambda x, y: x[name]) # Learn the set of possible values and assign them a fixed integer index. index.adapt(feature_ds) # Create a Discretization for our integer indices. encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size()) # Prepare a Dataset that only yields our feature. feature_ds = feature_ds.map(index) # Learn the space of possible indices. encoder.adapt(feature_ds) # Apply one-hot encoding to our indices. The lambda function captures the # layer so we can use them, or include them in the functional model later. return lambda feature: encoder(index(feature))
def get_data(data_file): # Read, then decode for py2 compat. text = open(data_file, 'rb').read().decode(encoding='utf-8') vocab = sorted(set(text)) ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab), mask_token=None) chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup( vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None) all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8')) ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids) seq_length = 100 sequences = ids_dataset.batch(seq_length + 1, drop_remainder=True) dataset = sequences.map(split_input_target) dataset = (dataset.shuffle(BUFFER_SIZE).batch( BATCH_SIZE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)) return dataset, ids_from_chars, chars_from_ids
def create_alphabet_data(seq_length=30): """ Creates a dataset from the alphabet text file @return Tuple of (xs, ys, vocab_size) as a training set from the alaphabet sample file """ data = open('./archive/alphabet2.txt').read() #print('Length of text: {} characters'.format(len(data))) vocab = sorted(set(data)) # This function as variable setup is weird to me but whatever ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab)) chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True) # Preprocess the text into characters all_ids = ids_from_chars(tf.strings.unicode_split(data, 'UTF-8')) vocab_size = len(ids_from_chars.get_vocabulary()) # Sanity tests vocab_sample = list(range(0,vocab_size)) tf_vocab = tf.convert_to_tensor(vocab_sample) mapped_vocab = chars_from_ids(tf_vocab).numpy() # Warning: This is an untested function used as a test dependency (xs, ys) = utils.split_data_new(all_ids.numpy(), vocab_size, seq_length) return (xs, ys, vocab_size, ids_from_chars)
def train_model(file_name=None, debug=False, num_epochs=2, cell_type='gru'): """ Codepath to process input and train (as opposed to load up and generate)""" # Load Data data = open('./archive/drake_lyrics.txt').read() print('Length of text: {} characters'.format(len(data))) vocab = sorted(set(data)) # Preprocess the text into integers ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab)) chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup( vocabulary=ids_from_chars.get_vocabulary(), invert=True) all_ids = ids_from_chars(tf.strings.unicode_split(data, 'UTF-8')) vocab_size = len(ids_from_chars.get_vocabulary()) # Sanity Check: output vocab mapping vocab_sample = list(range(0, vocab_size)) tf_vocab = tf.convert_to_tensor(vocab_sample) mapped_vocab = chars_from_ids(tf_vocab).numpy() print(vocab_sample) print(mapped_vocab) # Creating dataset from pre-processed text print("Splitting file into dataset") (split_xs, split_ys) = utils.split_data_new(all_ids.numpy(), vocab_size, seq_length, total_splits=char_to_process) # Create the Model if cell_type == 'gru': cell = custom_models.MyGRUCell(vocab_size) model = custom_models.MyCellModelWrapper(cell) elif cell_type == 'rnn' or cell_type == 'simple': cell = custom_models.MyRNNCell(vocab_size) model = custom_models.MyCellModelWrapper(cell) elif cell_type == 'keras' or cell_type == 'keras_gru': cell = keras.layers.SimpleRNNCell(150) model = custom_models.KerasRNNCellWrapper(cell, vocab_size) else: print( "Fatal ERROR: cell_type provided does not match supported options, terminating." ) return -1 my_loss = tf.losses.CategoricalCrossentropy(from_logits=True) model.compile(loss=my_loss, optimizer=keras.optimizers.Adam(lr=0.001), metrics=['accuracy'], run_eagerly=True) # Train the model # TODO run this in a gradient tape loop and play with batch randomization model.fit(x=split_xs, y=split_ys, epochs=num_epochs, verbose=1, batch_size=64) print(model.summary()) if file_name is not None: utils.save_model(file_name, model) return (model, vocab)
def getCategoryEncodingLayer(self, name, dataset, dtype, max_tokens=None): if dtype == 'string': index = preprocessing.StringLookup(max_tokens=max_tokens) else: index = preprocessing.IntegerLookup(max_tokens=max_tokens) feature_ds = dataset.map(lambda x, y: x[name]) index.adapt(feature_ds) encoder = preprocessing.CategoryEncoding( num_tokens=index.vocabulary_size()) return lambda feature: encoder(index(feature))
def __init__(self, encoding, **kwargs): super().__init__(**kwargs) self.encoding = encoding self.encoding_layers = [] for encoding in self.encoding: if encoding == NONE: self.encoding_layers.append(None) elif encoding == INT: self.encoding_layers.append(preprocessing.StringLookup()) elif encoding == ONE_HOT: self.encoding_layers.append(None)
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None): if dtype == 'string': index = preprocessing.StringLookup(max_tokens=max_tokens) else: index = preprocessing.IntegerLookup(max_values=max_tokens) feature_ds = dataset.map(lambda x, y: x[name]) index.adapt(feature_ds) encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size()) feature_ds = feature_ds.map(index) encoder.adapt(feature_ds) return lambda feature: encoder(index(feature))
def processcsv(featurecsv, csv, preprocess): from tensorflow.keras.layers.experimental import preprocessing inputs = {} for name, column in featurecsv.items(): dtype = column.dtype if dtype == object: dtype = tf.string else: dtype = tf.float32 inputs[name] = tf.keras.Input(shape=(1, ), name=name, dtype=dtype) numericInputs = { name: input for name, input in inputs.items() if input.dtype == tf.float32 } x = layers.Concatenate()(list(numericInputs.values())) if preprocess: norm = preprocessing.Normalization() norm.adapt(np.array(csv[numericInputs.keys()])) allNumericInputs = norm(x) preprocessedInputs = [allNumericInputs] else: preprocessedInputs = [x] for name, input in inputs.items(): if input.dtype == tf.float32: continue lookup = preprocessing.StringLookup( vocabulary=np.unique(featurecsv[name])) oneHot = preprocessing.CategoryEncoding(max_tokens=lookup.vocab_size()) x = lookup(input) x = oneHot(x) preprocessedInputs.append(x) preprocessedInputsCat = layers.Concatenate()(preprocessedInputs) preprocessing = tf.keras.Model(inputs, preprocessedInputsCat) featuresDict = { name: np.array(value) for name, value in featurecsv.items() } return inputs, preprocessing, featuresDict
def train_model(file_name=None, debug=False): """ Codepath to process input and train (as opposed to load up and generate)""" # Load Data data = open('./archive/drake_lyrics.txt').read() print('Length of text: {} characters'.format(len(data))) vocab = sorted(set(data)) # Preprocess the text into integers ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab)) chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup( vocabulary=ids_from_chars.get_vocabulary(), invert=True) all_ids = ids_from_chars(tf.strings.unicode_split(data, 'UTF-8')) vocab_size = len(ids_from_chars.get_vocabulary()) # Sanity Check: output vocab mapping vocab_sample = list(range(0, vocab_size)) tf_vocab = tf.convert_to_tensor(vocab_sample) mapped_vocab = chars_from_ids(tf_vocab).numpy() print(vocab_sample) print(mapped_vocab) # Creating dataset from pre-processed text print("Splitting file into dataset") # TODO Try new split data method (split_xs, split_ys) = utils.split_data(all_ids.numpy(), vocab_size, seq_length, total_splits=char_to_process) # Create the Model my_model = DrakeGRUSequential(vocab_size, embedding_dim) my_loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True) my_model.compile(loss=my_loss, optimizer=keras.optimizers.Adam(lr=0.001), metrics=['accuracy'], run_eagerly=debug) # Train the model # TODO run this in a gradient tape loop and play with batch randomization my_model.fit(x=split_xs, y=split_ys, epochs=2, verbose=1, batch_size=64) print(my_model.summary()) if file_name is not None: utils.save_model(file_name, my_model) return (my_model, vocab)
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None): """Creates everything that's needed for a categorical encoding input pipeline. Args: name (string): name of the feature dataset (tf.DataSet): tensorflow dataset dtype (string): datatype max_tokens (int, optional): maximum number of tokens. Defaults to None. Returns: lambda function: categorical input pipeline """ # Create a StringLookup layer which will turn strings into integer indices if dtype == 'string': index = exp_preprocessing.StringLookup(max_tokens=max_tokens) else: index = exp_preprocessing.IntegerLookup(max_values=max_tokens) # Prepare a Dataset that only yields our feature feature_ds = dataset.map(lambda x, y: x[name]) # Learn the set of possible values and assign them a fixed integer index. index.adapt(feature_ds) # Create a Discretization for our integer indices. encoder = exp_preprocessing.CategoryEncoding(max_tokens=index.vocab_size()) # Prepare a Dataset that only yields our feature. feature_ds = feature_ds.map(index) # Learn the space of possible indices. encoder.adapt(feature_ds) # Apply one-hot encoding to our indices. The lambda function captures the # layer so we can use them, or include them in the functional model later. return lambda feature: encoder(index(feature))
def feats_encoding(df): # encode numerical variables inputs = {} for name, column in df.items(): dtype = column.dtype if dtype == object: dtype = tf.string else: dtype = tf.float32 inputs[name] = tf.keras.Input(shape=(1, ), name=name, dtype=dtype) numeric_inputs = { name: input for name, input in inputs.items() if input.dtype == tf.float32 } x = layers.Concatenate()(list(numeric_inputs.values())) norm = preprocessing.Normalization() norm.adapt(np.array(df[numeric_inputs.keys()])) all_numeric_inputs = norm(x) preprocessed_inputs = [all_numeric_inputs] # all_numeric_inputs # encode categorial variables for feature in ["directors", "kinds"]: #'movie_id', lookup = preprocessing.StringLookup(vocabulary=np.unique(df[feature])) one_hot = preprocessing.CategoryEncoding( max_tokens=lookup.vocab_size()) x = lookup(inputs[feature]) x = one_hot(x) preprocessed_inputs.append(x) preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs) return tf.keras.Model(inputs, preprocessed_inputs_cat), inputs
def __init__(self, config): super(QuerySchemaEncoder, self).__init__() self.nnlm_embedder = hub.load(config['tf_hub_model']) self.margin = config['contrastive_loss_margin'] dim = config['dim'] self.use_char_embed = config['use_char_embedding'] if self.use_char_embed: self.char_vocab = config['character_vocab'] self.char_embedding_dim = config['char_embedding_dim'] self.ids_from_chars = preprocessing.StringLookup(vocabulary=self.char_vocab, mask_token=None) self.char_encoder = tf.keras.models.Sequential() self.char_encoder.add(tf.keras.layers.Embedding(len(self.char_vocab), self.char_embedding_dim)) self.char_encoder.add(tf.keras.layers.LSTM(self.char_embedding_dim, activation='relu')) self.char_word_combiner = tf.keras.models.Sequential() self.char_word_combiner.add(tf.keras.layers.InputLayer(input_shape=(dim + self.char_embedding_dim,))) self.char_word_combiner.add(tf.keras.layers.Dense(dim, activation='relu')) self.table_encoder_dense = tf.keras.models.Sequential() self.table_encoder_dense.add(tf.keras.layers.InputLayer(input_shape=(2 * dim,))) self.table_encoder_dense.add(tf.keras.layers.Dense(1.5 * dim, activation='relu')) self.table_encoder_dense.add(tf.keras.layers.Dense(dim, activation='relu')) self.use_lstm_query_encoder = config['use_lstm_query_encoder'] if self.use_lstm_query_encoder: self.query_encoder = tf.keras.models.Sequential() self.query_encoder.add(tf.keras.layers.LSTM(dim, activation='relu')) else: self.query_encoder = tf.keras.models.Sequential() self.query_encoder.add(tf.keras.layers.InputLayer(input_shape=(dim,))) self.query_encoder.add(tf.keras.layers.Dense(dim, activation='relu')) self.query_encoder.add(tf.keras.layers.Dense(dim, activation='relu'))
""" In addition, adaptable layers always expose an option to directly set state via constructor arguments or weight assignment. If the intended state values are known at layer construction time, or are calculated outside of the `adapt()` call, they can be set without relying on the layer's internal computation. For instance, if external vocabulary files for the `TextVectorization`, `StringLookup`, or `IntegerLookup` layers already exist, those can be loaded directly into the lookup tables by passing a path to the vocabulary file in the layer's constructor arguments. Here's an example where we instantiate a `StringLookup` layer with precomputed vocabulary: """ vocab = ["a", "b", "c", "d"] data = tf.constant([["a", "c", "d"], ["d", "z", "b"]]) layer = preprocessing.StringLookup(vocabulary=vocab) vectorized_data = layer(data) print(vectorized_data) """ ## Preprocessing data before the model or inside the model There are two ways you could be using preprocessing layers: **Option 1:** Make them part of the model, like this: ```python inputs = keras.Input(shape=input_shape) x = preprocessing_layer(inputs) outputs = rest_of_the_model(x) model = keras.Model(inputs, outputs)
variable_partitioner = ( tf.distribute.experimental.partitioners.FixedShardsPartitioner( num_shards=NUM_PS)) strategy = tf.distribute.experimental.ParameterServerStrategy( cluster_resolver, variable_partitioner=variable_partitioner) # Setup Data feature_vocab = [ "avenger", "ironman", "batman", "hulk", "spiderman", "kingkong", "wonder_woman" ] label_vocab = ["yes", "no"] with strategy.scope(): feature_lookup_layer = kpl.StringLookup(vocabulary=feature_vocab) label_lookup_layer = kpl.StringLookup(vocabulary=label_vocab, num_oov_indices=0, mask_token=None) raw_feature_input = keras.layers.Input(shape=(3, ), dtype=tf.string, name="feature") feature_id_input = feature_lookup_layer(raw_feature_input) feature_preprocess_stage = keras.Model({"features": raw_feature_input}, feature_id_input) raw_label_input = keras.layers.Input(shape=(1, ), dtype=tf.string, name="label")
# Przetworzenie tekstu print("---------- Przetwarzanie tekstu 2/2----------") vocab = set() for text in texts: vocab = vocab.union(set(text.split())) vocab = sorted(vocab) vectorizer = TextVectorization(standardize=None) text_ds = tf.data.Dataset.from_tensor_slices(vocab).batch(128) vectorizer.adapt(text_ds) tokens_from_words = preprocessing.StringLookup( vocabulary=vectorizer.get_vocabulary()) words_from_tokens = tf.keras.layers.experimental.preprocessing.StringLookup( vocabulary=tokens_from_words.get_vocabulary(), invert=True) def text_from_tokens(ids): return tf.strings.reduce_join(words_from_tokens(ids), axis=-1) print(len(tokens_from_words.get_vocabulary())) voc = vectorizer.get_vocabulary() word_index = dict(zip(voc, range(len(voc)))) test = ["pan", "jest", "tu", "i", "tam", "."] [word_index[w] for w in test]
def processInput(filename): heart_data = pd.read_csv(filename, usecols=range(1, 11)) heart_features = heart_data.copy() heart_labels = heart_features.pop('chd') # Preprocessing inputs = {} for name, column in heart_features.items(): dtype = column.dtype if dtype == object: dtype = tf.string else: dtype = tf.float32 inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype) numeric_inputs = {name:input for name, input in inputs.items() if input.dtype==tf.float32} x = layers.Concatenate()(list(numeric_inputs.values())) norm = preprocessing.Normalization() norm.adapt(np.array(heart_data[numeric_inputs.keys()])) all_numeric_inputs = norm(x) preprocessed_inputs = [all_numeric_inputs] for name, input in inputs.items(): if input.dtype == tf.float32: continue lookup = preprocessing.StringLookup(vocabulary=np.unique(heart_features[name])) one_hot = preprocessing.CategoryEncoding(max_tokens=lookup.vocab_size()) x = lookup(input) x = one_hot(x) preprocessed_inputs.append(x) preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs) heart_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat) heart_features_dict = {name: np.array(value) for name, value in heart_features.items()} def heart_model(preprocessing_head, inputs): body = tf.keras.Sequential([ layers.Dense(512, kernel_regularizer=regularizers.l2(0.001), activation='elu'), layers.Dense(512, activation='elu'), layers.Dropout(0.3), layers.Dense(1) ]) preprocessed_inputs = preprocessing_head(inputs) result = body(preprocessed_inputs) model = tf.keras.Model(inputs, result) model.compile(loss=tf.losses.BinaryCrossentropy(from_logits=True), optimizer=tf.optimizers.Adam(), metrics=['accuracy']) return model heart_model = heart_model(heart_preprocessing, inputs) return heart_features_dict, heart_labels, heart_model
print(vectorized_text) """ In addition, adaptable layers always expose an option to directly set state via constructor arguments or weight assignment. If the intended state values are known at layer construction time, or are calculated outside of the `adapt()` call, they can be set without relying on the layer's internal computation. For instance, if external vocabulary files for the `TextVectorization`, `StringLookup`, or `IntegerLookup` layers already exist, those can be loaded directly into the lookup tables by passing a path to the vocabulary file in the layer's constructor arguments. Here's an example where we instantiate a `StringLookup` layer with precomputed vocabulary: """ vocab = ["a", "b", "c", "d"] data = tf.constant([["a", "c", "d"], ["d", "z", "b"]]) layer = preprocessing.StringLookup(vocabulary=vocab) vectorized_data = layer(data) print(vectorized_data) """ ## Preprocessing data before the model or inside the model There are two ways you could be using preprocessing layers: **Option 1:** Make them part of the model, like this: ```python inputs = keras.Input(shape=input_shape) x = preprocessing_layer(inputs) outputs = rest_of_the_model(x) model = keras.Model(inputs, outputs) ```
import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers from tensorflow.keras.layers.experimental import preprocessing from . import models path_to_file = "./DS_1.txt" text = open(path_to_file, 'rb').read().decode(encoding='utf-8') vocab = sorted(set(text)) VOCAB_SIZE = len(vocab) # CHAR ENCODING TO ID chars = tf.strings.unicode_split(text, input_encoding='UTF-8') ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab)) ids = ids_from_chars(chars) # INVERSION TO CHAR chars_from_ids = preprocessing.StringLookup( vocabulary=ids_from_chars.get_vocabulary(), invert=True) with open('./model_arch.json') as f: model_archs = json.load(f) def text_from_ids(ids): return tf.strings.reduce_join(chars_from_ids(ids), axis=-1) def generate_lyrics(model_name, temp, length, input_text):
# Some other parameters. BATCH_SIZE = 64 BUFFER_SIZE = 10000 EPOCHS = 5 # Download and load the text. Define its vocabulary. path_to_file = tf.keras.utils.get_file( 'shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt' ) text = open(path_to_file, 'rb').read().decode(encoding='utf-8') vocab = sorted(set(text)) # Text Vectorization. # Create the preprocessing layers which can convert chars and IDs. ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab), mask_token=None) chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup( vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None) def text_from_ids(ids): return tf.strings.reduce_join(chars_from_ids(ids), axis=-1) # Define the dataset in terms of IDs. all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8')) ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids) # Define batches from the dataset. Sequences of characters are given as batch from a given size. sequences = ids_dataset.batch(seq_length + 1, drop_remainder=True)
def input_layer(self): return preprocessing.StringLookup(**self.feature_params)(self.inputs)
numeric_inputs = {name: input for name, input in inputs.items() if input.dtype == tf.float32} x = layers.Concatenate()(list(numeric_inputs.values())) norm = preprocessing.Normalization() norm.adapt(np.array(ti[numeric_inputs.keys()])) all_numeric_inputs = norm(x) ppi = [all_numeric_inputs] for name, input in inputs.items(): if input.dtype == tf.float32: continue lookup = preprocessing.StringLookup( vocabulary=np.unique(tif[name])) one_hot = preprocessing.CategoryEncoding(max_tokens=lookup.vocab_size()) x = lookup(input) x = one_hot(x) ppi.append(x) ppic = layers.Concatenate()(ppi) tip = tf.keras.Model(inputs, ppic) # tf.keras.utils.plot_model(model=tip, rankdir='LR', dpi=72, show_shapes=True) tifd = {name: np.array(value) for name, value in tif.items()} # fd = {name: values[:1] for name, values in tifd.items()} # print(tip(fd))
print(f"--------- {i}/{len(texts)} ----------") text = text.translate( str.maketrans(unwanted_whitespaces, "".join([" " for _ in range(len(unwanted_whitespaces))]))) text = text.translate(str.maketrans("", "", unwanted_chars)) # Przetworzenie tekstu print("---------- Przetwarzanie teksty 2/2----------") vocab = set() for text in texts: vocab = vocab.union(set(text)) vocab = sorted(vocab) ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab)) chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup( vocabulary=ids_from_chars.get_vocabulary(), invert=True) def text_from_ids(ids): return tf.strings.reduce_join(chars_from_ids(ids), axis=-1) print("---------- Tokenizacja tekstu ----------") all_ids = [] ids_datasets = [] for i, text in enumerate(texts): print(f"Tokenizacja tekstu {i}/{len(texts)}")
def main(): # In memory data url = 'https://storage.googleapis.com/download.tensorflow.org/data/abalone_train.csv' abalone_train = pd.read_csv(url, names=[ 'Length', 'Diamenter', 'Height', 'Whole weight', 'Viscera weight', 'Shell weight', 'Age' ]) print(abalone_train.head()) abalone_features = abalone_train.copy() abalone_labels = abalone_features.pop('Age') abalone_features = np.array(abalone_features) print(f'Features: {abalone_features}') abalone_model = tf.keras.Sequential([layers.Dense(64), layers.Dense(1)]) abalone_model.compile(loss=tf.losses.MeanSquaredError(), optimizer=tf.optimizers.Adam()) # Basic preprocessing normalize = preprocessing.Normalization() normalize.adapt(abalone_features) norm_abalone_model = tf.keras.Sequential( [normalize, layers.Dense(64), layers.Dense(1)]) norm_abalone_model.compile(loss=tf.losses.MeanSquaredError(), optimizer=tf.optimizers.Adam()) norm_abalone_model.fit(abalone_features, abalone_labels, epochs=10) # Mixed data types url = 'https://storage.googleapis.com/tf-datasets/titanic/train.csv' titanic = pd.read_csv(url) print(titanic.head()) titanic_features = titanic.copy() titanic_labels = titanic_features.pop('survived') # Create a symbolic input input = tf.keras.Input(shape=(), dtype=tf.float32) # Do a calculation using is result = 2 * input + 1 # The result doesn't have a value print(f'Result: {result}') calc = tf.keras.Model(inputs=input, outputs=result) print(f'calc(1) = {calc(1).numpy()}') print(f'calc(2) = {calc(2).numpy()}') inputs = {} for name, column in titanic_features.items(): dtype = column.dtype if dtype == object: dtype = tf.string else: dtype = tf.float32 inputs[name] = tf.keras.Input(shape=(1, ), name=name, dtype=dtype) inputs numeric_inputs = { name: input for name, input in inputs.items() if input.dtype == tf.float32 } x = layers.Concatenate()(list(numeric_inputs.values())) norm = preprocessing.Normalization() norm.adapt(np.array(titanic[numeric_inputs.keys()])) all_numeric_inputs = norm(x) all_numeric_inputs preprocessed_inputs = [all_numeric_inputs] for name, input in inputs.items(): if input.dtype == tf.float32: continue lookup = preprocessing.StringLookup( vocabulary=np.unique(titanic_features[name])) one_hot = preprocessing.CategoryEncoding( max_tokens=lookup.vocab_size()) x = lookup(input) x = one_hot(x) preprocessed_inputs.append(x) preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs) titanic_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat) tf.keras.utils.plot_model(model=titanic_preprocessing, rankdir='LR', dpi=72, show_shapes=True) titanic_features_dict = { name: np.array(value) for name, value in titanic_features.items() } features_dict = { name: values[:1] for name, values in titanic_features_dict.items() } titanic_preprocessing(features_dict) titanic_model = get_titanic_model(titanic_preprocessing, inputs) titanic_model.fit(x=titanic_features_dict, y=titanic_labels, epochs=10) titanic_model.save('test') reloaded = tf.keras.models.load_model('test') features_dict = { name: values[:1] for name, values in titanic_features_dict.items() } before = titanic_model(features_dict) after = reloaded(features_dict) assert (before - after) < 1e-3 print(f'Before: {before}') print(f'After: {after}') # Using tf.data # On in memory datasets for example in slices(titanic_features_dict): for name, value in example.items(): print(f'{name:19s}: {value}') break titanic_ds = tf.data.Dataset.from_tensor_slices( (titanic_features_dict, titanic_labels)) titanic_batches = titanic_ds.shuffle(len(titanic_labels)).batch(32) titanic_model.fit(titanic_batches, epochs=5) # From a single file url = 'https://storage.googleapis.com/tf-datasets/titanic/train.csv' titanic_file_path = tf.keras.utils.get_file('train.csv', url) titanic_csv_ds = tf.data.experimental.make_csv_dataset( titanic_file_path, batch_size=5, # Artificiallly small to make examples easier to show. label_name='survived', num_epochs=1, ignore_errors=True, ) for batch, label in titanic_csv_ds.take(1): for key, value in batch.items(): print(f'{key:20s}: value') print() print(f'{"label":20s}: {label}') url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00492/Metro_Interstate_Traffic_Volume.csv.gz' traffic_volume_csv_gz = tf.keras.utils.get_file( 'Metro_Interstate_Traffic_Volume.csv.gz', url, cache_dir='.', cache_subdir='traffic') traffic_volume_csv_gz_ds = tf.data.experimental.make_csv_dataset( traffic_volume_csv_gz, batch_size=256, label_name='traffic_volume', num_epochs=1, compression_type='GZIP') for batch, label in traffic_volume_csv_gz_ds.take(1): for key, value in batch.items(): print(f'{key:20s}: {value[:5]}') print() print(f'{"label":20s}: {label[:5]}') #Caching start = time.time() for i, (batch, label) in enumerate(traffic_volume_csv_gz_ds.repeat(20)): if i % 40 == 0: print('.', end='') print(f'Total time: {time.time() - start:.3f}') caching = traffic_volume_csv_gz_ds.cache().shuffle(1000) start = time.time() for i, (batch, label) in enumerate(caching.shuffle(1000).repeat(20)): if i % 40 == 0: print('.', end='') print(f'Total time: {time.time() - start:.3f}') start = time.time() snapshot = tf.data.experimental.snapshot('titanic.tfsnap') snapshotting = traffic_volume_csv_gz_ds.apply(snapshot).shuffle(1000) for i, (batch, label) in enumerate(snapshotting.shuffle(1000).repeat(20)): if i % 40 == 0: print('.', end='') print(f'Total time: {time.time() - start:.3f}') # Multiple files url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00417/fonts.zip' _ = tf.keras.utils.get_file('fonts.zip', url, cache_dir='.', cache_subdir='fonts', extract=True) fonts_csvs = sorted(str(p) for p in pathlib.Path('fonts').glob('*.csv')) print(f'Fonts: {fonts_csvs[:10]}') print(f'Fonts len: {len(fonts_csvs)}') fonts_ds = tf.data.experimental.make_csv_dataset( file_pattern='fonts/*.csv', batch_size=10, num_epochs=1, num_parallel_reads=20, shuffle_buffer_size=10000) for features in fonts_ds.take(1): for i, (name, value) in enumerate(features.items()): if i > 15: break print(f'{name:20s}: {value}') print('...') print(f'[total: {len(features)} features]') # Optional: Packing fields fonts_image_ds = fonts_ds.map(make_images) for features in fonts_image_ds.take(1): break plt.figure(figsize=(6, 6), dpi=120) for n in range(9): plt.subplot(3, 3, n + 1) plt.imshow(features['image'][..., n]) plt.title(chr(features['m_label'][n])) plt.axis('off') plt.show() # Lower level functions # `tf.io.decode_csv` text = pathlib.Path(titanic_file_path).read_text() lines = text.split('\n')[1:-1] all_strings = [str()] * 10 print(f'{all_strings}') features = tf.io.decode_csv(lines, record_defaults=all_strings) for f in features: print(f'type: {f.dtype.name}, shape: {f.shape}') print(f'Sample record: {lines[0]}') titanic_types = [ int(), str(), float(), int(), int(), float(), str(), str(), str(), str() ] print(f'Data types: {titanic_types}') features = tf.io.decode_csv(lines, record_defaults=titanic_types) for f in features: print(f'type: {f.dtype.name}, shape: {f.shape}') # `tf.data.experimental.CsvDataset` simple_titanic = tf.data.experimental.CsvDataset( titanic_file_path, record_defaults=titanic_types, header=True) for example in simple_titanic.take(1): print(f'Sample record: {[e.numpy() for e in example]}') def decode_titanic_line(line): return tf.io.decode_csv(line, titanic_types) manual_titanic = ( # Load the lines of text tf.data.TextLineDataset(titanic_file_path) # Skip the header row .skip(1) # Decode the line .map(decode_titanic_line)) for example in manual_titanic.take(1): print(f'Sample record: {[e.numpy() for e in example]}') # Multiple files font_line = pathlib.Path(fonts_csvs[0]).read_text().splitlines()[1] print(f'Sample: {font_line}') num_font_features = font_line.count(',') + 1 font_column_types = [str(), str()] + [float()] * (num_font_features - 2) print(f'Fonts[0]: {fonts_csvs[0]}') simple_font_ds = tf.data.experimental.CsvDataset( fonts_csvs, record_defaults=font_column_types, header=True) for row in simple_font_ds.take(10): print(f'CSV first column: {row[0].numpy()}') font_files = tf.data.Dataset.list_files('fonts/*.csv') print('Epoch 1:') for f in list(font_files)[:5]: print(f' {f.numpy()}') print(' ...') print() print('Epoch 2:') for f in list(font_files)[:5]: print(f' {f.numpy()}') print(' ...') def make_font_csv_ds(path): return tf.data.experimental.CsvDataset( path, record_defaults=font_column_types, header=True) font_rows = font_files.interleave(make_font_csv_ds, cycle_length=3) fonts_dict = {'font_name': [], 'character': []} for row in font_rows.take(10): fonts_dict['font_name'].append(row[0].numpy().decode()) fonts_dict['character'].append(chr(row[2].numpy())) print(pd.DataFrame(fonts_dict)) # Performance BATCH_SIZE = 2048 font_ds = tf.data.experimental.make_csv_dataset(file_pattern='fonts/*.csv', batch_size=BATCH_SIZE, num_epochs=1, num_parallel_reads=100) start = time.time() for i, batch in enumerate(font_ds.take(20)): print('.', end='') print(f'Total time: {time.time() - start:.3f}')
} x = layers.Concatenate()(list(numeric_inputs.values())) norm = preprocessing.Normalization() norm.adapt(np.array(titanic[numeric_inputs.keys()])) all_numeric_inputs = norm(x) all_numeric_inputs preprocessed_inputs = [all_numeric_inputs] for name, input in inputs.items(): if input.dtype == tf.float32: continue lookup = preprocessing.StringLookup( vocabulary=np.unique(titanic_features[name])) one_hot = preprocessing.CategoryEncoding(max_tokens=lookup.vocab_size()) x = lookup(input) x = one_hot(x) preprocessed_inputs.append(x) preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs) titanic_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat) # tf.keras.utils.plot_model(model = titanic_preprocessing , rankdir="LR", dpi=72, show_shapes=True) titanic_features_dict = { name: np.array(value) for name, value in titanic_features.items()
def main(do_train=True): ## Open and pre-process the data # Verified this works with alphabet now lets make things more interesting # data = open('./archive/alphabet.txt').read() data = open('./archive/drake_lyrics.txt').read() print('Length of text: {} characters'.format(len(data))) vocab = sorted(set(data)) # This function as variable setup is weird to me but whatever ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab)) chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup( vocabulary=ids_from_chars.get_vocabulary(), invert=True) # Preprocess the text into characters all_ids = ids_from_chars(tf.strings.unicode_split(data, 'UTF-8')) vocab_size = len(ids_from_chars.get_vocabulary()) # Output vocab mapping for sanity vocab_sample = list(range(0, vocab_size)) tf_vocab = tf.convert_to_tensor(vocab_sample) mapped_vocab = chars_from_ids(tf_vocab).numpy() print(vocab_sample) print(mapped_vocab) if do_train: (split_xs, split_ys) = split_data(all_ids.numpy(), vocab_size, seq_length, char_to_process) ## Build the model model = tf.keras.models.Sequential() model.add( tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=seq_length)) model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150))) model.add(tf.keras.layers.Dense(vocab_size, activation='softmax')) print(model.summary()) adam = tf.keras.optimizers.Adam(lr=0.01) model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy']) ## Train the model #earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto') history = model.fit(x=split_xs, y=split_ys, epochs=10, verbose=1) # Uncomment to show dope graph #plot_graphs(history, 'accuracy') saved_model_dir = "./models/simple_model/" model_filename = 'simple_bars.h5' model.save(saved_model_dir + model_filename) else: print("Loading model from file.") model = tf.keras.models.load_model( "./models/simple_model/simple_bars.h5") ## Generate text with model # Dank so we trained for 50 iterations on a slice of the data # Let's see what this model generates for a few seeds num_chars = 100 seed_text = '[Verse]\n' output_text = generate_text(seed_text, model, ids_from_chars, chars_to_gen=num_chars) print("Input sequence was: %s" % (seed_text)) print("%d character generated sequence:\n%s" % (num_chars, output_text)) seed_text = 'boy' output_text = generate_text(seed_text, model, ids_from_chars, chars_to_gen=num_chars) print("Input sequence was: %s" % (seed_text)) print("%d character generated sequence:\n%s" % (num_chars, output_text)) seed_text = 'you' output_text = generate_text(seed_text, model, ids_from_chars, chars_to_gen=num_chars) print("Input sequence was: %s" % (seed_text)) print("%d character generated sequence:\n%s" % (num_chars, output_text)) seed_text = 'love' output_text = generate_text(seed_text, model, ids_from_chars, chars_to_gen=num_chars) print("Input sequence was: %s" % (seed_text)) print("%d character generated sequence:\n%s" % (num_chars, output_text))