def main(data_dir): x_train, x_val, x_test, y_train, y_val, y_test = load_data(data_dir) batch_size = 128 max_vocab_size = 20000 max_seq_len = 30 embedding_dim = 100 lstm_dim = 128 vectorizer = TextVectorization(max_tokens=max_vocab_size, output_sequence_length=max_seq_len) text_data = tf.data.Dataset.from_tensor_slices(x_train).batch(batch_size) print('Building vocabulary') vectorizer.adapt(text_data) vocab = vectorizer.get_vocabulary() # load pre-trained w2v model w2v = Word2Vec.load(os.path.join(data_dir, 'processed/w2v.model')) print('Building embedding matrix') # This matrix will be used to initialze weights in the embedding layer embedding_matrix = build_embedding_mat(data_dir, vocab, w2v) print('embedding_matrix.shape => {}'.format(embedding_matrix.shape)) X_train = vectorizer(np.array([[s] for s in x_train])).numpy() X_val = vectorizer(np.array([[s] for s in x_val])).numpy() X_test = vectorizer(np.array([[s] for s in x_test])).numpy() y_train = np.array(y_train) y_val = np.array(y_val) y_test = np.array(y_test) acc_scores={} dropout=0.7 for layer in ['sigmoid','relu','tanh']: print("Building the model with ",layer," and dropout ",dropout) model = Sequential() model.add(Embedding(input_dim=max_vocab_size+3, output_dim=100, input_length=max_seq_len, weights = [embedding_matrix], trainable=True)) model.add(Flatten()) model.add(Dense(lstm_dim,activation=layer , kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01) )) model.add(Dropout(dropout)) model.add(Dense(2,activation='softmax',name='output_layer')) print(model.summary()) print("Compiling the model") model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["acc"]) print("Fitting the model") model.fit(X_train, y_train, batch_size=batch_size, epochs=10, validation_data=(X_val, y_val)) scores = model.evaluate(X_val, y_val) print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) acc_scores[layer+"_val"+str(dropout)] = scores[1]*100 print("Evaluating model on test data") scores = model.evaluate(X_test, y_test) print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) acc_scores[layer+"_test"+str(dropout)] = scores[1]*100 # model.save(os.path.join(data_dir, 'processed/'+layer+str(dropout)) ) model.save(os.path.join(data_dir, 'processed/'+layer+'.model')) print(acc_scores)
def vocab_maker(data, max_dic_size, batch_size): # Create a vocabulary of the recommended size-1 for pad and out of range vectorizer = TextVectorization(max_tokens=max_dic_size-1, output_mode='int') text_data = tf.data.Dataset.from_tensor_slices(data).batch(batch_size) vectorizer.adapt(text_data) # index 0 and 1 are reserved values for padding and out of dic vocab = vectorizer.get_vocabulary() vocab = [x.decode('utf-8') for x in vocab] return vocab
def main(text_path, classifier): x_train, _, x_test, _, _, _ = load_data(text_path) x_test = x_test[-20:] print(x_test) model = keras.models.load_model(os.path.join(text_path, classifier)) print(model.summary()) vectorizer = TextVectorization(max_tokens=config['max_vocab_size'], output_sequence_length=config['max_seq_len']) train_data = tf.data.Dataset.from_tensor_slices(x_train).batch(config['batch_size']) vectorizer.adapt(train_data) x_test = vectorizer(np.array([[w] for w in x_test])).numpy() prediction = model.predict(x_test) print(prediction) classes = np.argmax(prediction, axis=-1) print(classes)
def main(data_dir): print('Loading data') x_train_val, x_test = load_data(data_dir) # decrease dataset size for quick testing # x_train_val = x_train_val[:1000] # x_test = x_test[:100] # build vocab # NOTE: this script only considers tokens in the training set to build the # vocabulary object. vectorizer = TextVectorization( max_tokens=config['max_vocab_size'], output_sequence_length=config['max_seq_len']) text_data = tf.data.Dataset.from_tensor_slices(x_train_val).batch( config['batch_size']) print('Building vocabulary') vectorizer.adapt(text_data) # NOTE: in this vocab, index 0 is reserved for padding and 1 is reserved # for out of vocabulary tokens vocab = vectorizer.get_vocabulary() # load pre-trained w2v model (this model was trained in tut_1) w2v = Word2Vec.load(os.path.join(data_dir, 'w2v.model')) print('Building embedding matrix') # This matrix will be used to initialze weights in the embedding layer embedding_matrix, word2token = build_embedding_mat(data_dir, vocab, w2v) print('embedding_matrix.shape => {}'.format(embedding_matrix.shape)) print('Building Seq2Seq model') # build the embedding layer to convert token sequences into embeddings # set trainable to True if you wish to further finetune the embeddings. # It will increase train time but may yield better results. Try it out # on a more complex task (like neural machine translation)! embedding_layer = Embedding( input_dim=len(vocab) + 4, output_dim=config['embedding_dim'], embeddings_initializer=keras.initializers.Constant(embedding_matrix), trainable=False, ) # build the encoding layers # encoder_inputs accepts padded tokenized sequences as input, # which would be converted to embeddings by the embedding_layer # finally, the embedded sequences are fed to the encoder LSTM to get # encodings (or vector representation) of the input sentences # you can add droputs the input/embedding layers and make your model robust encoder_inputs = Input((None, ), name='enc_inp') enc_embedding = embedding_layer(encoder_inputs) # you can choose a GRU/Dense layer as well to keep things easier # note that we are not using the encoder_outputs for the given generative # task, but you'll need it for classification # Also, there hidden dimension is currently equal to the embedding dimension _, state_h, state_c = LSTM( config['embedding_dim'], # try a different value return_state=True, name='enc_lstm')(enc_embedding) encoder_states = [state_h, state_c] # build the decoding layers # decoder_inputs and dec_embedding serve similar purposes as in the encoding # layers. Note that we are using the same embedding_layer to convert # token sequences to embeddings while encoding and decoding. # In this case, we initialize the decoder using `encoder_states` # as its initial state (i.e. vector representation learned by the encoder). decoder_inputs = Input((None, ), name='dec_inp') dec_embedding = embedding_layer(decoder_inputs) dec_lstm = LSTM(config['embedding_dim'], return_state=True, return_sequences=True, name='dec_lstm') dec_outputs, _, _ = dec_lstm(dec_embedding, initial_state=encoder_states) # finally, we add a final fully connected layer which performs the # transformation of decoder outputs to logits vectors dec_dense = Dense(len(vocab) + 4, activation='softmax', name='out') output = dec_dense(dec_outputs) # Define the model that will turn # `encoder_input_data` & `decoder_input_data` into `decoder_target_data` model = Model([encoder_inputs, decoder_inputs], output) model.compile(optimizer='rmsprop', loss='categorical_crossentropy') print(model.summary()) # note that decoder_input_data and decoder_target_data will be same # as we are training a vanilla autoencoder # we are using np.ones as pad tokens are represented by 1 in our vocab # TODO: switch to a generator instead of creating such huge matrics. # will reduce memory consumption a lot. encoder_input_data = np.ones((len(x_train_val), config['max_seq_len']), dtype='float32') decoder_input_data = np.ones((len(x_train_val), config['max_seq_len']), dtype='float32') decoder_target_data = np.zeros( (len(x_train_val), config['max_seq_len'], len(vocab) + 4), dtype='float32') for i, input_text in enumerate(x_train_val): tokenized_text = tokenize(input_text, word2token) for j in range(len(tokenized_text)): encoder_input_data[i, j] = tokenized_text[j] decoder_input_data[i, j] = tokenized_text[j] decoder_target_data[i, j, tokenized_text[j]] = 1.0 # Run training (will take some time) print('Training model') # try different optimizers, learning rates, and analyze different metrics model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) model.fit( [encoder_input_data, decoder_input_data], decoder_target_data, batch_size=config['batch_size'], epochs=10, # try increasin #epochs validation_split=0.2) # Save model # this model is saved inside the tut_3/data folder just to showcase how # you can save your models as well inside respective assignment folders # and use them later model.save('tut_3/data/ae.model')
lab_dataset = tf.data.Dataset.from_tensor_slices((texts, labels)) lab_dataset = lab_dataset.batch(32) def custom(input_data): lower = tf.strings.lower(input_data) lower = tf.strings.regex_replace(lower, '\n', ' ') lower = tf.strings.regex_replace(lower, '[%s]' % re.escape(string.punctuation), '') return lower vector_layer = TextVectorization(standardize=custom, max_tokens=10000, output_sequence_length=250) data_text = lab_dataset.map(lambda x, y: x) vector_layer.adapt(data_text) # to expand dimension(inp format ke acc kaam ho) and attach label with text def vectorize_text(text, label): text = tf.expand_dims(text, -1) return vector_layer(text), label ready_train = lab_dataset.map(vectorize_text) AUTOTUNE = tf.data.AUTOTUNE
strip_chars = string.punctuation + "¿" strip_chars = strip_chars.replace("[", "") strip_chars = strip_chars.replace("]", "") vocab_size = 15000 sequence_length = 20 batch_size = 64 def custom_standardization(input_string): lowercase = tf.strings.lower(input_string) return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "") eng_vectorization = TextVectorization( max_tokens=vocab_size, output_mode="int", output_sequence_length=sequence_length, ) spa_vectorization = TextVectorization( max_tokens=vocab_size, output_mode="int", output_sequence_length=sequence_length + 1, standardize=custom_standardization, ) train_eng_texts = [pair[0] for pair in train_pairs] train_spa_texts = [pair[1] for pair in train_pairs] eng_vectorization.adapt(train_eng_texts) spa_vectorization.adapt(train_spa_texts) """ Next, we'll format our datasets.
text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x: tf.cast(tf.strings.length(x), bool)) # Now, create a custom standardization function to lowercase the text and # remove punctuation. def custom_standardization(input_data): lowercase = tf.strings.lower(input_data) return tf.strings.regex_replace(lowercase, '[%s]' % re.escape(string.punctuation), '') # Use the text vectorization layer to normalize, split, and map strings to # integers. Set output_sequence_length length to pad all samples to same length. vectorize_layer = TextVectorization( standardize=custom_standardization, max_tokens=vocab_size, output_mode='int', output_sequence_length=sequence_length) # Create vocabulary vectorize_layer.adapt(text_ds.batch(1024)) # Save the created vocabulary for reference. inverse_vocab = vectorize_layer.get_vocabulary() # Vectorize the data in text_ds. text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch() # Make senquences sequences = list(text_vector_ds.as_numpy_iterator()) # Embedding dim
def main(data_dir): print('Loading data') x_train, x_val, x_test, y_train, y_val, y_test = load_data(data_dir) # build vocabulary vectorizer = TextVectorization( max_tokens=config['max_vocab_size'], output_sequence_length=config['max_seq_len']) text_data = tf.data.Dataset.from_tensor_slices(x_train).batch( config['batch_size']) print('Building vocabulary') vectorizer.adapt(text_data) vocab = vectorizer.get_vocabulary() # load pre-trained w2v model w2v = Word2Vec.load(os.path.join(data_dir, 'w2v.model')) # build embedding matrix print('Building embedding matrix') embedding_matrix = build_embedding_matrix(vocab, w2v) print('embedding_matrix.shape => {}'.format(embedding_matrix.shape)) print('Building model') model = Sequential() model.add( Embedding(input_dim=len(vocab) + 2, output_dim=config['embedding_dim'], embeddings_initializer=keras.initializers.Constant( embedding_matrix), trainable=False, name='embedding_layer')) # add hidden layer with activation, L2 regularization, and dropout model.add( LSTM(32, activation=sys.argv[2], kernel_regularizer=l2(0.0001), dropout=0.1, return_sequences=False, name='hidden_layer')) # last layer with activation model.add(Dense(2, activation='softmax', name='output_layer')) model.summary() print('train the model') # train the model # convert words to indices, put them in arrays num_classes = 2 x_train = vectorizer(np.array([[w] for w in x_train])).numpy() x_val = vectorizer(np.array([[w] for w in x_val])).numpy() y_train = np.array(y_train) y_val = np.array(y_val) # convert labels to binary class y_train = keras.utils.to_categorical(y_train, num_classes) y_val = keras.utils.to_categorical(y_val, num_classes) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(x_train, y_train, batch_size=config['batch_size'], epochs=12, validation_data=(x_val, y_val)) model.save(data_dir + 'nn_' + sys.argv[2] + '.model') score = model.evaluate(x_val, y_val) print("Accuracy: {0: .2f}%".format((score[1] * 100)))