def construct(self, text_ds, label_num): max_features = 20000 embedding_dim = 128 sequence_length = 200 vectorize_layer = TextVectorization( standardize=custom_standardization, max_tokens=max_features, output_mode="int", output_sequence_length=sequence_length, ) vectorize_layer.adapt(text_ds) inputs = tf.keras.Input(shape=(1, ), dtype="string") indices = vectorize_layer(inputs) x = layers.Embedding(max_features + 1, embedding_dim)(indices) x = layers.Dropout(0.5)(x) # global max pooling x = layers.GlobalMaxPooling1D()(x) predictions = layers.Dense(label_num, activation="sigmoid", name="predictions")(x) model = tf.keras.Model(inputs, predictions) model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]) return model
def trainWordVectorEncoder(trainText, VOCAB_SIZE=None): #https://www.tensorflow.org/tutorials/text/text_classification_rnn encoder = TextVectorization( ) if VOCAB_SIZE is None else TextVectorization(max_tokens=VOCAB_SIZE) encoder.adapt(tf.data.Dataset.from_tensor_slices(trainText)) return encoder
class TFVectTokenizer: def __init__(self, seqlen, step, freq_threshold): self.freq_threshold = freq_threshold self.freq_threshold = 0 self.seqlen = seqlen self.step = step self.vocab_size = 20000 self.vectorize_layer = TextVectorization( standardize=custom_standardization, max_tokens=self.vocab_size - 1, output_mode="int", output_sequence_length=self.seqlen + 1, ) def tokenize(self, text_ds): # Create a vectorization layer and adapt it to the text self.vectorize_layer.adapt(text_ds) vocab = self.vectorize_layer.get_vocabulary( ) # To get words back from token indices reverse_token_map = {t: i for i, t in enumerate(vocab)} return text_ds, vocab, reverse_token_map def get_input_sequences(self, text_ds, reverse_token_map): text_ds = text_ds.map( lambda text: prepare_lm_inputs_labels(text, self.vectorize_layer)) text_ds = text_ds.prefetch(tf.data.experimental.AUTOTUNE) return text_ds
def get_vectorize_layer(texts, vocab_size, max_seq): """Build Text vectorization layer Args: texts (list): List of string, i.e., input texts vocab_size (int): vocab size max_seq (int): Maximum sequence length. Returns: layers.Layer: Return TextVectorization Keras Layer """ vectorize_layer = TextVectorization( max_tokens=vocab_size, output_mode="int", standardize=custom_standardization, output_sequence_length=max_seq, ) vectorize_layer.adapt(texts) # Insert mask token in vocabulary vocab = vectorize_layer.get_vocabulary() #print("len(vocab):", len(vocab)) #177 #vocab: ['', '[UNK]', 'the', 'and', 'a', 'of', ...] all lower-case #GJ20: where do the empty string and [UNK] come from? # they are created by adapt() as words 0 and 1 # '' is padding token; [UNK] is OOV token vocab = vocab[2:len(vocab)-1] + ["[mask]"] #print("len(vocab):", len(vocab)) #175 #GJ20: anyway first 2 words removed and '[mask]' added at the end vectorize_layer.set_vocabulary(vocab) # '' and [UNK] are back in #vocab = vectorize_layer.get_vocabulary() #print("len(vocab):", len(vocab)) #177 # '[mask]' has been added as last (least frequent) word in the vocab return vectorize_layer
def build_vocab(directories, batch_size, vocab_size, maxlen): global vectorize_layer # Create a list all files filenames = [] for dir in directories: for f in os.listdir(dir): filenames.append(os.path.join(dir, f)) print(f"{len(filenames)} files") # Create dataset from text files random.shuffle(filenames) text_ds = tf.data.TextLineDataset(filenames) text_ds = text_ds.shuffle(buffer_size=256) text_ds = text_ds.batch(batch_size) # Create vectcorization layer and adapt it to the text vectorize_layer = TextVectorization( standardize=custom_standardization, max_tokens=vocab_size - 1, output_mode="int", output_sequence_length=maxlen + 1, ) vectorize_layer.adapt(text_ds) vocab = vectorize_layer.get_vocabulary() word_to_index = {} for index, word in enumerate(vocab): word_to_index[word] = index text_ds = text_ds.map(prepare_lm_inputs_labels) text_ds = text_ds.prefetch(tf.data.experimental.AUTOTUNE) return (text_ds, vocab, word_to_index)
def makePrediction(messages_as_string): print("Running prediction function...") messages = list(messages_as_string.split('s3cur!tywh@l3')) vocab_size = 12612 sequence_length = 1000 embedding_layer = tf.keras.layers.Embedding(vocab_size, sequence_length) # Use the text vectorization layer to normalize, split, and map strings to # integers. Note that the layer uses the custom standardization defined above. # Set maximum_sequence length as all samples are not of the same length. vectorizer = TextVectorization(max_tokens=vocab_size, output_sequence_length=sequence_length) text_ds = tf.data.Dataset.from_tensor_slices(messages).batch(32) vectorizer.adapt(text_ds) path = './assets/models/model.h5' print("trying to load model at: " + path) model = load_model(path) print("I loaded a model") string_input = keras.Input(shape=(1, ), dtype="string") x = vectorizer(string_input) preds = model(x) end_to_end_model = keras.Model(string_input, preds) count = 0 Vuln = 0 vulnLengthSum = 0 nonVuln = 0 nonVulnLengthSum = 0 for message in messages: count = count + 1 probabilities = end_to_end_model.predict([[message]]) np.argmax(probabilities[0]) if probabilities[0][1] > 0.5: vulnLengthSum = vulnLengthSum + len(message) # print("length:",len(message)) # print(message) Vuln = Vuln + 1 if probabilities[0][0] > 0.5: nonVulnLengthSum = nonVulnLengthSum + len(message) # print("length:",len(message)) # print(message) nonVuln = nonVuln + 1 vuln = str(Vuln) avg_vuln = '0' if vulnLengthSum == 0 else str(vulnLengthSum / Vuln) isVuln = 'true' if Vuln > nonVuln else 'false' non_vuln = str(nonVuln) avg_non_vuln = '0' if nonVulnLengthSum == 0 else str(nonVulnLengthSum / nonVuln) return_string = vuln + "," + non_vuln + "," + isVuln print("Response body: \n" + return_string) return return_string
def get_text_vec_model(train_samples): # Taken from: https://github.com/mlflow/mlflow/issues/3910 # pylint: disable=no-name-in-module from tensorflow.keras.layers.experimental.preprocessing import TextVectorization VOCAB_SIZE = 10 SEQUENCE_LENGTH = 16 EMBEDDING_DIM = 16 vectorizer_layer = TextVectorization( input_shape=(1, ), max_tokens=VOCAB_SIZE, output_mode="int", output_sequence_length=SEQUENCE_LENGTH, ) vectorizer_layer.adapt(train_samples) model = tf.keras.Sequential([ vectorizer_layer, tf.keras.layers.Embedding( VOCAB_SIZE, EMBEDDING_DIM, name="embedding", mask_zero=True, input_shape=(1, ), ), tf.keras.layers.GlobalAveragePooling1D(), tf.keras.layers.Dense(16, activation="relu"), tf.keras.layers.Dense(1, activation="tanh"), ]) model.compile(optimizer="adam", loss="mse", metrics="mae") return model
def build_classifier(text): MAX_VOCAB_SIZE = 20000 encoder = TextVectorization(max_tokens=MAX_VOCAB_SIZE) encoder.adapt(text) vocabset = set(encoder.get_vocabulary()) vocab_size = len(encoder.get_vocabulary()) word2idx, weights = get_glove_embeddings(vocabset) embedding_matrix = np.zeros((vocab_size, weights.shape[1])) for i, word in enumerate(encoder.get_vocabulary()): vec = word2idx.get(word) if(vec is not None): embedding_matrix[i] = weights[vec] model = tf.keras.Sequential([ encoder, tf.keras.layers.Embedding( input_dim = embedding_matrix.shape[0], output_dim = embedding_matrix.shape[1], weights = embedding_matrix, mask_zero=True, trainable=True ), tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dropout(0.2), tf.keras.layers.Dense(1, activation='sigmoid') ]) return model
def prepare_data_set_for_training(raw_train_ds, raw_val_ds, raw_test_ds): max_features = 10000 sequence_length = 250 # create vectorization layer vectorize_layer = TextVectorization(standardize=custom_standardization, max_tokens=max_features, output_mode='int', output_sequence_length=sequence_length) # Make a text-only dataset (without labels), then call adapt train_text = raw_train_ds.map(lambda x, y: x) vectorize_layer.adapt(train_text) def vectorize_text(text, label): text = tf.expand_dims(text, -1) return vectorize_layer(text), label # retrieve a batch (of 32 reviews and labels) from the dataset text_batch, label_batch = next(iter(raw_train_ds)) first_review, first_label = text_batch[0], label_batch[0] print("Review", first_review) print("Label", raw_train_ds.class_names[first_label]) print("Vectorized review", vectorize_text(first_review, first_label)) #explore the vocabulary print("1287 ---> ", vectorize_layer.get_vocabulary()[1287]) print(" 313 ---> ", vectorize_layer.get_vocabulary()[313]) print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary()))) train_ds = raw_train_ds.map(vectorize_text) val_ds = raw_val_ds.map(vectorize_text) test_ds = raw_test_ds.map(vectorize_text) return max_features, train_ds, val_ds, test_ds, vectorize_layer
def get_vectorize_layer(self, texts, special_tokens=["mask"]): """Build Text vectorization layer Args: texts (list): List of string i.e input texts vocab_size (int): vocab size max_seq (int): Maximum sequence lenght. special_tokens (list, optional): List of special tokens. Defaults to ['[MASK]']. Returns: layers.Layer: Return TextVectorization Keras Layer """ vectorize_layer = TextVectorization( max_tokens=self.config.VOCAB_SIZE, output_mode="int", ngrams=None, standardize="lower_and_strip_punctuation", output_sequence_length=self.config.MAX_LEN, ) vectorize_layer.adapt(texts) # Insert mask token in vocabulary vocab = vectorize_layer.get_vocabulary() vocab = vocab[2:self.config.VOCAB_SIZE - len(special_tokens)] + ["mask"] vectorize_layer.set_vocabulary(vocab) return vectorize_layer
def load_dataset(self): raw_train_ds, raw_val_ds, raw_test_ds = self.load_raw_data() max_features = 10000 sequence_length = 250 vectorize_layer = TextVectorization( standardize=self.custom_standardization, max_tokens=self.max_tokens, output_mode='int', output_sequence_length=sequence_length) # Make a text-only dataset (without labels), then call adapt train_text = raw_train_ds.map(lambda x, y: x) vectorize_layer.adapt(train_text) self.vocab = vectorize_layer.get_vocabulary() train_ds = raw_train_ds.map( lambda x, y: (vectorize_layer(tf.expand_dims(x, -1)), y)) val_ds = raw_val_ds.map(lambda x, y: (vectorize_layer(tf.expand_dims(x, -1)), y)) test_ds = raw_test_ds.map(lambda x, y: (vectorize_layer(tf.expand_dims(x, -1)), y)) train_ds = train_ds.cache().prefetch(buffer_size=self.AUTOTUNE) val_ds = val_ds.cache().prefetch(buffer_size=self.AUTOTUNE) test_ds = test_ds.cache().prefetch(buffer_size=self.AUTOTUNE) return train_ds, val_ds, test_ds
def make_model(vector_train, max_tokens, output_seq_len, num_hidden, size_hidden, hidden_activ='relu', output_activ='sigmoid', loss='binary_crossentropy', optimizer='adam', embed=True): vectorizer = TextVectorization(max_tokens=max_tokens, output_sequence_length=output_seq_len) vectorizer.adapt(vector_train) model = keras.Sequential() model.add(layers.Input(shape=(1, ), dtype=tf.string)) model.add(vectorizer) #Vectorizer Layer if embed: model.add(layers.Embedding(max_tokens + 1, size_hidden)) #Embedded Layer for i in range(num_hidden): model.add(layers.Dense(size_hidden, activation=hidden_activ)) #hidden layers model.add(layers.Dense(1, activation=output_activ)) #output layer model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy']) return model
def build_model(train_dataset: PrefetchDataset) -> Sequential: """ Initializes a Sequential model and adds text vectorization, word embedding, LSTM, and densely connected layers. :param train_dataset: The dataset to adapt the vocabulary on. :return: A Sequential object. """ # Initialize the TextVectorization layer which assigns integers to each token encoder = TextVectorization(max_tokens=VOCAB_SIZE) # Set the vocabulary for the encoding layer. This will be used to initialize a lookup table of word embeddings. # The code for this and subsequent layers adapted from: # https://www.tensorflow.org/tutorials/text/text_classification_rnn#create_the_text_encoder encoder.adapt(train_dataset.map(lambda text, label: text)) model = Sequential() model.add(encoder) # Next we add our word embedding layer which converts token indices into dense vectors model.add(Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=8, activity_regularizer=l2(0.001), mask_zero=True)) # Bidirectional wrapper for LSTM allows data to be processed forwards and backwards and then concatenated into # one output model.add(Bidirectional(LSTM(8))) # Densely connected layers with L2 regularization to reduce over-fitting model.add(Dense(8, activation="relu", kernel_regularizer=l2(0.001), activity_regularizer=l2(0.001))) model.add(Dense(1, activation="sigmoid")) return model
def get_vectorize_layer(texts, vocab_size, max_seq, special_tokens=["[MASK]"]): """Build Text vectorization layer Args: texts (list): List of string i.e input texts vocab_size (int): vocab size max_seq (int): Maximum sequence lenght. special_tokens (list, optional): List of special tokens. Defaults to ['[MASK]']. Returns: layers.Layer: Return TextVectorization Keras Layer """ vectorize_layer = TextVectorization( max_tokens=vocab_size, output_mode="int", standardize=custom_standardization, output_sequence_length=max_seq, ) vectorize_layer.adapt(texts) # Insert mask token in vocabulary vocab = vectorize_layer.get_vocabulary() vocab = vocab[2 : vocab_size - len(special_tokens)] + ["[mask]"] vectorize_layer.set_vocabulary(vocab) return vectorize_layer
def create_text_vectorization_model( text_vectorization_filepath: str, dataset_all_tokens: tf.data.Dataset) -> tf.keras.models.Sequential: """ create text vectorization model this vectorizer converts an array of strings to an array of integers """ if exists(text_vectorization_filepath): logger.info('found text vectorization model') return tf.keras.models.load_model(text_vectorization_filepath, compile=False) vectorize_layer = TextVectorization(max_tokens=vocab_size, output_mode='int') logger.success('created text vectorization layer') # batch the dataset to make it easier to store # in memory vectorize_layer.adapt(dataset_all_tokens.batch(batch_size)) logger.success('adapted vectorization to training dataset') text_vectorization_model = tf.keras.models.Sequential( [tf.keras.Input(shape=(1, ), dtype=tf.string), vectorize_layer]) # simple text vectorization test logger.info(text_vectorization_model.predict(["this is a test"])) text_vectorization_model.save(text_vectorization_filepath) return text_vectorization_model
def init_vectorize_layer(self, text_dataset: np.ndarray) -> TextVectorization: text_vectorizer = TextVectorization(max_tokens=self.max_features, standardize=self.custom_preprocessing, output_mode='int', output_sequence_length=self.max_len) text_vectorizer.adapt(text_dataset) return text_vectorizer
class TextVectorizer(Vectorizer): """Text vectorizer contains an instance of TextVectorization from TF.""" def __init__(self, name, feature_number=10000, length=500): Vectorizer.__init__(self, name, feature_number) self.length = length def vectorize(self, text): text = tf.expand_dims(text, -1) return self.vectorizer(text) def vectorize_set(self, data): data = [self.vectorize(t) for t in data] return data def fit_transform(self, data): print(f'Training {self.name}') data = np.array(data) self.vectorizer = TextVectorization( max_tokens=self.feature_number, output_mode='int', output_sequence_length=self.length) self.vectorizer.adapt(data) return self.transform(data)
def initialize_vectorizer_layer(text, pad_length, max_tokens=None): # Create vectorizer vectorizer = TextVectorization(output_sequence_length=pad_length, standardize=None, max_tokens=max_tokens) vectorizer.adapt(text) vocab = vectorizer.get_vocabulary() return vectorizer, vocab
def create_encoder(list_of_texts): """ Creates encoder that creates a vocabulary based on given list of texts. It can be used as a parameter for create_model() function. """ encoder = TextVectorization(max_tokens=NUM_WORDS) encoder.adapt(list_of_texts) return encoder
def vectorizer(raw_train_ds): vectorizer = TextVectorization(standardize=custom_standardization, max_tokens=max_tokens, output_sequence_length=sequence_length) text_ds = raw_train_ds.map(lambda x, y: x) vectorizer.adapt(text_ds) np.savetxt('voc.out', vectorizer.get_vocabulary(), fmt='%s') return vectorizer
def runRNN(): # Assumes you're in the root level of the dataset directory. # If you aren't, you'll need to change the relative paths here. train_data = prepareData('./train') test_data = prepareData('./test') for text_batch, label_batch in train_data.take(1): print(text_batch.numpy()[0]) print(label_batch.numpy()[0]) # 0 = negative, 1 = positive model = Sequential() # ----- 1. INPUT # We need this to use the TextVectorization layer next. model.add(Input(shape=(1,), dtype="string")) # ----- 2. TEXT VECTORIZATION # This layer processes the input string and turns it into a sequence of # max_len integers, each of which maps to a certain token. max_tokens = 1000 max_len = 100 vectorize_layer = TextVectorization( # Max vocab size. Any words outside of the max_tokens most common ones # will be treated the same way: as "out of vocabulary" (OOV) tokens. max_tokens=max_tokens, # Output integer indices, one per string token output_mode="int", # Always pad or truncate to exactly this many tokens output_sequence_length=max_len, ) # Call adapt(), which fits the TextVectorization layer to our text dataset. # This is when the max_tokens most common words (i.e. the vocabulary) are selected. train_texts = train_data.map(lambda text, label: text) vectorize_layer.adapt(train_texts) model.add(vectorize_layer) # ----- 3. EMBEDDING # This layer turns each integer (representing a token) from the previous layer # an embedding. Note that we're using max_tokens + 1 here, since there's an # out-of-vocabulary (OOV) token that gets added to the vocab. model.add(Embedding(max_tokens + 1, 128)) # ----- 4. RECURRENT LAYER model.add(LSTM(64)) # ----- 5. DENSE HIDDEN LAYER model.add(Dense(64, activation="relu")) # ----- 6. OUTPUT model.add(Dense(1, activation="sigmoid")) # Compile and train the model. model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]) model.fit(train_data, epochs=1) model.save_weights('rnn')
def get_vectorizer(df_train, df_test): # Vectorizes and pads dataset # Also lowers and strips punctuation vectorizer = TextVectorization(max_tokens=7500, output_sequence_length=200) text_ds = tf.data.Dataset.from_tensor_slices(df_train['text']).batch(32) vectorizer.adapt(text_ds) return vectorizer
def build_text_layer(raw_vocab): vocabulary = tf.data.Dataset.from_tensor_slices(list(raw_vocab)) embed_layer = TextVectorization( max_tokens=100, #standardize=custom_standardization, output_mode='int', output_sequence_length=100) embed_layer.adapt(vocabulary.batch(64)) return embed_layer
def main(): train_dataset, test_dataset = generate_data() train_dataset = train_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE) test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE) encoder = TextVectorization(max_tokens=VOCAB_SIZE) encoder.adapt(train_dataset.map(lambda text, label: text)) vocab = np.array(encoder.get_vocabulary()) #print(vocab[:20]) LSTM_model(train_dataset, test_dataset, encoder)
def reviews_encoding(df, max_features, sequence_length): vectorize_layer = TextVectorization( standardize=None, max_tokens=max_features, output_mode="int", output_sequence_length=sequence_length, ) # vectorize_layer.adapt(np.array(df['commentaire'])) vectorize_layer.adapt(np.array(df)) return vectorize_layer
def __init__(self, model_dir): # load the artifacts self.artifacts = pickle.load( open(os.path.join(model_dir, 'model_artifacts.pkl'), 'rb')) # create the vectorizers train_src = np.load(os.path.join(model_dir, 'train_src.npy'), allow_pickle=True) train_tgt = np.load(os.path.join(model_dir, 'train_tgt.npy'), allow_pickle=True) vectorizer_src = TextVectorization() vectorizer_src.adapt(train_src) train_seq = vectorizer_src(train_src) self.vectorizer_src = vectorizer_src vectorizer_tgt = TextVectorization() vectorizer_tgt.adapt(train_tgt) self.vectorizer_tgt = vectorizer_tgt # load models vocab_src = vectorizer_src.get_vocabulary() self.encoder = MyEncoder( len(vocab_src), embedding_dim=self.artifacts['embedding_size'], enc_units=self.artifacts['bottleneck_units'], batch_size=self.artifacts['batch_size']) # call the model first to create the variables sample_hidden = self.encoder.initialize_hidden_state() sample_output, sample_hidden = self.encoder( tf.zeros( (self.artifacts['batch_size'], train_seq.numpy().shape[1])), sample_hidden) self.encoder.load_weights( os.path.join(model_dir, f'encoder_weights_e{self.artifacts["epochs"]}.h5')) print(self.encoder.summary()) vocab_tgt = vectorizer_tgt.get_vocabulary() self.decoder = MyDecoder( len(vocab_tgt), embedding_dim=self.artifacts['embedding_size'], dec_units=self.artifacts['bottleneck_units'], batch_size=self.artifacts['batch_size']) # call the model first to create the variables _ = self.decoder(tf.random.uniform((self.artifacts['batch_size'], 1)), sample_hidden, sample_output) self.decoder.load_weights( os.path.join(model_dir, f'decoder_weights_e{self.artifacts["epochs"]}.h5')) print(self.decoder.summary())
def make_vektorizer(vocab, max_features=10000, max_len=None, ngrams_size=None): vect_layer = TextVectorization( max_tokens=max_features, output_mode="int", ngrams=ngrams_size, output_sequence_length=max_len, ) logging.info("Starting to adapt...") vect_layer.adapt(vocab) logging.info("Adapted to Corpus") input = Input(shape=(1, ), dtype=tf.string) output = vect_layer(input) return Model(inputs=[input], outputs=[output])
def create_vectorize_text(ds): vectorize_layer = TextVectorization(max_tokens=max_features, output_mode='int', output_sequence_length=sequence_length) text_ds = ds.map(lambda x, y: x) vectorize_layer.adapt(text_ds) def vectorize_text(text, label): text = tf.expand_dims(text, -1) return vectorize_layer(text), label return vectorize_text
def makePrediction(messages_as_string, modelSelection): print("Running prediction function...") messages = list(messages_as_string.split('s3cur!tywh@l3')) vocab_size = 15613 # amount of words that appear in commit messages sequence_length = 1000 # arbitrary vector length embedding_layer = tf.keras.layers.Embedding(vocab_size, sequence_length) # Use the text vectorization layer to normalize, split, and map strings to # integers. Note that the layer uses the custom standardization defined above. # Set maximum_sequence length as all samples are not of the same length. vectorizer = TextVectorization(max_tokens=vocab_size, output_sequence_length=sequence_length) text_ds = tf.data.Dataset.from_tensor_slices(messages).batch(32) vectorizer.adapt(text_ds) #switch case for select path = switch(modelSelection) if path == 'invalidModel': return path #path = './assets/models/' + modelSelection print("trying to load model at: " + path) model = load_model(path) string_input = keras.Input(shape=(1, ), dtype="string") x = vectorizer(string_input) preds = model(x) end_to_end_model = keras.Model(string_input, preds) count = 0 vulnProbabilitySum = 0 nonVulnProbabilitySum = 0 for message in messages: count = count + 1 probabilities = end_to_end_model.predict([[message]]) print(message) vulnProbabilitySum = vulnProbabilitySum + probabilities[0][1] print('vuln:', probabilities[0][1]) nonVulnProbabilitySum = nonVulnProbabilitySum + probabilities[0][0] print('nonvuln:', probabilities[0][0]) vulnLikelyHoodStr = '0' if count == 0 else str(vulnProbabilitySum / count) nonVulnLikelyHoodStr = '0' if count == 0 else str(nonVulnProbabilitySum / count) ## confidence should equal approximately 1 confindence = '0' if count == 0 else str(vulnProbabilitySum / count + nonVulnProbabilitySum / count) return_string = vulnLikelyHoodStr + ',' + nonVulnLikelyHoodStr + ',' + confindence print("Response body: \n" + return_string) return return_string
def fit(self, x, y=None): vectorize_layer = TextVectorization( max_tokens=self.max_tokens, output_mode=self.output_mode, output_sequence_length=self.output_sequence_length) vectorize_layer.adapt(list(x)) model = tf.keras.models.Sequential() model.add(tf.keras.Input(shape=(1, ), dtype=tf.string)) model.add(vectorize_layer) self.model = model self.vocab_processor = vectorize_layer return self