Python TextVectorization.adapt 예제들, tensorflow.keras.layers.experimental.preprocessing.TextVectorization.adapt Python 예제들

예제 #1

0

파일 보기

파일: modelm.py 프로젝트: theken55/nlp_examples

    def construct(self, text_ds, label_num):
        max_features = 20000
        embedding_dim = 128
        sequence_length = 200

        vectorize_layer = TextVectorization(
            standardize=custom_standardization,
            max_tokens=max_features,
            output_mode="int",
            output_sequence_length=sequence_length,
        )
        vectorize_layer.adapt(text_ds)

        inputs = tf.keras.Input(shape=(1, ), dtype="string")
        indices = vectorize_layer(inputs)

        x = layers.Embedding(max_features + 1, embedding_dim)(indices)
        x = layers.Dropout(0.5)(x)

        # global max pooling
        x = layers.GlobalMaxPooling1D()(x)
        predictions = layers.Dense(label_num,
                                   activation="sigmoid",
                                   name="predictions")(x)
        model = tf.keras.Model(inputs, predictions)
        model.compile(loss="binary_crossentropy",
                      optimizer="adam",
                      metrics=["accuracy"])

        return model

예제 #2

0

파일 보기

파일: Helper.py 프로젝트: NiklasHoltmeyer/sentiment-analysis

    def trainWordVectorEncoder(trainText, VOCAB_SIZE=None):
        #https://www.tensorflow.org/tutorials/text/text_classification_rnn
        encoder = TextVectorization(
        ) if VOCAB_SIZE is None else TextVectorization(max_tokens=VOCAB_SIZE)
        encoder.adapt(tf.data.Dataset.from_tensor_slices(trainText))

        return encoder

예제 #3

0

파일 보기

class TFVectTokenizer:
    def __init__(self, seqlen, step, freq_threshold):
        self.freq_threshold = freq_threshold
        self.freq_threshold = 0
        self.seqlen = seqlen
        self.step = step
        self.vocab_size = 20000
        self.vectorize_layer = TextVectorization(
            standardize=custom_standardization,
            max_tokens=self.vocab_size - 1,
            output_mode="int",
            output_sequence_length=self.seqlen + 1,
        )

    def tokenize(self, text_ds):
        # Create a vectorization layer and adapt it to the text

        self.vectorize_layer.adapt(text_ds)
        vocab = self.vectorize_layer.get_vocabulary(
        )  # To get words back from token indices
        reverse_token_map = {t: i for i, t in enumerate(vocab)}
        return text_ds, vocab, reverse_token_map

    def get_input_sequences(self, text_ds, reverse_token_map):
        text_ds = text_ds.map(
            lambda text: prepare_lm_inputs_labels(text, self.vectorize_layer))
        text_ds = text_ds.prefetch(tf.data.experimental.AUTOTUNE)
        return text_ds

예제 #4

0

파일 보기

파일: train.py 프로젝트: vparisa/Project_CodeNet

def get_vectorize_layer(texts, vocab_size, max_seq):
    """Build Text vectorization layer

    Args:
      texts (list): List of string, i.e., input texts
      vocab_size (int): vocab size
      max_seq (int): Maximum sequence length.

    Returns:
        layers.Layer: Return TextVectorization Keras Layer
    """
    vectorize_layer = TextVectorization(
        max_tokens=vocab_size,
        output_mode="int",
        standardize=custom_standardization,
        output_sequence_length=max_seq,
    )
    vectorize_layer.adapt(texts)

    # Insert mask token in vocabulary
    vocab = vectorize_layer.get_vocabulary()
    #print("len(vocab):", len(vocab)) #177
    #vocab: ['', '[UNK]', 'the', 'and', 'a', 'of', ...] all lower-case
    #GJ20: where do the empty string and [UNK] come from?
    # they are created by adapt() as words 0 and 1
    # '' is padding token; [UNK] is OOV token
    vocab = vocab[2:len(vocab)-1] + ["[mask]"]
    #print("len(vocab):", len(vocab)) #175
    #GJ20: anyway first 2 words removed and '[mask]' added at the end
    vectorize_layer.set_vocabulary(vocab)
    # '' and [UNK] are back in
    #vocab = vectorize_layer.get_vocabulary()
    #print("len(vocab):", len(vocab)) #177
    # '[mask]' has been added as last (least frequent) word in the vocab
    return vectorize_layer

예제 #5

0

파일 보기

파일: vocab.py 프로젝트: dakk/ml-experiments

def build_vocab(directories, batch_size, vocab_size, maxlen):
    global vectorize_layer

    # Create a list all files
    filenames = []
    for dir in directories:
        for f in os.listdir(dir):
            filenames.append(os.path.join(dir, f))

    print(f"{len(filenames)} files")

    # Create dataset from text files
    random.shuffle(filenames)
    text_ds = tf.data.TextLineDataset(filenames)
    text_ds = text_ds.shuffle(buffer_size=256)
    text_ds = text_ds.batch(batch_size)

    # Create vectcorization layer and adapt it to the text
    vectorize_layer = TextVectorization(
        standardize=custom_standardization,
        max_tokens=vocab_size - 1,
        output_mode="int",
        output_sequence_length=maxlen + 1,
    )
    vectorize_layer.adapt(text_ds)
    vocab = vectorize_layer.get_vocabulary()

    word_to_index = {}
    for index, word in enumerate(vocab):
        word_to_index[word] = index

    text_ds = text_ds.map(prepare_lm_inputs_labels)
    text_ds = text_ds.prefetch(tf.data.experimental.AUTOTUNE)

    return (text_ds, vocab, word_to_index)

예제 #6

0

파일 보기

파일: predict.py 프로젝트: benbuzzelli/flask-whale-app

def makePrediction(messages_as_string):
    print("Running prediction function...")
    messages = list(messages_as_string.split('s3cur!tywh@l3'))

    vocab_size = 12612
    sequence_length = 1000

    embedding_layer = tf.keras.layers.Embedding(vocab_size, sequence_length)

    # Use the text vectorization layer to normalize, split, and map strings to
    # integers. Note that the layer uses the custom standardization defined above.
    # Set maximum_sequence length as all samples are not of the same length.
    vectorizer = TextVectorization(max_tokens=vocab_size,
                                   output_sequence_length=sequence_length)
    text_ds = tf.data.Dataset.from_tensor_slices(messages).batch(32)
    vectorizer.adapt(text_ds)

    path = './assets/models/model.h5'
    print("trying to load model at: " + path)
    model = load_model(path)
    print("I loaded a model")

    string_input = keras.Input(shape=(1, ), dtype="string")
    x = vectorizer(string_input)
    preds = model(x)
    end_to_end_model = keras.Model(string_input, preds)

    count = 0
    Vuln = 0
    vulnLengthSum = 0
    nonVuln = 0
    nonVulnLengthSum = 0

    for message in messages:
        count = count + 1
        probabilities = end_to_end_model.predict([[message]])
        np.argmax(probabilities[0])

        if probabilities[0][1] > 0.5:
            vulnLengthSum = vulnLengthSum + len(message)
            # print("length:",len(message))
            # print(message)
            Vuln = Vuln + 1
        if probabilities[0][0] > 0.5:
            nonVulnLengthSum = nonVulnLengthSum + len(message)
            # print("length:",len(message))
            # print(message)
            nonVuln = nonVuln + 1

    vuln = str(Vuln)
    avg_vuln = '0' if vulnLengthSum == 0 else str(vulnLengthSum / Vuln)
    isVuln = 'true' if Vuln > nonVuln else 'false'
    non_vuln = str(nonVuln)
    avg_non_vuln = '0' if nonVulnLengthSum == 0 else str(nonVulnLengthSum /
                                                         nonVuln)
    return_string = vuln + "," + non_vuln + "," + isVuln

    print("Response body: \n" + return_string)

    return return_string

예제 #7

0

파일 보기

파일: test_tensorflow2_autolog.py 프로젝트: ConsultingMD/mlflow

def get_text_vec_model(train_samples):
    # Taken from: https://github.com/mlflow/mlflow/issues/3910

    # pylint: disable=no-name-in-module
    from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

    VOCAB_SIZE = 10
    SEQUENCE_LENGTH = 16
    EMBEDDING_DIM = 16

    vectorizer_layer = TextVectorization(
        input_shape=(1, ),
        max_tokens=VOCAB_SIZE,
        output_mode="int",
        output_sequence_length=SEQUENCE_LENGTH,
    )
    vectorizer_layer.adapt(train_samples)
    model = tf.keras.Sequential([
        vectorizer_layer,
        tf.keras.layers.Embedding(
            VOCAB_SIZE,
            EMBEDDING_DIM,
            name="embedding",
            mask_zero=True,
            input_shape=(1, ),
        ),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dense(1, activation="tanh"),
    ])
    model.compile(optimizer="adam", loss="mse", metrics="mae")
    return model

예제 #8

0

파일 보기

def build_classifier(text):
    MAX_VOCAB_SIZE = 20000
    encoder = TextVectorization(max_tokens=MAX_VOCAB_SIZE)
    encoder.adapt(text)
    vocabset = set(encoder.get_vocabulary())
    vocab_size = len(encoder.get_vocabulary())

    word2idx, weights = get_glove_embeddings(vocabset)
    embedding_matrix = np.zeros((vocab_size, weights.shape[1]))
    for i, word in enumerate(encoder.get_vocabulary()):
        vec = word2idx.get(word)
        if(vec is not None):
            embedding_matrix[i] = weights[vec]
    
    model = tf.keras.Sequential([
        encoder,
        tf.keras.layers.Embedding(
            input_dim = embedding_matrix.shape[0],
            output_dim = embedding_matrix.shape[1],
            weights = embedding_matrix, 
            mask_zero=True,
            trainable=True
        ),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    return model

예제 #9

0

파일 보기

def prepare_data_set_for_training(raw_train_ds, raw_val_ds, raw_test_ds):
    max_features = 10000
    sequence_length = 250

    # create vectorization layer
    vectorize_layer = TextVectorization(standardize=custom_standardization,
                                        max_tokens=max_features,
                                        output_mode='int',
                                        output_sequence_length=sequence_length)

    # Make a text-only dataset (without labels), then call adapt
    train_text = raw_train_ds.map(lambda x, y: x)
    vectorize_layer.adapt(train_text)

    def vectorize_text(text, label):
        text = tf.expand_dims(text, -1)
        return vectorize_layer(text), label

    # retrieve a batch (of 32 reviews and labels) from the dataset
    text_batch, label_batch = next(iter(raw_train_ds))
    first_review, first_label = text_batch[0], label_batch[0]
    print("Review", first_review)
    print("Label", raw_train_ds.class_names[first_label])
    print("Vectorized review", vectorize_text(first_review, first_label))

    #explore the vocabulary
    print("1287 ---> ", vectorize_layer.get_vocabulary()[1287])
    print(" 313 ---> ", vectorize_layer.get_vocabulary()[313])
    print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

    train_ds = raw_train_ds.map(vectorize_text)
    val_ds = raw_val_ds.map(vectorize_text)
    test_ds = raw_test_ds.map(vectorize_text)
    return max_features, train_ds, val_ds, test_ds, vectorize_layer

예제 #10

0

파일 보기

    def get_vectorize_layer(self, texts, special_tokens=["mask"]):
        """Build Text vectorization layer

        Args:
          texts (list): List of string i.e input texts
          vocab_size (int): vocab size
          max_seq (int): Maximum sequence lenght.
          special_tokens (list, optional): List of special tokens. Defaults to ['[MASK]'].

        Returns:
            layers.Layer: Return TextVectorization Keras Layer
        """
        vectorize_layer = TextVectorization(
            max_tokens=self.config.VOCAB_SIZE,
            output_mode="int",
            ngrams=None,
            standardize="lower_and_strip_punctuation",
            output_sequence_length=self.config.MAX_LEN,
        )
        vectorize_layer.adapt(texts)

        # Insert mask token in vocabulary
        vocab = vectorize_layer.get_vocabulary()
        vocab = vocab[2:self.config.VOCAB_SIZE -
                      len(special_tokens)] + ["mask"]
        vectorize_layer.set_vocabulary(vocab)
        return vectorize_layer

예제 #11

0

파일 보기

파일: wrb_rnn.py 프로젝트: iyuanfang0105/wind_rice_bowl

    def load_dataset(self):
        raw_train_ds, raw_val_ds, raw_test_ds = self.load_raw_data()
        max_features = 10000
        sequence_length = 250

        vectorize_layer = TextVectorization(
            standardize=self.custom_standardization,
            max_tokens=self.max_tokens,
            output_mode='int',
            output_sequence_length=sequence_length)

        # Make a text-only dataset (without labels), then call adapt
        train_text = raw_train_ds.map(lambda x, y: x)
        vectorize_layer.adapt(train_text)
        self.vocab = vectorize_layer.get_vocabulary()

        train_ds = raw_train_ds.map(
            lambda x, y: (vectorize_layer(tf.expand_dims(x, -1)), y))
        val_ds = raw_val_ds.map(lambda x, y:
                                (vectorize_layer(tf.expand_dims(x, -1)), y))
        test_ds = raw_test_ds.map(lambda x, y:
                                  (vectorize_layer(tf.expand_dims(x, -1)), y))

        train_ds = train_ds.cache().prefetch(buffer_size=self.AUTOTUNE)
        val_ds = val_ds.cache().prefetch(buffer_size=self.AUTOTUNE)
        test_ds = test_ds.cache().prefetch(buffer_size=self.AUTOTUNE)

        return train_ds, val_ds, test_ds

예제 #12

0

파일 보기

파일: langnet4701.py 프로젝트: allanbuchness/CS4701LanguageNeuralNet

def make_model(vector_train,
               max_tokens,
               output_seq_len,
               num_hidden,
               size_hidden,
               hidden_activ='relu',
               output_activ='sigmoid',
               loss='binary_crossentropy',
               optimizer='adam',
               embed=True):

    vectorizer = TextVectorization(max_tokens=max_tokens,
                                   output_sequence_length=output_seq_len)
    vectorizer.adapt(vector_train)
    model = keras.Sequential()
    model.add(layers.Input(shape=(1, ), dtype=tf.string))
    model.add(vectorizer)  #Vectorizer Layer
    if embed:
        model.add(layers.Embedding(max_tokens + 1,
                                   size_hidden))  #Embedded Layer
    for i in range(num_hidden):
        model.add(layers.Dense(size_hidden,
                               activation=hidden_activ))  #hidden layers
    model.add(layers.Dense(1, activation=output_activ))  #output layer
    model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
    return model

예제 #13

0

파일 보기

파일: main.py 프로젝트: brentlblnc/sentiment-analysis-keras

def build_model(train_dataset: PrefetchDataset) -> Sequential:
    """
    Initializes a Sequential model and adds text vectorization, word embedding, LSTM, and densely connected layers.
    :param train_dataset: The dataset to adapt the vocabulary on.
    :return: A Sequential object.
    """

    # Initialize the TextVectorization layer which assigns integers to each token
    encoder = TextVectorization(max_tokens=VOCAB_SIZE)

    # Set the vocabulary for the encoding layer. This will be used to initialize a lookup table of word embeddings.
    # The code for this and subsequent layers adapted from:
    # https://www.tensorflow.org/tutorials/text/text_classification_rnn#create_the_text_encoder
    encoder.adapt(train_dataset.map(lambda text, label: text))

    model = Sequential()
    model.add(encoder)
    # Next we add our word embedding layer which converts token indices into dense vectors
    model.add(Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=8, activity_regularizer=l2(0.001),
                        mask_zero=True))
    # Bidirectional wrapper for LSTM allows data to be processed forwards and backwards and then concatenated into
    # one output
    model.add(Bidirectional(LSTM(8)))
    # Densely connected layers with L2 regularization to reduce over-fitting
    model.add(Dense(8, activation="relu", kernel_regularizer=l2(0.001), activity_regularizer=l2(0.001)))
    model.add(Dense(1, activation="sigmoid"))

    return model

예제 #14

0

파일 보기

def get_vectorize_layer(texts, vocab_size, max_seq, special_tokens=["[MASK]"]):
    """Build Text vectorization layer

    Args:
      texts (list): List of string i.e input texts
      vocab_size (int): vocab size
      max_seq (int): Maximum sequence lenght.
      special_tokens (list, optional): List of special tokens. Defaults to ['[MASK]'].

    Returns:
        layers.Layer: Return TextVectorization Keras Layer
    """
    vectorize_layer = TextVectorization(
        max_tokens=vocab_size,
        output_mode="int",
        standardize=custom_standardization,
        output_sequence_length=max_seq,
    )
    vectorize_layer.adapt(texts)

    # Insert mask token in vocabulary
    vocab = vectorize_layer.get_vocabulary()
    vocab = vocab[2 : vocab_size - len(special_tokens)] + ["[mask]"]
    vectorize_layer.set_vocabulary(vocab)
    return vectorize_layer

예제 #15

0

파일 보기

파일: classification.py 프로젝트: jschmidtnj/cs584

def create_text_vectorization_model(
        text_vectorization_filepath: str,
        dataset_all_tokens: tf.data.Dataset) -> tf.keras.models.Sequential:
    """
    create text vectorization model
    this vectorizer converts an array of strings to an array of integers
    """
    if exists(text_vectorization_filepath):
        logger.info('found text vectorization model')
        return tf.keras.models.load_model(text_vectorization_filepath,
                                          compile=False)

    vectorize_layer = TextVectorization(max_tokens=vocab_size,
                                        output_mode='int')
    logger.success('created text vectorization layer')
    # batch the dataset to make it easier to store
    # in memory
    vectorize_layer.adapt(dataset_all_tokens.batch(batch_size))
    logger.success('adapted vectorization to training dataset')

    text_vectorization_model = tf.keras.models.Sequential(
        [tf.keras.Input(shape=(1, ), dtype=tf.string), vectorize_layer])
    # simple text vectorization test
    logger.info(text_vectorization_model.predict(["this is a test"]))
    text_vectorization_model.save(text_vectorization_filepath)
    return text_vectorization_model

예제 #16

0

파일 보기

파일: train.py 프로젝트: menonj/imdb-reviews-sentiment-analysis

 def init_vectorize_layer(self, text_dataset: np.ndarray) -> TextVectorization:
     text_vectorizer = TextVectorization(max_tokens=self.max_features,
                                         standardize=self.custom_preprocessing,
                                         output_mode='int',
                                         output_sequence_length=self.max_len)
     text_vectorizer.adapt(text_dataset)
     return text_vectorizer

예제 #17

0

파일 보기

class TextVectorizer(Vectorizer):
    """Text vectorizer contains an instance of TextVectorization from TF."""

    def __init__(self, name, feature_number=10000, length=500):
        Vectorizer.__init__(self, name, feature_number)
        self.length = length

    def vectorize(self, text):
        text = tf.expand_dims(text, -1)
        return self.vectorizer(text)

    def vectorize_set(self, data):
        data =  [self.vectorize(t) for t in data]
        return data

    def fit_transform(self, data):
        print(f'Training {self.name}')
        data = np.array(data)
        self.vectorizer = TextVectorization(
            max_tokens=self.feature_number,
            output_mode='int',
            output_sequence_length=self.length)

        self.vectorizer.adapt(data)

        return self.transform(data)

예제 #18

0

파일 보기

def initialize_vectorizer_layer(text, pad_length, max_tokens=None):
    # Create vectorizer
    vectorizer = TextVectorization(output_sequence_length=pad_length,
                                   standardize=None,
                                   max_tokens=max_tokens)
    vectorizer.adapt(text)
    vocab = vectorizer.get_vocabulary()
    return vectorizer, vocab

예제 #19

0

파일 보기

def create_encoder(list_of_texts):
    """
    Creates encoder that creates a vocabulary based on given list of texts.
    It can be used as a parameter for create_model() function.
    """
    encoder = TextVectorization(max_tokens=NUM_WORDS)
    encoder.adapt(list_of_texts)
    return encoder

예제 #20

0

파일 보기

파일: CSI_5138_YuSun_A3.py 프로젝트: YuSun0804/CSI5138-Deep-Learning

def vectorizer(raw_train_ds):
    vectorizer = TextVectorization(standardize=custom_standardization,
                                   max_tokens=max_tokens,
                                   output_sequence_length=sequence_length)
    text_ds = raw_train_ds.map(lambda x, y: x)
    vectorizer.adapt(text_ds)
    np.savetxt('voc.out', vectorizer.get_vocabulary(), fmt='%s')
    return vectorizer

예제 #21

0

파일 보기

def runRNN():
    # Assumes you're in the root level of the dataset directory.
    # If you aren't, you'll need to change the relative paths here.
    train_data = prepareData('./train')
    test_data = prepareData('./test')

    for text_batch, label_batch in train_data.take(1):
        print(text_batch.numpy()[0])
        print(label_batch.numpy()[0])  # 0 = negative, 1 = positive

    model = Sequential()

    # ----- 1. INPUT
    # We need this to use the TextVectorization layer next.
    model.add(Input(shape=(1,), dtype="string"))

    # ----- 2. TEXT VECTORIZATION
    # This layer processes the input string and turns it into a sequence of
    # max_len integers, each of which maps to a certain token.
    max_tokens = 1000
    max_len = 100
    vectorize_layer = TextVectorization(
        # Max vocab size. Any words outside of the max_tokens most common ones
        # will be treated the same way: as "out of vocabulary" (OOV) tokens.
        max_tokens=max_tokens,
        # Output integer indices, one per string token
        output_mode="int",
        # Always pad or truncate to exactly this many tokens
        output_sequence_length=max_len,
    )

    # Call adapt(), which fits the TextVectorization layer to our text dataset.
    # This is when the max_tokens most common words (i.e. the vocabulary) are selected.
    train_texts = train_data.map(lambda text, label: text)
    vectorize_layer.adapt(train_texts)

    model.add(vectorize_layer)

    # ----- 3. EMBEDDING
    # This layer turns each integer (representing a token) from the previous layer
    # an embedding. Note that we're using max_tokens + 1 here, since there's an
    # out-of-vocabulary (OOV) token that gets added to the vocab.
    model.add(Embedding(max_tokens + 1, 128))

    # ----- 4. RECURRENT LAYER
    model.add(LSTM(64))

    # ----- 5. DENSE HIDDEN LAYER
    model.add(Dense(64, activation="relu"))

    # ----- 6. OUTPUT
    model.add(Dense(1, activation="sigmoid"))

    # Compile and train the model.
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    model.fit(train_data, epochs=1)

    model.save_weights('rnn')

예제 #22

0

파일 보기

def get_vectorizer(df_train, df_test):

    # Vectorizes and pads dataset
    # Also lowers and strips punctuation
    vectorizer = TextVectorization(max_tokens=7500, output_sequence_length=200)
    text_ds = tf.data.Dataset.from_tensor_slices(df_train['text']).batch(32)
    vectorizer.adapt(text_ds)

    return vectorizer

예제 #23

0

파일 보기

def build_text_layer(raw_vocab):
    vocabulary = tf.data.Dataset.from_tensor_slices(list(raw_vocab))
    embed_layer = TextVectorization(
        max_tokens=100,
        #standardize=custom_standardization,
        output_mode='int',
        output_sequence_length=100)
    embed_layer.adapt(vocabulary.batch(64))
    return embed_layer

예제 #24

0

파일 보기

def main():
    train_dataset, test_dataset = generate_data()
    train_dataset = train_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    encoder = TextVectorization(max_tokens=VOCAB_SIZE)
    encoder.adapt(train_dataset.map(lambda text, label: text))
    vocab = np.array(encoder.get_vocabulary())
    #print(vocab[:20])
    
    LSTM_model(train_dataset, test_dataset, encoder)

예제 #25

0

파일 보기

파일: md_run.py 프로젝트: Chaanks/allocine-sentiment-analysis

def reviews_encoding(df, max_features, sequence_length):
    vectorize_layer = TextVectorization(
        standardize=None,
        max_tokens=max_features,
        output_mode="int",
        output_sequence_length=sequence_length,
    )

    # vectorize_layer.adapt(np.array(df['commentaire']))
    vectorize_layer.adapt(np.array(df))

    return vectorize_layer

예제 #26

0

파일 보기

파일: ml.py 프로젝트: shindohikaru87/adl

    def __init__(self, model_dir):
        # load the artifacts
        self.artifacts = pickle.load(
            open(os.path.join(model_dir, 'model_artifacts.pkl'), 'rb'))

        # create the vectorizers
        train_src = np.load(os.path.join(model_dir, 'train_src.npy'),
                            allow_pickle=True)
        train_tgt = np.load(os.path.join(model_dir, 'train_tgt.npy'),
                            allow_pickle=True)

        vectorizer_src = TextVectorization()
        vectorizer_src.adapt(train_src)
        train_seq = vectorizer_src(train_src)
        self.vectorizer_src = vectorizer_src

        vectorizer_tgt = TextVectorization()
        vectorizer_tgt.adapt(train_tgt)
        self.vectorizer_tgt = vectorizer_tgt

        # load models
        vocab_src = vectorizer_src.get_vocabulary()
        self.encoder = MyEncoder(
            len(vocab_src),
            embedding_dim=self.artifacts['embedding_size'],
            enc_units=self.artifacts['bottleneck_units'],
            batch_size=self.artifacts['batch_size'])

        # call the model first to create the variables
        sample_hidden = self.encoder.initialize_hidden_state()
        sample_output, sample_hidden = self.encoder(
            tf.zeros(
                (self.artifacts['batch_size'], train_seq.numpy().shape[1])),
            sample_hidden)
        self.encoder.load_weights(
            os.path.join(model_dir,
                         f'encoder_weights_e{self.artifacts["epochs"]}.h5'))
        print(self.encoder.summary())

        vocab_tgt = vectorizer_tgt.get_vocabulary()
        self.decoder = MyDecoder(
            len(vocab_tgt),
            embedding_dim=self.artifacts['embedding_size'],
            dec_units=self.artifacts['bottleneck_units'],
            batch_size=self.artifacts['batch_size'])

        # call the model first to create the variables
        _ = self.decoder(tf.random.uniform((self.artifacts['batch_size'], 1)),
                         sample_hidden, sample_output)
        self.decoder.load_weights(
            os.path.join(model_dir,
                         f'decoder_weights_e{self.artifacts["epochs"]}.h5'))
        print(self.decoder.summary())

예제 #27

0

파일 보기

파일: vectorization.py 프로젝트: hv10/GoodFirstIssuePrediction

def make_vektorizer(vocab, max_features=10000, max_len=None, ngrams_size=None):
    vect_layer = TextVectorization(
        max_tokens=max_features,
        output_mode="int",
        ngrams=ngrams_size,
        output_sequence_length=max_len,
    )
    logging.info("Starting to adapt...")
    vect_layer.adapt(vocab)
    logging.info("Adapted to Corpus")
    input = Input(shape=(1, ), dtype=tf.string)
    output = vect_layer(input)
    return Model(inputs=[input], outputs=[output])

예제 #28

0

파일 보기

def create_vectorize_text(ds):
    vectorize_layer = TextVectorization(max_tokens=max_features,
                                        output_mode='int',
                                        output_sequence_length=sequence_length)

    text_ds = ds.map(lambda x, y: x)
    vectorize_layer.adapt(text_ds)

    def vectorize_text(text, label):
        text = tf.expand_dims(text, -1)
        return vectorize_layer(text), label

    return vectorize_text

예제 #29

0

파일 보기

def makePrediction(messages_as_string, modelSelection):
    print("Running prediction function...")
    messages = list(messages_as_string.split('s3cur!tywh@l3'))

    vocab_size = 15613  # amount of words that appear in commit messages
    sequence_length = 1000  # arbitrary vector length

    embedding_layer = tf.keras.layers.Embedding(vocab_size, sequence_length)

    # Use the text vectorization layer to normalize, split, and map strings to
    # integers. Note that the layer uses the custom standardization defined above.
    # Set maximum_sequence length as all samples are not of the same length.
    vectorizer = TextVectorization(max_tokens=vocab_size,
                                   output_sequence_length=sequence_length)
    text_ds = tf.data.Dataset.from_tensor_slices(messages).batch(32)
    vectorizer.adapt(text_ds)

    #switch case for select
    path = switch(modelSelection)
    if path == 'invalidModel': return path
    #path = './assets/models/' + modelSelection
    print("trying to load model at: " + path)
    model = load_model(path)
    string_input = keras.Input(shape=(1, ), dtype="string")
    x = vectorizer(string_input)
    preds = model(x)
    end_to_end_model = keras.Model(string_input, preds)

    count = 0
    vulnProbabilitySum = 0
    nonVulnProbabilitySum = 0

    for message in messages:
        count = count + 1
        probabilities = end_to_end_model.predict([[message]])
        print(message)
        vulnProbabilitySum = vulnProbabilitySum + probabilities[0][1]
        print('vuln:', probabilities[0][1])
        nonVulnProbabilitySum = nonVulnProbabilitySum + probabilities[0][0]
        print('nonvuln:', probabilities[0][0])
    vulnLikelyHoodStr = '0' if count == 0 else str(vulnProbabilitySum / count)
    nonVulnLikelyHoodStr = '0' if count == 0 else str(nonVulnProbabilitySum /
                                                      count)
    ## confidence should equal approximately 1
    confindence = '0' if count == 0 else str(vulnProbabilitySum / count +
                                             nonVulnProbabilitySum / count)
    return_string = vulnLikelyHoodStr + ',' + nonVulnLikelyHoodStr + ',' + confindence
    print("Response body: \n" + return_string)

    return return_string

예제 #30

0

파일 보기

파일: build_text_vect_features.py 프로젝트: richdevboston/sexism_custom_classifier

    def fit(self, x, y=None):
        vectorize_layer = TextVectorization(
            max_tokens=self.max_tokens,
            output_mode=self.output_mode,
            output_sequence_length=self.output_sequence_length)

        vectorize_layer.adapt(list(x))
        model = tf.keras.models.Sequential()
        model.add(tf.keras.Input(shape=(1, ), dtype=tf.string))
        model.add(vectorize_layer)

        self.model = model
        self.vocab_processor = vectorize_layer

        return self