Python TextVectorization 예제들, keras.layers.experimental.preprocessing.TextVectorization Python 예제들

예제 #1

0

파일 보기

파일: Part 4 - ANN_model_train.py 프로젝트: nadasaiyed/NLP-TextAnalytics

def main(data_dir):
    x_train, x_val, x_test, y_train, y_val, y_test = load_data(data_dir)

    batch_size = 128
    max_vocab_size = 20000
    max_seq_len = 30
    embedding_dim = 100
    lstm_dim = 128
   

    vectorizer = TextVectorization(max_tokens=max_vocab_size,
                                   output_sequence_length=max_seq_len)
    text_data = tf.data.Dataset.from_tensor_slices(x_train).batch(batch_size)
    print('Building vocabulary')
    vectorizer.adapt(text_data)
    vocab = vectorizer.get_vocabulary()
    # load pre-trained w2v model 
    w2v = Word2Vec.load(os.path.join(data_dir, 'processed/w2v.model'))
    print('Building embedding matrix')
    # This matrix will be used to initialze weights in the embedding layer
    embedding_matrix = build_embedding_mat(data_dir, vocab, w2v)
    print('embedding_matrix.shape => {}'.format(embedding_matrix.shape))

    X_train = vectorizer(np.array([[s] for s in x_train])).numpy()
    X_val = vectorizer(np.array([[s] for s in x_val])).numpy()
    X_test = vectorizer(np.array([[s] for s in x_test])).numpy()
    y_train = np.array(y_train)
    y_val = np.array(y_val)
    y_test = np.array(y_test)
    acc_scores={}
    dropout=0.7
    for layer in ['sigmoid','relu','tanh']:
        print("Building the model with ",layer," and dropout ",dropout)
        model = Sequential()
        model.add(Embedding(input_dim=max_vocab_size+3, output_dim=100, input_length=max_seq_len, weights = [embedding_matrix], trainable=True))
        model.add(Flatten())
        model.add(Dense(lstm_dim,activation=layer
                            , kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)
            ))
        model.add(Dropout(dropout))
        model.add(Dense(2,activation='softmax',name='output_layer'))

        print(model.summary())

        print("Compiling the model")
        model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["acc"])
        
        print("Fitting the model")
        model.fit(X_train, y_train, batch_size=batch_size, epochs=10, validation_data=(X_val, y_val))
        scores = model.evaluate(X_val, y_val)
        print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
        acc_scores[layer+"_val"+str(dropout)] = scores[1]*100
        print("Evaluating model on test data")
        scores = model.evaluate(X_test, y_test)
        print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
        acc_scores[layer+"_test"+str(dropout)] = scores[1]*100
        # model.save(os.path.join(data_dir, 'processed/'+layer+str(dropout)) )
        model.save(os.path.join(data_dir, 'processed/'+layer+'.model'))
    print(acc_scores)

예제 #2

0

파일 보기

def vocab_maker(data, max_dic_size, batch_size):
    # Create a vocabulary of the recommended size-1 for pad and out of range
    vectorizer = TextVectorization(max_tokens=max_dic_size-1,
                                   output_mode='int')
    text_data = tf.data.Dataset.from_tensor_slices(data).batch(batch_size)
    vectorizer.adapt(text_data)
    # index 0 and 1 are reserved values for padding and out of dic
    vocab = vectorizer.get_vocabulary()
    vocab = [x.decode('utf-8') for x in vocab]
    return vocab

예제 #3

0

파일 보기

파일: inference.py 프로젝트: JIACHENLOU/msci-text-analytics-s20

def main(text_path, classifier):
    x_train, _, x_test, _, _, _ = load_data(text_path)
    x_test = x_test[-20:]
    print(x_test)
    model = keras.models.load_model(os.path.join(text_path, classifier))
    print(model.summary())

    vectorizer = TextVectorization(max_tokens=config['max_vocab_size'], output_sequence_length=config['max_seq_len'])
    train_data = tf.data.Dataset.from_tensor_slices(x_train).batch(config['batch_size'])
    vectorizer.adapt(train_data)
    x_test = vectorizer(np.array([[w] for w in x_test])).numpy()
    prediction = model.predict(x_test)
    print(prediction)
    classes = np.argmax(prediction, axis=-1)
    print(classes)

예제 #4

0

파일 보기

파일: main.py 프로젝트: Demfier/MSCI-tutorials

def main(data_dir):
    print('Loading data')
    x_train_val, x_test = load_data(data_dir)
    # decrease dataset size for quick testing
    # x_train_val = x_train_val[:1000]
    # x_test = x_test[:100]

    # build vocab
    # NOTE: this script only considers tokens in the training set to build the
    # vocabulary object.
    vectorizer = TextVectorization(
        max_tokens=config['max_vocab_size'],
        output_sequence_length=config['max_seq_len'])
    text_data = tf.data.Dataset.from_tensor_slices(x_train_val).batch(
        config['batch_size'])
    print('Building vocabulary')
    vectorizer.adapt(text_data)
    # NOTE: in this vocab, index 0 is reserved for padding and 1 is reserved
    # for out of vocabulary tokens
    vocab = vectorizer.get_vocabulary()

    # load pre-trained w2v model (this model was trained in tut_1)
    w2v = Word2Vec.load(os.path.join(data_dir, 'w2v.model'))
    print('Building embedding matrix')
    # This matrix will be used to initialze weights in the embedding layer
    embedding_matrix, word2token = build_embedding_mat(data_dir, vocab, w2v)
    print('embedding_matrix.shape => {}'.format(embedding_matrix.shape))

    print('Building Seq2Seq model')

    # build the embedding layer to convert token sequences into embeddings
    # set trainable to True if you wish to further finetune the embeddings.
    # It will increase train time but may yield better results. Try it out
    # on a more complex task (like neural machine translation)!
    embedding_layer = Embedding(
        input_dim=len(vocab) + 4,
        output_dim=config['embedding_dim'],
        embeddings_initializer=keras.initializers.Constant(embedding_matrix),
        trainable=False,
    )

    # build the encoding layers
    # encoder_inputs accepts padded tokenized sequences as input,
    # which would be converted to embeddings by the embedding_layer
    # finally, the embedded sequences are fed to the encoder LSTM to get
    # encodings (or vector representation) of the input sentences
    # you can add droputs the input/embedding layers and make your model robust
    encoder_inputs = Input((None, ), name='enc_inp')
    enc_embedding = embedding_layer(encoder_inputs)
    # you can choose a GRU/Dense layer as well to keep things easier
    # note that we are not using the encoder_outputs for the given generative
    # task, but you'll need it for classification
    # Also, there hidden dimension is currently equal to the embedding dimension
    _, state_h, state_c = LSTM(
        config['embedding_dim'],  # try a different value
        return_state=True,
        name='enc_lstm')(enc_embedding)
    encoder_states = [state_h, state_c]

    # build the decoding layers
    # decoder_inputs and dec_embedding serve similar purposes as in the encoding
    # layers. Note that we are using the same embedding_layer to convert
    # token sequences to embeddings while encoding and decoding.
    # In this case, we initialize the decoder using `encoder_states`
    # as its initial state (i.e. vector representation learned by the encoder).
    decoder_inputs = Input((None, ), name='dec_inp')
    dec_embedding = embedding_layer(decoder_inputs)
    dec_lstm = LSTM(config['embedding_dim'],
                    return_state=True,
                    return_sequences=True,
                    name='dec_lstm')
    dec_outputs, _, _ = dec_lstm(dec_embedding, initial_state=encoder_states)
    # finally, we add a final fully connected layer which performs the
    # transformation of decoder outputs to logits vectors
    dec_dense = Dense(len(vocab) + 4, activation='softmax', name='out')
    output = dec_dense(dec_outputs)

    # Define the model that will turn
    # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
    model = Model([encoder_inputs, decoder_inputs], output)
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
    print(model.summary())

    # note that decoder_input_data and decoder_target_data will be same
    # as we are training a vanilla autoencoder
    # we are using np.ones as pad tokens are represented by 1 in our vocab
    # TODO: switch to a generator instead of creating such huge matrics.
    # will reduce memory consumption a lot.
    encoder_input_data = np.ones((len(x_train_val), config['max_seq_len']),
                                 dtype='float32')
    decoder_input_data = np.ones((len(x_train_val), config['max_seq_len']),
                                 dtype='float32')
    decoder_target_data = np.zeros(
        (len(x_train_val), config['max_seq_len'], len(vocab) + 4),
        dtype='float32')

    for i, input_text in enumerate(x_train_val):
        tokenized_text = tokenize(input_text, word2token)
        for j in range(len(tokenized_text)):
            encoder_input_data[i, j] = tokenized_text[j]
            decoder_input_data[i, j] = tokenized_text[j]
            decoder_target_data[i, j, tokenized_text[j]] = 1.0
    # Run training (will take some time)
    print('Training model')
    # try different optimizers, learning rates, and analyze different metrics
    model.compile(optimizer='rmsprop',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    model.fit(
        [encoder_input_data, decoder_input_data],
        decoder_target_data,
        batch_size=config['batch_size'],
        epochs=10,  # try increasin #epochs
        validation_split=0.2)
    # Save model
    # this model is saved inside the tut_3/data folder just to showcase how
    # you can save your models as well inside respective assignment folders
    # and use them later
    model.save('tut_3/data/ae.model')

예제 #5

0

파일 보기

파일: click_vs_nonclick.py 프로젝트: akhilagrawal1001/Recruitment-Project

lab_dataset = tf.data.Dataset.from_tensor_slices((texts, labels))
lab_dataset = lab_dataset.batch(32)


def custom(input_data):
    lower = tf.strings.lower(input_data)

    lower = tf.strings.regex_replace(lower, '\n', ' ')
    lower = tf.strings.regex_replace(lower,
                                     '[%s]' % re.escape(string.punctuation),
                                     '')
    return lower


vector_layer = TextVectorization(standardize=custom,
                                 max_tokens=10000,
                                 output_sequence_length=250)

data_text = lab_dataset.map(lambda x, y: x)
vector_layer.adapt(data_text)


# to expand dimension(inp format ke acc kaam  ho) and attach label with text
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vector_layer(text), label


ready_train = lab_dataset.map(vectorize_text)

AUTOTUNE = tf.data.AUTOTUNE

예제 #6

0

파일 보기

파일: neural_machine_translation_with_transformer.py 프로젝트: adsidelab/keras-ko

strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

vocab_size = 15000
sequence_length = 20
batch_size = 64


def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")


eng_vectorization = TextVectorization(
    max_tokens=vocab_size, output_mode="int", output_sequence_length=sequence_length,
)
spa_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)
train_eng_texts = [pair[0] for pair in train_pairs]
train_spa_texts = [pair[1] for pair in train_pairs]
eng_vectorization.adapt(train_eng_texts)
spa_vectorization.adapt(train_spa_texts)

"""
Next, we'll format our datasets.

예제 #7

0

파일 보기

파일: w2v_function.py 프로젝트: gguex/Impresso_DynWE

text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x: tf.cast(tf.strings.length(x), bool))


# Now, create a custom standardization function to lowercase the text and
# remove punctuation.
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    return tf.strings.regex_replace(lowercase,
                                    '[%s]' % re.escape(string.punctuation), '')


# Use the text vectorization layer to normalize, split, and map strings to
# integers. Set output_sequence_length length to pad all samples to same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Create vocabulary
vectorize_layer.adapt(text_ds.batch(1024))
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()

# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

# Make senquences
sequences = list(text_vector_ds.as_numpy_iterator())

# Embedding dim

예제 #8

0

파일 보기

def main(data_dir):
    print('Loading data')
    x_train, x_val, x_test, y_train, y_val, y_test = load_data(data_dir)
    # build vocabulary
    vectorizer = TextVectorization(
        max_tokens=config['max_vocab_size'],
        output_sequence_length=config['max_seq_len'])
    text_data = tf.data.Dataset.from_tensor_slices(x_train).batch(
        config['batch_size'])
    print('Building vocabulary')
    vectorizer.adapt(text_data)
    vocab = vectorizer.get_vocabulary()
    # load pre-trained w2v model
    w2v = Word2Vec.load(os.path.join(data_dir, 'w2v.model'))
    # build embedding matrix
    print('Building embedding matrix')
    embedding_matrix = build_embedding_matrix(vocab, w2v)
    print('embedding_matrix.shape => {}'.format(embedding_matrix.shape))
    print('Building model')

    model = Sequential()
    model.add(
        Embedding(input_dim=len(vocab) + 2,
                  output_dim=config['embedding_dim'],
                  embeddings_initializer=keras.initializers.Constant(
                      embedding_matrix),
                  trainable=False,
                  name='embedding_layer'))
    # add hidden layer with activation, L2 regularization, and dropout
    model.add(
        LSTM(32,
             activation=sys.argv[2],
             kernel_regularizer=l2(0.0001),
             dropout=0.1,
             return_sequences=False,
             name='hidden_layer'))
    # last layer with activation
    model.add(Dense(2, activation='softmax', name='output_layer'))
    model.summary()

    print('train the model')
    # train the model
    # convert words to indices, put them in arrays
    num_classes = 2
    x_train = vectorizer(np.array([[w] for w in x_train])).numpy()
    x_val = vectorizer(np.array([[w] for w in x_val])).numpy()
    y_train = np.array(y_train)
    y_val = np.array(y_val)
    # convert labels to binary class
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_val = keras.utils.to_categorical(y_val, num_classes)

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.fit(x_train,
              y_train,
              batch_size=config['batch_size'],
              epochs=12,
              validation_data=(x_val, y_val))

    model.save(data_dir + 'nn_' + sys.argv[2] + '.model')

    score = model.evaluate(x_val, y_val)
    print("Accuracy: {0: .2f}%".format((score[1] * 100)))