예제 #1
0
def lstm():
    data,  targets, filenames, embedding_matrix, word_index = preprocess_embedding()
    EMBEDDING_DIM = 300
    MAX_SEQUENCE_LENGTH = 50
    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable= False,
                                name='layer_embedding') #mask_zero=True,


    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    x1 = LSTM(150, return_sequences=True,name='lstm_1')(embedded_sequences)

    #x2 = LSTM(75, return_sequences=True,name='lstm_2')(x1)
    encoded = LSTM(30,name='lstm_3')(x1)
    x3 = RepeatVector(MAX_SEQUENCE_LENGTH,name='layer_repeat')(encoded)
   # x4 = LSTM(75, return_sequences=True,name='lstm_4')(x3)
    x5 = LSTM(150, return_sequences=True,name='lstm_5')(x3)
    decoded = LSTM(300, return_sequences=True,activation='linear',name='lstm_6')(x5)

    sequence_autoencoder = Model(sequence_input, decoded)
    #print sequence_autoencoder.get_layer('lstm_6').output
    encoder = Model(sequence_input, encoded)
    sequence_autoencoder.compile(loss='cosine_proximity',
                  optimizer='sgd')#, metrics=['acc'])
    embedding_layer = Model(inputs=sequence_autoencoder.input,
                                     outputs=sequence_autoencoder.get_layer('layer_embedding').output)


    sequence_autoencoder.fit(data, embedding_layer.predict(data), epochs=5)


    # for i in  sequence_autoencoder.layers[3].get_weights()[0]:
    #     print i
    #
    # print sequence_autoencoder.layers[3].get_weights()[0][1]

    # print sequence_autoencoder.layers[1].get_weights()[0][1].shape
    # print sequence_autoencoder.layers[2].get_weights()[0][1].shape
    # print sequence_autoencoder.layers[3].get_weights()[0][1].shape
    # print sequence_autoencoder.layers[4].get_weights()[0][1].shape
    # #print sequence_autoencoder.layers[5].get_weights()[0][1].shape
    # print sequence_autoencoder.layers[6].get_weights()[0][1].shape
    # print sequence_autoencoder.layers[7].get_weights()[0][1].shape

    csvname = 'lstm_autoencoder_weight'
    write_vec_to_csv(sequence_autoencoder.layers[3].get_weights()[0],targets,filenames,csvname)
예제 #2
0
    #Custom metric calculations
    metrics = Metrics_Approx(tag_index=all_labels, k=k)

    #Train model
    if not trans_learn:
        history = model.fit(X_train,
                            np.array(y_train),
                            batch_size=batch_size,
                            epochs=int(args.num_epochs),
                            verbose=1,
                            callbacks=[metrics],
                            validation_data=(X_test, np.array(y_test)))

    #Perform prediction to calculate scores for K iteration
    y_pred = np.asarray(model.predict(X_test))
    y_pred = np.argmax(y_pred, axis=-1)
    targ = y_test
    targ = np.argmax(targ, axis=-1)
    k_scores = evaluate_approx_match(y_pred, targ, all_labels)
    k_metrics_val = model.evaluate(X_test, y_test)
    k_metrics_train = model.evaluate(X_train, y_train)

    #Get timestamp for log
    timestamp = str(datetime.datetime.now())

    #Print scores for current K iteration
    with open(args.log_name, 'a', encoding='UTF-8') as writer:
        writer.write(timestamp + "|" + str(k) + "|" + str(len(X_train)) + "|" +
                     str(k_scores["p"]) + "|" + str(k_scores["r"]) + "|" +
                     str(k_scores["f1"]) + "|" + str(k_metrics_val[0]) + "|" +
예제 #3
0
            seq.append(word2index["UNK"])

    XX = sequence.pad_sequences(sequences=[seq], maxlen=MAX_SENTENCE_LENGTH) # list假設多個,所以這邊要放多層
    label_pred = int(round(model_LSTM.predict(XX)[0][0]))
    label2word = {1: 'Positive', 0: 'Negative'}
    print("Your sentence: '{}' belongs to {} emotion.".format(INPUT_SENTENCES, label2word[label_pred]))

interact_pred()


# 多個句子
INPUT_SENTENCES = ['I love reading.', 'You are so boring.']
XX = np.empty(len(INPUT_SENTENCES), dtype=list)
i = 0
for sentence in INPUT_SENTENCES:
    words = nltk.word_tokenize(sentence.lower())
    seq = []
    for word in words:
        if word in word2index:
            seq.append(word2index[word])
        else:
            seq.append(word2index['UNK'])
    XX[i] = seq
    i += 1

XX = sequence.pad_sequences(XX, maxlen=MAX_SENTENCE_LENGTH)
labels = [int(round(x[0])) for x in model.predict(XX)]
label2word = {1: '积极', 0: '消极'}
for i in range(len(INPUT_SENTENCES)):
    print('{}   {}'.format(label2word[labels[i]], INPUT_SENTENCES[i]))
예제 #4
0
#build model        
input = Input(shape=(X_train.shape[1],))

model = Embedding(input_dim=vocab_size, output_dim=word_embed_size, 
                   input_length=seq_maxlen, weights=[embedding_weight_matrix], 
                   trainable = False) (input)
model = LSTM(100, return_sequences=True)(model)
output = TimeDistributed(Dense(num_tags, activation="softmax"))(model)  

model = Model(inputs=input, outputs=output)
print(model.summary())

model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

save_weights = ModelCheckpoint('model.h5', monitor='val_loss', save_best_only=True)

model.fit(x=X_train, y=y_train, batch_size=batch_size,
          epochs=epochs, validation_data=(X_validation, y_validation), callbacks=[save_weights])

sentences_test, tags_test = load_data(os.path.join(data_dir, 'test.txt'))               
X_test = getSequences(sentences_test, tokenizer_words, seq_maxlen, 0)
y_test = getSequences(tags_test, tokenizer_tags, seq_maxlen, tokenizer_tags.word_index['o'])

y_pred = model.predict(X_test)

y_pred_tags, y_test_tags = decode_output(X_test, y_pred, y_test, tokenizer_words, tokenizer_tags, seq_maxlen)

report = classification_report(y_pred=y_pred_tags, y_true=y_test_tags)
print(report)
def lstm():
    data, targets, filenames, embedding_matrix, word_index = preprocess_embedding(
    )
    EMBEDDING_DIM = 300
    MAX_SEQUENCE_LENGTH = 500
    keras.callbacks.TensorBoard(log_dir='./Graph_lstm_embedding',
                                histogram_freq=0,
                                write_graph=True,
                                write_images=True)
    tbCallBack = keras.callbacks.TensorBoard(
        log_dir='./Graph_lstm_embedding',
        histogram_freq=10,
        embeddings_layer_names='layer_embedding',
        embeddings_freq=100,
        write_graph=True,
        write_images=True)
    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False,
                                name='layer_embedding')  #mask_zero=True,

    # STep 1 Training
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    #x1 = LSTM(150, return_sequences=True,name='lstm_1')(embedded_sequences) # [ batchsize, timesteps, input_dimension ]

    # [batchsize, timesteps, output_dimension ]

    #x2 = LSTM(75, return_sequences=True,name='lstm_2')(x1)
    encoded = LSTM(300, name='lstm_3')(embedded_sequences)

    x3 = RepeatVector(MAX_SEQUENCE_LENGTH, name='layer_repeat')(encoded)
    # x4 = LSTM(75, return_sequences=True,name='lstm_4')(x3)
    #x5 = LSTM(150, return_sequences=True,name='lstm_5')(x3)
    decoded = LSTM(300,
                   return_sequences=True,
                   activation='softmax',
                   name='lstm_6')(x3)

    sequence_autoencoder = Model(sequence_input, decoded)
    #print sequence_autoencoder.get_layer('lstm_6').output
    encoder = Model(sequence_input, encoded)  # two functions that you learn

    sequence_autoencoder.compile(loss='cosine_proximity',
                                 optimizer='sgd')  #, metrics=['acc'])
    embedding_layer = Model(
        inputs=sequence_autoencoder.input,
        outputs=sequence_autoencoder.get_layer('layer_embedding').output)

    sequence_autoencoder.fit(data,
                             embedding_layer.predict(data),
                             epochs=20,
                             callbacks=[tbCallBack])
    # Training is done

    # define the encoding function using the trained encoded weights

    #encoder = Model(sequence_input, encoded) # a function that you learn
    print '**************************************************'
    encoded_data = encoder.predict(data)
    print encoded_data
    print '**************************************************'
    print encoded_data.shape

    csvname = 'lstm_autoencoder_representation'
    write_vec_to_csv(encoded_data, targets, filenames, csvname)