/
network.py
102 lines (86 loc) · 3.74 KB
/
network.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import numpy as np
from keras.callbacks import LambdaCallback
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Activation
from keras.models import Sequential
import keras
from prepare_data import get_tokens, sp
from variables import tweet_limit, use_nltk, sos, eos
def pre_train_model(cursor, word_model):
checkpoint_path = "data/cp.ckpt"
cursor.execute(f"SELECT count(cleaned) from tweets")
(tweet_count,) = cursor.fetchone()
tweet_count = min(tweet_count, tweet_limit)
cursor.execute("""
SELECT max_sentence_length
FROM metadata
WHERE id = (SELECT MAX(id) FROM metadata)
""")
(max_sentence_len,) = cursor.fetchone()
pretrained_weights = word_model.wv.vectors
vocab_size, emdedding_size = pretrained_weights.shape
print('Result embedding shape:', pretrained_weights.shape)
# print('Checking similar words:')
# for word in ['model', 'network', 'train', 'learn']:
# most_similar = ', '.join(
# '%s (%.2f)' % (similar, dist) for similar, dist in word_model.wv.most_similar(word)[:8])
# print(' %s -> %s' % (word, most_similar))
def word2idx(word):
return word_model.wv.vocab[word].index
def idx2word(idx):
return word_model.wv.index2word[idx]
print('\nPreparing the data for LSTM...')
train_x = np.zeros([tweet_count, max_sentence_len], dtype=np.int32)
train_y = np.zeros([tweet_count], dtype=np.int32)
for i, sentence in enumerate(get_tokens(cursor)):
for t, word in enumerate(sentence[:-1]):
train_x[i, t] = word2idx(word)
train_y[i] = word2idx(sentence[-1])
print('train_x shape:', train_x.shape)
print('train_y shape:', train_y.shape)
print('\nTraining LSTM...')
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[pretrained_weights]))
model.add(LSTM(units=emdedding_size))
model.add(Dense(units=vocab_size))
model.add(Activation('softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
def sample(preds, temperature=1.0):
if temperature <= 0:
return np.argmax(preds)
preds = np.asarray(preds).astype('float64')
preds = np.log(preds) / temperature
exp_preds = np.exp(preds)
preds = exp_preds / np.sum(exp_preds)
probas = np.random.multinomial(1, preds, 1)
return np.argmax(probas)
def generate_next(text, num_generated=15):
word_idxs = [word2idx(word) for word in text.lower().split()]
for _ in range(num_generated):
prediction = model.predict(x=np.array(word_idxs))
idx = sample(prediction[-1], temperature=0.7)
word_idxs.append(idx)
if idx == eos:
break
pieces = list(map(lambda idx: idx2word(idx), word_idxs))
if use_nltk:
return ' '.join(pieces)
result = sp.decode_pieces(pieces)
return result
def on_epoch_end(epoch, _):
print('\nGenerating text after epoch: %d' % epoch)
texts = [sos, sos, sos]
for text in texts:
sample = generate_next(text)
print('%s... -> %s' % (text, sample))
# Create a callback that saves the model's weights
cp_callback = keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
save_weights_only=True,
verbose=1)
model.summary()
model.fit(train_x, train_y,
batch_size=128,
epochs=20,
callbacks=[cp_callback, LambdaCallback(on_epoch_end=on_epoch_end)])
return model