示例#1
0
x_train = dataset['x_train']
y_train = dataset['y_train']
x_test = dataset['x_test']
y_test = dataset['y_test']

x_train = x_train[:25000]
y_train = y_train[:25000]
x_test = x_test[:25000]
y_test = y_test[:25000]

print('Training data size is: ', x_train.shape)
print('Validation data size is: ', x_test.shape)

# Load vocab
bpe = BPE("./pre-trained-model/en.wiki.bpe.op25000.vocab")
# Build vocab, {token: index}
vocab = {}
for i, token in enumerate(bpe.words):
    vocab[token] = i + 1

# Embedding Initialization
from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format(
    "./pre-trained-model/en.wiki.bpe.op25000.d50.w2v.bin", binary=True)

from keras.layers import Embedding

input_size = 364
embedding_dim = 50
示例#2
0
train_texts = [s.lower() for s in train_texts]
test_texts = test_df[1].values
test_texts = [s.lower() for s in test_texts]

# replace all digits with 0
import re
train_texts = [re.sub('\d', '0', s) for s in train_texts]
test_texts = [re.sub('\d', '0', s) for s in test_texts]

# replace all URLs with <url>
url_reg = r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b'
train_texts = [re.sub(url_reg, '<url>', s) for s in train_texts]
test_texts = [re.sub(url_reg, '<url>', s) for s in test_texts]

# Convert string to subword, this process may take several minutes
bpe = BPE("../pre-trained-model/en.wiki.bpe.op25000.vocab")
train_texts = [bpe.encode(s) for s in train_texts]
test_texts = [bpe.encode(s) for s in test_texts]

# Build vocab, {token: index}
vocab = {}
for i, token in enumerate(bpe.words):
    vocab[token] = i + 1


# Convert subword to index, function version
def subword2index(texts, vocab):
    sentences = []
    for s in texts:
        s = s.split()
        one_line = []