x_train = dataset['x_train'] y_train = dataset['y_train'] x_test = dataset['x_test'] y_test = dataset['y_test'] x_train = x_train[:25000] y_train = y_train[:25000] x_test = x_test[:25000] y_test = y_test[:25000] print('Training data size is: ', x_train.shape) print('Validation data size is: ', x_test.shape) # Load vocab bpe = BPE("./pre-trained-model/en.wiki.bpe.op25000.vocab") # Build vocab, {token: index} vocab = {} for i, token in enumerate(bpe.words): vocab[token] = i + 1 # Embedding Initialization from gensim.models import KeyedVectors model = KeyedVectors.load_word2vec_format( "./pre-trained-model/en.wiki.bpe.op25000.d50.w2v.bin", binary=True) from keras.layers import Embedding input_size = 364 embedding_dim = 50
train_texts = [s.lower() for s in train_texts] test_texts = test_df[1].values test_texts = [s.lower() for s in test_texts] # replace all digits with 0 import re train_texts = [re.sub('\d', '0', s) for s in train_texts] test_texts = [re.sub('\d', '0', s) for s in test_texts] # replace all URLs with <url> url_reg = r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b' train_texts = [re.sub(url_reg, '<url>', s) for s in train_texts] test_texts = [re.sub(url_reg, '<url>', s) for s in test_texts] # Convert string to subword, this process may take several minutes bpe = BPE("../pre-trained-model/en.wiki.bpe.op25000.vocab") train_texts = [bpe.encode(s) for s in train_texts] test_texts = [bpe.encode(s) for s in test_texts] # Build vocab, {token: index} vocab = {} for i, token in enumerate(bpe.words): vocab[token] = i + 1 # Convert subword to index, function version def subword2index(texts, vocab): sentences = [] for s in texts: s = s.split() one_line = []