예제 #1
0
word_freqs = x_train.sum(axis=0)
word_freqs = np.squeeze(np.asarray(word_freqs))
freq_idx = np.argsort(word_freqs)[::-1]
freq_idx = freq_idx[:num_frequent_words]
x_train = x_train[:, freq_idx]
train_vocab = [train_vocab[i] for i in freq_idx]

print("Vocabulary Size (Reduced): {}".format(len(train_vocab)))

# Construct reverse lookup vocabulary
reverse_vocab = {k: v for v, k in enumerate(train_vocab)}

# Process Google News word2vec file (in a memory-friendly way) and store relevant embeddings.
print("Loading pre-trained embeddings from {}...".format(embedding_file))
embeddings = data.load_word2vec(embedding_file,
                                reverse_vocab,
                                embedding_dim,
                                tf_VP=False)

# Process test data using the reduced train vocabulary
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(
    vocabulary=train_vocab)
x_test = vectorizer.fit_transform(x_test)

# Normalize data
x_train = x_train.astype(np.float64)
x_train = sklearn.preprocessing.normalize(x_train, axis=1, norm="l1")
x_train = x_train.astype(np.float32)
x_test = x_test.astype(np.float64)
x_test = sklearn.preprocessing.normalize(x_test, axis=1, norm="l1")
x_test = x_test.astype(np.float32)
                                maxlen=seq_len)

x_train = train.data.astype(np.int32)
x_test = test.data.astype(np.int32)
y_train = train.labels
y_test = test.labels

# Correct sequence length if seq_len was originally None
seq_len = x_train.shape[1]

# Construct reverse lookup vocabulary
reverse_vocab = {w: i for i, w in enumerate(train.vocab)}

# Process Google News word2vec file (in a memory-friendly way) and store relevant embeddings
print("Loading pre-trained embeddings from {}...".format(embedding_file))
embeddings = data.load_word2vec(embedding_file, reverse_vocab, embedding_dim)

# Print information about the dataset
utils.print_data_info(train, x_train, x_test, y_train, y_test)

# To print for results.csv
data_str = "{{format: 'word2ind', vocab_size: {}, seq_len: {}}}".format(
    len(train.vocab), seq_len)

# Training
# ==================================================

with tf.Graph().as_default():
    session_conf = tf.ConfigProto(allow_soft_placement=allow_soft_placement,
                                  log_device_placement=log_device_placement)
    sess = tf.Session(config=session_conf)
    test_sample_index:]

del x, y, x_shuffled, y_shuffled  # don't need these anymore

print("Vocabulary Size: {}".format(len(vocab_processor.vocabulary_)))
print("Train/Test Split: {}/{}".format(len(y_train), len(y_test)))

# Initialize embedding matrix from pre-trained word2vec embeddings. 0.25 is chosen so that unknown vectors
# have (approximately) the same variance as pre-trained ones.
embeddings = np.random.uniform(
    -0.25, 0.25, (len(vocab_processor.vocabulary_), embedding_dim))

# Process Google News word2vec file (in a memory-friendly way) and store relevant embeddings.
print("Loading pre-trained embeddings from {}...".format(embedding_file))
embeddings = data.load_word2vec(embedding_file,
                                vocab_processor.vocabulary_,
                                embedding_dim,
                                tf_VP=True)

# Embed the data with the extracted embeddings
x_train = np.array([
    np.mean([embeddings[idx] for idx in sentence], axis=0)
    for sentence in x_train
])
x_test = np.array([
    np.mean([embeddings[idx] for idx in sentence], axis=0)
    for sentence in x_test
])

# Transform targets from arrays to labels
y_train = np.argmax(y_train, 1)
y_test = np.argmax(y_test, 1)