/
Word2Vec_TutorialCode.py
177 lines (133 loc) · 7.03 KB
/
Word2Vec_TutorialCode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# See https://code.google.com/p/word2vec/ and http://radimrehurek.com/gensim/models/word2vec.html
# Note that the former is in C and not well written. Required manual debugging to run on my machine.
# 1. Install gensim, which includes the Python implementation of word2vec
# 2. Install cython <- May be problematic for Windows users -- resulting in 70x slowdown
# 3. Download the data
#
# This script assumes you're already in the directory containing the data files
import logging
from gensim.models import word2vec
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re, string
from nltk.corpus import stopwords
# This controls word2vec output
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def review_to_wordlist(review,remove_stopwords=False):
review_text = BeautifulSoup(review).get_text()
review_text = re.sub("[^a-zA-Z]"," ", review_text)
review_text = re.sub(r'(.)\1+', r'\1\1',review_text) # replace doubled up letters
words = review_text.lower().split()
if remove_stopwords:
stops = set(stopwords.words("english"))
words = [w for w in words if not w in stops]
return(words)
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
raw_sentences = tokenizer.tokenize(review.strip())
sentences = []
for raw_sentence in raw_sentences:
if len(raw_sentence) > 0:
sentences.append(review_to_wordlist(raw_sentence,remove_stopwords))
return sentences
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# Read data from files
train = pd.read_csv("labeledTrainData.tsv",header=0,delimiter="\t",quoting=3)
test = pd.read_csv("testData.tsv",header=0,delimiter="\t",quoting=3)
unlabeled_train = pd.read_csv("unlabeledTrainData.tsv",header=0,delimiter="\t",quoting=3)
print "Read %d labeled train reviews, %d labeled test reviews, and %d unlabeled "\
"reviews\n" % (train["review"].size, test["review"].size, unlabeled_train["review"].size )
sentences = []
print "Parsing sentences from training set"
for review in train["review"]:
sentences += review_to_sentences(review, tokenizer)
print "Parsing sentences from unlabeled set"
for review in unlabeled_train["review"]:
sentences += review_to_sentences(review, tokenizer)
num_features = 300 # should be a multiple of 4 for optimal speed but can be anything. Lower -> faster
min_word_count = 40 # Set to at least some reasonable value like 10. Higher -> faster.
num_workers = 4 # Number of threads to run in parallel. Varies by machine but at least 4 is a safe bet
context = 10 # for hierarchical softmax
downsampling = 1e-3
# Can verify that the parallization is working by using >top -o cpu. The Python process should spin up to usage
# of around num_workers * 100%
# The num_workers parameter has NO EFFECT IF CYTHON IS NOT INSTALLED AND WORKING PROPERLY!!
# Train on the smaller set first -- to illustrate the differences in accuracy between 25k model and 75k model
# min_count affects vocabulary size and is the minimum times a word must appear to be included in the model
# size affects the number of features that each word vector will have (optimized if a multiple of 4)
# workers indicates cores to use for parallelization. Only takes effect if cython is installed
print "Training model..."
model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, \
window = context, sample = downsampling)
model.init_sims(replace=True)
model_name = "%dfeatures_%dmin_word_count_%dcontext" % (num_features, min_word_count,
context)
model.save(model_name)
# Some examples
model.doesnt_match("man woman child kitchen".split())
model.most_similar("france")
model.most_similar("awful")
# The below makes the model more memory efficient but seals it off from further training
# model.init_sims(replace=True)
# In the tutorial, also make a note that they can save / load this model - train it more later
# ************************************
def makeFeatureVec(words, model, num_features):
# Utility function to create an average word vector for a given review
featureVec = np.zeros((num_features,),dtype="float32")
nwords = 0.
# Convert index2word to a set, for speed
index2word_set = set(model.index2word)
for word in words:
if word in index2word_set: # index2word returns the vocabulary list for the model
nwords = nwords + 1.
featureVec = np.add(featureVec,model[word])
featureVec = np.divide(featureVec,nwords)
return featureVec
def getAvgFeatureVecs(reviews, model, num_features):
# Given a set of reviews (each one a list of words), calculate the average feature vector
# and return a 2D numpy array
counter = 0.
reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
for review in reviews:
if counter%1000. == 0.:
print "Review %d of %d" % (counter, len(reviews))
reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
counter = counter + 1.
return reviewFeatureVecs
# NOTE: The vector averaging is a bit slow (despite some minor optimizations such as matrix preallocation)
#
# Note that this operation is 'embarassingly parallel' and is a good candidate for multi-threading
# if the tutorial wants to go into that; could use the python package pp
# http://www.parallelpython.com/content/view/15/30/#QUICKSMP
print "Creating average feature vectors for labeled reviews..."
# Unlike the first step, we now need to parse the reviews as a whole, not as individual sentences
clean_train_reviews = []
for review in train["review"]:
clean_train_reviews.append(review_to_wordlist(review,remove_stopwords=True))
trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features)
# Fit a simple classifier such as logreg or RF
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
print "Fitting a random forest to labeled training data..."
forest = forest.fit(trainDataVecs,train["sentiment"])
print "Creating average feature vecs for test reviews"
clean_test_reviews = []
for review in test["review"]:
clean_test_reviews.append(review_to_wordlist(review,remove_stopwords=True))
# Slowish; see comments above - good candidate for parallelizing if we want to go that route
testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )
# Test & extract results
result = forest.predict(testDataVecs)
# Write the test results
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv("Word2Vec.csv")
# **********************
# Compare to known results (internal use only)
known_result = pd.read_csv("testData-TRUTH.csv",header=0)
percent_correct = sum(known_result["sentiment"]==output["sentiment"])/25000.
print "Fraction correct = %f" % percent_correct
# Maybe also output a confusion matrix here
# ***********************
#
# With 4096 features and stopword removal (supervised portion only): 82.4% correct