forked from shayanzare007/EntitySentiment
-
Notifications
You must be signed in to change notification settings - Fork 0
/
rnn.py
117 lines (93 loc) · 3.41 KB
/
rnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import sys, os
from numpy import *
from matplotlib.pyplot import *
%matplotlib inline
matplotlib.rcParams['savefig.dpi'] = 100
%load_ext autoreload
%autoreload 2
from rnnlm import RNNLM
# Gradient check on toy data, for speed
random.seed(10)
wv_dummy = random.randn(10,50)
model = RNNLM(L0 = wv_dummy, U0 = wv_dummy,
alpha=0.005, rseed=10, bptt=4)
model.grad_check(array([1,2,3]), array([2,3,4]))
from data_utils import utils as du
import pandas as pd
# Load the vocabulary
#vocab = pd.read_table("data/lm/vocab.ptb.txt", header=None, sep="\s+",
# index_col=0, names=['count', 'freq'], )
vocab2 = pd.read_table("worddic.txt",header=None,sep="\s+",index_col=0)
# Choose how many top words to keep
#vocabsize = 2000
vocabsize2 = 58868 #remove for implemenation
#num_to_word = dict(enumerate(vocab.index[:vocabsize]))
num_to_word2 = dict(enumerate(vocab.index[:vocabsize]))
#word_to_num = du.invert_dict(num_to_word)
word_to_num2 = du.invert_dict(num_to_word2)
#print word_to_num2
##############
filename = 'reviews_plain.txt'
print "Opening the file..."
X_train = []
f = open(filename,'r')
count = 0
for line in f.readlines():
sentence = []
line = line.strip()
if not line: continue
ret = line.split()
for word in ret:
word = word.strip()
try:
if word_to_num.get(word) is not None:
sentence.append(word_to_num.get(word))
except:
count +=1
X_train.append(sentence)
print "And finally, we close the file."
f.close()
################
##
# Below needed for 'adj_loss': DO NOT CHANGE
fraction_lost = float(sum([vocab['count'][word] for word in vocab.index
if (not word in word_to_num)
and (not word == "UUUNKKK")]))
fraction_lost /= sum([vocab['count'][word] for word in vocab.index
if (not word == "UUUNKKK")])
print "Retained %d words from %d (%.02f%% of all tokens)" % (vocabsize, len(vocab),
100*(1-fraction_lost))
hdim = 100 # dimension of hidden layer = dimension of word vectors
random.seed(10)
L0 = zeros((vocabsize, hdim)) # replace with random init,
# or do in RNNLM.__init__()
model = RNNLM(L0, U0 = L0, alpha=0.08, rseed=10, bptt=2)
print len(Y_train)
# Gradient check is going to take a *long* time here
# since it's quadratic-time in the number of parameters.
# run at your own risk...
# model.grad_check(array([1,2,3]), array([2,3,4]))
#### YOUR CODE HERE ####
##
# Pare down to a smaller dataset, for speed (optional)
ntrain = len(Y_train)
ntrain = 5000
X = X_train[:ntrain]
Y = Y_train[:ntrain]
nepoch = 5
idxiter_random = random.permutation(len(Y))
for i in range(2,nepoch):
permut = random.permutation(len(Y))
idxiter_random = concatenate((idxiter_random,permut),axis=0)
curve = model.train_sgd(X,Y,idxiter_random,None,400,1000) #minibatch_sgd(X,Y,0.1)
#clf3.train_sgd(X_train,y_train,idxiter,None,400,1000)
## Evaluate cross-entropy loss on the dev set,
## then convert to perplexity for your writeup
dev_loss = model.compute_mean_loss(X_dev, Y_dev)
print dev_loss
## DO NOT CHANGE THIS CELL ##
# Report your numbers, after computing dev_loss above.
def adjust_loss(loss, funk):
return (loss + funk * log(funk))/(1 - funk)
print "Unadjusted: %.03f" % exp(dev_loss)
print "Adjusted for missing vocab: %.03f" % exp(adjust_loss(dev_loss, fraction_lost))