/
lauren.py
149 lines (121 loc) · 5 KB
/
lauren.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
from sklearn import svm
from nltk.corpus import dependency_treebank as dt
import numpy as np
from sklearn.feature_extraction import DictVectorizer as dv
import pickle
import os
'''import data'''
def import_data():
#iterate through directory to get all the data
'''for subdir, dirs, files in os.walk('./dep_treebank'):
for row in files:
for f in row:
print f'''
#example of what to do for each file
path = "../../../../../Users/lurke/Documents/Harvard/Senior/CS187/final/dep_treebank"
f = path + "/00/wsj_0001.mrg"
t = dt.parsed_sents(f)[0]
print t
#TODO: represent this tree in the best format to gather the features
#TODO: split data by directory into train and test
'''train'''
def train():
#for each sentence in the dependency parses, construct the tree:
features=[]
answers = []
data = []
for sentence in data:
#format sentence to pass into tree maker
#iterate through sentence until tree is formed - one each iteration, will consider multiple pairs of words
#each time we consider two words, get features, save as row in array with dictvectorize
'''features, answers = make_tree('',sentence,True)
features.append(featues)
answers.append(answers)'''
pass
#example so that can do svm
#order of features -2,-1,0-,0+,1,2: pos,lex,ch-L-pos,ch-L-lex,ch-R-pos,ch-R-lex
example_features1 = {'pos-2': ':', 'pos-1': 'NNS', 'pos-0': 'IN', 'pos0': 'NN', 'pos1': 'WP', 'pos1': ':'}
example_features1.update( {'lex-2': '-', 'lex-1': 'sellers','lex-0': 'of','lex0': 'resort','lex1': 'who','lex2': '-'})
example_features1.update({'chrrpos-1':'DT','chrrpos0':'JJ','chrlex-1':'the','chrlex0':'last'})
example_features1.update({'chlpos1': 'VBD', 'chllex1':'were'})
example_features2 = {'pos-2': 'NNS', 'pos-1': 'IN', 'pos-0': 'NN', 'pos0': 'WP', 'pos1': ':'}
example_features2.update( {'lex-2': 'sellers', 'lex-1': 'of','lex-0': 'resort','lex0': 'who','lex1': '-'})
example_features2.update({'chrrpos-2':'DT','chrrpos-0':'JJ','chrlex-2':'the','chrlex-0':'last'})
example_features2.update({'chlpos0': 'VBD', 'chllex0':'were'})
#this feature matrix has two entries
example_features = [example_features1,example_features2]
#each time we consider two words, get correct action (shift, right, left), put in the answer vector
example_answers = np.array(['right','left'])
#dictvectorize to turn strings into numerical values
vec = dv()
example_array = vec.fit_transform(example_features).toarray()
'''array = vec.fit_transform(features).toarray()'''
#TODO: a later goal: once we have the matrix, sort and split label (left, right, split) data
#so that we can run three different models
#use sklearn svm to come up with a model
#persist the model in a pickle
clf = svm.SVC()
clf.fit(example_array,example_answers)
'''clf.fit(array,answers)'''
pkl = open('svm.pkl','wb')
pickle.dump(clf,pkl)
pkl.close()
return clf
'''test'''
def test(model,data):
#TODO: figure out why pickle loading isnt working, so that we can save it
# instead of passing it in as a variable
'''model = pickle.load('svm.pkl')'''
trees = []
#for each datum in the test data, estimate what the tree should be
for datum in data:
#TODO: format sentence in proper way from datum
trees.append(make_tree(model, sentence))
return trees
'''function for both train and test that iterates over a sentence, constructing the tree and either
train: outputting the features and answers as it goes along to construct the matrix for svm or
test: outputting the predicted tree'''
def make_tree(model,sentence,train=False):
i=1
T=sentence
no_construction = true
features = []
answers = []
while(len(T) >= 1):
if i == len(T):
if no_construction: break
no_construction = True
i = 1
else:
x = get_contextual_features(T,i)
if train:
#TODO:get the action from the answer key
#TODO:append feature rows and answer vector
pass
else:
y = model.predict(x)
construction(T,i,y)
if y == 'left' or y == 'right':
no_construction = False
if train:
return features, answers
else: return T
'''the function that does the actual tree construction after an action has been selected for
the current two words'''
def construction(T,i,y):
#TODO: make the nodes at i and i+1 in T take the action recommended by y
pass
'''evaluate: the function that will evaluate the test results'''
def evaluate(trees):
for tree in tree:
#TODO: do evaluation to see if the tree is correct
#TODO:add these stats to overall stats
pass
#return overall stats
def main():
import_data()
model = train()
#trees = test(model,test)
#stats = evaluate(trees)
if __name__=="__main__":
main()