forked from tmdavid/NLI_Code
/
extract_sentences.py
254 lines (215 loc) · 9.51 KB
/
extract_sentences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
__author__ = 'david_torrejon'
"""
This module extracts the sentences from the snli corpus:
Samuel R. Bowman, Gabor Angeli, Christopher Potts, and Christopher D. Manning. 2015.
A large annotated corpus for learning natural language inference.
In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing (EMNLP)
http://nlp.stanford.edu/projects/snli/
"""
import sys
import pandas as pd
import numpy as np
import re
import string
from keras.preprocessing.text import text_to_word_sequence, base_filter
from keras.preprocessing.sequence import pad_sequences
from random import seed, uniform
from sys import stdout
tokenizing_errors = 0
def build_glove_dictionary():
"""
builds a dictionary based on the glove model.
http://nlp.stanford.edu/projects/glove/
dictionary will have the form of key = token, value = numpy array with the pretrained values
REALLY IMPORTANT the glove dataset. with the big one finds nearly everything....
smallest one...quite baaaaaad...
"""
print ('building glove dictionary...')
glove_file = '../TBIR/glove.840B.300d.txt'
glove_dict = {}
with open(glove_file) as fd_glove:
for i, input in enumerate(fd_glove):
stdout.write("\rloading glove dictionary: %d" % i)
stdout.flush()
input_split = input.split(" ")
#print input_split
key = input_split[0] #get key
del input_split[0] # remove key
values = []
for value in input_split:
values.append(float(value))
np_values = np.asarray(values)
glove_dict[key] = np_values
#print 'dictionary build with length', len(glove_dict)
print ""
return glove_dict
def return_sparse_vector(sentence, vocab_size):
"""
@params:
sentence: array with the encoded sentence [1 534 232 ... 3 ...0]
returns len(sentence) np.arrays with 1 hot encoded vectors
"""
sparse_vector = []
for item in sentence:
one_hot_vector = np.zeros(vocab_size)
one_hot_vector[item] = 1
sparse_vector.append(one_hot_vector)
return np.asarray(sparse_vector)
def read_json_file():
file_to_read = 'snli_1.0/snli_1.0_train.jsonl'
try:
#read whole file into python array
print 'Opening File ' + file_to_read
with open(file_to_read, 'rb') as f:
data = f.readlines()
'''
Each element of 'data' is an individual JSON object.
I want to convert it into an *array* of JSON objects
which, in and of itself, is one large JSON object
basically... add square brackets to the beginning
and end, and have all the individual business JSON objects
separated by a comma
'''
data_json_str = "[" + ','.join(data) + "]"
# now, load it into pandas
data_df = pd.read_json(data_json_str)
# Only need sentence 1 and sentence 2 + id + annotator_labels
"""
sample
{"annotator_labels": ["neutral"], "captionID": "3416050480.jpg#4", "gold_label": "neutral", "pairID": "3416050480.jpg#4r1n", "sentence1": "A person on a horse jumps over a broken down airplane.",
"sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )",
"sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))",
"sentence2": "A person is training his horse for a competition.", "sentence2_binary_parse": "( ( A person ) ( ( is ( ( training ( his horse ) ) ( for ( a competition ) ) ) ) . ) )",
"sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (VP (VBG training) (NP (PRP$ his) (NN horse)) (PP (IN for) (NP (DT a) (NN competition))))) (. .)))"}
"""
# ,['sentence1_binary_parse'],['sentence1_parse'],['sentence2_binary_parse'],['sentence2_parse']
del data_df['captionID']
del data_df['sentence1_binary_parse']
del data_df['sentence1_parse']
del data_df['sentence2_binary_parse']
del data_df['sentence2_parse']
list_of_sentences1 = data_df['annotator_labels'].tolist()
counter_1 = 0
counter_2 = 0
counter_3 = 0
for dp in list_of_sentences1:
if len(set(dp)) is 1:
counter_1 +=1
if len(set(dp)) is 2:
counter_2 +=1
if len(set(dp)) is 3:
counter_3 +=1
print counter_1, counter_2, counter_3
#raise SystemExit(0)
return data_df
except IOError as e:
print e
def make_unicode(source_text):
not_letters_or_digits = u'!"#%\'()*+,-./:;<=>?@[\]^_`{|}~'
no_unicode = ''
for char in source_text:
if char in not_letters_or_digits:
char = " "
no_unicode+=char
return no_unicode
def label_output_data(label):
labels = {'neutral':np.array([0,1,0]),
'entailment':np.array([1,0,0]),
'contradiction':np.array([0,0,1]),
' ': np.array([0,0,0])
}
return labels[label]
"""
Generates a encoded vector, each cell 1 numbered refering to a word in the word2idx dictionary
could use keras text to word, but gives some trouble with nonunicode tokens...have to convert to unicode all the time...see glove.py
"""
def create_vectorized_sentence(sentence, word2idx):
vectorized_sentence = []
regex = re.compile('[%s]' % re.escape(string.punctuation))
sentence = regex.sub('', sentence).lower()
tokenized_sentence = sentence.split(" ")
for token in tokenized_sentence:
if word2idx.get(token):
vectorized_sentence.append(word2idx[token])
else:
vectorized_sentence.append(0) # dunno how to deal with mistakes, ask!
return np.asarray(vectorized_sentence)
def pad_sentence(sentence, max_len=35, pad_with=0):
'''
@pads a single sentence
@if sentence is below max_len, returns a sentence(vectorized) of max_len with 0s on the right
@sentence is a list of tokens (right now, a numpy array)
@if sentence length larger than max_len, truncates the sentence to the max_len first values
'''
padded_sentence=np.zeros(max_len, dtype=int)
if len(sentence) < max_len:
#pads
for i,value in enumerate(sentence):
padded_sentence[i] = value
else:
#truncates
for i in range (len(padded_sentence)):
padded_sentence[i] = sentence[i]
return padded_sentence
def create_sentence_ds(sentences_df, word2idx, cut_ds, maxlen=35,):
# create pair [[s1,s2], label]
seed = 1337 #great seed
data_set = []
print('Generating dataset')
list_premises = sentences_df['sentence1'].tolist()
list_hypothesis = sentences_df['sentence2'].tolist()
list_label = sentences_df['gold_label'].tolist()
for premise, hypothesis, label_text in zip(list_premises, list_hypothesis, list_label):
num = uniform(1.0, 0.0)
if num < cut_ds:
label_no_unicode = make_unicode(label_text)
numpy_label = label_output_data(label_no_unicode)
premise_encoded = create_vectorized_sentence(premise, word2idx)
hypothesis_encoded = create_vectorized_sentence(hypothesis, word2idx)
padded_premise = pad_sentence(premise_encoded, max_len=maxlen)
padded_hypothesis = pad_sentence(hypothesis_encoded, max_len=maxlen)
#print numpy_label
#([pre-hypo],[encoded pre-hypo],[100]output) first pair of values is unnecesary right now 27/02, just for debug purpouses
data_set.append([[premise, hypothesis], [padded_premise, padded_hypothesis], numpy_label])
return data_set
def give_vocabulary(sentences_df):
'''
@parameter: the dataframe from the json file with the 5 columns we need
@returns: the vocabulary in a set.
'''
vocabulary = []
list_of_sentences1 = sentences_df['sentence1'].tolist()
list_of_sentences2 = sentences_df['sentence2'].tolist()
list_sentence_words = []
'''
# Do same with keras
for sentence in list_of_sentences1:
sentence.lower()
#tokenize or split by " "
tokens1 = sentence.split(" ")
for token1 in tokens1:
if token1 not in vocabulary:
vocabulary.append(token1)
'''
list_sentence_word_tmp = []
for s1, s2 in zip (list_of_sentences1, list_of_sentences2):
sentence_unicode1 = make_unicode(s1)
sentence_unicode2 = make_unicode(s2)
#print sentence_no_unicode
list_sentence_word_tmp += text_to_word_sequence(sentence_unicode1.encode('ascii'), filters=base_filter(), lower=True, split=" ")
list_sentence_word_tmp += text_to_word_sequence(sentence_unicode2.encode('ascii'), filters=base_filter(), lower=True, split=" ")
set_words = set(list_sentence_word_tmp)
word2idx = {}
for i, word in enumerate(set_words):
word2idx[word] = int(i)
#print word2idx
print "length of vocabulary: %d"%len(set_words)
return set_words, len(set_words), word2idx
# simple test of extracting a json file and showing the len of the vocabulary
def test():
df_data = read_json_file()
vocab, len_vocab = give_vocabulary(df_data)
def test_labeling():
labels = ['neutral','entailment','contradiction']
for label in labels:
print label_output_data(label)