/
fig_detect.py
227 lines (150 loc) · 5.91 KB
/
fig_detect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
#!/usr/bin/python
"""
figurative speech detection
By: Dominic Doyle
05.04.2016
"""
from pattern.en import wordnet , lexeme , parsetree , singularize
from pattern.search import search
from sys import argv
def upward_chain(synsets, recur= False):
"""
upward chains query through hypernyms in the wordnet
recur = False -> first roots
"""
roots = []
if recur:
for syn in synsets:
roots += [match for match in syn.hypernyms(recursive=True)]
#print "recursive roots {0} for {1}".format(roots, synsets)
else:
for syn in synsets:
roots += [match for match in syn.hypernyms()]
print "first roots {0} for {1}".format(roots, synsets)
return roots
def is_descendant(NP1, NP2, ancest=False):
"""
bool ancest: search deeper than immediate ancestor
Params: synsets
determines from a query , if N1 has an ancestor in
the chain assuming NP1 are already regierested in the wn
check if immediate descendant
returns either: ancestral or immediate
"""
detect = ''
if NP2[0].synonyms[0] in [entry.synonyms[0] for entry in upward_chain(NP1, ancest)]:
if ancest:
print "{n2} is an ancestral hypernym of {n1}".format(n2=NP2, n1=NP1)
else:
print "{n2} is an immediate hypernym of {n1}".format(n2=NP2, n1=NP1)
return True
else:
return False
def detect_figa(phrase):
"""
detects if a phrase is a 'simple' example
of figurative speech, fact , falsehood, or generalization
NP1 + conj('is') + NP2
"""
#check for pattern..
#a type of|an example of (took that out for now) {[one ]?}
criteria = '{NP}' + ' ' + '{is|are|was|were}' + ' ' +'{NP}'
parset = parsetree(phrase)
hits = search(criteria, parset)
status = ""
print "Parse tree: {pt}".format(pt=parset)
#check for subphrases inside of phrases
for hit in hits:
print "NP1: {grp1}".format(grp1=hit.group(1))
print "conj: {conj}".format(conj=hit.group(2))
print "NP2: {grp2}".format(grp2=hit.group(3))
NP1 =hit.group(1) #word objects
conj=hit.group(2)
NP2 =hit.group(3)
syn1 = unify_query(NP1)
syn2 = unify_query(NP2)
#determine relation of objects
status = determine_relation(syn1, syn2)
#apply detection checks
return status
def determine_relation(syn1, syn2):
relation = ''
#both are registered
if len(syn1) > 0 and len(syn2) > 0:
#verifiable results
print 'Common Ancestor: {0}'.format(wordnet.ancestor(syn1[0], syn2[0]))
print 'Similarity measure between synsets: {0}'.format(wordnet.similarity(syn1[0], syn2[0]))
if is_descendant(syn1, syn2):
relation = "is a verifiable fact"
elif is_descendant(syn1, syn2, True):
relation = "is a verifiable true over-generalization"
elif is_descendant(syn2, syn1):
relation = "is a verifiable falsehood"
elif is_descendant(syn2, syn1, True):
relation = "is a verifiable false over-generalization"
else:
relation = "figurative speech (two entries, with no roots)"
else:
relation = "undetermined, entries not in wordnet"
return relation
def build_wn_query(wds):
"""
build wn query from a set of words
1) remove/ignore determiners
2) concat adj. nouns/compound words with white space
3) singular ...
due to the tagging base , there are strangely JJ's being identified as NP chunks
.. This will fix the problem for figurative love; however, I may need more
testing to see how far it reaches or wd.type =='JJ' ..I would also
have to change the grammar to match adjp.. I really am uncertain about it
"""
query = ''
nouns = [wd for wd in wds if wd.type == 'NN'
or wd.type == 'NNP'
or wd.type == 'NNS'
or wd.type == 'NNP-PERS'
or wd.type == 'NNP-LOC']
print nouns
if len(nouns) > 1:
for noun in nouns:
query += noun.string
query += ' '
else:
query = nouns[0].string
return query.strip()
def unify_query(query):
"""
a peek of sorts ..
param: list of Word objs
return: synset entry from wn
"""
#build query from
query = build_wn_query(query)
print 'wordnet query: {0}'.format(query)
s = wordnet.synsets(singularize(query), pos=wordnet.NOUN)
if len(s) == 0:
#this is a bit hacky.. it's based on the assumption, if it fails, it may be a two word NN
#i.e. thrill ride fails, ride doesn't
print 'no entry for {0}..'.format(query)
s = wordnet.synsets(singularize(query.split()[1]), pos=wordnet.NOUN)
if len(s) == 0:
print 'no entry for {0}'.format(query.split()[1])
return s
def main(args):
"""
her eyes were jewels :figurative detected (correct)
a car is a vehicule : non figurative detected (correct)
success is a bastard : figurative (incorrect)
"""
wout = open(args[2] + ".txt", 'a+')
with open(args[1], 'r') as f:
lines = f.readlines()
for num, ex in enumerate(lines):
print "example: {0}".format(ex)
wout.write(str(num) + ") " + ex)
wout.write( " -> " + detect_figa(ex))
wout.write("\n")
f.close()
wout.close()
if __name__ == '__main__':
main(argv)