-
Notifications
You must be signed in to change notification settings - Fork 0
/
nominal_features.py
90 lines (77 loc) · 2.64 KB
/
nominal_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from nltk.corpus import wordnet as wn
from nltk.metrics import jaccard_distance
from nltk.corpus import brown
from nltk.util import ngrams
from collections import Counter
import string
"""
NOTE: synsets 'pos' argument accepts only four possible attributes.
They are as follows:
VERB
NOUN
ADV
ADJ
POS tags of the words have to be classified into the above categories
before the functions can be used.
"""
"""
Dictionary that stores POS of POS tags
"""
pos = {
'JJ': 'ADJ',
'NN': 'NOUN',
'RB': 'ADV',
'VB': 'VERB'
}
"""
hypernym takes input of a list containg (word,pos_tag) tuple pairs
and return a list of hypernyms.
"""
def hypernym(words):
hypernyms = []
for word in words:
arg = wn.synsets(word[0], pos=getattr(wn, pos[word[1][:2]]))[0]
hypernyms.append(arg.hypernyms()[0].name().split('.')[0])
return hypernyms
"""
lowest_common_hypernym takes input a list containing the
[(nominal1,pos_tag),(nominal2,pos_tag)] tuple pairs and
returns lowest common hypernym.
"""
def lowest_common_hypernym(words):
lch = []
for word in words:
arg_1 = wn.synsets(words[0][0], pos=getattr(wn, pos[words[0][1]]))[0]
arg_2 = wn.synsets(words[1][0], pos=getattr(wn, pos[words[1][1]]))[0]
lch.append(arg_1.lowest_common_hypernyms(arg_2)[0].name().split('.')[0])
return lch
"""
jaccard_common returns two values. First value is a list containing the four most
common objects of each nominal. Second value is the Jaccard distance of the two
nominal sets.
"""
def jaccard_common(nominals):
sents = brown.sents()
sents_no_punct = []
for sent in sents:
sents_no_punct.append(
[''.join(c for c in s if c not in string.punctuation) for s in sent])
sents_no_punct = [words for sent in sents_no_punct for words in sent]
sents_no_punct = [word for word in sents_no_punct if word]
five_grams = ngrams(sents_no_punct, 5)
e1_words, e2_words = [], []
for five_gram in five_grams:
if nominals[0] in five_gram:
for word in five_gram:
if word != nominals[0]:
e1_words.append(word)
elif nominals[1] in five_gram:
for word in five_gram:
if word != nominals[1]:
e2_words.append(word)
e1_top, e2_top = [], []
e1_count = Counter(e1_words)
e2_count = Counter(e2_words)
e1_top = [word[0] for word in e1_count.most_common(4)]
e2_top = [word[0] for word in e2_count.most_common(4)]
return [e1_top, e2_top], jaccard_distance(set(Counter(e1_words).keys()), set(Counter(e2_words).keys()))