/
datamap.py
229 lines (207 loc) · 7.16 KB
/
datamap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import requests
from typing import Tuple, FrozenSet, Set, Union
from jellyfish import soundex
from enum import Enum
from wiktionaryparser import WiktionaryParser
import regex as re
from wordnik import swagger, WordApi
from config import config
from joblib import Memory
from morph import flatten
from time import sleep
from urllib.error import HTTPError
class Relation(Enum):
"""
Les types de relations possibles entre les embeddings.
"""
SYNONYM = 7
COHYPO = 5
HYPO = 2
RELATED = 8
MORPHO = 9
PARTOF = 1
HYPERNYM = 4
class DataMap:
"""
Gets relative data information from words.
"""
relations = {
Relation.SYNONYM: frozenset(['etymologically-related-term', 'synonym']),
Relation.RELATED: frozenset(['cross-reference']),
Relation.HYPERNYM: frozenset(['hypernym']),
Relation.MORPHO: frozenset(['rhyme']),
Relation.COHYPO: frozenset(['same-context'])
}
# URL for the Wikipedia API
WIKIPEDIA_API_URL = 'http://en.wikipedia.org/w/api.php'
WORDNIK_API_URL = 'http://api.wordnik.com/v4'
WORDNIK_API_KEY = config.get('wordnik', 'API key')
WANTED_CATEGORIES = [
14, # Namespace
100 # Portal
]
CACHE_DIRECTORY = config.get('files', 'data cache')
memory = Memory(CACHE_DIRECTORY, verbose=0)
parser = WiktionaryParser()
@classmethod
@memory.cache
def get_metadata(cls, word: str) -> Tuple[int, FrozenSet[str], FrozenSet[str]]:
"""
Fetches information from Wikipedia based on a given word
:param word: The provided word
:return: The context for word based on the information from Wikipedia
"""
params = dict(
action='query',
srsearch=word,
format='json',
list='search')
resp = requests.get(url=cls.WIKIPEDIA_API_URL, params=params)
data = resp.json()
page_id = data['query']['search'][0]['pageid']
params = dict(
action='parse',
pageid=page_id,
format='json',
prop='categories|links')
resp = requests.get(url=cls.WIKIPEDIA_API_URL, params=params)
data = resp.json()['parse']
links = data['links']
links = frozenset(l['*'] for l in links if l['ns'] in cls.WANTED_CATEGORIES)
categories = data['categories']
categories = frozenset(c['*'] for c in categories if "hidden" not in c)
return page_id, links, categories
@staticmethod
@memory.cache
def is_synonym(a: Tuple[int, FrozenSet[str], FrozenSet[str]], b: Tuple[int, FrozenSet[str], FrozenSet[str]]) -> bool:
return a[0] == b[0]
@staticmethod
@memory.cache
def cohyponyme(a: Tuple[int, FrozenSet[str], FrozenSet[str]], b: Tuple[int, FrozenSet[str], FrozenSet[str]]):
return a[1] & b[1] != set() or a[2] & b[2] != set()
@classmethod
@memory.cache
def hyponyme(cls, word: str, a: Tuple[int, FrozenSet[str], FrozenSet[str]],
b: Tuple[int, FrozenSet[str], FrozenSet[str]]) -> bool:
if not cls.cohyponyme(a, b):
return False
parts = b[1] | b[2]
return bool([p for p in parts if word in p])
@classmethod
@memory.cache
def part_of(cls, a: str, b: str) -> bool:
"""
Returns whether word a is semantically a part of b.
:param a: a word
:param b: a word
:param synonyms: synonyms of a
:return:
"""
definition = cls.parser.fetch(b)
a = cls.get_synonyms(a)
synonyms = '|'.join(a)
try:
definitions = [d['definitions'] for d in definition]
definitions = flatten(definitions)
definitions = [d['text'] for d in definitions]
definitions = flatten(definitions)
except IndexError:
return False
for d in definitions:
if re.search(f'(?:parts? of|elements? of|portions? of|in an?).*(?:{synonyms})', d) is not None:
return True
return False
@staticmethod
@memory.cache
def morphologique(a: str, b: str) -> bool:
"""
:param a: a word
:param b: a word
:return: True if both a and b sound the same.
"""
return soundex(a) == soundex(b)
@classmethod
@memory.cache
def defintion(cls, word: str):
"""
Fetches a definition from Wordnik.
:param word: The word to define.
:return: The definition from Wordnik.
"""
client = swagger.ApiClient(cls.WORDNIK_API_KEY, cls.WORDNIK_API_URL)
word_api = WordApi.WordApi(client)
return word_api.getRelatedWords(word)
@classmethod
@memory.cache
def get_synonyms(cls, word: str) -> Set[str]:
synonyms = {word}
try:
definitions = cls.defintion(word)
except HTTPError as e:
if e.code == 404:
return set()
else:
raise
for definition in definitions:
if word not in definition.words:
continue
if definition.relationshipType in cls.relations[Relation.SYNONYM]:
synonyms = synonyms.union(definition.words)
return synonyms
@classmethod
@memory.cache
def relation(cls, a: str, b: str) -> Union[bool, Relation]:
"""
Tries to use a Wordnik definition to deduce a relationship from word a to word b.
:param a: Source word.
:param b: Target word.
:return: Whether the relationship was found then the relationship
or a set of homonyms of a b if no relationship can be found.
"""
relations = []
try:
definitions = cls.defintion(b)
except HTTPError as e:
if e.code == 404:
return False
else:
raise
for definition in definitions:
for word in definition.words:
if a in word:
for relation, dict_relation in cls.relations.items():
if definition.relationshipType in dict_relation:
relations.append(relation)
if relations:
relations.sort(key=lambda r: r.value)
return relations[0]
return False
@classmethod
def étiqueter(cls, a: str, b: str) -> Relation:
"""
Tries to label a semantic relationship from word a to word b automatically. Very approximate at best.
Inaccurate at worse.
:param a: Source word.
:param b: Target word.
:return: Relationship.
"""
sleep(10)
m_a = cls.get_metadata(a)
try:
m_b = cls.get_metadata(b)
except IndexError:
return Relation.RELATED
if cls.is_synonym(m_a, m_b):
return Relation.SYNONYM
result = cls.relation(a, b)
if result:
return result
if cls.part_of(a, b):
return Relation.PARTOF
if cls.hyponyme(a, m_a, m_b):
return Relation.HYPO
if cls.cohyponyme(m_a, m_b):
return Relation.COHYPO
if cls.morphologique(a, b):
return Relation.MORPHO
return Relation.RELATED