/
wiki_analysis.py
36 lines (24 loc) · 1.06 KB
/
wiki_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
"""Special routines powered by NLP analysis on Wikipedia pages / using
Wikipedia pages."""
import re
from lxml import etree
import wikipedia
PARENTHETICAL_EXPRESSION = re.compile(r'(?<=\s)\([^)]+?\)\s*(\.?)')
def get_definition(term, document=None):
"""Get a definition for a term. Caller can optionally provide the
`lxml` document in which the term was first found (and thus where a
definition may also be found)."""
# First stab: try to fetch a relevant article
page_name = wikipedia.get_page_for_query(term)
document = wikipedia.fetch_page((None, page_name, None))
if document is None:
# TODO: Other extraction method
return
content_el = document.xpath('//div[@id="mw-content-text"]/p[1]')[0]
text_content = etree.tostring(content_el, method='text',
encoding='utf-8')
text_content = PARENTHETICAL_EXPRESSION.sub(r'\1', text_content)
# Grab first sentence.
# TODO: Better sentence segmentation
sentence = text_content.split('.')[0]
return sentence.strip() + '.'