-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
96 lines (80 loc) · 3.2 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import wikipedia
ENGLISH = 'english'
INDONESIAN = 'indonesian'
EN = 'en'
ID = 'id'
class Scraper():
def __init__(self, language):
self.language = language
def get_first_page(self, possible_page_titles):
first_page = possible_page_titles[0]
page_result = wikipedia.page(first_page)
return page_result.content
def get(self, query):
if self.language == INDONESIAN:
wikipedia.set_lang(ID)
else:
wikipedia.set_lang(EN)
try:
possible_page_titles = wikipedia.search(query)
if len(possible_page_titles) > 0:
return self.get_first_page(possible_page_titles)
else:
suggested_query = wikipedia.suggest(query)
possible_page_titles = wikipedia.search(suggested_query)
if len(possible_page_titles) > 0:
return self.get_first_page(possible_page_titles)
except wikipedia.exceptions.DisambiguationError as e:
possible_page_titles = e.options
return self.get_first_page(possible_page_titles)
return None
def get_intro(self, query):
content = self.get(query)
if content is not None:
intro = content.split('==', 1)[0]
if intro is not None:
return intro.rstrip('\n\r')
return None
def get_query(self, query, isInverse=False):
if self.language == INDONESIAN and not isInverse:
wikipedia.set_lang(ID)
lang = INDONESIAN
else:
wikipedia.set_lang(EN)
lang = ENGLISH
try:
possible_page_titles = wikipedia.search(query)
if possible_page_titles and len(possible_page_titles) > 0:
return possible_page_titles[0], 0, lang
else:
suggested_query = wikipedia.suggest(query)
if suggested_query is None or len(suggested_query) < 1:
return [], -1, lang
possible_page_titles = wikipedia.search(suggested_query)
if possible_page_titles and len(possible_page_titles) > 0:
return possible_page_titles[0], -1, lang
except wikipedia.exceptions.DisambiguationError as e:
possible_page_titles = e.options
if possible_page_titles and len(possible_page_titles) > 0:
return possible_page_titles[0], 0, lang
return [], -1, lang
else:
if possible_page_titles and len(possible_page_titles) > 0:
return possible_page_titles[0], 0, lang
return [], -1, lang
return [], -1, lang
def get_intro_lang(self, query, lang):
if lang == INDONESIAN:
wikipedia.set_lang(ID)
else:
wikipedia.set_lang(EN)
try:
content = self.get_first_page([query])
except wikipedia.exceptions.DisambiguationError as e:
possible_page_titles = e.options
content = self.get_first_page(possible_page_titles)
if content is not None:
intro = content.split('==', 1)[0]
if intro is not None:
return intro.rstrip('\n\r\s')
return None