-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
284 lines (221 loc) · 9.49 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
import json
import requests
import sys
from episode import Episode
from bs4 import BeautifulSoup
from person import PersonOrChain
from rating import Rating
base_url = "https://doughboys.fandom.com"
episodes = "/wiki/Episodes"
rss_feed = "https://rss.art19.com/doughboys"
current_scores = None
def scrub_string(text):
return ' '.join(text.split())
# finding generic episode info inside article
def find_gen_info(article, attr_val):
gross_obj = article.find('div', {"data-source": attr_val})
value = gross_obj.find('div', {"class": "pi-data-value pi-font"}).text
return value
def get_duration(number):
html = requests.get(rss_feed).text
soup = BeautifulSoup(html, 'html.parser')
this_ep = soup.find('itunes:episode', text=number)
duration = this_ep.parent.find('itunes:duration').text
return get_sec(duration)
def get_sec(time_str):
"""Get Seconds from time."""
h, m, s = time_str.split(':')
return int(h) * 3600 + int(m) * 60 + int(s)
def get_latest_rating(episode_num):
fb = requests.get("https://doughpedia.firebaseio.com/ratings.json")
ratings = list(fb.json().items())
highest_value = 0
episode = ""
for rating in ratings:
num = int(rating[0].strip("_"))
if num > highest_value:
highest_value = num
episode = int(rating[1]['episode'].strip("_"))
print(highest_value)
print(episode)
#if episode + 1 != int(episode_num):
# print("not doing the next episode !!")
# return None
return highest_value
# retrieves fork rating for a given episode
def get_fork_ratings(url):
html_contents = requests.get(url).text
soup = BeautifulSoup(html_contents, 'html.parser')
article = soup.find('article')
#fork_table = article.find('span', {"id": "Fork_rating"}).findNext('table', {"class": "article-table"})
fork_table = article.find('table', {"class": "article-table"})
fork_table_rows = fork_table.find_all('tr')[1:]
last_row = fork_table_rows[-1]
last_row_title = last_row.find('td')
#if "shared" in last_row_title.text or "??" in last_row_title.text:
# fork_table_rows.pop(-1)
#fork_table_rows.pop(-1)
fork_ratings = {}
for row in fork_table_rows:
entries = row.find_all('td')
person = scrub_string(entries[0].text)
if "shared" in person:
continue
rating = scrub_string(entries[-1].text).split(" ")[0]
if rating is "":
continue
fork_ratings[person] = rating
return fork_ratings
def get_synopsis(url):
html_contents = requests.get(url).text
soup = BeautifulSoup(html_contents, 'html.parser')
article = soup.find('article')
return article.find('span', {"id": "Synopsis"}).findNext('p').text
def get_image(url):
html_contents = requests.get(url).text
soup = BeautifulSoup(html_contents, 'html.parser')
article = soup.find('article')
image_obj = article.find('figure', {"class": "pi-item pi-image"})
return image_obj.find('a').attrs['href'].split("/revision/")[0]
# returns list of all regular, numbered episodes
def get_episode_list():
html_contents = requests.get(base_url + episodes).text
soup = BeautifulSoup(html_contents, 'html.parser')
ep_table = soup.find('table').find_all('tr')[1:]
episode_list = []
for row in ep_table:
columns = row.find_all('td')
number = scrub_string(columns[0].text)
avg_score = scrub_string(columns[2].text)
# filter out double & tournament episodes
if number.isdigit() and avg_score.replace('.', '', 1).isdigit():
a_tag = columns[1].find('a')
title = scrub_string(a_tag.text)
print("Fetching Episode " + number + " - " + title)
href = scrub_string(a_tag.attrs['href'])
date = scrub_string(columns[3].text)
fork_ratings = get_fork_ratings(base_url + href)
episode_list.append(Episode(title, number, href, date, fork_ratings))
return episode_list
def get_episode(number, new_rating):
html_contents = requests.get(base_url + episodes).text
soup = BeautifulSoup(html_contents, 'html.parser')
ep_table = soup.find('table').find_all('tr')[1:]
for entry in ep_table:
columns = entry.find_all('td')
numFound = scrub_string(columns[0].text)
avg_score = scrub_string(columns[2].text)
if numFound == str(number):
if not numFound.isdigit() and not avg_score.replace('.', '', 1).isdigit():
return None
a_tag = entry.find_all('td')[1].find('a')
title = scrub_string(a_tag.text).replace("Rockaroundtheclockdoughberfest: ", "")
title = title.replace("Tropical Freeze: ", "")
print("Fetching Episode " + numFound + " - " + title)
href = scrub_string(a_tag.attrs['href'])
date = scrub_string(columns[3].text)
fork_ratings = get_fork_ratings(base_url + href)
global current_scores
current_scores = fork_ratings
image = scrub_string(get_image(base_url + href))
synopsis = scrub_string(get_synopsis(base_url + href))
duration = get_duration(numFound.strip())
current_ep = Episode(title, numFound, date, duration, fork_ratings, image, new_rating, synopsis)
print(json.dumps(current_ep.__dict__, indent=4))
return current_ep
def get_ratings(episode, next_rating_number):
global current_scores
ratings = {}
for person, score in current_scores.items():
new_rating = Rating(episode.number, episode.date, episode.epoch, person, episode.restaurant, score)
ratings["_" + str(next_rating_number)] = new_rating
next_rating_number += 1
return ratings
def get_people(episode):
people = []
for person in (episode.people.keys()):
new_person = PersonOrChain()
print(person)
def check_person_existence(name):
response = requests.get("https://doughpedia.firebaseio.com/people/" + name + ".json")
if response.json() is None:
return False
else:
return True
def check_restaurant_existence(name):
response = requests.get("https://doughpedia.firebaseio.com/restaurants/" + name + ".json")
if response.json() is None:
return False
else:
return True
def set_flag_true(url):
requests.put(url, data="true")
def add_episode_number(episode):
body = json.dumps(episode.__dict__)
requests.put("https://doughpedia.firebaseio.com/episodes/_" + episode.number + ".json", data=body)
def make_put_call(url, pojo):
body = json.dumps(pojo.__dict__)
requests.put(url, data=body)
if __name__ == "__main__":
# getting single episode
if sys.argv[1] == "test":
get_latest_rating("5")
exit()
if sys.argv[1] is not None:
testing = False
for num in range(276, 280):
latest_rating = get_latest_rating(num)
if latest_rating is None:
print("numbering is off")
exit(0)
next_rating_number = latest_rating + 1
# add episode obj
new_episode = get_episode(num, next_rating_number)
if new_episode is None:
print("tourney or DD called")
exit()
if not testing:
make_put_call("https://doughpedia.firebaseio.com/episodes/_" + str(new_episode.number) + ".json",
new_episode)
# add ratings and update them for each episode + person
new_ratings = get_ratings(new_episode, next_rating_number)
count = 0
for index, rating in new_ratings.items():
print(index)
print(json.dumps(rating.__dict__, indent=4))
rating_url = "https://doughpedia.firebaseio.com/ratings/" + index + ".json"
print(rating_url)
if not testing:
make_put_call(rating_url, rating)
# update person rating
person_url = "https://doughpedia.firebaseio.com/people/" + rating.person + "/ratings/" + index + ".json"
print(person_url)
if not testing:
set_flag_true(person_url)
# update person appearance
person_ep_url = "https://doughpedia.firebaseio.com/people/" + rating.person + "/episodes/_" + str(
new_episode.number) + ".json"
print(person_ep_url)
if not testing:
set_flag_true(person_ep_url)
# update chain rating
chain_url = "https://doughpedia.firebaseio.com/restaurants/" + rating.restaurant + "/ratings/" + index + ".json"
print(chain_url)
if not testing:
set_flag_true(chain_url)
# update chain appearance
chain_ep_url = "https://doughpedia.firebaseio.com/restaurants/" + rating.restaurant + "/episodes/_" + str(
new_episode.number) + ".json"
print(chain_ep_url)
if not testing:
set_flag_true(chain_ep_url)
# set_flag_true("https://doughpedia.firebaseio.com/people/Demi Moore/ratings/_8.json")
# for rating_num in new_episode.ratings.keys():
# print(rating_num)
# people = get_people(new_episode)
# chains = get_chains(new_episode)
else:
for ep in get_episode_list():
print("=====================")
print(str(ep))
# get_episode(base_url+last_ep)