-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
118 lines (74 loc) · 2.45 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import re
from dateutil.parser import parse
from xml.sax.saxutils import unescape
import requests
import bs4
from celery import Celery
from app import app, db
from models import Result
base_scraper_url = app.config.get('SCRAPER_BASE_URL')
base_url = app.config.get('BASE_URL')
base_dict = {}
celery = Celery(app.name, broker=app.config['CELERY_BROKER_URL'])
celery.conf.update(app.config)
def clean_html(html):
return ' '.join(re.findall(r'\S+', html))
def get_rows(url):
page = requests.get(url)
rows = []
if page.ok:
content = clean_html(page.content)
tree = bs4.BeautifulSoup(content, 'lxml')
table = tree.find('table', class_='data')
rows = table.find_all('tr')
return rows
def get_article(url):
page = requests.get(url)
if page.ok:
content = clean_html(page.content)
tree = bs4.BeautifulSoup(content, 'lxml')
return tree.body
return None
def get_body(file):
h4 = file.find('h4')
data = str(h4)
for sibling in h4.find_next_siblings():
data += str(sibling)
return data
@celery.task
def scraper():
print('About to start the scraper.')
base_rows = get_rows(base_scraper_url)
base_for_today = base_rows[0]
base_datacells = base_for_today.find_all('td')
today_url_cell = base_datacells[0].find('a')
base_dict['url'] = unescape(base_url + today_url_cell.attrs['href'])
base_dict['date'] = parse(today_url_cell.text)
todays_rows = get_rows(base_dict.get('url'))
for row in todays_rows:
datacells = row.find_all('td')
doc = {
'url': base_dict.get('url'),
'date': base_dict.get('date'),
'source': datacells[1].text,
'agency': datacells[2].text,
'fsg': datacells[3].text,
'title': datacells[4].text,
'keywords': datacells[5].text,
'url_2': unescape(
base_url + datacells[4].find('a').attrs.get('href')),
}
article = get_article(doc.get('url_2'))
if article:
try:
doc['description'] = get_body(article)
result = Result(**doc)
db.session.add(result)
db.session.commit()
except Exception as e:
result = Result(**doc)
db.session.add(result)
db.session.commit()
print('Scraper completed execution')
return article
scraper.apply_async()