/
CBSScraper.py
63 lines (59 loc) · 2.8 KB
/
CBSScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import urllib, datetime
from Utilities import checkVal, checkElement, getTime
from bs4 import BeautifulSoup
#scrapes given article and returns dictionary with title, date, body, author and source
def scrapeArticle(URL):
bText = ''
sList = []
soup = BeautifulSoup(urllib.urlopen(URL))
mainStory = BeautifulSoup(str(soup.find('div', attrs = {'class': 'entry'})))
for paragraph in mainStory.find_all('p'):
if paragraph.has_attr('class') != True and paragraph.get_text().strip() != '':
sList.append(' '.join(paragraph.get_text()
.replace('CBS', 'Source').strip().split()))
if len(sList) <= 1:
return None
for sentence in sList:
bText += sentence.strip() + ('\n\n')
date = checkElement(soup.find('span', attrs = {'class': 'time'}), 'date')
ArticleDict = {'title' : checkElement(soup.find('h1', attrs = {'class': 'title'}), 'title'),
'author' : checkElement(soup.find('span', attrs = {'class':'author'}), 'author'),
'body_text' : bText[:-3],
'URL' : URL,
'date' : date,
'timestamp' : getTime(date.replace('AM', ' AM ').replace('PM', ' PM ').strip(),
[',',':'], [], '%B %d %Y %I %M %p')}
source = soup.find('span', attrs = {'class': 'source'})
if source != None:
ArticleDict['source'] = source.get_text()
else:
ArticleDict['source'] = 'CBS News'
image = soup.find('div', attrs = {'class':'article-image'})
if image != None and image.img != None:
ArticleDict['image'] = image.img.get('src')
return ArticleDict
#Scrapes the main page of CBSnews.com and returns a list of article links
def scrapeMainPage(URL):
linkList = []
soup = BeautifulSoup(urllib.urlopen(URL))
header = soup.find('h1', attrs = {'class':'title'})
if header != None:
linkList.append(header.a.get('href'))
soup = BeautifulSoup(str(soup.find('div', attrs = {'class':'col-5'})))
for story in soup.find_all('li'):
if '/' == story.a.get('href')[0] and 'news' in story.a.get('href'):
linkList.append(story.a.get('href'))
return linkList
#Scrapes given section of CBSnews.com and returns a list of article links
def scrapeSection(URL):
linkList = []
soup = BeautifulSoup(urllib.urlopen(URL))
header = soup.find('h3', attrs = {'class':'title'}).a.get('href').encode('utf-8')
if header != None:
linkList.append(header)
soup = BeautifulSoup(str(soup.find('div', attrs = {'class':'col-5'})))
for story in soup.find_all('li'):
if story != None:
if '/' == story.a.get('href')[0] and 'news' in story.a.get('href'):
linkList.append(story.a.get('href'))
return linkList