/
run.py
102 lines (79 loc) · 2.45 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from lxml import html
from bs4 import BeautifulSoup
from six.moves.html_parser import HTMLParser
from collections import defaultdict
import requests
import argparse
class Crawler:
def __init__(self, url):
self.url = url
def crawl(self):
res = self.getRssUrl()
return res
def getRssUrl(self):
# check if rss in url
if '/rss' not in self.url:
self.url = self.url + '/rss'
# GET request from url
page = requests.get(self.url)
content = ''
markup = ''
# check request status
if page.status_code == 200:
# get content
content = page.content
# get markup
markup = self.getMarkup( page.headers['content-type'] )
elif page.status_code == 404:
# no rss feed
return
# get feeds
feeds = Feed(content, markup)
data = feeds.getFeeds()
return data
def getMarkup(self, content_type):
if 'xml' in content_type:
markup = 'xml'
elif 'html' in content_type:
markup = 'html'
return markup
class Feed:
# Class to handle Feeds
def __init__(self, data, markup):
self.obj = BeautifulSoup(data, markup)
self.html_parser = HTMLParser()
def getFeeds(self):
# instantiate
feeds = {}
# get title
feeds['title'] = self.getTitle()
# get link
feeds['link'] = self.getLink()
# get items
feeds['items'] = self.setupItems()
return feeds
def getTitle(self):
return self.obj.title.string
def getLink(self):
return self.obj.find('link').string
def getItems(self):
return self.obj.find_all('item')
def setupItems(self):
items = self.getItems()
data = []
for item in items:
new_item = {
'title': self.html_parser.unescape( item.title.string ),
'link': item.find("link").string,
'comments_link': item.find("comments"),
'publication_date': item.find('pubDate').text,
'author': self.html_parser.unescape( item.find('creator').text )
}
data.append(new_item)
return data
parser = argparse.ArgumentParser()
parser.add_argument('url', metavar='U', type=str, help="url for the script")
args = parser.parse_args()
crawler = Crawler(args.url)
res = crawler.crawl()
print(res)