-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
38 lines (32 loc) · 1.18 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import newspaper
from newspaper import news_pool
while True:
yahoo = newspaper.build('http://news.yahoo.com/us/')
print "yahoo built"
google = newspaper.build('http://www.usnews.com/')
print "google built"
bbc = newspaper.build('http://www.bbc.com/news/world/us_and_canada/')
print "bbc built"
nbc = newspaper.build('http://www.nbcnews.com/news/us-news')
print "nbcbuild"
cnn = newspaper.build('http://www.cnn.com/US/')
print "cnn"
abc = newspaper.build('http://abcnews.go.com/US/')
print "abc built"
fox = newspaper.build('http://www.foxnews.com/us/index.html')
print "fox built"
papers = [yahoo, google, bbc, nbc, cnn, abc, fox]
news_pool.set(papers, threads_per_source=2)
news_pool.join()
for Source in papers:
for article in Source.articles:
url = article.url
htmlcode = article.html
print url
filename = "html/" + article.title + ".html"
filename = filename.replace("'", "")
print filename.encode('utf-8')
htmlfile = open(filename.encode('utf-8'), "wb")
htmlfile.write(htmlcode.encode('utf-8'))
htmlfile.close()
#HTML(filename).write_png(pngfilename)