/
grab.py
75 lines (56 loc) · 1.83 KB
/
grab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# Author: legend
# Mail: kygx.legend@gmail.com
# File: grab.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
import argparse
import requests
from bs4 import BeautifulSoup as soup
from pymongo import MongoClient as mc
head = 'http://www.nba.com'
host = 'mongodb://172.16.104.62:20001'
def getLinks():
cache = open('links.tmp', 'a')
navigation = '{}/2016/news/archive/{}/index.html'
for week in xrange(10, 20):
req = requests.get(navigation.format(head, week))
page = soup(req.text, 'html.parser')
div = page.find('div', class_='nbaNAContent')
div = '<html><body>{}</body></html>'.format(str(div))
links = soup(div, 'html.parser').find_all('a')
for l in links:
cache.write(l.get_text() + '\n')
cache.write(l.get('href') + '\n')
def getNews():
client = mc(host)
collection = client['mydb']['nba_news']
lines = open('links.tmp', 'r').readlines()
toggle, title, link = True, None, None
for l in lines:
if toggle:
title = l.strip()
else:
link = l.strip()
req = requests.get('{}/{}'.format(head, link))
page = soup(req.text, 'html.parser')
section = page.find('section')
section = '<html><body>{}</body></html>'.format(str(section))
article = soup(section, 'html.parser').find_all('p')
content = ''.join([ p.text for p in article ])
print title, link, content
doc = {
"title": title,
"link": '{}/{}'.format(head, link),
"content": content.replace("\"", "\'")
}
collection.insert_one(doc)
toggle = not toggle
print collection.count()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Grab tools')
parser.add_argument('-s', '--step', help='Choose step to run')
args = parser.parse_args()
if args.step == '1':
getLinks()
elif args.step == '2':
getNews()