forked from twtrubiks/PTT_Beauty_Spider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
beauty_spider2.py
125 lines (110 loc) · 4.55 KB
/
beauty_spider2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import download_beauty
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import sys
import time
import datetime
from bs4 import BeautifulSoup
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
rs = requests.session()
def get_page_number(content):
start_index = content.find('index')
end_index = content.find('.html')
page_number = content[start_index + 5: end_index]
return int(page_number) + 1
def over18(board):
res = rs.get('https://www.ptt.cc/bbs/{}/index.html'.format(board), verify=False)
# 先檢查網址是否包含'over18'字串 ,如有則為18禁網站
if 'over18' in res.url:
print("18禁網頁")
load = {
'from': '/bbs/{}/index.html'.format(board),
'yes': 'yes'
}
res = rs.post('https://www.ptt.cc/ask/over18', verify=False, data=load)
return BeautifulSoup(res.text, 'html.parser')
def craw_page(res, push_rate):
soup_ = BeautifulSoup(res.text, 'html.parser')
article_seq = []
for r_ent in soup_.find_all(class_="r-ent"):
try:
# 先得到每篇文章的篇url
link = r_ent.find('a')['href']
if link:
# 確定得到url再去抓 標題 以及 推文數
title = r_ent.find(class_="title").text.strip()
rate = r_ent.find(class_="nrec").text
url = 'https://www.ptt.cc' + link
if rate:
rate = 100 if rate.startswith('爆') else rate
rate = -1 * int(rate[1]) if rate.startswith('X') else rate
else:
rate = 0
# 比對推文數
if int(rate) >= push_rate:
article_seq.append({
'title': title,
'url': url,
'rate': rate,
})
except Exception as e:
# print('crawPage function error:',r_ent.find(class_="title").text.strip())
print('本文已被刪除')
return article_seq
def main():
# python beauty_spider2.py [版名] [爬蟲起始的頁面] [爬幾頁] [推文多少以上]
board, start_page, page_term, push_rate = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4])
start_time = time.time()
datetime_format = '%Y%m%d%H%M%S'
crawler_time = '_PttImg_{:{}}'.format(datetime.datetime.now(), datetime_format)
if start_page == 0:
print("請輸入有效數字")
sys.exit()
# 如為 -1 ,則從最新的一頁開始
else:
# 檢查看板是否為18禁,有些看板為18禁
soup = over18(board)
all_page_url = soup.select('.btn.wide')[1]['href']
start_page = get_page_number(all_page_url)
print("Analytical download page...")
index_list = []
article_list = []
for page in range(start_page, start_page - page_term, -1):
page_url = 'https://www.ptt.cc/bbs/{}/index{}.html'.format(board, page)
index_list.append(page_url)
# 抓取 文章標題 網址 推文數
while index_list:
index = index_list.pop(0)
res = rs.get(index, verify=False)
# 如網頁忙線中,則先將網頁加入 index_list 並休息1秒後再連接
if res.status_code != 200:
index_list.append(index)
# print('error_url:',index)
time.sleep(1)
else:
# soup = BeautifulSoup(res.text, 'html.parser')
article_list = craw_page(res, push_rate)
# print('ok_url:', index)
time.sleep(0.05)
total = len(article_list)
count = 0
# 進入每篇文章分析內容
while article_list:
article = article_list.pop(0)
res = rs.get(article['url'], verify=False)
# 如網頁忙線中,則先將網頁加入 index_list 並休息1秒後再連接
if res.status_code != 200:
article_list.append(article)
# print('error_URL:',article[1])
time.sleep(1)
else:
# soup = BeautifulSoup(res.text, 'html.parser')
count += 1
# print('OK_URL:', article.url)
download_beauty.store_pic(crawler_time, article['url'], article['rate'], article['title'])
print('download: {:.2%}'.format(count / total))
time.sleep(0.05)
print("下載完畢...")
print('execution time: {:.3}s'.format(time.time() - start_time))
if __name__ == '__main__':
main()