/
aliex_search_details.py
103 lines (84 loc) · 2.79 KB
/
aliex_search_details.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# -*- coding: utf-8 -*-
__author__='wys'
#给定检索关键词,返回Aliexpress 检索结果list
import json
import time
import threading
import random
import crawler
from aliex_onepage_detail import get_details
from aliex_onepage_detail import is_login_page
from aliex_search_itemurl_list import get_pages_url
from aliex_search_itemurl_list import searchword2list
from config import SEARCH_WORDS
#################全局变量,保存待爬数据列表
details_urls = []
SEARCH_WORD = ''
########################
def get_pagedetail_callback(res):
det = get_details(res=res)
if(det):
return 'success',det
else:
if(is_login_page(res)):
return 'proxy_unsable',det
else:
return 'other_error',det
def after_process(url,res):
global details_urls
details_urls.remve(url)
search_word = '_'.join(SEARCH_WORD.split())
outputfile_name = 'output/'+ search_word + '.txt'
detail_output(res,outputfile_name)
def detail_output(detail,file='sports_outdoor_details_2.txt'):
with open(file,'a',encoding='utf-8')as fout:
fout.write(json.dumps(detail,ensure_ascii=False))
fout.write('\n')
def work_thread():
#print('enter',threading.current_thread().name)
global detail_urls
while(detail_urls):
url = detail_urls.get_random_item()
try:
sleep_time = random.random()*6
time.sleep(sleep_time)
cra = crawler()
#print('crawler url:%s'%url)
flag,res = cra.get_by_proxy(url=url,call_back=get_pagedetail_callback)
if(flag):
print(threading.current_thread().name,'get detail done')
after_process(url,res)
else:
print(threading.current_thread().name,'get detail failed')
except Exception as e:
print(threading.current_thread().name, 'get detail done,error ending')
print(threading.current_thread().name,e)
def search_details(searchword):
global details_urls
details_urls = searchword2list(searchword)
global SEARCH_WORD
SEARCH_WORD = searchword
if(not details_urls):
print('urls is none')
return None
t1 = threading.Thread(target=work_thread, name='worker1')
t2 = threading.Thread(target=work_thread, name='worker2')
#t3 = threading.Thread(target=work_thread, name='worker3')
#t4 = threading.Thread(target=work_thread, name='worker4')
#t5 = threading.Thread(target=work_thread, name='worker5')
#t6 = threading.Thread(target=work_thread, name='worker6')
t1.start()
t2.start()
#t3.start()
#t4.start()
#t5.start()
#t6.start()
t1.join()
t2.join()
#t3.join()
#t4.join()
#t5.join()
#t6.join()
if __name__=='__main__':
for word in SEARCH_WORDS:
search_details(word)