def process_start_built_url(self): url = 'https://www.to8to.com/index.html' while 1: try: response = requests.get(url, headers={ 'user-agent': random_useragent() }, proxies=AbuyunSpider.returnRequestProxies()) response.encoding = response.apparent_encoding if response.status_code == 200: break except Exception as e: print(e) time.sleep(random.randint(2, 5)) # with open('ddddd.html','w') as f: # f.write(response.text) document = pq(response.text) res = [] for x in document('div[@class="xzcs_dt"] > a').items(): # print(x) item = {} pattern = re.compile('//(.*?)\.') item['city'] = x.text() item['city_num'] = re.search(pattern,x.attr('href')).group(1) res.append(item) return res
def process_requesst(self,session,url): while 1: try: response = session.get(url, headers={'user-agent': random_useragent()}, proxies=AbuyunSpider.returnRequestProxies(),timeout=6) if response.status_code == 200: break except Exception as e: print(e) time.sleep(random.randint(2, 5)) return session, response
def get_cityid(self,url): while 1: print('###############获取城市ID{}'.format(url)) try: response = requests.get(url, headers={ 'user-agent': random_useragent() }, proxies=AbuyunSpider.returnRequestProxies()) if response.status_code == 200: break except Exception as e: print(e) time.sleep(random.randint(2,5)) document = pq(response.text) cityID = document('#cityId').attr('value') return cityID
def reutnRequestsProxies(self): """ :return: 返回requests的proxies """ return AbuyunSpider.returnRequestProxies()
class chenYangSpider(object): start_urls = "http://www.chenyang.com/index.php?m=content&c=index&a=lists&catid=27&p={p}&city={city}" next_urls = "http://www.chenyang.com/index.php?m=content&c=index&a=lists&catid=27&p={p}&city={city}&page={page}" built_url = "http://www.chenyang.com/" returnRequestsProxies = AbuyunSpider.returnRequestProxies() def __init__(self, setting=Setting()): self.db_name = setting.mysql_table_jinpin_dbname def returnBuiltHeaders(self, path, RefererUrl=None): """ 构造headers :return: """ headers = { "Accept": 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh - CN, zh;q = 0.9", "Cache-Control": "max - age = 0", "Connection": "keep - alive", "Host": "www.chenyang.com", "Upgrade-Insecure-Requests": "1", "user-agent": random_useragent(), } if RefererUrl: headers['Referer'] = RefererUrl return headers def start_requests(self): citys = address_json.address for i in citys: pro = i['p'] # print(i) p = parse.quote(i['p']) for c in i['city']: cit = c # if '北京' in i['p']: c = parse.quote(c) page = 1 url = self.start_urls.format(p=p, city=c) response = self.process_request(nextPage=url) if response: self.parse(response, meta={ 'page': 1, 'p': p, "c": c, 'pro': pro, 'cit': cit }) # todo:处理requests 请求URL def process_request(self, nextPage, meta=None, Referer=None): path_params = '/' + '/'.join(nextPage.split('/')[-3:]) count = 0 while 1: try: response = requests.get(url=nextPage, headers=self.returnBuiltHeaders( path=path_params, RefererUrl=Referer), timeout=3, allow_redirects=False, proxies=self.returnRequestsProxies) if response.status_code == 200: print('#########解析成功URL: {}########'.format(response.url)) return response else: print(nextPage) print(response) if count > 20: return False count += 1 except Exception as e: print(e) print(nextPage) time.sleep(random.randint(2, 5)) def parse(self, response, meta): # with open('chengyang.html','w') as f: # f.write(response.text) document_pq = pq(response.text) value = document_pq('.list-zmd > li') if value: pro = meta['pro'] city = meta['cit'] io_data = {self.db_name: []} for i in value.items(): name = i('div').eq(0)('p > strong').text() address = i('div').eq(1)('p').text() item = {} item['name'] = name item['address'] = address item['province'] = pro item['city'] = city item['area'] = "" item['numbers'] = "" item['telphone'] = "" item['types'] = 3 print(item) io_data[self.db_name].append(item) server = Pipline_to_redis_server() server.sadd(io_data) if document_pq('.pages > li'): len_num = len(document_pq('.pages > li')) ressult_li = document_pq('.pages > li').eq(len_num - 1)('a').attr('href') pattern = re.compile('page=(.*?)&') page_li = re.search(pattern, ressult_li).group() try: page_url = re.search(pattern, response.url).group() except Exception as e: page_url = '' if page_li == page_url: ressult_li = '' page = meta['page'] page += 1 c = meta['c'] p = meta['p'] if ressult_li: url = self.built_url + ressult_li response = self.process_request(nextPage=url) if response: self.parse(response=response, meta={ 'page': page, 'p': p, "c": c, 'pro': pro, 'cit': city })
class DuoleshiSpider(object): name = 'duoleshi' allowed_domains = ['www.dulux.com.cn'] # start_urls = "https://www.dulux.com.cn/find/store-ajax?address={address}&attributes=&language=zh&pagenum={page}" start_urls = "https://www.dulux.com.cn/ajax/stores-api/select/all-id?flds=id,latitude,longitude,companyName,companyName_zh,address,address_zh,city,city_zh,zipcode,zipcode_zh,attributeCodes,brands,region,region_zh,phone,phone_zh,district,district_zh,country,countryCode_zh,country_zh" returnRequestsProxies = AbuyunSpider.returnRequestProxies() def __init__(self,setting=Setting()): self.db_name = setting.mysql_table_jinpin_dbname def returnBuiltHeaders(self, path, RefererUrl=None): """ 构造headers :return: """ headers = { "accept": "application/json, text/plain, */*", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9", "if-none-match": "1591637021-1", "referer": "https://www.dulux.com.cn/zh/find-a-stockist", "sec-fetch-dest": "empty", "sec-fetch-mode": "cors", "sec-fetch-site": "same-origin", "user-agent": random_useragent(), } if RefererUrl: headers['Referer'] = RefererUrl return headers # todo:处理requests 请求URL def process_request(self, nextPage, meta=None, Referer=None): path_params = '/' + '/'.join(nextPage.split('/')[-3:]) count = 0 while 1: try: response = requests.get(url=nextPage, headers=self.returnBuiltHeaders(path=path_params), timeout=3, allow_redirects=False, proxies=self.returnRequestsProxies) if response.status_code == 200: print('#########解析成功URL: {}########'.format(response.url)) return response else: print(nextPage) print(response) if count > 20: return False count += 1 except Exception as e: print(e) print(nextPage) time.sleep(random.randint(2, 5)) def start_requests(self): url = self.start_urls response = self.process_request(nextPage=url) if response: self.parse(response) # yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): result = json.loads(response.text)['response']['docs'] print('####################爬取多乐士网站数据:{}条##########'.format(len(result))) import re if result: io_data = {self.db_name: []} for index,i in enumerate(result): # print(index) try: name = i['companyName_zh'] pattern_params = re.compile('(.*?){}|(\s)+省'.format(i['city_zh'])) try: region = re.search(pattern_params,i['address_zh']).group(1) # 省份 except Exception as e: print(e) region = '' city = i['city_zh'] # 城市 subtitle = '' # 授权号 address = i['address_zh'] phone = i.get('phone_zh','') item = {} item['name'] = name item['address'] = address item['province'] = region item['city'] = city item['area'] = "" item['numbers'] = subtitle item['telphone'] = phone item['types'] = 2 io_data[self.db_name].append(item) except Exception as e: print(e) server = Pipline_to_redis_server() server.sadd(io_data)
class HuarunqiSpider(object): name = 'huarunqi' allowed_domains = ['www.huarun.com'] start_urls = "http://www.huarun.com/service/store/search?province={}&city=&area=&address=&design=off&page=1&limit=10000" returnRequestsProxies = AbuyunSpider.returnRequestProxies() def __init__(self, setting=Setting()): self.db_name = setting.mysql_table_jinpin_dbname def returnBuiltHeaders(self, path, RefererUrl=None): """ 构造headers :return: """ headers = { "Accept": 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh - CN, zh;q = 0.9", "Cache-Control": "max - age = 0", "Connection": "keep - alive", "Host": "www.chenyang.com", "Upgrade-Insecure-Requests": "1", "user-agent": random_useragent(), } if RefererUrl: headers['Referer'] = RefererUrl return headers # todo:处理requests 请求URL def process_request(self, nextPage, meta=None, Referer=None): path_params = '/' + '/'.join(nextPage.split('/')[-3:]) count = 0 while 1: try: response = requests.get(url=nextPage, headers=self.returnBuiltHeaders( path=path_params, RefererUrl=Referer), timeout=3, allow_redirects=False, proxies=self.returnRequestsProxies) if response.status_code == 200: print('#########解析成功URL: {}########'.format(response.url)) return response else: print(nextPage) print(response) if count > 20: return False count += 1 except Exception as e: print(e) print(nextPage) time.sleep(random.randint(2, 5)) def start_requests(self): citys = address_json.address for i in citys: cit = parse.quote(i['p']) url = self.start_urls.format(cit) response = self.process_request(nextPage=url) if response: self.parse(response) # yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): result = json.loads(response.text) if 'datas' in result['result']: io_data = {self.db_name: []} for j in result['result']['datas']: name = j['name'] address = j['address'] province = j['province'] city = j['city'] area = j['area'] numbers = j['numbers'] telphone = j['telphone'] item = {} item['name'] = name item['address'] = address item['province'] = province item['city'] = city item['area'] = area item['numbers'] = numbers item['telphone'] = telphone item['types'] = 1 io_data[self.db_name].append(item) print('############开始存入redis#{}##############'.format(io_data)) # yield io_data print(io_data) server = Pipline_to_redis_server() server.sadd(io_data) else: print('########################没有结果########', result)