def next_requests(self): bizTypes = self.server.lrange(bizType_set_key, 0 , -1) bizType_titles = self.server.lrange(bizType_title_set_key, 0 , -1) inds = self.server.lrange(ind_set_key, 0, -1) ind_names = self.server.lrange(ind_name_set_key, 0, -1) if ((not bizTypes) or (not inds)): if self.idling: self.closed() else: return # return, wait for next call if not idle for ib, bizType in enumerate(bizTypes): for ii, ind in enumerate(inds): bizType_id = bytes_to_str(bizType, self.redis_encoding) bizType_title = bytes_to_str(bizType_titles[ib]) ind_id = bytes_to_str(ind, self.redis_encoding) ind_name = bytes_to_str(ind_names[ii]) req = self.make_request_from_data( bizType_id, bizType_title, ind_id, ind_name ) if req: yield req self.server.delete( bizType_set_key, bizType_title_set_key, ind_set_key, ind_name_set_key )
def make_request_from_data(self, data): # print("start make request: ",time.time()) localtime = time.localtime(time.time()) self.mouth = localtime[1] self.year = localtime[0] self.day = localtime[2] house_id = bytes_to_str(data, self.redis_encoding) url = self.urlJoint(house_id) meta = { 'house_id': house_id, 'url': url, "handle_httpstatus_all": True } headers = { # ('Host', 'www.airbnb.cn'), # ('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'), # ('X-Airbnb-Supports-Airlock-V2','true'), # ('X-Airbnb-GraphQL-Platform-Client','apollo-niobe'), # ('Accept-Languaget','zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2'), # ('Accept','*/*'), ('X-Airbnb-API-Key', 'd306zoyjsyarp7ifhu67rjxn52tv0t20') } # print("end make request: ",time.time()) # settings = get_project_settings() # print ("Your CONCURRENT_REQUESTS is:\n%s" % (settings.get('CONCURRENT_REQUESTS'))) return Request(url=url, callback=self.calendarParse, errback=self.calendarErrback, meta=meta, dont_filter=True, headers=headers)
def make_request_from_data(self, data): str_seed = bytes_to_str(data, self.redis_encoding) seed = Seed.parse_seed(str_seed) if seed.type == 0: cate_id, brand_id, name = seed.value if brand_id: cid1, cid2, cid3 = re.split(',', cate_id) # if cid1 == "1713": # en_cate_id, en_brand_id = urllib.parse.urlencode({"cat": cate_id}), urllib.parse.urlencode( # {"ev": "expublishers_" + brand_id}) # else: #en_cate_id, en_brand_id = urllib.parse.urlencode({"cat": cate_id}), urllib.parse.urlencode({"ev": "exbrand_" + name}) url = 'https://list.jd.com/list.html?{0}&{1}&psort=4&click=1'.format( urllib.parse.urlencode({"cat": cate_id}), urllib.parse.urlencode({"ev": "exbrand_" + name})) else: url = 'https://list.jd.com/list.html?{0}&psort=4&click=1'.format( urllib.parse.urlencode({"cat": cate_id})) return Request(url=url, meta={ "_seed": str_seed, "headers": { "Referer": "https://www.jd.com/" } }, priority=0, callback=self.parse) elif seed.type == 3: str_seed = seed.value request = Request.deserialize(str_seed, self) return request
def make_request_from_data(self, data): str_seed = bytes_to_str(data, self.redis_encoding) seed = Seed.parse_seed(str_seed) print(seed) if seed.type == 0: skuid = seed.value url = "https://club.jd.com/comment/getSkuPageFoldComments.action?callback=jQuery2675603&productId={0}&score=0&sortType=6&page=0&pageSize=10".format( skuid) return Request( url=url, meta={ "_seed": str_seed, "current_page": 0, "headers": { "Connection": "close", "Referer": "https://item.m.jd.com/{0}.html".format(skuid) } }, priority=0, callback=self.parse) elif seed.type == 3: str_seed = seed.value request = Request.deserialize(str_seed, self) return request
def make_request_from_data(self, data): """Returns a Request instance from data coming from Redis. By default, ``data`` is an encoded URL. You can override this method to provide your own message decoding. Parameters ---------- data : bytes Message from redis. """ url = bytes_to_str(data, self.redis_encoding) opt = URL(url) if opt.domain().find('eastmoney') == -1: print('eastmoney false') return been_flag = self.rlink.get(self.redis_name + 'been_url:' + url) if opt.domain() == 'iguba.eastmoney.com': self.rlink.rpush(self.redis_key, url) if been_flag: print(self.redis_name + 'been_url:' + url) RedisMixin.site_no_add_content_count += 1 time.sleep(RedisMixin.site_no_add_content_count * RedisMixin.site_no_add_content_count) return else: RedisMixin.site_no_add_content_count = 0 else: if been_flag: return print(url) return self.make_requests_from_url(url)
def next_request(self, cookie_dict): """Returns a request to be scheduled or none.""" use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', START_URLS_AS_SET) fetch_one = self.server.spop if use_set else self.server.lpop # XXX: Do we need to use a timeout here? found = 0 # TODO: Use redis pipeline execution. while found < self.redis_batch_size: data = fetch_one(self.redis_key) if not data: # Queue empty. break req = self.make_request_from_data(data) if req: url = bytes_to_str(data, self.redis_encoding) yield [ scrapy.Request(url=url, headers=self.headers, cookies=cookie_dict, callback=self.parse) ] found += 1 else: self.logger.debug("Request not made from data: %r", data) if found: self.logger.debug("Read %s requests from '%s'", found, self.redis_key) if found: self.logger.debug("Read %s requests from '%s'", found, self.redis_key)
def next_requests(self): """ Replaces the default method. Closes spider when tickers are crawled and queue empty. """ use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET) fetch_one = self.server.spop if use_set else self.server.lpop found = 0 while found < self.redis_batch_size: data = fetch_one(self.redis_key) if not data: break params = bytes_to_str(data, self.redis_encoding).split(";") ticker = params[0] self.idling = False try: pageSize = params[1] req = self.make_request_from_data(ticker, pageSize) if req: yield req else: self.logger.info("Request not made from data: %r", data) except: count_data["formdata"]["code"] = ticker count_data["formdata"]["tradingdate"] = self.date count_data["meta"]["ticker"] = ticker count_data["meta"]["counted"] = "0" req = FormRequest(url=count_data["url"], formdata=count_data["formdata"], headers=count_data["headers"], cookies=count_data["cookies"], meta=count_data["meta"], callback=self.parse, errback=self.handle_error) if req: yield req self.logger.info( f'Counting number of associates of {ticker}') else: self.logger.info("Request not made from data: %r", data) found += 1 if found: self.logger.debug("Read %s params from '%s'", found, self.redis_key) # Close spider if corpAZ is closed and none in queue and spider is idling # Print off requests with errors, then delete all keys related to this Spider if self.r.get(self.corpAZ_closed_key) == "1" and self.r.llen( self.redis_key) == 0 and self.idling == True: self.logger.info(self.r.smembers(self.error_set_key)) keys = self.r.keys(f'{self.name}*') for k in keys: self.r.delete(k) self.crawler.engine.close_spider( spider=self, reason="CorpAZ is closed; Queue is empty; Processed everything" ) self.close_status()
def make_request_from_data(self, data): """ scrapy-redis自帶去重功能失效,實現去重 """ url = bytes_to_str(data, self.redis_encoding) if self.redis_cli.sismember('shops_baseinfo:urls_successed', url): return None return self.make_requests_from_url(url)
def process_decode(self, item): data_str = bytes_to_str(item) data = json.loads(data_str) company_os_number = data.get('company_os_number') company_name = data.get('company_name') company_url = data.get('company_url') new_data = {'os_number': company_os_number, 'name': company_name, 'url': company_url} # print(new_data) return new_data
def make_request_from_data(self, data): """ 解析任务 :param data: str redis任务数据 :return: """ data = bytes_to_str(data, self.redis_encoding) _data = json.loads(data) return self.make_requests_from_url(_data)
def make_request_from_data(self, data): house_id = bytes_to_str(data, self.redis_encoding) meta = {'house_id': house_id, "handle_httpstatus_all": True} return Request(url=self.urlJoint(house_id), callback=self.calendarParse, errback=self.calendarErrback, meta=meta, dont_filter=True, headers={('User-Agent', 'Mozilla/5.0')})
def make_request_from_data(self, data): """ 如果客户端传输不仅仅是url,需要带一些其他参数信息,就需要重写这个方法 """ param = json.loads(bytes_to_str(data, self.redis_encoding)) return scrapy.Request( url=param["url"], dont_filter=True, method="GET", )
def make_request_from_data(self, data): """ json_input supports all members of Request """ req_data = bytes_to_str(data, self.redis_encoding) if self.json_input: req_json = self.process_req_json(json.loads(req_data)) return self.request_from_json(req_json) return self.request_cls(url=req_data)
def make_request_from_data(self, data): """ 重写RedisCrawlSpider中的该算法, 以实现更多参数的传递 :param data: redis数据库中获取的数据 :return: """ # 从redis数据库中获取详情页的url redis_data = json.loads(bytes_to_str(data, self.redis_encoding)) url = redis_data.get('url') return scrapy.Request(url=str(url), callback=self.parse_detail)
def make_request_from_data(self, data): str_seed = bytes_to_str(data, self.redis_encoding) seed = Seed.parse_seed(str_seed) if seed.type == 0: return Request(url=seed.value, meta={"_seed": str_seed}, priority=0, callback=self.parse) elif seed.type == 3: str_seed = seed.value request = Request.deserialize(str_seed, self) return request
def make_request_from_data(self, data): """ 重写RedisCrawlSpider中的该算法, 以实现更多参数的传递 :param data: redis数据库中获取的数据 :return: """ # 从redis数据库中获取详情页的url redis_data = json.loads(bytes_to_str(data, self.redis_encoding)) url = redis_data.get('url') "在mysql中查询" exist = self.session.query(Content).filter_by(url=url).first() if not exist: return scrapy.Request(url=str(url), callback=self.parse)
def make_request_from_data(self, data): keyword = bytes_to_str(data) url_keyword = urllib.parse.quote(keyword) for page in range(1, 101): url = 'https://s.taobao.com/list?data-key=s&data-value={}&ajax=true&q={}&cat=16&style=grid&seller_type=taobao'.format( (page - 1) * 60, url_keyword) yield Request( url=url, dont_filter=True, # meta={ # 'keyword': keyword # } )
def make_request_from_data(self, data): url = bytes_to_str(data, self.redis_encoding) meta = None try: #{'url':'xxxxx','meta':{}} import json jurl = json.loads(url) url = jurl['url'] meta = jurl['meta'] #meta = {'task':123456} except: pass return Request(url, dont_filter=True, meta=meta)
def next_requests(self): """Replaces the default method. Closes spider when tickers are crawled and queue empty. This method customized from scraperVSRedis Spider because it has the Page param. in formdata. """ use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET) fetch_one = self.server.spop if use_set else self.server.lpop found = 0 while found < self.redis_batch_size: d = fetch_one(self.redis_key) if not d: break params = bytes_to_str(d, self.redis_encoding).split(";") ticker = params[0] self.idling = False # If page param. is pushed in, crawl that page # Otherwise begin with page 1 try: page = params[1] req = self.make_request_from_data(ticker, page) if req: yield req else: self.logger.info("Request not made from params: %r", d) except: req = self.make_request_from_data(ticker, "1") if req: yield req else: self.logger.info("Request not made from params: %r", d) found += 1 if found: self.logger.debug("Read %s params from '%s'", found, self.redis_key) # Close spider if corpAZ is closed and none in queue and spider is idling # Print off requests with errors, then delete all keys related to this Spider if self.r.get(self.corpAZ_closed_key) == "1" and self.r.llen( self.redis_key) == 0 and self.idling == True: self.logger.info(self.r.smembers(self.error_set_key)) keys = self.r.keys(f'{self.name}*') for k in keys: self.r.delete(k) self.crawler.engine.close_spider( spider=self, reason="CorpAZ is closed; Queue is empty; Processed everything" ) self.close_status()
def make_request_from_data(self, data): str_seed = bytes_to_str(data, self.redis_encoding) seed = Seed.parse_seed(str_seed) if seed.type == 0: phonenumber = seed.value.strip() url = "http://shouji.xpcha.com/{0}.html".format(phonenumber) return Request(url=url, meta={"_seed": str_seed}, priority=0, callback=self.parse) elif seed.type == 3: str_seed = seed.value request = Request.deserialize(str_seed, self) return request
def make_request_from_data(self, data): keyword = bytes_to_str(data) url_keyword = urllib.parse.quote(keyword) for page in range(1, 101): url = 'https://s.taobao.com/search?q={}&js=1&s={}}'.format( url_keyword, (page - 1) * 44) yield Request( url=url, dont_filter=True, # meta={ # 'keyword': keyword # } )
def make_request_from_data(self, data): """Returns a Request instance from data coming from Redis. By default, ``data`` is an encoded URL. You can override this method to provide your own message decoding. Parameters ---------- data : bytes Message from redis. """ url = bytes_to_str(data, self.redis_encoding) return self.make_requests_from_url(url)
def start_requests(self): items = redis_client.lrange("wanda_plaza:url", 0, -1) for item in items: item = bytes_to_str(item) home_page, plaza_name = item.split("|") business_url = home_page + "/shanghu" yield scrapy.Request( url=business_url, callback=(self.parse_detail_page), meta={ "name": plaza_name, "detail_page": home_page }, )
def make_request_from_data(self, data): """Returns a Request instance from data coming from Redis. By default, ``data`` is an encoded URL. You can override this method to provide your own message decoding. Parameters ---------- data : bytes Message from redis. """ # pattern = r'https?\:\/\/sofifa.com\/team\/[0-9]+\/[a-zA-Z0-9-\ ]+\/' # url = bytes_to_str(data, self.redis_encoding) # url = re.findall(pattern, url)[0] url = bytes_to_str(data, self.redis_encoding) return self.make_requests_from_url(url)
def make_request_from_data(self, data): """Overrides make_request_from_data of RedisSpider, and then we could push post request parameters to Redis. :param data: :return: """ house_id = bytes_to_str(data, self.redis_encoding) payload = json.dumps(self.get_payload(house_id)) meta = {'house_id': house_id} return Request(url=self.url, method="POST", callback=self.parse, meta=meta, body=payload, headers=self.headers, dont_filter=True, errback=self.errback)
def make_request_from_data(self, data): """Returns a Request instance from data coming from Redis. By default, ``data`` is an encoded URL. You can override this method to provide your own message decoding. Parameters ---------- data : bytes Message from redis. Note: override RedisSpider to make it support to extract url from dictionary """ data = bytes_to_str(data, self.redis_encoding) data = json.loads(data) url = data['job_url'] request = Request(url) request.meta['job_info'] = data return request
def make_request_from_data(self, data): house_id = bytes_to_str(data, self.redis_encoding) meta = {'house_id': house_id, "handle_httpstatus_all": True} headers = { ('User-Agent', 'Mozilla/5.0'), ('X-Airbnb-GraphQL-Platform-Client', 'apollo-niobe'), ('X-CSRF-Token', 'V4$.airbnb.cn$wPXtlwZHCOo$axPFMN6Y-FIzrRCv6IQQXLdHUIV6r9b9VWqFPO791kA=' ), ('X-Airbnb-API-Key', 'd306zoyjsyarp7ifhu67rjxn52tv0t20') } return Request(url=self.urlJoint(house_id), callback=self.detailParse, errback=self.detailErrback, meta=meta, dont_filter=True, headers=headers)
def make_request_from_data(self, data): """override method""" print("start to process") datastr = bytes_to_str(data, self.redis_encoding) base_url, fishtype = [a.strip() for a in datastr.split(",")] formdata = { 'currentMethod': 'imgs', 'fromSearch': 'yes', 'displayCount': '200', 'query': fishtype, '-Search': 'Search' } return scrapy.FormRequest(url=base_url, meta={'type': fishtype}, callback=self.parse, formdata=formdata, dont_filter=True)
def make_request_from_data(self, data): house_id = bytes_to_str(data, self.redis_encoding) print(house_id) meta = {'house_id': house_id, "handle_httpstatus_all": True} headers = { ('User-Agent', 'Mozilla/5.0'), ('X-Airbnb-GraphQL-Platform-Client', 'apollo-niobe'), ('X-CSRF-Token', 'V4$.airbnb.com$N08Lvly9so8$9FCbNS_kWV_D2v-DNX8_ErpCxUhGEOx_x6zCLNui514=' ), ('X-Airbnb-API-Key', 'd306zoyjsyarp7ifhu67rjxn52tv0t20') } return Request(url=self.urlJoint(house_id), callback=self.detailParse, errback=self.detailErrback, meta=meta, dont_filter=True, headers=headers)
def make_request_from_data(self, data): str_seed = bytes_to_str(data, self.redis_encoding) seed = Seed.parse_seed(str_seed) if seed.type == 0: skuid = seed.value #url = "https://wq.jd.com/commodity/comment/getcommentlist?callback=fetchJSON_comment98&pagesize=10&sceneval=2&skucomment=1&score=0&sku={0}&sorttype=6&page=0".format(skuid) url = "https://wq.jd.com/commodity/comment/getcommentlist?callback=skuJDEvalB&version=v2&pagesize=10&sceneval=2&skucomment=1&score=0&sku={}&sorttype=6&page=1&t=0.5156075450518778".format( skuid) headers = { 'Connection': 'close', 'Host': 'wq.jd.com', 'accept': '*/*', 'sec-fetch-site': 'same-site', 'sec-fetch-mode': 'no-cors', 'sec-fetch-dest': 'script', "Referer": "https://item.m.jd.com/ware/view.action?wareId={}&sid=null". format(skuid), 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 'User-Agent': 'Mozilla/5.0 (Linux; Android 10; HRY-AL00a; HMSCore 5.1.1.303) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 HuaweiBrowser/11.0.7.303 Mobile Safari/537.36', "cookie": "__jdc=122270672; mba_muid=16087105855231456793479; shshshfpa=b86c237d-b506-9cc9-730d-39db2f5ea48c-1608710586; shshshfpb=aW2xjA0PZevBiTvJrQ6rk4A%3D%3D; retina=1; webp=1; visitkey=31140776387466944; sbx_hot_h=null; deviceVersion=83.0.4103.106; deviceOS=android; deviceOSVersion=10; deviceName=Chrome; rurl=https%3A%2F%2Fwqs.jd.com%2Ffaqs%2Findex.html%3Fsceneval%3D2%26ptag%3D7001.1.124%26productId%3D12991458%26ispg%3D%26_fd%3Djdm%26jxsid%3D16109541564584400343; equipmentId=A75Q6PQS36IHI62HBEUGC44IVLERE7257UWVYTGEXPMR6NOKARSVVF2Q6EBPSVGNR537LK6GQN3ENW47JREOEXNAVI; __jdv=122270672%7Cdirect%7C-%7Cnone%7C-%7C1614224630058; sc_width=360; shshshfp=c6774e911e47825ddd51cefc23f9b157; wxa_level=1; cid=9; jxsid=16145705280303310338; __jda=122270672.16087105855231456793479.1608710585.1614224630.1614570529.10; wq_ug=14; fingerprint=794164a430090764096f40466260c718; mt_xid=V2_52007VwMVU1ReUlsbQB1YBmUDF1ZaXlpYGk8RbFVuBEBVWV9RRkhIGw4ZYlcRWkFQWwlIVR5aAjAAR1BZX1tZHnkaXQZnHxNQQVlSSx9JElgFbAEbYl9oUmoXSB5dDWYKE1BZXlNeF08cVQNvMxJbWV8%3D; wq_logid=1614571192.282863947; wqmnx1=MDEyNjM5M3AuL3d3MiY2NjQ1eGQtTTFBaSBsby8zd3IzZTUyNy00UkghKQ%3D%3D; __jdb=122270672.9.16087105855231456793479|10.1614570529; mba_sid=16145705290954323095988279117.9; __wga=1614571199267.1614570547761.1614225998734.1610954174749.5.6; PPRD_P=UUID.16087105855231456793479-LOGID.1614571199300.300139660; jxsid_s_t=1614571199496; jxsid_s_u=https%3A//item.m.jd.com/ware/view.action; sk_history=70241615154%2C101609%2C615036%2C54761686610%2C1399903%2C10024515889185%2C10381689654%2C12991458%2C100010062010%2C58070892025%2C100007627009%2C; shshshsID=e45b3b58ca53b7ab42489de6ebc02d6b_5_1614571200418" } return Request(url=url, meta={ "_seed": str_seed, "dydmc_delay": 0.15 + random.random() * 0.1, "headers": headers }, priority=0, callback=self.parse) elif seed.type == 3: str_seed = seed.value request = Request.deserialize(str_seed, self) return request
def make_request_from_data(self, data): """ 如果客户端传输不仅仅是url,需要带一些其他参数信息,就需要重写这个方法 """ param = json.loads(bytes_to_str(data, self.redis_encoding)) return scrapy.Request(url=param["url"],dont_filter=True, method="GET",)
def test_bytes_to_str(): assert bytes_to_str(b'foo') == 'foo' # This char is the same in bytes or latin1. assert bytes_to_str(b'\xc1', 'latin1') == '\xc1'