def make_request_from_data(self, data): str_seed = bytes_to_str(data, self.redis_encoding) seed = Seed.parse_seed(str_seed) print(seed) if seed.type == 0: skuid = seed.value url = "https://club.jd.com/comment/getSkuPageFoldComments.action?callback=jQuery2675603&productId={0}&score=0&sortType=6&page=0&pageSize=10".format( skuid) return Request( url=url, meta={ "_seed": str_seed, "current_page": 0, "headers": { "Connection": "close", "Referer": "https://item.m.jd.com/{0}.html".format(skuid) } }, priority=0, callback=self.parse) elif seed.type == 3: str_seed = seed.value request = Request.deserialize(str_seed, self) return request
def parse3(self, response): seed = Seed.parse_seed(response.meta["_seed"]) cate_id, brand_id, page, s = seed.value for item in json.loads(self.json_pettern.findall(response.text)[0]): if item: print(item) yield { "skuid": item.get("pid"), "cate_id": cate_id, "brand_id": brand_id, "shopid": item.get("shopId"), "venderid": item.get("venderId", None), "shop_name": item.get("seller"), "ziying": 1 if item.get("seller") and item.get("seller").find("自营") != -1 else 0 } else: print(item)
def parse1(self, response): seed = Seed.parse_seed(response.meta["_seed"]) cate_id, brand_id, page, s = seed.value r1 = self.first_pettern.findall(response.text) if r1: r1 = r1[0] if r1: cate_id, brand_id, page, s, items = cate_id, brand_id, page + 1, s + 30, r1 if brand_id: en_cate_id, en_brand_id = urllib.parse.urlencode({ "cat": cate_id }), urllib.parse.urlencode({"ev": "exbrand_" + brand_id}) url = 'https://list.jd.com/list.html?{0}&{1}&psort=4&page={2}&s={3}&scrolling=y&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={4}'.format( en_cate_id, en_brand_id, page, s, items) request = Request(url=url, callback=self.parse2, priority=2) request.headers[ "Referer"] = "https://list.jd.com/list.html?{0}&{1}&page={2}&s={3}&psort=4&click=1".format( en_cate_id, en_brand_id, page - 1, s - 30) else: en_cate_id = urllib.parse.urlencode({"cat": cate_id}) url = 'https://list.jd.com/list.html?{0}&psort=4&page={1}&s={2}&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={3}'.format( en_cate_id, page, s, items) request = Request(url=url, callback=self.parse2, priority=2) request.headers[ "Referer"] = "https://list.jd.com/list.html?{0}&page={1}&s={2}&psort=4&click=1".format( en_cate_id, page - 1, s - 30) request.meta["_seed"] = str( Seed((cate_id, brand_id, page, s), type=2)) request.meta["last_page_pids"] = r1 yield request
def make_request_from_data(self, data): str_seed = bytes_to_str(data, self.redis_encoding) seed = Seed.parse_seed(str_seed) if seed.type == 0: cate_id, brand_id, name = seed.value if brand_id: cid1, cid2, cid3 = re.split(',', cate_id) # if cid1 == "1713": # en_cate_id, en_brand_id = urllib.parse.urlencode({"cat": cate_id}), urllib.parse.urlencode( # {"ev": "expublishers_" + brand_id}) # else: #en_cate_id, en_brand_id = urllib.parse.urlencode({"cat": cate_id}), urllib.parse.urlencode({"ev": "exbrand_" + name}) url = 'https://list.jd.com/list.html?{0}&{1}&psort=4&click=1'.format( urllib.parse.urlencode({"cat": cate_id}), urllib.parse.urlencode({"ev": "exbrand_" + name})) else: url = 'https://list.jd.com/list.html?{0}&psort=4&click=1'.format( urllib.parse.urlencode({"cat": cate_id})) return Request(url=url, meta={ "_seed": str_seed, "headers": { "Referer": "https://www.jd.com/" } }, priority=0, callback=self.parse) elif seed.type == 3: str_seed = seed.value request = Request.deserialize(str_seed, self) return request
def parse5(self, response): seed = Seed.parse_seed(response.meta["_seed"]) count = self.allcnt_pattern.findall(response.text) skuid = response.meta["prices"]["id"] result = {} result.update(response.meta["prices"]) result.update({"comment": count[0]}) result.update(response.meta["info"][int(skuid)]) result.pop("id") yield result
def parse(self, response): seed = Seed.parse_seed(response.meta["_seed"]) page_strs = self.totalpage_perttern.findall(response.text) if int(page_strs[0]) < 100: #no need to flip pass else: #need to flip pass if page_strs: page_strs = page_strs[0]
def make_request_from_data(self, data): str_seed = bytes_to_str(data, self.redis_encoding) seed = Seed.parse_seed(str_seed) if seed.type == 0: return Request(url=seed.value, meta={"_seed": str_seed}, priority=0, callback=self.parse) elif seed.type == 3: str_seed = seed.value request = Request.deserialize(str_seed, self) return request
def make_request_from_data(self, data): str_seed = bytes_to_str(data, self.redis_encoding) seed = Seed.parse_seed(str_seed) if seed.type == 0: phonenumber = seed.value.strip() url = "http://shouji.xpcha.com/{0}.html".format(phonenumber) return Request(url=url, meta={"_seed": str_seed}, priority=0, callback=self.parse) elif seed.type == 3: str_seed = seed.value request = Request.deserialize(str_seed, self) return request
def parse3(self, response): seed = Seed.parse_seed(response.meta["_seed"]) cate_id, brand_id, page, s = seed.value r = json.loads(self.json_pettern.findall(response.text)[0]) if r: for item in r: if item: self.logger.info("jd_skuids 1") yield {"skuid": item.get("pid"), "cate_id": cate_id, "brand_id": brand_id, "shopid": item.get("shopId"), "venderid": item.get("venderId", None), "shop_name": item.get("seller"), "ziying": 1 if item.get("seller") and item.get("seller").find("自营") != -1 else 0, "title":response.meta["sku2title"][str(item.get("pid"))],"chaoshi":1 if "京东超市" in response.meta["sku2title"][str(item.get("pid"))] else 0} else: self.logger.info("jd_skuids 2") else: raise Exception(response.request.url)
def parse3(self, response): seed = Seed.parse_seed(response.meta["_seed"]) cate_id, brand_id, page, s = seed.value r = json.loads(self.json_pettern.findall(response.text)[0]) if r: tmp = {} for item in r: if item: tmp[item.get("pid")] = { "skuid": item.get("pid"), "cate_id": cate_id, "brand_id": brand_id, "shopid": item.get("shopId"), "venderid": item.get("venderId", None), "shop_name": item.get("seller"), "ziying": 1 if item.get("seller") and item.get("seller").find("自营") != -1 else 0, "title": response.meta["sku2title"][str(item.get("pid"))], "chaoshi": 1 if "京东超市" in response.meta["sku2title"][str( item.get("pid"))] else 0 } response.meta["info"] = tmp response.meta["dydmc_delay"] = 1 response.meta["headers"] = { "Connection": "close", "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Mobile Safari/537.36" } yield Request( url="https://p.3.cn/prices/mgets?&type=1&skuIds=J_" + "%2CJ_".join([sku for sku in response.meta["sku2title"]]) + '&pduid=' + str(random.randint(100000000, 999999999)), callback=self.parse4, meta=response.meta, priority=4) else: raise Exception(response.request.url)
def make_request_from_data(self, data): str_seed = bytes_to_str(data, self.redis_encoding) seed = Seed.parse_seed(str_seed) if seed.type == 0: skuid = seed.value #url = "https://wq.jd.com/commodity/comment/getcommentlist?callback=fetchJSON_comment98&pagesize=10&sceneval=2&skucomment=1&score=0&sku={0}&sorttype=6&page=0".format(skuid) url = "https://wq.jd.com/commodity/comment/getcommentlist?callback=skuJDEvalB&version=v2&pagesize=10&sceneval=2&skucomment=1&score=0&sku={}&sorttype=6&page=1&t=0.5156075450518778".format( skuid) headers = { 'Connection': 'close', 'Host': 'wq.jd.com', 'accept': '*/*', 'sec-fetch-site': 'same-site', 'sec-fetch-mode': 'no-cors', 'sec-fetch-dest': 'script', "Referer": "https://item.m.jd.com/ware/view.action?wareId={}&sid=null". format(skuid), 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 'User-Agent': 'Mozilla/5.0 (Linux; Android 10; HRY-AL00a; HMSCore 5.1.1.303) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 HuaweiBrowser/11.0.7.303 Mobile Safari/537.36', "cookie": "__jdc=122270672; mba_muid=16087105855231456793479; shshshfpa=b86c237d-b506-9cc9-730d-39db2f5ea48c-1608710586; shshshfpb=aW2xjA0PZevBiTvJrQ6rk4A%3D%3D; retina=1; webp=1; visitkey=31140776387466944; sbx_hot_h=null; deviceVersion=83.0.4103.106; deviceOS=android; deviceOSVersion=10; deviceName=Chrome; rurl=https%3A%2F%2Fwqs.jd.com%2Ffaqs%2Findex.html%3Fsceneval%3D2%26ptag%3D7001.1.124%26productId%3D12991458%26ispg%3D%26_fd%3Djdm%26jxsid%3D16109541564584400343; equipmentId=A75Q6PQS36IHI62HBEUGC44IVLERE7257UWVYTGEXPMR6NOKARSVVF2Q6EBPSVGNR537LK6GQN3ENW47JREOEXNAVI; __jdv=122270672%7Cdirect%7C-%7Cnone%7C-%7C1614224630058; sc_width=360; shshshfp=c6774e911e47825ddd51cefc23f9b157; wxa_level=1; cid=9; jxsid=16145705280303310338; __jda=122270672.16087105855231456793479.1608710585.1614224630.1614570529.10; wq_ug=14; fingerprint=794164a430090764096f40466260c718; mt_xid=V2_52007VwMVU1ReUlsbQB1YBmUDF1ZaXlpYGk8RbFVuBEBVWV9RRkhIGw4ZYlcRWkFQWwlIVR5aAjAAR1BZX1tZHnkaXQZnHxNQQVlSSx9JElgFbAEbYl9oUmoXSB5dDWYKE1BZXlNeF08cVQNvMxJbWV8%3D; wq_logid=1614571192.282863947; wqmnx1=MDEyNjM5M3AuL3d3MiY2NjQ1eGQtTTFBaSBsby8zd3IzZTUyNy00UkghKQ%3D%3D; __jdb=122270672.9.16087105855231456793479|10.1614570529; mba_sid=16145705290954323095988279117.9; __wga=1614571199267.1614570547761.1614225998734.1610954174749.5.6; PPRD_P=UUID.16087105855231456793479-LOGID.1614571199300.300139660; jxsid_s_t=1614571199496; jxsid_s_u=https%3A//item.m.jd.com/ware/view.action; sk_history=70241615154%2C101609%2C615036%2C54761686610%2C1399903%2C10024515889185%2C10381689654%2C12991458%2C100010062010%2C58070892025%2C100007627009%2C; shshshsID=e45b3b58ca53b7ab42489de6ebc02d6b_5_1614571200418" } return Request(url=url, meta={ "_seed": str_seed, "dydmc_delay": 0.15 + random.random() * 0.1, "headers": headers }, priority=0, callback=self.parse) elif seed.type == 3: str_seed = seed.value request = Request.deserialize(str_seed, self) return request
def parse1(self, response): seed = Seed.parse_seed(response.meta["_seed"]) cate_id, brand_id, name, page, s = seed.value sku2title = {} for sku in self.sku_pattern1.findall(response.text): sku2title[sku[0]] = re.sub("<[\s\S]*?>|\t|\n", "", sku[1]) r1 = self.first_pettern.findall(response.text) if r1: r1 = r1[0] if r1: cate_id, brand_id, page, s, items = cate_id, brand_id, page + 1, s + 30, r1 if brand_id: en_cate_id, ename = urllib.parse.urlencode({ "cat": cate_id }), urllib.parse.urlencode({"ev": "exbrand_" + name}) url = 'https://list.jd.com/list.html?{0}&{1}&psort=4&page={2}&s={3}&scrolling=y&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={4}'.format( en_cate_id, ename, page, s, items) request = Request(url=url, callback=self.parse2, priority=2) request.headers[ "Referer"] = "https://list.jd.com/list.html?{0}&{1}&page={2}&s={3}&psort=4&click=1".format( en_cate_id, ename, page - 1, s - 30) else: en_cate_id = urllib.parse.urlencode({"cat": cate_id}) url = 'https://list.jd.com/list.html?{0}&psort=4&page={1}&s={2}&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={3}'.format( en_cate_id, page, s, items) request = Request(url=url, callback=self.parse2, priority=2) request.headers[ "Referer"] = "https://list.jd.com/list.html?{0}&page={1}&s={2}&psort=4&click=1".format( en_cate_id, page - 1, s - 30) request.meta["_seed"] = str( Seed((cate_id, brand_id, page, s), type=2)) request.meta["last_page_pids"] = r1 request.meta["sku2title"] = sku2title request.meta["totalpage"] = response.meta["totalpage"] request.meta["currentpage"] = page - 1 request.meta["dydmc_delay"] = 2.5 yield request else: raise Exception(response.request.url) else: raise Exception(response.request.url)
def make_request_from_data(self, data): str_seed = bytes_to_str(data, self.redis_encoding) seed = Seed.parse_seed(str_seed) if seed.type == 0: cats = re.split(',', seed.value) format_value = (seed.value, 2, "pub") if cats[0] == '1713' else (seed.value, 1, "brand") url = 'https://list.jd.com/list.html?cat={0}&trans=1&md={1}&my=list_{2}'.format( *format_value) return Request(url=url, meta={"_seed": str_seed}, priority=0, callback=self.parse) elif seed.type == 3: str_seed = seed.value request = Request.deserialize(str_seed, self) return request
def parse(self, response): seed = Seed.parse_seed(response.meta["_seed"]) skuid = seed.value count = self.allcnt_pattern.findall(response.text) r = self.comments_pattern.findall(response.text)[0] if r != '[]': for item in json.loads(r): yield { "id": item.get("id"), "creationTime": item.get("creationTime"), "isTop": str(item.get('isTop')), "isMobile": item.get("isMobile"), "userLevelName": item.get("userLevelName"), "userClientShow": item.get("userClientShow"), "plusAvailable": item.get("plusAvailable"), "firstCategory": item.get("firstCategory"), "secondCategory": item.get("secondCategory"), "thirdCategory": item.get("thirdCategory"), "discussionId": item.get("discussionId"), "referenceId": item.get("referenceId"), "referenceTime": item.get("referenceTime"), "nickname": item.get("nickname"), "commentcout": count[0], "current_page": response.meta["current_page"] } maxpagesindex = max(0, min((int(count[0]) - 1) // 10, 99)) if response.meta["current_page"] < maxpagesindex: url = "https://club.jd.com/comment/getSkuPageFoldComments.action?callback=jQuery2675603&productId={0}&score=0&sortType=6&page={1}&pageSize=10".format( skuid, response.meta["current_page"] + 1) yield Request( url=url, meta={ "_seed": response.meta["_seed"], "commentcount": count[0], "current_page": response.meta["current_page"] + 1, "headers": { "Connection": "close", "Referer": "https://item.m.jd.com/{0}.html".format(skuid) } }, priority=1, callback=self.parse)
def parse1(self, response): seed = Seed.parse_seed(response.meta["_seed"]) cate_id, brand_id, page, s = seed.value sku2title = {} for sku in self.sku_pattern1.findall(response.text): sku2title[sku[0]] = re.sub("<[\s\S]*?>|\t|\n", "", sku[1]) r1 = self.first_pettern.findall(response.text) if r1: r1 = r1[0] if r1: if len(r1.split(",")) == 30: cate_id, brand_id, page, s, items = cate_id, brand_id, page + 1, s + 30, r1 if brand_id: en_cate_id, en_brand_id = urllib.parse.urlencode( {"cat": cate_id}), urllib.parse.urlencode({"ev": "exbrand_" + brand_id}) url = 'https://list.jd.com/list.html?{0}&{1}&psort=4&page={2}&s={3}&scrolling=y&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={4}'.format( en_cate_id, en_brand_id, page, s, items) request = Request(url=url, callback=self.parse2, priority=2) request.headers["Referer"] = "https://list.jd.com/list.html?{0}&{1}&page={2}&s={3}&psort=4&click=1".format( en_cate_id, en_brand_id, page-1, s-30) self.logger.info("jd_skuids 6") else: en_cate_id = urllib.parse.urlencode({"cat": cate_id}) url = 'https://list.jd.com/list.html?{0}&psort=4&page={1}&s={2}&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={3}'.format( en_cate_id, page, s, items) request = Request(url=url, callback=self.parse2, priority=2) request.headers["Referer"] = "https://list.jd.com/list.html?{0}&page={1}&s={2}&psort=4&click=1".format( en_cate_id, page - 1, s - 30) self.logger.info("jd_skuids 7") request.meta["_seed"] = str(Seed((cate_id, brand_id, page, s), type=2)) request.meta["last_page_pids"] = r1 request.meta["sku2title"] = sku2title request.meta["totalpage"] = response.meta["totalpage"] request.meta["currentpage"] = page - 1 yield request else: # 说明没有下半页"https://chat1.jd.com/api/checkChat?pidList=10020242230938,1999899692,72276507174,19999997645,1999899692,100000002015,100000002686,200134637813&callback=jQuery8117083" yield Request(url="https://chat1.jd.com/api/checkChat?pidList={0}&callback=jQuery8117083".format( r1), callback=self.parse3, meta=response.meta) else: raise Exception(response.request.url) else: raise Exception(response.request.url)
def parse(self, response): seed = Seed.parse_seed(response.meta["_seed"]) page_strs = self.totalpage_perttern.findall(response.text) if page_strs: page_strs = page_strs[0] for i in range(1, int(page_strs) + 1): page, s = 2 * i - 1, 60 * (i - 1) + 1 cate_id, brand_id, name = seed.value if brand_id: en_cate_id, ename = urllib.parse.urlencode({ "cat": cate_id }), urllib.parse.urlencode({"ev": "exbrand_" + name}) url = 'https://list.jd.com/list.html?{0}&{1}&page={2}&s={3}&psort=4&click=1'.format( en_cate_id, ename, page, s) refer = "https://www.jd.com/" if i == 1 else 'https://list.jd.com/list.html?{0}&{1}&page={2}&s={3}&psort=4&click=1'.format( en_cate_id, ename, 2 * (i - 1) - 1, 60 * (i - 2) + 1) else: en_cate_id = urllib.parse.urlencode({"cat": cate_id}) url = 'https://list.jd.com/list.html?{0}&page={1}&s={2}&psort=4&click=1'.format( en_cate_id, page, s) refer = "https://www.jd.com/" if i == 1 else 'https://list.jd.com/list.html?{0}&page={1}&s={2}&psort=4&click=1'.format( en_cate_id, 2 * (i - 1) - 1, 60 * (i - 2) + 1) yield Request(url=url, callback=self.parse1, meta={ "dydmc_delay": 2.5, "totalpage": int(page_strs), "currentpage": page, "_seed": str( Seed((cate_id, brand_id, name, page, s), type=1)), "headers": { "Connection": "close", "Referer": refer } }, priority=1) else: raise Exception(response.request.url)
def parse1(self, response): seed = Seed.parse_seed(response.meta["_seed"]) phonenumber = seed.value res = self.pattern.findall(response.text) if res: city, company, phonenumber = res[0] yield { "phonenumber": phonenumber, "province": city2prov(city), "city": city, "company": company } else: yield { "phonenumber": phonenumber, "province": "未发现", "city": "未发现", "company": "未发现" }
def make_request_from_data(self, data): str_seed = bytes_to_str(data, self.redis_encoding) seed = Seed.parse_seed(str_seed) if seed.type == 0: sku_ids = seed.value url = "http://p.3.cn/prices/mgets?&type=1&skuIds=J_" + sku_ids + '&pduid=' + self.usrid return Request(url=url, meta={ "_seed": str_seed, "headers": { "Connection": "keep-alive" } }, priority=0, callback=self.parse) elif seed.type == 3: str_seed = seed.value request = Request.deserialize(str_seed, self) return request
def parse(self, response): seed = Seed.parse_seed(response.meta["_seed"]) skuid = seed.value count = self.allcnt_pattern.findall(response.text) for item in literal_eval( self.comments_pattern.findall(response.text)[0].replace( "\t", "").replace("\n", "").replace(",}", "}").replace( "false", 'False').replace("true", 'True').replace( 'False,', '"False",').replace('True,', '"True",')): yield { "id": item.get("id"), "creationTime": item.get("creationTime"), "isTop": str(item.get('isTop')), "isMobile": item.get("isMobile"), "userLevelName": item.get("userLevelName"), "userClientShow": item.get("userClientShow"), "plusAvailable": item.get("plusAvailable"), "firstCategory": item.get("firstCategory"), "secondCategory": item.get("secondCategory"), "thirdCategory": item.get("thirdCategory"), "discussionId": item.get("discussionId"), "referenceId": item.get("referenceId"), "referenceTime": item.get("referenceTime"), "nickname": item.get("nickname"), "commentcout": count[0] } maxpagesindex = max(0, min((int(count[0]) - 1) // 10, 99)) for pindex in range(maxpagesindex + 1): url = "https://wq.jd.com/commodity/comment/getcommentlist?callback=fetchJSON_comment98&pagesize=10&sceneval=2&skucomment=1&score=0&sku={0}&sorttype=6&page={1}".format( skuid, pindex) yield Request( url=url, meta={ "_seed": response.meta["_seed"], "commentcount": count[0], "headers": { "Connection": "close", "Referer": "https://item.m.jd.com/{0}.html".format(skuid) } }, priority=1, callback=self.parse1)
def parse(self, response): seed = Seed.parse_seed(response.meta["_seed"]) phonenumber = seed.value pro_city = self.pro_city_pattern.findall(response.text) tel_compay = self.telcompany_pattern.findall(response.text) if pro_city: if pro_city[0][0] != "未知": yield { "phonenumber": phonenumber, "province": pro_city[0][0], "city": pro_city[0][0] if pro_city[0][1] == "" else pro_city[0][1], "company": tel_compay[0] } else: #失败写出 yield Request( url="https://haoma.baidu.com/phoneSearch?search={0}". format(phonenumber), meta={ "_seed": response.meta["_seed"], "headers": { "Referer": "https://www.baidu.com/" } }, priority=1, callback=self.parse1) else: yield Request( url="https://haoma.baidu.com/phoneSearch?search={0}".format( phonenumber), meta={ "_seed": response.meta["_seed"], "headers": { "Referer": "https://www.baidu.com/" } }, priority=1, callback=self.parse1)
def parse(self, response): seed = Seed.parse_seed(response.meta["_seed"]) tuples = self.pattern.findall(response.text) if len(tuples) > 0: for item in tuples: yield { "brand_id": item[0], "name": item[1], "cate_id": seed.value, "_seed": seed.value, "status": 0 } elif response.text.find( """<span class="result">抱歉,没有找到与“<em></em>”相关的商品</span>""" ) == -1: #没有品牌的分类 yield {"cate_id": seed.value, "_seed": seed.value, "status": -1} else: #不存在的分类 yield {"cate_id": seed.value, "_seed": seed.value, "status": -2}
def make_request_from_data(self, data): str_seed = bytes_to_str(data, self.redis_encoding) seed = Seed.parse_seed(str_seed) if seed.type == 0: skuid = seed.value url = "https://club.jd.com/comment/skuProductPageComments.action?callback=fetchJSON_comment98" \ "&productId={0}&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1".format(skuid) return Request( url=url, meta={ "_seed": str_seed, "headers": { "Connection": "close", "Referer": "https://item.m.jd.com/{0}.html".format(skuid) } }, priority=0, callback=self.parse) elif seed.type == 3: str_seed = seed.value request = Request.deserialize(str_seed, self) return request
def parse0(self, response): seed = Seed.parse_seed(response.meta["_seed"]) tuples = self.pattern.findall(response.text) for item in tuples: cate_id, brand_id, name = seed.value, item[0], item[1] if brand_id: url = 'https://list.jd.com/list.html?{0}&{1}&psort=4&click=1'.format( urllib.parse.urlencode({"cat": cate_id}), urllib.parse.urlencode({"ev": "exbrand_" + name})) else: url = 'https://list.jd.com/list.html?{0}&psort=4&click=1'.format( urllib.parse.urlencode({"cat": cate_id})) yield Request(url=url, meta={ "dydmc_delay": 2.5, "_seed": str(Seed(value=(cate_id, brand_id, name))), "headers": { "Referer": "https://www.jd.com/" } }, priority=0, callback=self.parse)
def make_request_from_data(self, data): str_seed = bytes_to_str(data, self.redis_encoding) seed = Seed.parse_seed(str_seed) if seed.type == 0: skuid = seed.value url = "https://wq.jd.com/commodity/comment/getcommentlist?callback=fetchJSON_comment98&pagesize=10&sceneval=2&skucomment=1&score=0&sku={0}&sorttype=6&page=0".format( skuid) return Request( url=url, meta={ "_seed": str_seed, "headers": { "Connection": "close", "Referer": "https://item.m.jd.com/{0}.html".format(skuid) } }, priority=0, callback=self.parse) elif seed.type == 3: str_seed = seed.value request = Request.deserialize(str_seed, self) return request
def parse(self, response): seed = Seed.parse_seed(response.meta["_seed"]) skuid = seed.value count = self.allcnt_pattern.findall(response.text) yield {"skuid": skuid, "comment": count[0]}