def parse1(self, response): seed = Seed.parse_seed(response.meta["_seed"]) cate_id, brand_id, page, s = seed.value r1 = self.first_pettern.findall(response.text) if r1: r1 = r1[0] if r1: cate_id, brand_id, page, s, items = cate_id, brand_id, page + 1, s + 30, r1 if brand_id: en_cate_id, en_brand_id = urllib.parse.urlencode({ "cat": cate_id }), urllib.parse.urlencode({"ev": "exbrand_" + brand_id}) url = 'https://list.jd.com/list.html?{0}&{1}&psort=4&page={2}&s={3}&scrolling=y&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={4}'.format( en_cate_id, en_brand_id, page, s, items) request = Request(url=url, callback=self.parse2, priority=2) request.headers[ "Referer"] = "https://list.jd.com/list.html?{0}&{1}&page={2}&s={3}&psort=4&click=1".format( en_cate_id, en_brand_id, page - 1, s - 30) else: en_cate_id = urllib.parse.urlencode({"cat": cate_id}) url = 'https://list.jd.com/list.html?{0}&psort=4&page={1}&s={2}&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={3}'.format( en_cate_id, page, s, items) request = Request(url=url, callback=self.parse2, priority=2) request.headers[ "Referer"] = "https://list.jd.com/list.html?{0}&page={1}&s={2}&psort=4&click=1".format( en_cate_id, page - 1, s - 30) request.meta["_seed"] = str( Seed((cate_id, brand_id, page, s), type=2)) request.meta["last_page_pids"] = r1 yield request
def __init__(self, seeds_file, **kwargs): super(JDPrice, self).__init__(**kwargs) self.proxies = list( map(lambda x: ("http://u{}:[email protected]:3128".format(x)), range(28))) self.ua = UserAgent() with open(seeds_file) as infile: for i, seed in enumerate(infile): current = seed.strip('\n').split("\t")[0] if i % 60 == 0: if i != 0: self.seeds_queue.put(Seed(strr, kwargs["retries"])) strr = current else: strr = strr + '%2CJ_' + current if strr: self.seeds_queue.put(Seed(strr, kwargs["retries"])) self.price_ad = 'http://p.3.cn/prices/mgets?&type=1&skuIds=J_' self.block_pattern = re.compile(r'{.*?}') self.innerid_pattern = re.compile(r'\d+') self.innerprice_pattern = re.compile(r'"\d+.\d+"') self.op_pattern = re.compile(r'"op":"(\d+.\d+)"') self.p_pattern = re.compile(r'(\d+.\d+)"') self.p2_pattern = re.compile(r'(-\d+.\d+)') self.p1 = re.compile(r'id":.*?p":".*?"}') self.id_pattern = re.compile(r'id:"(\d+)"') self.first_pattern = re.compile(r'([a-zA-Z]*)":') self.rid = random.randint(100000000, 999999999) self.usrid = str(self.rid) self.up_pattern = re.compile('"up":"tpp"')
def __init__(self, **kwargs): super(JDPriceMiss, self).__init__(**kwargs) self.ua = UserAgent() with op.DBManger() as m: table = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^summary_201905_20\d\d\d\d$"}}) skuid_set = set() for item in m.read_from(db_collect=("jingdong",table), out_field=("skuid",)): skuid_set.add(item[0]) for i, seed in enumerate(skuid_set): current = seed.strip() if i % 60 == 0: if i != 0: self.seeds_queue.put(Seed(strr, kwargs["retries"])) strr = current else: strr = strr + '%2CJ_' + current if strr: self.seeds_queue.put(Seed(strr, kwargs["retries"])) self.block_pattern = re.compile(r'{.*?}') self.innerid_pattern = re.compile(r'\d+') self.innerprice_pattern = re.compile(r'"\d+.\d+"') self.op_pattern = re.compile(r'"op":"(\d+.\d+)"') self.p_pattern = re.compile(r'"(\d+.\d+)"') self. p2_pattern = re.compile(r'"(-\d+.\d+)"') self.p1 = re.compile(r'"id":.*?"}') self.id_pattern = re.compile(r'id:"J_(\d+)"') self.first_pattern = re.compile(r'([a-zA-Z]*)":') self.rid = random.randint(100000000, 999999999) self.usrid = str(self.rid) self.up_pattern = re.compile('"up":"tpp"') self.price_pattern = re.compile(r'^\d+\.\d\d$')
def init_start_urls(self): self.redis.delete(self.start_urls_redis_key) self.redis.delete(self.items_redis_key) buffer = [] buffer_size = 1024 for i, seed in enumerate( open("shoujiguishudi/resource/buyer_phone.3")): seed = Seed(value=seed.strip(), type=0) buffer.append(str(seed)) if len(buffer) % buffer_size == 0: self.redis.sadd(self.start_urls_redis_key, *buffer) buffer = [] if buffer: self.redis.sadd(self.start_urls_redis_key, *buffer)
def init_start_urls(self): self.redis.delete(self.start_urls_redis_key) self.redis.delete(self.items_redis_key) with op.DBManger() as m: pipeline = [ { "$match": { "skuid": { "$ne": None } } }, ] skuid_set = set() for item in m.read_from(db_collect=("jingdong", "jdprice_miss_seed"), out_field=("skuid", ), pipeline=pipeline): skuid_set.add(int(item[0])) self.logger.info( "total new skuid of comment larger than 0 is: {}".format( len(skuid_set))) buffer = [] for i, seed in enumerate(skuid_set): seed = str(seed) current = seed.strip() if i % 60 == 0: if i != 0: seed = Seed(value=strr, type=0) buffer.append(str(seed)) strr = current else: strr = strr + '%2CJ_' + current if strr: seed = Seed(value=strr, type=0) buffer.append(str(seed)) if buffer: buffer1 = [] buffer_size = 10000 for i, seed in enumerate(buffer): buffer1.append(str(seed)) if len(buffer1) % buffer_size == 0: random.shuffle(buffer1) self.redis.sadd(self.start_urls_redis_key, *buffer1) buffer1 = [] if buffer1: random.shuffle(buffer1) self.redis.sadd(self.start_urls_redis_key, *buffer1)
def make_request_from_data(self, data): str_seed = bytes_to_str(data, self.redis_encoding) seed = Seed.parse_seed(str_seed) print(seed) if seed.type == 0: skuid = seed.value url = "https://club.jd.com/comment/getSkuPageFoldComments.action?callback=jQuery2675603&productId={0}&score=0&sortType=6&page=0&pageSize=10".format( skuid) return Request( url=url, meta={ "_seed": str_seed, "current_page": 0, "headers": { "Connection": "close", "Referer": "https://item.m.jd.com/{0}.html".format(skuid) } }, priority=0, callback=self.parse) elif seed.type == 3: str_seed = seed.value request = Request.deserialize(str_seed, self) return request
def parse3(self, response): seed = Seed.parse_seed(response.meta["_seed"]) cate_id, brand_id, page, s = seed.value for item in json.loads(self.json_pettern.findall(response.text)[0]): if item: print(item) yield { "skuid": item.get("pid"), "cate_id": cate_id, "brand_id": brand_id, "shopid": item.get("shopId"), "venderid": item.get("venderId", None), "shop_name": item.get("seller"), "ziying": 1 if item.get("seller") and item.get("seller").find("自营") != -1 else 0 } else: print(item)
def make_request_from_data(self, data): str_seed = bytes_to_str(data, self.redis_encoding) seed = Seed.parse_seed(str_seed) if seed.type == 0: cate_id, brand_id, name = seed.value if brand_id: cid1, cid2, cid3 = re.split(',', cate_id) # if cid1 == "1713": # en_cate_id, en_brand_id = urllib.parse.urlencode({"cat": cate_id}), urllib.parse.urlencode( # {"ev": "expublishers_" + brand_id}) # else: #en_cate_id, en_brand_id = urllib.parse.urlencode({"cat": cate_id}), urllib.parse.urlencode({"ev": "exbrand_" + name}) url = 'https://list.jd.com/list.html?{0}&{1}&psort=4&click=1'.format( urllib.parse.urlencode({"cat": cate_id}), urllib.parse.urlencode({"ev": "exbrand_" + name})) else: url = 'https://list.jd.com/list.html?{0}&psort=4&click=1'.format( urllib.parse.urlencode({"cat": cate_id})) return Request(url=url, meta={ "_seed": str_seed, "headers": { "Referer": "https://www.jd.com/" } }, priority=0, callback=self.parse) elif seed.type == 3: str_seed = seed.value request = Request.deserialize(str_seed, self) return request
def init_start_urls(self): self.redis.delete(self.start_urls_redis_key) self.redis.delete(self.items_redis_key) buffer_size = 1024 with op.DBManger() as m: m.create_db_collection( db_collection=("jingdong", "jdchaoshi{0}_sep".format(current_date))) buffer = [] import requests request = { "url": "https://chaoshi.jd.com/", "headers": { 'Connection': 'close', "Referer": "https://www.jd.com", 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36' } } nav_pattern = re.compile(r'navThird[1-9]: (\[.*\])') import time from ast import literal_eval src = requests.get(**request).text for i in nav_pattern.findall(src): for j in literal_eval(i): for k in j["children"]: seed = Seed(value=k["URL"].replace("\\", ""), type=0) buffer.append(str(seed)) if len(buffer) % buffer_size == 0: self.redis.sadd(self.start_urls_redis_key, *buffer) buffer = [] if buffer: self.redis.sadd(self.start_urls_redis_key, *buffer)
def init_start_urls(self): self.redis.delete(self.start_urls_redis_key) self.redis.delete(self.items_redis_key) buffer = [] buffer_size = 1024 with op.DBManger() as m: pipeline = [ { "$match": { "_status": 3 } }, ] data_set = collections.DataSet( m.read_from(db_collect=("jingdong", self.last_retry_collect), out_field=("_seed", "_status"), pipeline=pipeline)) should_exit = True for i, (seed, status) in enumerate(data_set.distinct()): should_exit = False seed = Seed(value=seed, type=3) buffer.append(str(seed)) if len(buffer) % buffer_size == 0: self.redis.sadd(self.start_urls_redis_key, *buffer) buffer = [] if buffer: self.redis.sadd(self.start_urls_redis_key, *buffer) if should_exit: import sys sys.exit(0)
def __init__(self, **kwargs): super(JDPrice, self).__init__(**kwargs) self.ua = UserAgent() with op.DBManger() as m: #创建临时表本月任务的分界线 m.create_db_collection(db_collection=("jingdong","jdcommentdetail{0}_sep".format(current_date))) skuid_set = {} top1000w = TopK(1) #skuids in last result last_result = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^summary_201905_20\d\d\d\d$"}}) pipeline = [ { "$project": { "skuid": "$skuid", "comment_{}".format(last_result[-6:]):"$comment_{}".format(last_result[-6:]) } }, {"$limit": 100} ] for item, comments in m.read_from(db_collect=("jingdong", last_result), out_field=("skuid","comment_{}".format(last_result[-6:])),pipeline=pipeline): if int(item) not in skuid_set: top1000w.push(int(comments)) skuid_set[int(item)] = int(comments) top1000w = set(top1000w.get_topk()) for i, seed in enumerate(skuid_set): if skuid_set[seed] in top1000w: seed = Seed(value=seed, type=0) self.seeds_queue.put(seed)
def init_start_urls(self): self.redis.delete(self.start_urls_redis_key) self.redis.delete(self.items_redis_key) buffer = [] buffer_size = 1024 with op.DBManger() as m: m.create_db_collection(db_collection=("jingdong", "jdskuid{0}_sep".format(current_date))) last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdbrand20\d\d\d\d\d\d_sep$"}}) seed_set = set() for table in m.list_tables("jingdong", filter={"name": {"$regex": r"^jdbrand20\d\d\d\d\d\dretry\d*$"}}): if not last_sep or table > last_sep: self.logger.info("valid table : {}".format(table)) pipeline = [ {"$match": { "$and": [{"_status": 0}, {"$or": [{"status": 0}, {"status": -1}]}] } } ] for seed in m.read_from(db_collect=("jingdong", table), out_field=("cate_id", "brand_id"), pipeline=pipeline): seed_set.add(seed) for i, seed in enumerate(seed_set): seed = Seed(value=seed, type=0) buffer.append(str(seed)) if len(buffer) % buffer_size == 0: self.redis.sadd(self.start_urls_redis_key, *buffer) buffer = [] if buffer: self.redis.sadd(self.start_urls_redis_key, *buffer)
def __init__(self, current_date, **kwargs): super(SecooWeekJob, self).__init__(**kwargs) self.proxies = list(map(lambda x:("http://u{}:[email protected]:3128".format(x)), range(28))) self.ua = UserAgent() self.current_date = current_date space = np.linspace(0, 5800000, kwargs["spider_num"] + 1) ranges = [(int(space[i]), int(space[i + 1])) for i in range(len(space) - 1)] totalpages_pattern = re.compile(r'<strong>共<i>(\d+)</i>页,到第 <b>') self.block_pattern = re.compile(r'dlProId=[\s\W\w]*?</dl>') self.pid_pattern = re.compile(r'ProId="\d+"') self.name_pattern = re.compile(r'title=".*?"') self.lo_pattern = re.compile(r'"s1"[\s\W\w]*?</span>') self.price_pattern = re.compile(r'secoo_price.*?</span>') self.br_pattern = re.compile(r'</i>.*?</span') for r in ranges: request = {"url": "http://list.secoo.com/all/0-0-0-0-0-7-0-0-0-10-{0}_{1}-0-100-0.shtml".format(r[0], r[1]), "proxies": {"http": random.choice(self.proxies)}, "headers": {"Connection": "close", 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134'} } page = self.get_request(request) tmp = totalpages_pattern.findall(page) if tmp: page_num = int(tmp[0]) self.log.info((page_num, r[0], r[1])) for pageindex in range(1, page_num + 1): self.seeds_queue.put(Seed((pageindex, r[0], r[1]), kwargs["retries"])) else: self.log.info((0, r[0], r[1]))
def parse1(self, response): seed = Seed.parse_seed(response.meta["_seed"]) cate_id, brand_id, name, page, s = seed.value sku2title = {} for sku in self.sku_pattern1.findall(response.text): sku2title[sku[0]] = re.sub("<[\s\S]*?>|\t|\n", "", sku[1]) r1 = self.first_pettern.findall(response.text) if r1: r1 = r1[0] if r1: cate_id, brand_id, page, s, items = cate_id, brand_id, page + 1, s + 30, r1 if brand_id: en_cate_id, ename = urllib.parse.urlencode({ "cat": cate_id }), urllib.parse.urlencode({"ev": "exbrand_" + name}) url = 'https://list.jd.com/list.html?{0}&{1}&psort=4&page={2}&s={3}&scrolling=y&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={4}'.format( en_cate_id, ename, page, s, items) request = Request(url=url, callback=self.parse2, priority=2) request.headers[ "Referer"] = "https://list.jd.com/list.html?{0}&{1}&page={2}&s={3}&psort=4&click=1".format( en_cate_id, ename, page - 1, s - 30) else: en_cate_id = urllib.parse.urlencode({"cat": cate_id}) url = 'https://list.jd.com/list.html?{0}&psort=4&page={1}&s={2}&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={3}'.format( en_cate_id, page, s, items) request = Request(url=url, callback=self.parse2, priority=2) request.headers[ "Referer"] = "https://list.jd.com/list.html?{0}&page={1}&s={2}&psort=4&click=1".format( en_cate_id, page - 1, s - 30) request.meta["_seed"] = str( Seed((cate_id, brand_id, page, s), type=2)) request.meta["last_page_pids"] = r1 request.meta["sku2title"] = sku2title request.meta["totalpage"] = response.meta["totalpage"] request.meta["currentpage"] = page - 1 request.meta["dydmc_delay"] = 2.5 yield request else: raise Exception(response.request.url) else: raise Exception(response.request.url)
def __init__(self, seeds_file, **kwargs): super(GetBrands, self).__init__(**kwargs) self.ua = UserAgent() with open(seeds_file) as infile: data_set = collections.DataSet(infile) for i, seed in enumerate(data_set.map(lambda line: line.strip('\n').split("\t")[0].replace('-', ',')) .shuffle(1024)): self.seeds_queue.put(Seed(seed, kwargs["retries"])) self.pattern = re.compile(r'<li id="brand-(\d+)[\s\S]*?品牌::([\s\S]*?)\'\)"')
def parse5(self, response): seed = Seed.parse_seed(response.meta["_seed"]) count = self.allcnt_pattern.findall(response.text) skuid = response.meta["prices"]["id"] result = {} result.update(response.meta["prices"]) result.update({"comment": count[0]}) result.update(response.meta["info"][int(skuid)]) result.pop("id") yield result
def parse1(self, response): seed = Seed.parse_seed(response.meta["_seed"]) cate_id, brand_id, page, s = seed.value sku2title = {} for sku in self.sku_pattern1.findall(response.text): sku2title[sku[0]] = re.sub("<[\s\S]*?>|\t|\n", "", sku[1]) r1 = self.first_pettern.findall(response.text) if r1: r1 = r1[0] if r1: if len(r1.split(",")) == 30: cate_id, brand_id, page, s, items = cate_id, brand_id, page + 1, s + 30, r1 if brand_id: en_cate_id, en_brand_id = urllib.parse.urlencode( {"cat": cate_id}), urllib.parse.urlencode({"ev": "exbrand_" + brand_id}) url = 'https://list.jd.com/list.html?{0}&{1}&psort=4&page={2}&s={3}&scrolling=y&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={4}'.format( en_cate_id, en_brand_id, page, s, items) request = Request(url=url, callback=self.parse2, priority=2) request.headers["Referer"] = "https://list.jd.com/list.html?{0}&{1}&page={2}&s={3}&psort=4&click=1".format( en_cate_id, en_brand_id, page-1, s-30) self.logger.info("jd_skuids 6") else: en_cate_id = urllib.parse.urlencode({"cat": cate_id}) url = 'https://list.jd.com/list.html?{0}&psort=4&page={1}&s={2}&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={3}'.format( en_cate_id, page, s, items) request = Request(url=url, callback=self.parse2, priority=2) request.headers["Referer"] = "https://list.jd.com/list.html?{0}&page={1}&s={2}&psort=4&click=1".format( en_cate_id, page - 1, s - 30) self.logger.info("jd_skuids 7") request.meta["_seed"] = str(Seed((cate_id, brand_id, page, s), type=2)) request.meta["last_page_pids"] = r1 request.meta["sku2title"] = sku2title request.meta["totalpage"] = response.meta["totalpage"] request.meta["currentpage"] = page - 1 yield request else: # 说明没有下半页"https://chat1.jd.com/api/checkChat?pidList=10020242230938,1999899692,72276507174,19999997645,1999899692,100000002015,100000002686,200134637813&callback=jQuery8117083" yield Request(url="https://chat1.jd.com/api/checkChat?pidList={0}&callback=jQuery8117083".format( r1), callback=self.parse3, meta=response.meta) else: raise Exception(response.request.url) else: raise Exception(response.request.url)
def parse(self, response): seed = Seed.parse_seed(response.meta["_seed"]) page_strs = self.totalpage_perttern.findall(response.text) if page_strs: page_strs = page_strs[0] for i in range(1, int(page_strs) + 1): page, s = 2 * i - 1, 60 * (i - 1) + 1 cate_id, brand_id, name = seed.value if brand_id: en_cate_id, ename = urllib.parse.urlencode({ "cat": cate_id }), urllib.parse.urlencode({"ev": "exbrand_" + name}) url = 'https://list.jd.com/list.html?{0}&{1}&page={2}&s={3}&psort=4&click=1'.format( en_cate_id, ename, page, s) refer = "https://www.jd.com/" if i == 1 else 'https://list.jd.com/list.html?{0}&{1}&page={2}&s={3}&psort=4&click=1'.format( en_cate_id, ename, 2 * (i - 1) - 1, 60 * (i - 2) + 1) else: en_cate_id = urllib.parse.urlencode({"cat": cate_id}) url = 'https://list.jd.com/list.html?{0}&page={1}&s={2}&psort=4&click=1'.format( en_cate_id, page, s) refer = "https://www.jd.com/" if i == 1 else 'https://list.jd.com/list.html?{0}&page={1}&s={2}&psort=4&click=1'.format( en_cate_id, 2 * (i - 1) - 1, 60 * (i - 2) + 1) yield Request(url=url, callback=self.parse1, meta={ "dydmc_delay": 2.5, "totalpage": int(page_strs), "currentpage": page, "_seed": str( Seed((cate_id, brand_id, name, page, s), type=1)), "headers": { "Connection": "close", "Referer": refer } }, priority=1) else: raise Exception(response.request.url)
def __init__(self, seeds_file, dateindex, **kwargs): super(GetComment, self).__init__(**kwargs) self.ua = UserAgent() with open(seeds_file) as infile: data_set = collections.DataSet(infile) for i, seed in enumerate( data_set.map(lambda line: line.strip('\n').split("\t")[0]). shuffle(2048)): self.seeds_queue.put(Seed(seed, kwargs["retries"])) self.allcnt_pattern = re.compile(r'"commentCount":(\d+),') self.dateindex = dateindex
def parse(self, response): seed = Seed.parse_seed(response.meta["_seed"]) page_strs = self.totalpage_perttern.findall(response.text) if int(page_strs[0]) < 100: #no need to flip pass else: #need to flip pass if page_strs: page_strs = page_strs[0]
def make_request_from_data(self, data): str_seed = bytes_to_str(data, self.redis_encoding) seed = Seed.parse_seed(str_seed) if seed.type == 0: return Request(url=seed.value, meta={"_seed": str_seed}, priority=0, callback=self.parse) elif seed.type == 3: str_seed = seed.value request = Request.deserialize(str_seed, self) return request
def __init__(self, seeds_file, **kwargs): super(GetBrands1, self).__init__(**kwargs) self.proxies = list( map(lambda x: ("http://u{}:[email protected]:3128".format(x)), range(28))) self.ua = UserAgent() with open(seeds_file) as infile: data_set = collections.DataSet(infile) for i, seed in enumerate( data_set.map(lambda line: line.strip('\n').split("\t")[0]. replace('-', ',')).shuffle(1024)): self.seeds_queue.put(Seed(seed, kwargs["retries"])) self.pattern = re.compile(r'"id":.*?"name":".*?"')
def make_request_from_data(self, data): str_seed = bytes_to_str(data, self.redis_encoding) seed = Seed.parse_seed(str_seed) if seed.type == 0: phonenumber = seed.value.strip() url = "http://shouji.xpcha.com/{0}.html".format(phonenumber) return Request(url=url, meta={"_seed": str_seed}, priority=0, callback=self.parse) elif seed.type == 3: str_seed = seed.value request = Request.deserialize(str_seed, self) return request
def __init__(self, seeds_file, dateindex, **kwargs): super(GetComment1, self).__init__(**kwargs) self.proxies = list( map(lambda x: ("http://u{}:[email protected]:3128".format(x)), range(28))) self.ua = UserAgent() with open(seeds_file) as infile: data_set = collections.DataSet(infile) for i, seed in enumerate( data_set.map(lambda line: line.strip('\n').split("\t")[0]). shuffle(2048)): self.seeds_queue.put(Seed(seed, kwargs["retries"])) self.allcnt_pattern = re.compile(r'"CommentCount": "(\d+)"') self.dateindex = dateindex
def parse0(self, response): seed = Seed.parse_seed(response.meta["_seed"]) tuples = self.pattern.findall(response.text) for item in tuples: cate_id, brand_id, name = seed.value, item[0], item[1] if brand_id: url = 'https://list.jd.com/list.html?{0}&{1}&psort=4&click=1'.format( urllib.parse.urlencode({"cat": cate_id}), urllib.parse.urlencode({"ev": "exbrand_" + name})) else: url = 'https://list.jd.com/list.html?{0}&psort=4&click=1'.format( urllib.parse.urlencode({"cat": cate_id})) yield Request(url=url, meta={ "dydmc_delay": 2.5, "_seed": str(Seed(value=(cate_id, brand_id, name))), "headers": { "Referer": "https://www.jd.com/" } }, priority=0, callback=self.parse)
def init_start_urls(self): self.redis.delete(self.start_urls_redis_key) self.redis.delete(self.items_redis_key) with op.DBManger() as m: #创建临时表本月任务的分界线 m.create_db_collection( db_collection=("jingdong", "jdcommentdetail{0}_sep".format(current_date))) skuid_set = {} top1000w = TopK(1000000) #skuids in last result last_result = m.get_lasted_collection( "jingdong", filter={"name": { "$regex": r"^summary_201905_20\d\d\d\d$" }}) pipeline = [ { "$project": { "skuid": "$skuid", "comment_{}".format(last_result[-6:]): "$comment_{}".format(last_result[-6:]) } }, #{"$limit": 1000} ] for item, comments in m.read_from( db_collect=("jingdong", last_result), out_field=("skuid", "comment_{}".format(last_result[-6:])), pipeline=pipeline): if int(item) not in skuid_set: top1000w.push(int(comments)) skuid_set[int(item)] = int(comments) top1000w = set(top1000w.get_topk()) buffer = [] buffer_size = 10000 for i, seed in enumerate(skuid_set): if skuid_set[seed] in top1000w: seed = Seed(value=seed, type=0) buffer.append(str(seed)) if len(buffer) % buffer_size == 0: random.shuffle(buffer) self.redis.sadd(self.start_urls_redis_key, *buffer) buffer = [] if buffer: random.shuffle(buffer) self.redis.sadd(self.start_urls_redis_key, *buffer)
def parse3(self, response): seed = Seed.parse_seed(response.meta["_seed"]) cate_id, brand_id, page, s = seed.value r = json.loads(self.json_pettern.findall(response.text)[0]) if r: for item in r: if item: self.logger.info("jd_skuids 1") yield {"skuid": item.get("pid"), "cate_id": cate_id, "brand_id": brand_id, "shopid": item.get("shopId"), "venderid": item.get("venderId", None), "shop_name": item.get("seller"), "ziying": 1 if item.get("seller") and item.get("seller").find("自营") != -1 else 0, "title":response.meta["sku2title"][str(item.get("pid"))],"chaoshi":1 if "京东超市" in response.meta["sku2title"][str(item.get("pid"))] else 0} else: self.logger.info("jd_skuids 2") else: raise Exception(response.request.url)
def __init__(self, seeds_file, **kwargs): super(NewPhone, self).__init__(**kwargs) self.ua = UserAgent() self.phone_regx = re.compile(r'^\d{11,11}$') self.phone_number_checker = stringUtils.check_legality( pattern=r'^\d{11,11}$') for seed in open(seeds_file): seed = seed.strip("\n") if (self.phone_number_checker(seed)): self.seeds_queue.put(Seed(seed, kwargs["retries"])) else: self.log.info("legal_format: " + seed) self.pro_city_pattern = re.compile( r'<dd><span>号码归属地:</span>(.*?) (.*?)</dd>') self.telcompany_pattern = re.compile( r'<dd><span>手机卡类型:</span>(.*?)</dd>')
def parse3(self, response): seed = Seed.parse_seed(response.meta["_seed"]) cate_id, brand_id, page, s = seed.value r = json.loads(self.json_pettern.findall(response.text)[0]) if r: tmp = {} for item in r: if item: tmp[item.get("pid")] = { "skuid": item.get("pid"), "cate_id": cate_id, "brand_id": brand_id, "shopid": item.get("shopId"), "venderid": item.get("venderId", None), "shop_name": item.get("seller"), "ziying": 1 if item.get("seller") and item.get("seller").find("自营") != -1 else 0, "title": response.meta["sku2title"][str(item.get("pid"))], "chaoshi": 1 if "京东超市" in response.meta["sku2title"][str( item.get("pid"))] else 0 } response.meta["info"] = tmp response.meta["dydmc_delay"] = 1 response.meta["headers"] = { "Connection": "close", "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Mobile Safari/537.36" } yield Request( url="https://p.3.cn/prices/mgets?&type=1&skuIds=J_" + "%2CJ_".join([sku for sku in response.meta["sku2title"]]) + '&pduid=' + str(random.randint(100000000, 999999999)), callback=self.parse4, meta=response.meta, priority=4) else: raise Exception(response.request.url)
def make_request_from_data(self, data): str_seed = bytes_to_str(data, self.redis_encoding) seed = Seed.parse_seed(str_seed) if seed.type == 0: skuid = seed.value #url = "https://wq.jd.com/commodity/comment/getcommentlist?callback=fetchJSON_comment98&pagesize=10&sceneval=2&skucomment=1&score=0&sku={0}&sorttype=6&page=0".format(skuid) url = "https://wq.jd.com/commodity/comment/getcommentlist?callback=skuJDEvalB&version=v2&pagesize=10&sceneval=2&skucomment=1&score=0&sku={}&sorttype=6&page=1&t=0.5156075450518778".format( skuid) headers = { 'Connection': 'close', 'Host': 'wq.jd.com', 'accept': '*/*', 'sec-fetch-site': 'same-site', 'sec-fetch-mode': 'no-cors', 'sec-fetch-dest': 'script', "Referer": "https://item.m.jd.com/ware/view.action?wareId={}&sid=null". format(skuid), 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 'User-Agent': 'Mozilla/5.0 (Linux; Android 10; HRY-AL00a; HMSCore 5.1.1.303) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 HuaweiBrowser/11.0.7.303 Mobile Safari/537.36', "cookie": "__jdc=122270672; mba_muid=16087105855231456793479; shshshfpa=b86c237d-b506-9cc9-730d-39db2f5ea48c-1608710586; shshshfpb=aW2xjA0PZevBiTvJrQ6rk4A%3D%3D; retina=1; webp=1; visitkey=31140776387466944; sbx_hot_h=null; deviceVersion=83.0.4103.106; deviceOS=android; deviceOSVersion=10; deviceName=Chrome; rurl=https%3A%2F%2Fwqs.jd.com%2Ffaqs%2Findex.html%3Fsceneval%3D2%26ptag%3D7001.1.124%26productId%3D12991458%26ispg%3D%26_fd%3Djdm%26jxsid%3D16109541564584400343; equipmentId=A75Q6PQS36IHI62HBEUGC44IVLERE7257UWVYTGEXPMR6NOKARSVVF2Q6EBPSVGNR537LK6GQN3ENW47JREOEXNAVI; __jdv=122270672%7Cdirect%7C-%7Cnone%7C-%7C1614224630058; sc_width=360; shshshfp=c6774e911e47825ddd51cefc23f9b157; wxa_level=1; cid=9; jxsid=16145705280303310338; __jda=122270672.16087105855231456793479.1608710585.1614224630.1614570529.10; wq_ug=14; fingerprint=794164a430090764096f40466260c718; mt_xid=V2_52007VwMVU1ReUlsbQB1YBmUDF1ZaXlpYGk8RbFVuBEBVWV9RRkhIGw4ZYlcRWkFQWwlIVR5aAjAAR1BZX1tZHnkaXQZnHxNQQVlSSx9JElgFbAEbYl9oUmoXSB5dDWYKE1BZXlNeF08cVQNvMxJbWV8%3D; wq_logid=1614571192.282863947; wqmnx1=MDEyNjM5M3AuL3d3MiY2NjQ1eGQtTTFBaSBsby8zd3IzZTUyNy00UkghKQ%3D%3D; __jdb=122270672.9.16087105855231456793479|10.1614570529; mba_sid=16145705290954323095988279117.9; __wga=1614571199267.1614570547761.1614225998734.1610954174749.5.6; PPRD_P=UUID.16087105855231456793479-LOGID.1614571199300.300139660; jxsid_s_t=1614571199496; jxsid_s_u=https%3A//item.m.jd.com/ware/view.action; sk_history=70241615154%2C101609%2C615036%2C54761686610%2C1399903%2C10024515889185%2C10381689654%2C12991458%2C100010062010%2C58070892025%2C100007627009%2C; shshshsID=e45b3b58ca53b7ab42489de6ebc02d6b_5_1614571200418" } return Request(url=url, meta={ "_seed": str_seed, "dydmc_delay": 0.15 + random.random() * 0.1, "headers": headers }, priority=0, callback=self.parse) elif seed.type == 3: str_seed = seed.value request = Request.deserialize(str_seed, self) return request