def parse_item(self, response): hxs = Selector(response) item_titles = extract(hxs, "//div[@class='gl-i-wrap j-sku-item']//a/em/text()") top_id = extract_one(hxs, '//*[@id="J_crumbsBar"]/div/div/div/div[1]/a/text()') type_id1 = extract(hxs, '//*[@id="J_crumbsBar"]//div[@class="trigger"]/span/text()')[0] type_id2 = extract(hxs, '//*[@id="J_crumbsBar"]//div[@class="trigger"]/span/text()')[-1] if type_id1 != type_id2: for i, t in enumerate(item_titles): if i < 20: good = { 'mall': '2', 'rank': str(i + 1), 'title': t, 'price': '0', 'turnover_index': '0', 'top_id': top_id, 'type_id1': type_id1, 'type_id2': type_id2, 'url': response.url } yield Good(good) for link in self.normal_url_extractor.extract_links(response): yield SplashRequest(link.url, callback=self.parse_url, args={'wait': 0.5, 'html': 1, }) for link in self.needed_url_extractor.extract_links(response): if 'ev' not in link.url: url = re.sub(r'page=.*&', 'page=1&', link.url) url = re.sub(r'stock=.*&', 'stock=0&', url) url = re.sub(r'delivery_daofu=.*&', 'delivery_daofu=0&', url) url = re.sub(r'delivery=.*&', 'delivery=0&', url) yield SplashRequest(url, callback=self.parse_item, args={'wait': 0.5, 'html': 1, })
def parse_item(self, response): hxs = Selector(response) top_id = re.findall(r'.*&topId=(\S+_\S+)&type.*', response.url)[0] # type_id=re.findall(r'.*leafId=(\d+)&rank=.*',response.url)[0] type_id1 = extract_one( hxs, "//div[@class='block-body ']/div[@class='params-cont']/a[@class='param-item icon-tag param-item-selected']/text()" ) ranks_tuple = extract( hxs, '//*[@class="rank-num rank-focus"]/text()|//*[@class="rank-num rank-important"]/text()|//*[@class="rank-num rank-"]/text()' ) ranks = [] for r in ranks_tuple: if r.strip() != '': ranks.append(r) titles = extract(hxs, '//*[@class="title"]/a/text()') prices = extract(hxs, '//*[@class="col3 col"]/text()')[1:] turnover_indexs = extract(hxs, '//*[@class="focus-bar"]/span/text()') for r, t, p, i in zip(ranks, titles, prices, turnover_indexs): good = { 'mall': '0', 'rank': r.strip(), 'title': t.strip(), 'price': p.split('¥')[-1].strip(), 'turnover_index': i.strip(), 'top_id': top_id.strip(), 'type_id1': type_id1.strip(), 'type_id2': '', 'url': response.url } yield Good(good)
def parse_item(self,response): hxs=Selector(response) top_id=re.findall(r'.*&topId=(\S+_\S+)&type.*',response.url)[0] # type_id=re.findall(r'.*leafId=(\d+)&rank=.*',response.url)[0] type_id1=extract_one(hxs,"//div[@class='block-body ']/div[@class='params-cont']/a[@class='param-item icon-tag param-item-selected']/text()") ranks_tuple=extract(hxs,'//*[@class="rank-num rank-focus"]/text()|//*[@class="rank-num rank-important"]/text()|//*[@class="rank-num rank-"]/text()') ranks=[] for r in ranks_tuple: if r.strip()!='': ranks.append(r) titles=extract(hxs,'//*[@class="title"]/a/text()') prices=extract(hxs,'//*[@class="col3 col"]/text()')[1:] turnover_indexs=extract(hxs,'//*[@class="focus-bar"]/span/text()') for r,t,p,i in zip(ranks,titles,prices,turnover_indexs): good={ 'mall':'0', 'rank':r.strip(), 'title':t.strip(), 'price':p.split('¥')[-1].strip(), 'turnover_index':i.strip(), 'top_id':top_id.strip(), 'type_id1':type_id1.strip(), 'type_id2':'', 'url':response.url } yield Good(good)
def parse_item(self, response): hxs = Selector(response) item_titles = extract(hxs, "//div[@class='gl-i-wrap j-sku-item']//a/em/text()") top_id = extract_one(hxs, '//*[@id="J_crumbsBar"]/div/div/div/div[1]/a/text()') type_id1 = extract(hxs, '//*[@id="J_crumbsBar"]//div[@class="trigger"]/span/text()')[0] type_id2 = extract(hxs, '//*[@id="J_crumbsBar"]//div[@class="trigger"]/span/text()')[-1] if type_id1 != type_id2: for i, t in enumerate(item_titles): if i < 20: good = { "mall": "2", "rank": str(i + 1), "title": t, "price": "0", "turnover_index": "0", "top_id": top_id, "type_id1": type_id1, "type_id2": type_id2, "url": response.url, } yield Good(good) for link in self.normal_url_extractor.extract_links(response): yield SplashRequest(link.url, callback=self.parse_url, args={"wait": 0.5, "html": 1}) for link in self.needed_url_extractor.extract_links(response): if "ev" not in link.url: url = re.sub(r"page=.*&", "page=1&", link.url) url = re.sub(r"stock=.*&", "stock=0&", url) url = re.sub(r"delivery_daofu=.*&", "delivery_daofu=0&", url) url = re.sub(r"delivery=.*&", "delivery=0&", url) yield SplashRequest(url, callback=self.parse_item, args={"wait": 0.5, "html": 1})
def parse_item(self,response): hxs=Selector(response) search_condition=extract_one(hxs,'//*[@id="J_CrumbSearchInuput"]/@value') item_titles=extract(hxs,"//div[@id='J_ItemList']//p[@class='productTitle']/a/text()") top_id=extract_one(hxs,'//*[@id="J_CrumbSlideCon"]/li[2]/a/text()') type_id1=extract_one(hxs,'//*[@id="J_CrumbSlideCon"]//div[@class="crumbDrop j_CrumbDrop"]/a/text()') if type_id1 is not None and search_condition is not None: type_id1=type_id1.split('/n')[0] titles=[] title='' for t in item_titles: if not t.endswith('\n'): title+=t.strip() elif t.endswith('\n'): title+=t.strip() if len(title)>5: titles.append(title.strip()) title='' if len(titles)>19 and search_condition!=type_id1: for i,t in enumerate(titles): if i<20: good={ 'mall': '1', 'rank': str(i+1), 'title': t.strip(), 'price': '0', 'turnover_index':'0', 'top_id': top_id.strip(), 'type_id1': type_id1.strip(), 'type_id2': search_condition.strip(), 'url': response.url } yield Good(good) for link in self.needed_url_extractor.extract_links(response): if 'industryCatId' and 'cat' in link.url and 'post_fee' and 'brand' not in link.url: url = re.sub(r'sort=.*&', 'sort=d&', link.url) url = re.sub(r'search_condition=.*&', 'search_condition=7', url) url=re.sub(r'miaosha=.*&','miaosha=0&',url) url=re.sub(r'wwonline=.*&','wwonline=0&',url) yield SplashRequest(url, callback=self.parse_item, args={'wait': 0.5, 'html': 1,})
def parse_item(self, response): hxs = Selector(response) item_titles = extract(hxs, "//div[@id='J_ItemList']//p[@class='productTitle']/a/text()") top_id = extract_one(hxs, '//*[@id="J_CrumbSlideCon"]/li[2]/a/text()') type_id1 = extract(hxs, '//*[@id="J_CrumbSlideCon"]//div[@class="crumbDrop j_CrumbDrop"]/a/text()') if type_id1 is not None: if len(type_id1) > 1: type_id2 = type_id1.split('/n')[-1] else: type_id2 = '' type_id1 = type_id1.split('/n')[0] titles = [] title = '' for t in item_titles: if not t.endswith('\n'): title += t.strip() elif t.endswith('\n'): title += t.strip() if len(title) > 5: titles.append(title.strip()) title = '' if len(titles) > 19: for i, t in enumerate(titles): if i < 20: good = { 'mall': '1', 'rank': str(i + 1), 'title': t.strip(), 'price': '0', 'turnover_index': '0', 'top_id': top_id.strip(), 'type_id1': type_id1.strip(), 'type_id2': type_id2.strip(), 'url': response.url } yield Good(good) for link in self.normal_url_extractor.extract_links(response): yield SplashRequest(link.url, callback=self.parse, args={'wait': 0.5, 'html': 1, })
def parse_item(self,response): hxs=Selector(response) item_titles=extract(hxs,"//div[@id='J_ItemList']//p[@class='productTitle']/a/text()") top_id=extract_one(hxs,'//*[@id="J_CrumbSlideCon"]/li[2]/a/text()') type_id1=extract(hxs,'//*[@id="J_CrumbSlideCon"]//div[@class="crumbDrop j_CrumbDrop"]/a/text()') if type_id1 is not None: if len(type_id1) >1: type_id2=type_id1.split('/n')[-1] else: type_id2='' type_id1=type_id1.split('/n')[0] titles=[] title='' for t in item_titles: if not t.endswith('\n'): title+=t.strip() elif t.endswith('\n'): title+=t.strip() if len(title)>5: titles.append(title.strip()) title='' if len(titles)>19: for i,t in enumerate(titles): if i<20: good={ 'mall': '1', 'rank': str(i+1), 'title': t.strip(), 'price': '0', 'turnover_index':'0', 'top_id': top_id.strip(), 'type_id1': type_id1.strip(), 'type_id2': type_id2.strip(), 'url': response.url } yield Good(good) for link in self.normal_url_extractor.extract_links(response): yield SplashRequest(link.url,callback=self.parse,args={'wait':0.5,'html':1,})