def goods(self, response): item = response.meta['item'] sel = scrapy.Selector(response) url = response.url body = response.body ProductID = item['ProductID'] PreferentialPrice = item['PreferentialPrice'] price = item['price'] if "error" in url or "2017?t" in url or "/?" in url: #302重定向页面,写回原页面处理 url = "https://item.jd.com/" + str(ProductID) + ".html" item = XiwanjiItem(ProductID=ProductID, PreferentialPrice=PreferentialPrice, price=price) yield scrapy.Request(url, callback=self.goods, meta={'item': item}) return None # --------------------全球购网页--------------------------------------------- elif "hk" in url: print("全球购:", url) #京东商品介绍部分 detail_info = sel.xpath(".//div[@class='p-parameter']") # 包含商品详情内容 detail = detail_info.xpath(".//li/text()").extract() if detail[0] == '品牌: ': detail_brand = detail_info.xpath( ".//li[1]/@title").extract()[0] detail[0] = detail[0] + detail_brand product_detail = '\"' + ' '.join(detail).replace('\t', '').replace( '\n', '').replace(' ', '') + '\"' detail_1 = detail_info.extract() #缩小范围,从商品介绍部分获取想要的内容 #商品名称 try: p_Name = sel.xpath(".//div[@class='sku-name']/text()").extract( )[-1].strip('\"').strip('\n').strip().replace('\t', '') print(p_Name) except: p_Name = None # detail_info=sel.xpath(".//div[@class='p-parameter']/text()").extract() #店铺名称 try: shop_name = sel.xpath( ".//div[@class='shopName']/strong/span/a/text()").extract( )[0] # 店铺名称 except: try: shop = sel.xpath( ".//div[@class='p-parameter']/ul[@class='parameter2']/li[3]/@title" ).extract()[0] if '店' in shop: shop_name = shop else: shop_name = None except: shop_name = None #京东规格与包装部分(将这部分的内容读为字典形式,x为字典) try: s = BeautifulSoup(body, 'lxml') guige = s.find('div', id_='specifications') x = {} guige2 = guige.find_all('td', class_='tdTitle') guige3 = guige.find_all('td', class_=None) for i in range(len(guige2)): dt = re.findall(">(.*?)<", str(guige2[i])) dd = re.findall(">(.*?)<", str(guige3[i])) x.setdefault(dt[0], dd[0]) except: x = None #商品品牌 try: brand = x['品牌'] except: brand = p_Name.split(" ")[0] if brand != p_Name: if ("(" and ")") in brand: dd = re.findall("(.*?)", brand)[0] brand = brand.replace(dd, '').replace(' ', '') if ("(" and ")") in brand: dd = re.findall("\(.*?\)", brand)[0] brand = brand.replace(dd, '').replace(' ', '') if brand == "Panasonic": brand = "松下" if brand == "CHEBLO": brand = "樱花" if brand == "MBO": brand = "美博" if brand == "YAIR": brand = "扬子" if brand == "PHLGCO": brand = "飞歌" if brand == "FZM": brand = "方米" if brand == "inyan": brand = "迎燕" if brand == "JENSANY": brand = "金三洋" #商品名称(型号) try: try: X_name = re.findall(">货号:(.*?)<", detail_1[0])[0].strip().replace( brand, '') if p_Name == None: p_Name = X_name except: try: X_name = x['型号'].replace(brand, '') if p_Name == None: p_Name = X_name except: X_name = re.findall(">商品名称:(.*?)<", detail_1[0])[0].strip().replace( '\t', '').replace(brand, '') # 商品名称 if len(X_name) == 0: X_name = p_Name if p_Name == None: p_Name = X_name except: X_name = p_Name if X_name == p_Name: if brand and brand != p_Name: if brand in X_name: X_name = X_name[:0] + re.sub(brand, '', X_name) X_name = X_name[:0] + re.sub(r'(.*?)', '', X_name) X_name = X_name[:0] + re.sub(r'\(.*?\)', '', X_name) X_name = X_name[:0] + re.sub(r'[\u4e00-\u9fa5]+', '', X_name) X_name = X_name.replace('/', '').strip() try: open_method = re.findall(">开合方式:(.*?)<", detail_1[0])[0].strip() except: try: open_method = x['开合方式'] except: open_method = None try: laundry = re.findall(">洗碗方式:(.*?)<", detail_1[0])[0].strip() except: try: laundry = x['洗涤方式'] except: laundry = None try: capacity = re.findall(">总容积:(.*?)<", detail_1[0])[0].strip() except: try: capacity = x['餐具容量(套)'] except: capacity = None try: control = re.findall(">控制方式:(.*?)<", detail_1[0])[0].strip() except: try: control = x['控制方式'] except: control = None try: dry_method = x['干燥方式'] except: try: dry_method = re.findall(">干燥方式:(.*?)<", detail_1[0])[0].strip() except: dry_method = None try: disinfection = x['消毒方式'] except: try: disinfection = re.findall(">消毒方式:(.*?)<", detail_1[0])[0].strip() except: disinfection = None try: consump = x['耗水量(L)'] except: try: consump = re.findall(">耗水量:(.*?)<", detail_1[0])[0].strip() except: consump = None try: color = x['颜色'] except: try: color = re.findall(">颜色:(.*?)<", detail_1[0])[0].strip() except: color = None # price_web="https://p.3.cn/prices/mgets?pduid=15107253217849152442&skuIds=J_"+str(ProductID) comment_web = "https://sclub.jd.com/comment/productPageComments.action?productId=" + str( ProductID) + "&score=0&sortType=5&page=0&pageSize=10" # ---------------------普通网页----------------------------------- else: #商品名称(1.从名称处读;2.从表头的名称处读) try: p_Name = sel.xpath(".//div[@class='sku-name']/text()").extract( )[0].strip('\"').strip('\n').strip().replace('\t', '') # 商品名称 if len(p_Name) == 0: # 如发生商品名称读取结果为空的情况 p_Name = sel.xpath(".//div[@class='item ellipsis']/@title" ).extract()[0].replace('\t', '') except: try: p_Name = sel.xpath(".//div[@class='item ellipsis']/@title" ).extract()[0].replace('\t', '') except: p_Name = None #京东商品介绍部分 detail_info = sel.xpath(".//div[@class='p-parameter']") # 包含商品详情内容 detail = detail_info.xpath(".//li/text()").extract() if detail[0] == '品牌: ': detail_brand = detail_info.xpath( ".//li[1]/@title").extract()[0] detail[0] = detail[0] + detail_brand product_detail = '\"' + ' '.join(detail).replace('\t', '').replace( '\n', '').replace(' ', '') + '\"' detail_1 = detail_info.extract() #京东规格与包装部分(读取为字典格式) try: s = BeautifulSoup(body, 'lxml') # print(s) guige = s.find('div', class_='Ptable') # print (guige) guige1 = guige.find_all('div', class_='Ptable-item') # print (guige1) x = {} for gg in guige1: guige2 = gg.find_all('dt', class_=None) guige3 = gg.find_all('dd', class_=None) for i in range(len(guige2)): dt = re.findall(">(.*?)<", str(guige2[i])) dd = re.findall(">(.*?)<", str(guige3[i])) x.setdefault(dt[0], dd[0]) except: x = None #店铺名称 try: try: shop_name = sel.xpath( ".//div[@class='name']/a/text()").extract()[0] # 店铺名称 except: shop_name = re.findall(">店铺:(.*?)<", detail_1[0])[0].strip() except: shop_name = "京东自营" #不是品牌:**的形式,不用find try: brand = detail_info.xpath( ".//ul[@id='parameter-brand']/li/a/text()").extract( )[0].strip() # 商品品牌 except: try: brand = x['品牌'] except: brand = None if brand: if ("(" and ")") in brand: dd = re.findall("(.*?)", brand)[0] brand = brand.replace(dd, '').replace(' ', '') if ("(" and ")") in brand: dd = re.findall("\(.*?\)", brand)[0] brand = brand.replace(dd, '').replace(' ', '') if brand == "Panasonic": brand = "松下" if brand == "CHEBLO": brand = "樱花" if brand == "MBO": brand = "美博" if brand == "YAIR": brand = "扬子" if brand == "PHLGCO": brand = "飞歌" if brand == "FZM": brand = "方米" if brand == "inyan": brand = "迎燕" if brand == "JENSANY": brand = "金三洋" #商品名称(型号) try: try: X_name = re.findall(">货号:(.*?)<", detail_1[0])[0].strip().replace( brand, '') except: try: X_name = x['型号'].replace(brand, '') except: X_name = re.findall(">商品名称:(.*?)<", detail_1[0])[0].strip().replace( '\t', '').replace(brand, '') # 商品名称 if len(X_name) == 0: X_name = p_Name if p_Name == None: p_Name = X_name except: X_name = p_Name if X_name == p_Name: if brand and brand != p_Name: if brand in X_name: X_name = X_name[:0] + re.sub(brand, '', X_name) X_name = X_name[:0] + re.sub(r'(.*?)', '', X_name) X_name = X_name[:0] + re.sub(r'\(.*?\)', '', X_name) X_name = X_name[:0] + re.sub(r'[\u4e00-\u9fa5]+', '', X_name) X_name = X_name.replace('/', '').strip() try: open_method = re.findall(">开合方式:(.*?)<", detail_1[0])[0].strip() except: try: open_method = x['开合方式'] except: open_method = None try: laundry = re.findall(">洗碗方式:(.*?)<", detail_1[0])[0].strip() except: try: laundry = x['洗涤方式'] except: laundry = None try: capacity = re.findall(">总容积:(.*?)<", detail_1[0])[0].strip() except: try: capacity = x['餐具容量(套)'] except: capacity = None try: control = re.findall(">控制方式:(.*?)<", detail_1[0])[0].strip() except: try: control = x['控制方式'] except: control = None try: dry_method = x['干燥方式'] except: try: dry_method = re.findall(">干燥方式:(.*?)<", detail_1[0])[0].strip() except: dry_method = None try: disinfection = x['消毒方式'] except: try: disinfection = re.findall(">消毒方式:(.*?)<", detail_1[0])[0].strip() except: disinfection = None try: consump = x['耗水量(L)'] except: try: consump = re.findall(">耗水量:(.*?)<", detail_1[0])[0].strip() except: consump = None try: color = x['颜色'] except: try: color = re.findall(">颜色:(.*?)<", detail_1[0])[0].strip() except: color = None # price_web = "https://p.3.cn/prices/mgets?pduid=1508741337887922929012&skuIds=J_" + str(ProductID) comment_web = "https://sclub.jd.com/comment/productPageComments.action?productId=" + str( ProductID) + "&score=0&sortType=5&page=0&pageSize=10" # price_web = "https://p.3.cn/prices/mgets?pduid=1508741337887922929012&skuIds=J_" + str(ProductID) # price_web="https://p.3.cn/prices/mgets?ext=11000000&pin=&type=1&area=1_72_4137_0&skuIds=J_"+str(ProductID)+"&pdbp=0&pdtk=vJSo%2BcN%2B1Ot1ULpZg6kb4jfma6jcULJ1G2ulutvvlxgL3fj5JLFWweQbLYhUVX2E&pdpin=&pduid=1508741337887922929012&source=list_pc_front&_=1510210566056" # 商品评价 json格式 # comment_web = "https://sclub.jd.com/comment/productPageComments.action?productId=" + str(ProductID) + "&score=0&sortType=5&page=0&pageSize=10" # comment_web="https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds="+str(ProductID) # comment_webs = requests.get(comment_web,timeout=1000).text # urls = json.loads(comment_webs) urls = requests.get(comment_web, timeout=1000).json() try: comment = urls['hotCommentTagStatistics'] keyword_list = [] for i in range(len(comment)): keyword_list.append(comment[i]['name']) if len(keyword_list) == 0: keyword = None else: keyword = ' '.join(keyword_list) #关键词 except: keyword = None rate = urls['productCommentSummary'] try: CommentCount = rate['commentCount'] # 评论总数 except: CommentCount = None print("评价总数", CommentCount) try: GoodRateShow = rate['goodRateShow'] # 好评率 except: GoodRateShow = None try: GoodCount = rate['goodCount'] # 好评数 except: GoodCount = None try: GeneralCount = rate['generalCount'] # 中评数 except: GeneralCount = None try: PoorCount = rate['poorCount'] # 差评数 except: PoorCount = None '''''' '''' 方法一 ''' '''''' '' # search_web = "https://search.jd.com/Search?keyword=" + str(p_Name) + "&enc=utf-8&wq=" + str(p_Name) # # print ("search页面:",search_web) # search_webs = requests.get(search_web, timeout=1000).text # soup = BeautifulSoup(search_webs, 'lxml') # skuid = "J_" + str(ProductID) # try: # price_info = soup('strong', class_=skuid) # PreferentialPrice = re.findall("<em>ï¿¥</em><i>(.*?)</i>", str(price_info[0]))[0] # # 会有<strong class="J_10108922808" data-done="1" data-price="639.00"><em>ï¿¥</em><i></i></strong>出现 # #如id=10108922808 p_Name=柏翠(petrus) 38L电烤箱家用多功能 精准控温 PE7338 升级版 # if len(PreferentialPrice) == 0: # PreferentialPrice = re.findall('data-price=\"(.*?)\"', str(price_info[0]))[0] # price = PreferentialPrice # except: # try: # print("价格:",price_web) # price_webs = requests.get(price_web, timeout=1000).text # price_json = json.loads(price_webs)[0] # PreferentialPrice = price_json['p'] # price = price_json['m'] # except: # price=None # PreferentialPrice=None # print(price,PreferentialPrice) if float(PreferentialPrice) > 0.00: item = XiwanjiItem() item['ProductID'] = ProductID item['p_Name'] = p_Name item['shop_name'] = shop_name item['price'] = price item['PreferentialPrice'] = PreferentialPrice item['CommentCount'] = CommentCount item['GoodRateShow'] = GoodRateShow item['GoodCount'] = GoodCount item['GeneralCount'] = GeneralCount item['PoorCount'] = PoorCount item['keyword'] = keyword item['type'] = product_detail item['brand'] = brand item['X_name'] = X_name item['open_method'] = open_method item['laundry'] = laundry item['capacity'] = capacity item['control'] = control item['dry_method'] = dry_method item['disinfection'] = disinfection item['consump'] = consump item['color'] = color item['product_url'] = url item['source'] = "京东" item['ProgramStarttime'] = self.ProgramStarttime yield item else: print('广告及无效页面:', url)
def start_requests(self): yield scrapy.Request(self.url)
def start_requests(self): start_urls = ['https://www.radiotavisupleba.ge/a/31333277.html', ] for url in start_urls: yield scrapy.Request(url=url, callback=self.parse)
def parseCategory(self, response): for pageUrl in response.css('li.parent-cate a::attr(href)').extract(): yield scrapy.Request(url=pageUrl, callback=self.parse)
def start_requests(self): url_list = ['https://www.matrimonio.com/forum/prova-trucco--t750941'] for url in url_list: yield scrapy.Request(url=url, callback=self.parse)
def start_requests(self): urls = ['http://www.bjda.gov.cn/eportal/ui?pageId=331184'] yield scrapy.Request(urls[0], dont_filter=True)
def start_requests(self): yield scrapy.Request(url='https://www.dhanak.com.pk/', callback=self.link)
def start_requests(self): urls = ['http://www.codeforces.com/contests'] for url in urls: yield scrapy.Request(url, self.parse)
def get_media_requests(self, item, info): image_link = item['imagelink'] yield scrapy.Request(image_link)
def parse(self, response): info = response.body.decode('utf-8') info = json.loads(info) if 'items' not in info.keys(): self.err_after(response.meta) return None item_info = info['items'] flip = info['flip'] keyword = response.meta['keyword'] sort = response.meta['sort'] # 上一页最后一个产品的排名 p_time = response.meta['p_time'] item_list = [] page = response.meta['page'] proxy = response.meta['proxy'] # print('parse_before', sort,len(item_info), keyword) # 返回有数据,处理数据 if len(item_info) > 0: for value in item_info: sort = sort + 1 # 判断是否推广 if 'ad' in value.keys(): mall_id = value['ad']['mall_id'] is_ad = 1 suggest_keyword = '' else: mall_id = 0 is_ad = 0 suggest_keyword = '' goods_info = value goods_info['keyword'] = keyword goods_info['sort'] = sort goods_info['p_time'] = p_time goods_info['mall_id'] = mall_id goods_info['is_ad'] = is_ad goods_info['suggest_keyword'] = suggest_keyword item_list.append(goods_info) # 处理单个关键字下所有产品的排名 item = KeywordGoodsList() item['goods_list'] = item_list item['page'] = page item['keyword'] = keyword # print('parse_middle', sort,len(item_info), keyword) yield item page += 1 # 返回数据,页码加1,未返回数据,重新抓取 # print('parse_after', sort,len(item_info), keyword) if page <= self.max_page: url = self.build_search_url(page, self.size, keyword, flip) headers = self.make_headers(keyword) meta = { 'flip': flip, 'proxy': proxy, 'page': page, 'keyword': keyword, 'sort': sort, 'p_time': p_time } yield scrapy.Request(url, meta=meta, callback=self.parse, headers=headers, dont_filter=True, errback=self.errback_httpbin)
def start_requests(self): urls = [ 'http://sz.to8to.com/zwj/'] return [scrapy.Request(url=url, callback=self.parse) for url in urls]
def parse_list(self, response): json_text = json.loads(response.text[7:-1], encoding='utf-8') print(json_text) for data in json_text['data']: item = Zqz510Item() if 'agS' in data: item['agS'] = data['agS'] else: item['agS'] = empty_word if 'agidS' in data: item['agidS'] = data['agidS'] else: item['agidS'] = empty_word if 'an' in data: item['an'] = data['an'] else: item['an'] = empty_word if 'anDest' in data: item['anDest'] = data['anDest'] else: item['anDest'] = empty_word if 'anList' in data: item['anList'] = str(data['anList']) else: item['anList'] = empty_word if 'apS' in data: item['apS'] = data['apS'] else: item['apS'] = empty_word if 'apidS' in data: item['apidS'] = data['apidS'] else: item['apidS'] = empty_word if 'cid' in data: item['cid'] = data['cid'] else: item['cid'] = empty_word if 'docid' in data: item['docid'] = data['docid'] else: item['docid'] = empty_word if 'law' in data: item['law'] = data['law'] else: item['law'] = empty_word if 'link' in data: item['link'] = data['link'] else: item['link'] = empty_word if 'litem' in data: item['litem'] = data['litem'] else: item['litem'] = empty_word if 'ltid' in data: item['ltid'] = data['ltid'] else: item['ltid'] = empty_word if 'pd' in data: item['pd'] = data['pd'] else: item['pd'] = empty_word if 'psty' in data: item['psty'] = data['psty'] else: item['psty'] = empty_word if 'rid' in data: item['rid'] = data['rid'] else: item['rid'] = empty_word if 'ti' in data: item['ti'] = data['ti'] else: item['ti'] = empty_word if 'ty' in data: item['ty'] = data['ty'] else: item['ty'] = empty_word detail_url = 'http://api.zqz510.com/tmof/detail?docid={}&callback=_jqjsp&_{}='.format( item['docid'], str(int(time.time() * 1000))) yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item}, cookies=self.cookie)
def parse_detail_info(self, response): detail_info = response.text cust = response.meta['cust'] if eval(detail_info.replace('null', 'None').replace('false', 'None').replace('true', 'None')).get('msg'): self.logger.warning('%s', cust + ' occurred err!') self.logger.warning('%s', eval(detail_info.replace('null', 'None').replace('false', 'None').replace('true', 'None'))) results = eval(detail_info.replace('null', 'None').replace('false', 'None').replace('true', 'None'))['data']['results'] credit_info_detail_url = "http://www.creditchina.gov.cn/api/credit_info_detail?" pub_permissions_name_url = 'http://www.creditchina.gov.cn/api/pub_permissions_name?' pub_penalty_name_url = 'http://www.creditchina.gov.cn/api/pub_penalty_name?' record_param_url = 'http://www.creditchina.gov.cn/api/record_param?' credit_info_detail_list = [] pub_permissions_name_list = [] pub_penalty_name_list = [] record_param_list_2 = [] record_param_list_4 = [] record_param_list_8 = [] for result in results: self.logger.info('Searcing name is %s, result name is %s' %(cust,result['name'])) if result['name'] == cust: self.logger.warning('%s', cust + ' has results.') # summary credit_info_detail_url_append = {'encryStr': result['encryStr'].replace('\n', '')} credit_info_detail_url_append = urllib.urlencode(credit_info_detail_url_append) credit_info_detail_list.append(credit_info_detail_url + credit_info_detail_url_append) credit_info_detail_list = list(set(credit_info_detail_list)) # pub_permissions pub_permissions_name_url_append = {'name': cust, 'page': 1, 'pageSize': 50} pub_permissions_name_url_append = urllib.urlencode(pub_permissions_name_url_append) pub_permissions_name_list.append(pub_permissions_name_url + pub_permissions_name_url_append) pub_permissions_name_list = list(set(pub_permissions_name_list)) # pub_penalty pub_penalty_name_url_append = {'name': cust, 'page': 1, 'pageSize': 50} pub_penalty_name_url_append = urllib.urlencode(pub_penalty_name_url_append) pub_penalty_name_list.append(pub_penalty_name_url + pub_penalty_name_url_append) pub_penalty_name_list = list(set(pub_penalty_name_list)) # creditType=2 red,creditType=4 attention,creditType=8 black record_param_url_append_2 = {'encryStr': result['encryStr'].replace('\n', ''), 'creditType': 2, 'dataSource': 0, 'pageNum': 1, 'pageSize': 50} record_param_url_append_2 = urllib.urlencode(record_param_url_append_2) record_param_list_2.append(record_param_url + record_param_url_append_2) record_param_list_2 = list(set(record_param_list_2)) record_param_url_append_4 = {'encryStr': result['encryStr'].replace('\n', ''), 'creditType': 4, 'dataSource': 0, 'pageNum': 1, 'pageSize': 50} record_param_url_append_4 = urllib.urlencode(record_param_url_append_4) record_param_list_4.append(record_param_url + record_param_url_append_4) record_param_list_4 = list(set(record_param_list_4)) record_param_url_append_8 = {'encryStr': result['encryStr'].replace('\n', ''), 'creditType': 8, 'dataSource': 0, 'pageNum': 1, 'pageSize': 50} record_param_url_append_8 = urllib.urlencode(record_param_url_append_8) record_param_list_8.append(record_param_url + record_param_url_append_8) record_param_list_8 = list(set(record_param_list_8)) if pub_permissions_name_list != []: for url in pub_permissions_name_list: time.sleep(random.uniform(1, 3)) yield scrapy.Request(url=url, callback=self.parse_pub_permissions_name,meta={'cust': cust}, dont_filter = True) if record_param_list_2 != []: for url in record_param_list_2: time.sleep(random.uniform(1, 3)) yield scrapy.Request(url=url, callback=self.parse_record_param_url_append_2,meta={'cust': cust}, dont_filter = True) if record_param_list_4 != []: for url in record_param_list_4: time.sleep(random.uniform(1, 3)) yield scrapy.Request(url=url, callback=self.parse_record_param_url_append_4,meta={'cust': cust}, dont_filter = True) if record_param_list_8 != []: for url in record_param_list_8: time.sleep(random.uniform(1, 3)) yield scrapy.Request(url=url, callback=self.parse_record_param_url_append_8,meta={'cust': cust}, dont_filter = True)
def parse(self, response): sel = scrapy.Selector(response) '''''' ''' 方法二 ''' '''''' productid_list1 = sel.xpath( ".//div[@id='plist']/ul/li/div[contains(@class,'gl-i-wrap')]/@data-sku" ).extract() #单件,套餐…… productid_list2 = sel.xpath( ".//div[@class='gl-i-tab-content']/div[@class='tab-content-item tab-cnt-i-selected j-sku-item']/@data-sku" ).extract() productid_list = productid_list1 + productid_list2 print(productid_list) print(len(productid_list)) productid_str = '%2CJ_'.join(productid_list) # time.sleep(random.randint(60,120)) price_web = "https://p.3.cn/prices/mgets?ext=11000000&pin=&type=1&area=1_72_4137_0&skuIds=J_" + str( productid_str) price_webs = requests.get(price_web, timeout=1000).text price_jsons = json.loads(price_webs) if len(price_jsons) > 50: self.pagenum = self.pagenum + 1 print("第" + str(self.pagenum) + "页") for price_json in price_jsons: try: id = price_json['id'] ProductID = id[2:] PreferentialPrice = price_json['p'] price = price_json['m'] except: ProductID = None PreferentialPrice = None price = None # 商品价格 if ProductID: item = XiwanjiItem() with open("price.csv", "a") as csvfile: writer = csv.writer(csvfile) writer.writerow([ProductID, PreferentialPrice, price]) item['ProductID'] = ProductID item['PreferentialPrice'] = PreferentialPrice item['price'] = price goods_web = "https://item.jd.com/" + str(ProductID) + ".html" request = scrapy.Request(url=goods_web, callback=self.goods, meta={'item': item}, dont_filter=True) yield request else: print("ProductID未获取到") self.num = self.num + 1 if (self.num) > 60: print("ProductID多次未获取到") exit() '''''' '''' 方法一 ''' '''''' '' # # url="https://item.jd.hk/18739277759.html" #京东全球购与普通网址不同,不同的地方为“https://item.jd.com/4251335.html” # goods_info=sel.xpath(".//div[@id='plist']/ul/li") # for goods in goods_info: # ProductID_info=goods.xpath(".//div[@class='gl-i-wrap j-sku-item']/@data-sku").extract() #商品编号 # if len(ProductID_info)==0: # ProductID_info=goods.xpath(".//div[@class='gl-i-tab-content']/div[@class='tab-content-item tab-cnt-i-selected j-sku-item']/@data-sku").extract() # ProductID=ProductID_info[0] # else: # ProductID=ProductID_info[0] # # print(ProductID) # if len(ProductID)!=0: # goods_web="https://item.jd.com/"+str(ProductID)+".html" #商品链接 包含商品型号,店铺名称,类别,品牌,型号等 # item=JdItem(ProductID=ProductID) # request=scrapy.Request(url=goods_web,callback=self.goods,meta={'item':item},dont_filter=True) # yield request # else: # print("parse中ProductID为空 没有读到") # #测试用 # productid_list1=sel.xpath(".//div[@id='plist']/ul/li/div[contains(@class,'gl-i-wrap')]/@data-sku").extract() # #单件,套餐…… # productid_list2 = sel.xpath( ".//div[@class='gl-i-tab-content']/div[@class='tab-content-item tab-cnt-i-selected j-sku-item']/@data-sku").extract() # productid_list=productid_list1+productid_list2 # print(productid_list) # print(len(productid_list)) # for ProductID in productid_list: # item = JinghuaqiItem(ProductID=ProductID,price=2.00,PreferentialPrice=2.00) # # url="https://item.jd.hk/1971910764.html" # url="https://item.jd.com/" + str(ProductID) + ".html" # request = scrapy.Request(url=url, callback=self.goods,meta={'item':item}, dont_filter=True) # yield request #翻页功能 time.sleep(random.randint(60, 120)) next_page = sel.xpath( ".//div[@class='p-wrap']/span[@class='p-num']/a[@class='pn-next']/@href" ).extract() if next_page: next = "https://list.jd.com/" + next_page[0] yield scrapy.Request(next, callback=self.parse)
def start_requests(self): urls = ['https://pittsburghpa.gov/mayor/mayor-contact', 'https://pittsburghpa.gov/controller/controller-contact'] urls += ["https://pittsburghpa.gov/council/d{}-contacts".format(i+1) for i in range(9)] for url in urls: yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): for url in urls: yield scrapy.Request("https://www.indeed.com" + url + "/reviews?fcountry=CN",callback = self.parse_single)
def parse(self, response): manager_response = response.css('.jl_intro') funds_response = response.css('.jl_office') company = response.css('.bs_gl').xpath( './p/label/a[@href]/text()').extract()[-1] num = len(manager_response) if isinstance(manager_response, SelectorList): assert num == len(funds_response) else: manager_response = [manager_response] funds_response = [funds_response] for i in range(num): manager = Manager() intro_list = manager_response[i].xpath('.//text()').extract() manager['name'] = intro_list[1] manager['appointment_date'] = intro_list[3] manager['introduction'] = intro_list[4] manager['url'] = 'http:' + manager_response[i].xpath( './a/@href').extract_first() manager['image_urls'] = manager_response[i].xpath( './a/img/@src').extract() manager['_id'] = manager['url'][-13:-5] try: funds_table_list = funds_response[i].xpath( './/text()').extract() funds_table = numpy.array(funds_table_list[2:]).reshape(-1, 9) manager_name = funds_table_list[0] except Exception: def parse_line(tr): return [ item.xpath('.//text()').extract_first() for item in tr.xpath('./td') ] funds_table = numpy.array([ parse_line(tr) for tr in funds_response[i].xpath('./table/tbody/tr') ]) manager_name = funds_response[0].xpath( './div/label/a/text()').extract_first() manager['funds'] = funds_table[1:, 0].tolist() yield scrapy.Request(manager['url'], callback=self.parse_manager, meta={'manager': manager}) for fund_list in funds_table[1:, ]: yield Fund(_id=manager['_id'] + '#' + fund_list[0], code=fund_list[0], name=fund_list[1], type=fund_list[2], start_date=fund_list[3], end_date=fund_list[4], duty_days=fund_list[5], duty_return=fund_list[6], average=fund_list[7], rank=fund_list[8], manager=manager_name, company=company)
def parse_single(self, response): reviews = response.xpath('//div[@class = "cmp-Review"]') company = response.xpath('//div[@class = "cmp-CompactHeaderLayout-nameContainer"]//text()').extract_first() for review in reviews: item = IndeedItem() item['company'] = company item['rating'] = review.xpath(".//div[@class = 'cmp-ReviewRating-text']/text()").extract_first() item['date'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[last()]").extract_first() item['content'] = review.xpath(".//span[@itemprop = 'reviewBody']//span[@class = 'cmp-NewLineToBr-text']/text()").extract() item['position'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']//meta[@itemprop='name']/@content").extract_first() if len(review.xpath(".//div[@class = 'cmp-Review-title']/text()")): item['title'] = review.xpath(".//div[@class = 'cmp-Review-title']/text()").extract_first() else: item['title'] = review.xpath(".//a[@class = 'cmp-Review-titleLink']/text()").extract_first() if len(review.xpath(".//a[@class = 'cmp-ReviewAuthor-link']")) == 2 : item['location'] = review.xpath(".//a[@class = 'cmp-ReviewAuthor-link'][2]/text()").extract_first() item['status'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[2]").extract_first() elif len(review.xpath(".//a[@class = 'cmp-ReviewAuthor-link']")) == 1: if review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[last()-2]").extract_first() != ' - ': item['location'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[last()-2]").extract_first() item['status'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[last()-4]").extract_first() else: item['location'] = review.xpath(".//a[@class = 'cmp-ReviewAuthor-link'][1]/text()").extract_first() item['status'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[3]").extract_first() else: item['location'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[5]").extract_first() item['status'] = review.xpath(".//span[@class = 'cmp-ReviewAuthor']/text()[3]").extract_first() subrating = review.xpath(".//div[@class = 'cmp-SubRating']//div[@class = 'cmp-RatingStars-starsFilled']/@style").extract() item['work_life_rating'] = subrating[0] item['benefits_rating'] = subrating[1] item['security_rating'] = subrating[2] item['management_rating'] = subrating[3] item['culture_rating'] = subrating[4] # 3px=0 # 15px=1 # 27px=2 # 39px=3 # 51px=4 # 63px=5 if len(review.xpath(".//div[@class = 'cmp-ReviewProsCons-prosText']//span[@class = 'cmp-NewLineToBr-text']/text()")): item['Pros'] = review.xpath(".//div[@class = 'cmp-ReviewProsCons-prosText']//span[@class = 'cmp-NewLineToBr-text']/text()").extract_first() else: item['Pros'] = 'NaN' if len(review.xpath(".//div[@class = 'cmp-ReviewProsCons-consText']//span[@class = 'cmp-NewLineToBr-text']/text()")): item['Cons'] = review.xpath(".//div[@class = 'cmp-ReviewProsCons-consText']//span[@class = 'cmp-NewLineToBr-text']/text()").extract_first() else: item['Cons'] = 'NaN' if len(review.xpath(".//span[text()='Yes']//span[@class = 'cmp-StatelessReviewFeedbackButtons-count']/text()")): item['helpful'] = review.xpath(".//span[text()='Yes']//span[@class = 'cmp-StatelessReviewFeedbackButtons-count']/text()").extract_first() else: item['helpful'] = 0 if len(review.xpath(".//span[text()='No']//span[@class = 'cmp-StatelessReviewFeedbackButtons-count']/text()")): item['helpless'] = review.xpath(".//span[text()='No']//span[@class = 'cmp-StatelessReviewFeedbackButtons-count']/text()").extract_first() else: item['helpless'] = 0 yield item if len(response.xpath("//a[@data-tn-element = 'next-page']/@href")): next_url = response.xpath("//a[@data-tn-element = 'next-page']/@href").extract_first() yield scrapy.Request("https://www.indeed.com" + next_url ,callback = self.parse_single)
def start_requests(self): data1 = [ { "category": "m.图书.音像.游戏", "url": "http://list.jd.com/list.html?cat=mvd.jd.com/theme/4053-7.html&go=0" }, { "category": "m.图书.少儿.0-2岁", "url": "http://list.jd.com/list.html?cat=book.jd.com/children0-2.html&go=0" }, { "category": "m.图书.少儿.3-6岁", "url": "http://list.jd.com/list.html?cat=book.jd.com/children3-6.html&go=0" }, { "category": "m.图书.少儿.7-10岁", "url": "http://list.jd.com/list.html?cat=book.jd.com/children7-10.html&go=0" }, { "category": "m.图书.少儿.11-14岁", "url": "http://list.jd.com/list.html?cat=book.jd.com/children11-14.html&go=0" }, { "category": "m.图书.少儿.儿童文学", "url": "http://list.jd.com/list.html?cat=1713-3263-3394&go=0" }, { "category": "m.图书.少儿.绘本", "url": "http://list.jd.com/list.html?cat=1713-3263-4761&go=0" }, { "category": "m.图书.少儿.科普", "url": "http://list.jd.com/list.html?cat=1713-3263-3399&go=0" }, { "category": "m.图书.少儿.幼儿启蒙", "url": "http://list.jd.com/list.html?cat=1713-3263-3395&go=0" }, { "category": "m.图书.少儿.手工游戏", "url": "http://list.jd.com/list.html?cat=1713-3263-3396&go=0" }, { "category": "m.图书.少儿.智力开发", "url": "http://list.jd.com/list.html?cat=1713-3263-3398&go=0" }, { "category": "m.图书.教育.教材", "url": "http://list.jd.com/list.html?cat=1713-11047&go=0" }, { "category": "m.图书.教育.中小学教辅", "url": "http://list.jd.com/list.html?cat=1713-3289&go=0" }, { "category": "m.图书.教育.考试", "url": "http://list.jd.com/list.html?cat=1713-3290&go=0" }, { "category": "m.图书.教育.外语学习", "url": "http://list.jd.com/list.html?cat=1713-3291&go=0" }, { "category": "m.图书.教育.字典词典", "url": "http://list.jd.com/list.html?cat=1713-3294&go=0" }, { "category": "m.图书.文艺.小说", "url": "http://list.jd.com/list.html?cat=1713-3258&go=0" }, { "category": "m.图书.文艺.文学", "url": "http://list.jd.com/list.html?cat=1713-3259&go=0" }, { "category": "m.图书.文艺.青春文学", "url": "http://list.jd.com/list.html?cat=1713-3260&go=0" }, { "category": "m.图书.文艺.传记", "url": "http://list.jd.com/list.html?cat=1713-3261&go=0" }, { "category": "m.图书.文艺.动漫", "url": "http://list.jd.com/list.html?cat=1713-3272&go=0" }, { "category": "m.图书.文艺.艺术", "url": "http://list.jd.com/list.html?cat=1713-3262&go=0" }, { "category": "m.图书.文艺.摄影", "url": "http://list.jd.com/list.html?cat=1713-12776&go=0" }, { "category": "m.图书.经管励志.管理", "url": "http://list.jd.com/list.html?cat=1713-3266&go=0" }, { "category": "m.图书.经管励志.金融与投资", "url": "http://list.jd.com/list.html?cat=1713-3265&go=0" }, { "category": "m.图书.经管励志.经济", "url": "http://list.jd.com/list.html?cat=1713-3264&go=0" }, { "category": "m.图书.经管励志.励志与成功", "url": "http://list.jd.com/list.html?cat=1713-3267&go=0" }, { "category": "m.图书.人文社科.历史", "url": "http://list.jd.com/list.html?cat=1713-3273&go=0" }, { "category": "m.图书.人文社科.心理学", "url": "http://list.jd.com/list.html?cat=1713-3279&go=0" }, { "category": "m.图书.人文社科.政治/军事", "url": "http://list.jd.com/list.html?cat=1713-3276&go=0" }, { "category": "m.图书.人文社科.社会科学", "url": "http://list.jd.com/list.html?cat=1713-3281&go=0" }, { "category": "m.图书.人文社科.法律", "url": "http://list.jd.com/list.html?cat=1713-3277&go=0" }, { "category": "m.图书.人文社科.文化", "url": "http://list.jd.com/list.html?cat=1713-3280&go=0" }, { "category": "m.图书.生活.家教与育儿", "url": "http://list.jd.com/list.html?cat=1713-3270&go=0" }, { "category": "m.图书.生活.孕产", "url": "http://list.jd.com/list.html?cat=1713-3270-3509&go=0" }, { "category": "m.图书.生活.健身保健", "url": "http://list.jd.com/list.html?cat=1713-3269&go=0" }, { "category": "m.图书.生活.旅游/地图", "url": "http://list.jd.com/list.html?cat=1713-3271&go=0" }, { "category": "m.图书.生活.美食", "url": "http://list.jd.com/list.html?cat=1713-9278&go=0" }, { "category": "m.图书.生活.时尚美妆", "url": "http://list.jd.com/list.html?cat=1713-9291&go=0" }, { "category": "m.图书.生活.家居", "url": "http://list.jd.com/list.html?cat=1713-9301&go=0" }, { "category": "m.图书.生活.手工DIY", "url": "http://list.jd.com/list.html?cat=1713-9314-9315&go=0" }, { "category": "m.图书.生活.两性", "url": "http://list.jd.com/list.html?cat=1713-9309&go=0" }, { "category": "m.图书.生活.体育", "url": "http://list.jd.com/list.html?cat=1713-3288&go=0" }, { "category": "m.图书.科技.计算机与互联网", "url": "http://list.jd.com/list.html?cat=1713-3287&go=0" }, { "category": "m.图书.科技.建筑", "url": "http://list.jd.com/list.html?cat=1713-3284&go=0" }, { "category": "m.图书.科技.工业技术", "url": "http://list.jd.com/list.html?cat=1713-3282&go=0" }, { "category": "m.图书.科技.电子/通信", "url": "http://list.jd.com/list.html?cat=1713-9351&go=0" }, { "category": "m.图书.科技.医学", "url": "http://list.jd.com/list.html?cat=1713-3285&go=0" }, { "category": "m.图书.科技.科学与自然", "url": "http://list.jd.com/list.html?cat=1713-3286&go=0" }, { "category": "m.图书.科技.农林", "url": "http://list.jd.com/list.html?cat=1713-9368&go=0" }, { "category": "m.图书.刊/原版.杂志/期刊", "url": "http://list.jd.com/list.html?cat=1713-4758&go=0" }, { "category": "m.图书.刊/原版.英文原版书", "url": "http://list.jd.com/list.html?cat=1713-4855&go=0" }, { "category": "m.图书.刊/原版.港台图书", "url": "http://list.jd.com/list.html?cat=1713-6929&go=0" }, { "category": "m.图书.电子书.小说", "url": "http://list.jd.com/list.html?cat=e.jd.com/products/5272-5278.html&go=0" }, { "category": "m.图书.电子书.励志与成功", "url": "http://list.jd.com/list.html?cat=e.jd.com/products/5272-5287.html&go=0" }, { "category": "m.图书.电子书.经济金融", "url": "http://list.jd.com/list.html?cat=e.jd.com/products/5272-12438.html&go=0" }, { "category": "m.图书.电子书.文学", "url": "http://list.jd.com/list.html?cat=e.jd.com/products/5272-5279.html&go=0" }, { "category": "m.图书.电子书.社科", "url": "http://list.jd.com/list.html?cat=e.jd.com/products/5272-5301.html&go=0" }, { "category": "m.图书.电子书.婚恋两性", "url": "http://list.jd.com/list.html?cat=e.jd.com/products/5272-10884.html&go=0" }, { "category": "m.图书.电子书.外文原版", "url": "http://list.jd.com/list.html?cat=e.jd.com/products/5272-6828.html&go=0" }, { "category": "m.图书.电子书.免费", "url": "http://list.jd.com/list.html?cat=e.jd.com/products/5272-5276.html&go=0" }, ] for data in data1: category = data['category'] url = data['url'] request = scrapy.Request(url, self.parse) request.meta['category'] = category yield request
def parse(self, response): print(response.text) yield scrapy.Request(self.start_urls[0], dont_filter=True)
def get_media_requests(self, item, info): for image_url in item['image_urls']: yield scrapy.Request( image_url, meta={'image_name': item["image_name"] + '.jpg'})
def start_requests(self): """Start parsing from first page of questions""" yield scrapy.Request(url=f"{self.domain}/questions?tab=newest&page=1", callback=self.parse_pages)
def start_requests(self): urls = [ "http://www.wandoujia.com/apps", ] for url in urls: yield scrapy.Request(url=url, callback=self.parseCategory)
def start_requests(self): yield scrapy.Request(url=self.start_urls[0],callback=self.parse1)
def start_requests(self): urls = ['http://www.theguardian.com'] for url in urls: yield scrapy.Request(url=url, callback = self.frontpage)
def start_requests(self): urls = ['https://en.wikipedia.org/wiki/Web_page', 'https://en.wikipedia.org/wiki/Web_browser', 'https://en.wikipedia.org/wiki/WorldWideWeb'] for url in urls: yield scrapy.Request(url=url, callback=self.parse)
def get_media_requests(self, item, info): for image_url in item['image_urls']: yield scrapy.Request(image_url)
def parse_page(self, response): url = 'http://www.renren.com/880151247/profile' request = scrapy.Request(url=url, callback=self.parse_project) yield request
def start_requests(self): url = 'https://news.pts.org.tw/list/0' meta = {'iter_time': 0} yield scrapy.Request(url, callback=self.parse_news_list, meta=meta)
def start_requests(self): for page in range(1, 12): url = 'http://blog.eastmoney.com/5185188/bloglist_0_%d.html' % page self.logger.debug('parsing page: ' + url) yield scrapy.Request(url, callback=self.parse_outer_page)