def country_history_data_parse(self, response): # 取出返回结果 # 通过 response.json() data = response.json() # 历史疫情数据 data = data.get('data') # 得到 META 中的 country_name的值 country_name = response.meta['country_name'] countryShortCode = response.meta['countryShortCode'] # 循环 历史数据,给每一条数据添加一个country_name 字段,以标注它是属于哪一个国家的数据 for d in data: d['country_name'] = country_name d['countryShortCode'] = countryShortCode # file_path = f'datas/countries/{country_name}.json' # #文件的上一级路径 # parent_path = pathlib.PosixPath(file_path).parent # # 如果文件的上一级不存在,则创建 # if not parent_path.exists(): # #进行㠌套创建目录 # parent_path.mkdir(parents=True) # # # 保存数据 # with open(file_path, 'w+') as f: # json.dump(data, f, ensure_ascii=False) # 通过 Pipeline 方式进行数据的统一保存 item = SpiderItem() item['is_last_updated'] = False item['data'] = data yield item
def parse_info(self, response): item = SpiderItem() soup = BeautifulSoup(response.body, 'html.parser') soup_type = soup.find(id='lastfont') item['category'] = soup_type.string.strip() soup_title = soup.find(id='tdTitle').div item['title'] = soup_title.font.b.string.strip() soup_title = soup_title.next_sibling.next_sibling item['date'] = soup_title.get_text().split('\r\n')[1].strip() soup_content = soup.find(id='TDContent') item['content'] = soup_content.get_text() item['file_urls'] = [] item['file_names'] = [] soup_files = soup.find(id='filedown').find_all('a') for soup_file in soup_files: item['file_urls'].append(response.urljoin(soup_file.attrs['href'])) item['file_names'].append(soup_file.get_text().strip()) item['url'] = response.url return item
def parse(self, response): content = response.body.decode('utf-8') a = json.loads(content) # re.search('window\.rawData= (.*)\;\s*\<\/script\>', content) if a: content = a goods_data = SpiderItem() content_goods = content['goods'] goods_data['goods_id'] = content_goods["goods_id"] goods_data['mall_id'] = content_goods['mall_id'] goods_data['goods_type'] = content_goods['goods_type'] goods_data['category1'] = str(content_goods['cat_id_1']) goods_data['category2'] = str(content_goods['cat_id_2']) goods_data['category3'] = str(content_goods['cat_id_3']) goods_data['goods_name'] = content_goods['goods_name'] goods_data['market_price'] = float(content_goods['market_price'] / 100) # 单位:元,下同 goods_data['max_group_price'] = float(content['price']['max_on_sale_group_price'] / 100) goods_data['min_group_price'] = float(content['price']['min_on_sale_group_price'] / 100) goods_data['max_normal_price'] = float(content['price']['max_on_sale_normal_price'] / 100) goods_data['min_normal_price'] = float(content['price']['min_on_sale_normal_price'] / 100) goods_data['thumb_url'] = content_goods['thumb_url'] # goods_data['publish_date'] = goods['created_at'] goods_data['total_sales'] = int(content_goods['sold_quantity']) # 总销量 goods_data['is_on_sale'] = content_goods['is_onsale'] # ##获取核算价 goods_data['price'] = goods_data['min_group_price'] goods_data['total_amount'] = float(goods_data['total_sales'] * float(goods_data['price'])) # 总销售额 # print(goods_data) yield goods_data
def parse_list(self, response): soup = BeautifulSoup(response.body, 'html.parser') soup_list = soup.find(id='MoreInfoList1_tdcontent') or soup.find( id='DataGrid1') soup_list = soup_list.find_all('a') soup_type = soup.find(id='lastfont') for i in soup_list: if 'infodetail' in i.attrs['href'].lower(): yield scrapy.Request(url=response.urljoin(i.attrs['href']), callback=self.parse_info) elif 'buyi_list' in i.attrs['href'].lower(): yield scrapy.Request(url=response.urljoin(i.attrs['href']), callback=self.parse_parameters) else: item = SpiderItem() item['category'] = soup_type.string.strip() item['title'] = i.string.strip() item['date'] = i.parent.next_sibling.string.strip().replace( '-', '/') item['content'] = '' item['file_urls'] = [response.urljoin(i.attrs['href'])] item['file_names'] = ['test.txt'] item['url'] = response.urljoin(i.attrs['href']) yield item
def parse(self, response): driver = webdriver.Chrome() reviews = response.xpath( "//div[contains(@class,'lister-item mode-detail imdb-user-review with-spoiler')]" ) driver.get(response.url) right_index = response.url.rfind('/') left_index = response.url[:right_index].rfind('/') movieID = response.url[left_index + 1:right_index] while len(reviews) <= 100: driver.find_element_by_class_name("ipl-load-more").click() html = driver.page_source reviews = scrapy.Selector(text=html).xpath( "//div[contains(@class,'lister-item mode-detail imdb-user-review with-spoiler')]" ) for review in reviews[:100].extract(): element = scrapy.Selector(text=review) spiderItem = SpiderItem() spiderItem['url'] = response.url spiderItem['review'] = element.xpath( "//div[contains(@class, 'text show-more__control')]/text()" ).extract_first() spiderItem['movieID'] = movieID if len(element.xpath( "//span[contains(@class, 'spoiler-warning')]")) != 0: spiderItem['spoiler'] = "true" else: spiderItem['spoiler'] = "false" score = element.xpath( "//div[contains(@class,'ipl-ratings-bar')]/span/span/text()") if len(score) != 0: spiderItem['score'] = score.extract_first() else: spiderItem['score'] = "" yield spiderItem
def parse4(self, response): for link in response.xpath( '//div/a[contains(@class,"action")]/@href').extract(): loader = ItemLoader(item=SpiderItem(), selector=link) urlabsoluta = response.urljoin(link) loader.add_value('file_urls', urlabsoluta) yield loader.load_item()
def parse_link(self, response): item = SpiderItem() unique_name = response.meta['unique_name'] full_name = response.meta['full_name'] content = response.meta['content'] category = response.meta['category'] post_time = response.meta['post_time'] file_urls = response.meta['file_urls'] screen_urls = response.meta['screen_urls'] image_urls = response.meta['image_urls'] tag = response.meta['tag'] print unique_name print '------' sel = Selector(response) try: link1 = sel.xpath( '//div[contains(@class, "downloadlink")]//a/@href')[0].extract( ) link1_text = sel.xpath( '//div[contains(@class, "downloadlink")]//a/text()' )[0].extract() except IndexError, e: link1 = '' link1_text = '' f = open('no_link.html', 'a') f.write('%s\n' % full_name) f.close()
def parse(self, response): reviews = response.xpath( "//div[contains(@class,'lister-item mode-detail imdb-user-review with-spoiler')]" ) group = response.url.split('/') movieID = group[4] for review in reviews.extract(): element = scrapy.Selector(text=review) spiderItem = SpiderItem() spiderItem['url'] = response.url spiderItem['review'] = element.xpath( "//div[contains(@class, 'text show-more__control')]/text()" ).extract_first().replace('\n', ' ') spiderItem['movieID'] = movieID if len(element.xpath( "//span[contains(@class, 'spoiler-warning')]")) != 0: spiderItem['spoiler'] = "true" else: spiderItem['spoiler'] = "false" score = element.xpath( "//div[contains(@class,'ipl-ratings-bar')]/span/span/text()") if len(score) != 0: spiderItem['score'] = score.extract_first() else: spiderItem['score'] = "" yield spiderItem if len(self.start_urls) <= 40000: loader = response.xpath( "//div[contains(@class,'load-more-data')]/@data-key" ).extract_first() if loader != None: url = 'https://www.imdb.com/title/' + movieID + '/reviews/_ajax?paginationKey=' + loader self.start_urls.append(url)
def parse(self, response): # 这个表示有40个 == # response 返回数据的方法xpath,css 但是通常用 xpath # print("==" * 40) # 第一个div 是获取的整体内容,第二个div是下属的 # SelectorList 结合类型,注意 div的 id 要取 大id # contentLeft = response.xpath("//div[@id='content']/div") contentLeft = response.xpath("//div[@id='content']/div") # Selector for content in contentLeft: # 这是获取作者等单条记录的方法,只有一个标签的时候使用 # author = content.xpath(".//li/text()").get().strip() # author = content.xpath(".//div[@class='line']//text()").getall() author = content.xpath(".//div[@class='line']//text()").getall() # 变成字符串的方法 author = "".join(author).strip() # print(author) # 方法一:构造成生成器,给PIPELINE 使用 ?生成器的作用 # text = {'text': author} # yield text # 方法二(建议):引用ITEMS模型 item = SpiderItem(author=author) # 返回当前item yield item
def scrape_home_page(self, response): open_in_browser(response) l = ItemLoader(item=SpiderItem(), response=response) h1_tag = response.xpath('//h1/a/text()').extract_first() tags = response.xpath('//*[@class="tag-item"]/a/text()').extract() l.add_value('h1_tag', h1_tag) l.add_value('tags', tags) return l.load_item()
def parse_item(self, response): self.logger.info('parse url %s' % response.url) content = response.css('.detail').extract_first() content = re.sub('<.*?>|\t', '', content) duilian = re.findall('(.*?);(.*?)。', content) item = SpiderItem() item['duilian'] = duilian item['content'] = content item['url'] = response.url return item
def parse(self, response): movie = Selector(response=response).xpath( '//div[@class="channel-detail movie-item-title"]') for m in movie[:10]: item = SpiderItem() url = m.xpath('./a/@href').extract_first().strip() link = 'https://maoyan.com' + url item['link'] = link yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
def parse(self, response): items = [] movies = Selector(response=response).xpath('//*[@id="app"]/div/div/div[1]/dl/dd[2]/div/div/div[1]') for movie in movies: item = SpiderItem() link = movie.xpath('./a/@href') title = movie.xpath('./a/text()') time = movie.xpath('//*[@id="app"]/div/div/div[1]/dl/dd[2]/div/div/div[1]/p[3]') item['title'] = title item['link'] = link item['time'] = time items.append(item) yield scrapy.Request(url=link,meta={'item':item},callback=self.parse2)
def parse_item(self, response): url = response.css('a::attr(href)').extract() for i in range(0, len(url) + 1): if "detail" in url[i]: # yield scrapy.Request('https:'+url[i], callback=self.parse, # errback=self.errback) newres = self.webpage('https:' + url[i]) dt = ItemLoader(item=SpiderItem(), response=newres) dt.add_xpath('name', '//*[@id="J_AttrUL"]/li[1]') dt.add_xpath('parse', '//*[@id="J_AttrUL"]/li[3]') # dt.add_xpath('price','//*[@id="J_StrPriceModBox"]/dd/span') dt.add_xpath('price', '//*[@id="J_StrPriceModBox"]/dd/span') return dt.load_item()
def parse_product(self, response): sel = Selector(response) item = SpiderItem() category = sel.xpath( '//div[contains(@class, "category-list")]//a/text()')[-1].extract( ) tag = sel.xpath( '//div[contains(@class,"post-tags-wrapper")]//div[contains(@class,"post-tags")]//a/text()' ).extract() content = sel.xpath( '//div[contains(@class, "the-content")]/*[not(@class="nmac-before-content" or self::a or self::script or @class="nmac-after-content" or @class="adsbygoogle" or @id="aswift_2_expand" or class="alert fade in alert-error" or class="wp-image-3333" or @style="text-align: center; width: 40%; margin-left: 30%;" or @style="text-align: center" or @style="text-align: center;" or @class="alert fade in alert-error" or @style="text-align: left;" or @class="alert fade in alert-error " or @style="text-align: center; width: 100%;" or @class="size-full")]' ).extract() download_url = sel.xpath( '//div[contains(@class, "the-content")]//a[contains(@class,"btn-block")]/@href' ).extract() name = sel.xpath( '//div[contains(@class, "main-content")]//h1/text()').extract() unique_name = name[0].replace(u'\u2013', '-').split('-')[0].strip() full_name = name[0].replace(u'\u2013', '-').strip() post_time = sel.xpath( '//div[contains(@class,"meta-data")]//span[contains(@class,"date")]/text()' )[-1].extract().split('\n')[-1].strip() image_urls = sel.xpath( '//div[contains(@class, "the-content")]//img[contains(@class,"alignright")]/@src' ).extract() item['unique_name'] = unique_name item['full_name'] = full_name item['content'] = content item['category'] = [category] item['image_urls'] = image_urls item['tag'] = tag item['post_time'] = post_time for index, d_url in enumerate(download_url): if index == 0: request = Request(d_url, callback=self.parse_download_link_1, meta={ "download_url": download_url, 'item': item }) yield request
def parse(self, response): items = [] shorts = Selector(response=response).xpath("//span[@class='short']") stars = Selector( response=response).xpath("//span[starts-with(@class, 'allstar')]") votes = Selector(response=response).xpath("//span[@class='votes']") for i in range(len(shorts)): item = SpiderItem() item['short'] = shorts[i].xpath('./text()').extract()[0] item['star'] = stars[i].xpath('./@class').extract()[0][7:9] item['recommend'] = stars[i].xpath('./@title').extract()[0] item['vote'] = votes[i].xpath('./text()').extract()[0] items.append(item) print(items) return items
def parse_item(self, response): self.logger.info('A response from %s just arrived!', response.url) item = SpiderItem() item['url'] = response.url title = response.xpath('//div[@id="article"]/h1[@id="title"]/text()').extract()[0] if title: item['title'] = title else: title['title'] = '' text = response.xpath('//div[@id="article"]/div[2]/p/text()').extract() if text: item['text'] = ' '.join(text) else: item['text'] = '' return item
def parse(self, response): # 获取的数据集 node_list = response.xpath("//div[@class='li_txt']") for node in node_list: item = SpiderItem() # .extract()将xpath对象转换为Unicode字符串 name = node.xpath("./h3/text()").extract() title = node.xpath("./h4/text()").extract() info = node.xpath("./p/text()").extract() item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] # yield :获得一个item数据后暂停循环,然后将它交给管道,之后继续进行循环 yield item
def parse(self, response): items = [] movies = Selector( response=response).xpath('//div[@class="movie-hover-info"]') for movie in movies: item = SpiderItem() movie_title = movie.xpath('./div/span/text()').extract_first() item['movie_title'] = movie_title movie_info_list = movie.xpath( './div[@class="movie-hover-title"]/text()').extract() movie_info_list_new = [] for x in movie_info_list: x = x.replace('\n', '').replace(' ', '') if x != '': movie_info_list_new.append(x) movie_type = movie_info_list_new[0] item['movie_type'] = movie_type movie_time_list = movie.xpath( './div[@class="movie-hover-title movie-hover-brief"]/text()' ).extract() movie_time_list_new = [] for y in movie_time_list: y = y.replace('\n', '').replace(' ', '') if y != '': movie_time_list_new.append(y) movie_time = ''.join(movie_time_list_new) if movie_time == '': movie_time = '暂无' item['movie_time'] = movie_time items.append(item) return items
def parse(self, response): movies = Selector(response=response).xpath( '//div[contains(@class, "movie-hover-info")]') print(movies) for movie in movies[0:10]: name = movie.xpath('./div/span[@class="name "]/text()') print(name.extract_first()) category = movie.xpath( './div/span[contains(text(), "类型")]/parent::*/text()') print(category.extract()[-1].strip()) show_time = movie.xpath( './div/span[contains(text(), "上映时间")]/parent::*/text()') print(show_time.extract()[-1].strip()) item = SpiderItem() item['name'] = name.extract_first() item['category'] = category.extract()[-1].strip() item['show_time'] = show_time.extract()[-1].strip() yield item
def parse(self, response): ''' extract title content url ''' print '>' * 50 print 'response url: ', response.url hxs = HtmlXPathSelector(response) print '>>>> repsonse.url: ', response.url #get urls content_urls = hxs.select(content_url_format).extract() list_urls = hxs.select(list_url_format).extract() list_urls = [up.urljoin(response.url, url) for url in list_urls] content_urls = [up.urljoin(response.url, url) for url in content_urls] print "@" * 60 time.sleep(self.sleep_time) self.start_urls.extend(list_urls) for url in list_urls: yield Request(url, self.parse) #http://www.pcconnection.com/IPA/Shop/Product/Detail.htm?sku=16037879&cac=Result content_re = re.compile( r'http://www[.]pcconnection[.]com/.*cac=Result') for url in content_urls: if content_re.match(url): if len(self.dic) > 160: self.start_urls = [] raise CloseSpider('reach pages limit, end the spider.') self.count += 1 self.dic.add(hash(url)) #extract data item = SpiderItem() item['url'] = url item['kind'] = self.name yield item else: print "!!!!!!! not match content url:" print url
def parse(self, response): item = SpiderItem() if self.counter < self.goal: print(self.counter) self.counter += 1 # h1, h2, h3, h4, h5, h6, li, a, span title = response.xpath("//h1/text()").extract() p = response.xpath('//p/text()').extract() span = response.xpath('//span/text()').extract() li = response.xpath('//li/text()').extract() a = response.xpath('//a/text()').extract() # h1 = response.xpath('//h1/text()').extract() h2 = response.xpath('//h2/text()').extract() h3 = response.xpath('//h3/text()').extract() h4 = response.xpath('//h4/text()').extract() h5 = response.xpath('//h5/text()').extract() h6 = response.xpath('//h6/text()').extract() text = str(p).strip() + str(span).strip() + str(a).strip()\ + str(h2).strip() + str(h3).strip()\ + str(h4).strip() + str(h5).strip() + str(h6).strip() \ + str(li) + str(title) text = text.replace('\\r', '').replace('\\n', '').replace('\\t', '') if str(response.url) not in self.dict_page.keys(): self.doc_id += 1 self.dict_page[str(response.url)] = [[self.doc_id], [text]] item['text'] = text item['title'] = title else: print("Writing dictionary into file. " + "Dictionary size: " + str(len(self.dict_page))) with open( "/Users/lekangdu/Desktop/my_spider/spider/res/ai_res" + str(self.goal) + ".json", 'w') as f: json.dump(self.dict_page, f) self.crawler.engine.close_spider(self, 'Spider closed.') links = response.xpath('.//a/@href').extract() for url in links: if url.endswith('.html'): next_url = response.urljoin(url) yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): pass content = response.body.decode('utf-8') a = re.search('window\.rawData= (.*)\;\s*\<\/script\>', content) if a: content = json.loads(a.group(1)) print(content) if 'goods' not in content.keys(): return None goods = content['goods'] goods_data = SpiderItem() goods_data['goods_id'] = goods['goodsID'] goods_data['mall_id'] = goods['mallID'] goods_data['goods_type'] = goods['goodsType'] goods_data['category1'] = str(goods['catID1']) goods_data['category2'] = str(goods['catID2']) goods_data['category3'] = str(goods['catID3']) goods_data['goods_name'] = goods['goodsName'] goods_data['market_price'] = goods['marketPrice'] goods_data['max_group_price'] = goods['maxOnSaleGroupPrice'] goods_data['min_group_price'] = goods['minOnSaleGroupPrice'] goods_data['max_normal_price'] = goods['maxOnSaleNormalPrice'] goods_data['min_normal_price'] = goods['minOnSaleNormalPrice'] goods_data['thumb_url'] = goods['thumbUrl'] goods_data['publish_date'] = self.get_goods_publish_date( goods['topGallery'], goods['detailGallery'], goods['skus']) goods_data['total_sales'] = int(goods['sales']) ##总销量 if goods['isOnSale'] and goods['isGoodsOnSale']: goods_data['is_on_sale'] = 1 else: goods_data['is_on_sale'] = 0 # ##获取核算价 goods_data['price'] = self.get_goods_price(goods['skus'], goods['sales']) goods_data['total_amount'] = float( goods_data['total_sales'] * float(goods_data['price'])) ##总销售额 yield goods_data
def parse(self, response): ''' extract title content url ''' print '>' * 50 print 'response url: ', response.url hxs = HtmlXPathSelector(response) print '>>>> repsonse.url: ', response.url #get urls content_urls = hxs.select(content_url_format).extract() list_urls = hxs.select( '//ul[contains(@class,"pagination")]/li/a[contains(@href,"query=camera")]/@href' ).extract() list_urls = [up.urljoin(response.url, url) for url in list_urls] content_urls = [up.urljoin(response.url, url) for url in content_urls] print "@" * 60 time.sleep(self.sleep_time) self.start_urls.extend(list_urls) for url in list_urls: yield Request(url, self.parse) content_re = re.compile( r'http://.*[.]cnet[.]com/.*camera.*/.*[.]html$') for url in content_urls: if content_re.match(url): if self.count > 450: self.start_urls = [] raise CloseSpider('reach pages limit, end the spider.') self.count += 1 #extract data item = SpiderItem() item['url'] = url item['kind'] = self.name yield item
def parse(self, response): movie_list = response.xpath( "//div[@class='article']//ol[@class='grid_view']//li") #循环电影的条目 for i_item in movie_list: #item文件导进来 spider_item = SpiderItem() #写详细的xpath,进行数据解析 spider_item['serial_number'] = i_item.xpath( ".//div[@class='item']//em/text()").extract_first() spider_item['movie_name'] = i_item.xpath( ".//div[@class='info']/div[@class='hd']/a/span[1]/text()" ).extract_first() content = i_item.xpath( ".//div[@class='info']//div[@class='bd']/p[1]/text()").extract( ) #数据的处理 for i_content in content: content_s = "".join(i_content.split()) spider_item['introduce'] = content_s # print(spider_item['introduce']) spider_item['star'] = i_item.xpath( ".//span[@class='rating_num']/text()").extract_first() spider_item['evaluate'] = i_item.xpath( ".//div[@class='star']//span[4]/text()").extract_first() spider_item['describe'] = i_item.xpath( ".//p[@class='quote']/span/text()").extract_first() #你需要将数据yield到pipelines里面去 # yield spider_item #解析下一页,取后页的xpath next_link = response.xpath("//span[@class='next']/a/@href").extract() #判断是否有下一页按钮 if next_link: next_link = next_link[0] yield scrapy.Request("https://movie.douban.com/top250" + next_link, callback=self.parse) #回调函数
def parse(self, response): # 1从返回对象中取值 # 取值方式: # 1.1 response.xpath() # data_txt = response.xpath('//script[@id="getListByCountryTypeService2true"]/text()').get() # print(data_txt) # 1.2 response.css() data_txt = response.css( '#getListByCountryTypeService2true::text').get() # print(f'工作目录:{os.getcwd()}') # 2 清洗数据 # 2.1 通过正则匹配出“[]”中的字符串 data_txt = re.findall('\[.+\]', data_txt)[0] # 2.2 通过json模块,将字符串转换化为python对象,在这里为list data = json.loads(data_txt) item = SpiderItem() item['is_last_updated'] = True item['data'] = data yield item # # 3 保存数据 # with open('datas/last_updated_dxy_datas.json', 'w+') as f: # json.dump(data, f, ensure_ascii=False) # 循环 data 得到 每个国家的历史疫情数据 URL for country_data in data: # 表示 各国家 的历史疫情数据 URL url = country_data['statisticsData'] #国家或地区名称 country_name = country_data['provinceName'] countryShortCode = country_data['countryShortCode'] # 发起请求 yield scrapy.Request(url, callback=self.country_history_data_parse, meta={ 'country_name': country_name, 'countryShortCode': countryShortCode })
def parse(self, response): item = SpiderItem() pro_char = 'http://www.baidu.com/' for num in range(1, 11): pro_hot = response.xpath( "//div[@class='FYB_RD']/table/tbody[1]/tr[%d]" % num) hotname = pro_hot.xpath('td/span/a/text()').extract()[0] hoturl = pro_hot.xpath('td/span/a/@href').extract()[0] theurl = pro_char + hoturl item['bdname'] = hotname item['bdurl'] = theurl print hotname, theurl yield scrapy.Request( url= 'https://www.so.com/s?ie=utf-8&fr=none&src=360sou_newhome&q=%E7%83%AD%E7%82%B9', meta={'item': item}, callback=self.get360, dont_filter=True)
def parse(self, response): ''' extract title content url ''' print '>' * 50 print 'response url: ', response.url hxs = HtmlXPathSelector(response) print '>>>> repsonse.url: ', response.url #get urls content_urls = hxs.select(content_url_format).extract() list_urls = hxs.select( '//span[contains(@class,"pagnLink")]/a[contains(@href,"keywords=notebook")]/@href' ).extract() list_urls = [up.urljoin(response.url, url) for url in list_urls] print "@" * 60 time.sleep(self.sleep_time) self.start_urls.extend(list_urls) for url in list_urls: yield Request(url, self.parse) content_re = re.compile( r'http://www.amazon.com/[^s]+.*&keywords=notebook$') for url in content_urls: if content_re.match(url): if len(self.dic) > 450: self.start_urls = [] raise CloseSpider('reach pages limit, end the spider.') self.count += 1 self.dic.add(hash(url)) #extract data item = SpiderItem() item['url'] = url item['kind'] = 'amazon_notebook' yield item
def parse(self, response): pass goods_id = response.meta['goods_id'] self.ssdb_client.hdel(self.hash_name, goods_id) content = response.body.decode('utf-8') a = json.loads( content ) # re.search('window\.rawData= (.*)\;\s*\<\/script\>', content) if a: goods = a goods_data = SpiderItem() goods_data['goods_id'] = goods['goods_id'] goods_data['mall_id'] = goods['mall_id'] goods_data['goods_type'] = goods['goods_type'] goods_data['category1'] = str(goods['cat_id_1']) goods_data['category2'] = str(goods['cat_id_2']) goods_data['category3'] = str(goods['cat_id_3']) goods_data['goods_name'] = goods['goods_name'] goods_data['market_price'] = float(goods['market_price'] / 100) # 单位:元,下同 goods_data['max_group_price'] = float( goods['max_on_sale_group_price'] / 100) goods_data['min_group_price'] = float( goods['min_on_sale_group_price'] / 100) goods_data['max_normal_price'] = float( goods['max_on_sale_normal_price'] / 100) goods_data['min_normal_price'] = float( goods['min_on_sale_normal_price'] / 100) goods_data['thumb_url'] = goods['thumb_url'] goods_data['publish_date'] = goods['created_at'] goods_data['total_sales'] = int(goods['sales']) # 总销量 goods_data['is_on_sale'] = goods['is_onsale'] # ##获取核算价 goods_data['price'] = goods_data['min_group_price'] goods_data['total_amount'] = float( goods_data['total_sales'] * float(goods_data['price'])) # 总销售额 yield goods_data
def parse(self, response): global maxnum item = SpiderItem() selector = Selector(response) article = selector.css('.post') for arc in article: name = arc.xpath( './/h2[@class="entry-title"]//text()').extract_first() link = arc.css('.entry-title a::attr(href)').extract_first() author = arc.xpath( './/h5[@class="entry-author"]/a/text()').extract() description = arc.xpath( './/div[@class="entry-summary"]/p/text()').extract_first() authors = '' for eachAuthor in author: eachAuthor += ' ' authors += eachAuthor item['name'] = name item['link'] = link item['author'] = authors item['description'] = description yield item url = 'http://www.allitebooks.com/page' digits = re.findall(r'/(\d+)/', response.url) digit = 0 if bool(self.maxnum) is False: self.maxnum = selector.css("#content .pagination a:last-child" ).xpath(".//text()").extract_first() self.maxnum = int(self.maxnum) print("\nmaxnum:" + str(self.maxnum) + '\n') if digits: digit = int(digits[0]) + 1 nextURL = url + '/' + str(digit) + '/?s=python' else: nextURL = url + '/2/?s=python' if digit <= self.maxnum: print("\nnextURL:" + nextURL + "\n") yield scrapy.Request(nextURL, callback=self.parse)