def video_page(self, response: HtmlResponse): video_title = response.css('h1.title').css('span::text').get() video_channel = response.css('div.video-actions-container').css( 'div.usernameWrap.clearfix').css('a::text').get() js = response.css('div.video-wrapper').css('#player').css( 'script').get() data_video_id = response.css('div.video-wrapper').css( '#player::attr(data-video-id)').get() prepare_js = js.split('<script type="text/javascript">')[1].split( 'loadScriptUniqueId')[0] exec_js = '{0}\nqualityItems_{1};'.format(prepare_js, data_video_id) js_result = js2py.eval_js(exec_js) # type: js2py.base.JsObjectWrapper quality_items = js_result.to_list() # type: list quality = quality_items[-1]['text'].split('p')[0] if int(quality) >= 720: video_url = quality_items[-1]['url'] self.logger.info('parse [%s] success, url: %s', video_title, video_url) if self.settings.get('ENABLE_SQL'): result = self.data_base.select_all_by_title_my_follow( video_title) if len(result) != 0: for line in result: self.logger.error('has duplicate record: %s', line) else: self.data_base.save_my_follow(video_title, video_channel, video_url, response.url) yield PornhubItem(file_urls=video_url, file_name=video_title, file_channel=video_channel)
def get_desc(cls, value): try: # Step 1 url_desc = ingram.desc_api % (value) # Step 2 data = urllib.urlopen(url_desc) response = HtmlResponse(url=url_desc, body=data.read()) data = None all_xpath = [ "substring-before(//div[@class='training_details_content'],'Language')", "substring-before(//div[@class='training_details_content'],'Please bring your')", "//div[@class='training_details_content']/p/text()", "//div[@class='training_details_content']/div[1]/text()", "//div[@class='training_details_content']/span/text()", "//div[@class='training_details_content']//text()" ] for xpath in all_xpath: data = response.xpath(xpath).extract() desc = html_to_text(data) if desc: return desc return None except: pass
def most_popular_page(self, response: HtmlResponse): description_list = response.css('div.descriptionContainer') for item in description_list: title = item.css('a::text').extract_first() sub_link = item.css('a::attr(href)').extract_first() channel_url = response.urljoin(sub_link) self.logger.warning('get channel:{0} ,link is:{1}'.format( title, channel_url)) yield scrapy.Request(channel_url, callback=self.channel_page_see_all) # determine has next page next_page_li = response.css('li.page.next.wl-page') if next_page_li: next_page_sub_link = next_page_li.css( 'a::attr(href)').extract_first() page_number = int(next_page_sub_link.split('page=')[1]) page_number_start = self.settings.get('PAGE_NUMBER_START') page_number_end = self.settings.get('PAGE_NUMBER_END') if page_number_end is not None: if page_number_start < page_number <= page_number_end: next_page_url = response.urljoin(next_page_sub_link) self.logger.warning( 'has next page, url is:{0}'.format(next_page_url)) yield scrapy.Request(next_page_url, callback=self.most_popular_page) else: self.logger.warning('has next page, but is in limit') else: next_page_url = response.urljoin(next_page_sub_link) self.logger.warning( 'has next page, url is:{0}'.format(next_page_url)) yield scrapy.Request(next_page_url, callback=self.most_popular_page)
def test_extract_repeated_field(self): sample = { 'plugins': {'annotations-plugin': {}}, 'url': 'https://stackoverflow.com', 'original_body': re.sub( 'data-scrapy-annotate=".*"', '', html_page._body), 'scrapes': 'default', 'version': '0.13.0' } data = _open_spec('so_annotations.json') annos, items, results = data['annos'], data['items'], data['results'] sample['plugins']['annotations-plugin']['extracts'] = annos spider = IblSpider('so', _spider(sample=sample), items, {}, Settings()) page = HtmlResponse('http://url', body=sample['original_body'], encoding='utf-8') items = [i for i in spider.parse(page) if not isinstance(i, Request)] keys = {(u'_index', u'_template', u'_type', u'answered', u'tags', u'title', 'url')} self.assertEqual({tuple(sorted(i.keys())) for i in items}, keys) self.assertEqual(items[0], results[0]) self.assertEqual(items[52], results[1]) self.assertEqual(items[-1], results[2]) self.assertEqual(len(items), 96) data = _open_spec('autoevolution.json') schemas = data['schemas'] results = data['results'] page = HtmlResponse('http://url', body=data['original_body'], encoding='utf-8') spider = IblSpider('ae', _spider(sample=data), schemas, {}, Settings()) items = [i for i in spider.parse(page) if not isinstance(i, Request)] self.assertEqual(items, results)
def process_request(self, request, spider): try: self.browser.get(request.url) time.sleep(1) if request.url == "http://www.jianshu_selenium.com/" : for i in range(20): js = 'window.scrollTo(0,%s)' % (i * 300) self.browser.execute_script(js) time.sleep(0.5) # while self.browser.execute_script('alert("To Bottom")'): # self.browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') # self.browser.execute_script('alert("To Bottom")') # time.sleep(1) try: while True: showMore =self.browser.find_element_by_class_name('load-more') showMore.click() time.sleep(1) if not showMore: break except Exception: pass return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8', status=200) except TimeoutException: return HtmlResponse(url=request.url, status=500, request=request)
def page(cls, response: HtmlResponse): current = url_query(response.url) current_page = int(current['p']) tag = current['word'] item_type = response.meta['item_type'] _search = demjson.decode(response.text)['body'][item_type['type']] _pages = math.ceil(_search['total'] / item_type['page_count']) cls.spider_log.info( "Search :%s Type:%s Total :%s Pages: %s Current :%s" % (tag, item_type['type'], _search['total'], _pages, current_page)) _datas = _search['data'] response.meta['word'] = tag _cls = cls _space = cls.settings().get('FILES_STORE') def _filter(id): _database = os.path.join(_space, response.meta['group'], '%s_main.db' % cls.script_name()) cls.space.set(_database, MainSpace.space(_database)) _has = _cls.space.get(_database).skip_complete({'id': id}) if _has is True: _cls.spider_log.info("Skip Item :%s" % str(id)) return _has for _data in _datas: if _filter(_data['id']): continue if item_type['type'] in ['manga', 'illust']: artworks = "https://www.pixiv.net/ajax/illust/%s" % _data['id'] referer = 'https://www.pixiv.net/artworks/%s' % _data['id'] cls.spider_log.info("Illust Title :%s" % _data['title']) author_item = AuthorItem() author_item['id'] = _data['userId'] author_item['name'] = _data['userName'] response.meta['author'] = author_item yield Request(url=artworks, callback=cls.illust_detail, meta=response.meta, headers={'Referer': referer}) if item_type['type'] in ['novel']: _novel_url = "https://www.pixiv.net/ajax/novel/%s" % _data['id'] cls.spider_log.info("Novel Title :%s" % _data['title']) author_item = AuthorItem() author_item['id'] = _data['userId'] author_item['name'] = _data['userName'] response.meta['author'] = author_item yield Request(url=_novel_url, callback=cls.novels_metas, meta=response.meta) if current_page < _pages: _item_url = "https://www.pixiv.net/ajax/search/%s/%s?word=%s&order=date_d&mode=all&p=%s&s_mode=s_tag_full&lang=zh" % ( item_type['url'], tag, tag, current_page + 1) yield Request(url=_item_url, callback=cls.page, meta=response.meta)
def parse(self, response: HtmlResponse, **kwargs): url = 'https://www.news.gov.hk/jsp/NewsArticle.jsp' new_url = 'https://sc.news.gov.hk/TuniS/www.news.gov.hk/jsp/NewsArticle.jsp' category_list = [ 'finance', 'school_work', 'health', 'environment', 'law_order', 'infrastructure', 'admin', 'city_life', 'record' ] language_list = ['eng', 'chi'] params = { 'language': 'chi', 'category': 'finance', 'date': '', } for date in get_date(): for category in category_list: if date == '202102': break for language in language_list: params['date'] = date params['language'] = language params['category'] = category yield response.follow(url=url + '?' + urlencode(params), callback=self.get_news_list) params['language'] = 'chi' yield response.follow(url=new_url + '?' + urlencode(params), callback=self.get_news_list)
def model_page(self, response: HtmlResponse): video_sum_element = response.css('div.showingInfo').css( 'span.totalSpan') # some p**n star hasn't show video number page_number = 1 if video_sum_element: video_sum = video_sum_element.css('::text').get() sum_number = int(video_sum) page_number = math.ceil(sum_number / 40) # url contains page means load all videos || num == 1, start parse if 'page' in response.url or page_number == 1: li_list = response.css('div.videoUList').css('ul').css('li') for li_tag in li_list: # type: SelectorList a_tag = li_tag.css('span.title').css('a') video_title = a_tag.css('::text').get() video_url = a_tag.css('::attr(href)').get() real_url = 'https://www.pornhubpremium.com' + video_url self.logger.info('send [%s] ,url: %s', video_title, video_url) yield scrapy.Request(real_url, callback=self.video_page, priority=100) else: # url not contains page and num > 1 means need load all videos new_link = '{0}?page={1}'.format(response.url, page_number) yield scrapy.Request(new_link, callback=self.model_page, priority=10)
def _create_product_data_dictionary( self, response: HtmlResponse, name: str, brand: Optional[str] = None, model_number: Optional[str] = None, upc: Optional[str] = None, data: Optional[Dict] = None, ) -> Dict: breadcrumbs = response.css('ul.nav.breadcrumb \ > li[itemtype="http://data-vocabulary.org/Breadcrumb"] \ > a[itemprop="url"] \ > span[itemprop="title"]::text').getall() item = product_data_item_loader \ .ProductDataItemLoader(response=response) \ .add_language_data( response=response, brand=brand, images=response.css( 'meta[property="og:image"]::attr(content)' ).extract(), name=name, url=response.url, breadcrumbs=breadcrumbs ).add_sku(sku=upc) \ .add_upc(response=response, upc=upc) \ .add_store_id(store_id=self.store_id) \ .add_sold_by(sold_by=self.sold_by) \ .add_version(version=self.version) \ .load_item() return item.get_dictionary()
def parse_task(self, response: HtmlResponse, subsection='empty'): # Source task_name = response.css('table.viewingtable div.componentboxheader::text').extract_first().strip() source = TaskSourceItem() source['name'] = f'{task_name} (problems.ru)' source['url'] = response.url content = response.css('table.viewingtable .componentboxcontents') # Themes info = content.css('table.problemdetailscaptiontable') themes = [theme.strip() for theme in info.css('.problemdetailssubject .problemdetailssubjecttablecell a.componentboxlink::text').extract()] # Grades _, grades = info.css('.problemdetailsdifficulty nobr::text').extract() grades = list(map(lambda n: int(n), re.findall(r'\d+', grades))) # Task task_dict, image_urls, tex_used = self.extract_task(content, response) yield ParseResultItem( source=source, themes=themes, grades=grades, task=task_dict, section=SECTION, subsection=subsection, image_urls=image_urls, tex_used = tex_used )
def parse_region(self, response: HtmlResponse): """Parse regions. Nordbayern -> Frankenjura Nord Example: https://www.frankenjura.com/klettern/region/2 """ item = SectorItem() item["name"] = response.meta["region_name"] item["fk_sector"] = response.meta["parent"] item["source"] = response.url item["description"] = response.css( 'div[class="location-head"]+p ::text').get() yield item region = item.django_model.objects.get(**item) sub_regions = response.css('div[class="column"]').css( 'a[href*="region"]') for sub_region in sub_regions: meta = { "sub_region_name": sub_region.css("::text").get(), "parent": region } yield response.follow(sub_region, self.parse_sub_region, meta=meta)
def detail_xpath(response: HtmlResponse): data = JobDetail() url = response.url job_top_detail = response.xpath("//div[@class='cn']") job_name = job_top_detail.xpath("./h1/@title").extract()[0] job_salary = job_top_detail.xpath("./strong//text()").extract()[0] job_company = job_top_detail.xpath( "./p[@class='cname']/a/@title").extract()[0] job_tag = job_top_detail.xpath( "./p[contains(@class,'msg')]/@title").extract()[0] job_position_information = response.xpath( "//div[contains(@class,'bmsg job_msg')]/p//text()").extract() job_position_information: str = ''.join(job_position_information) company_detail = response.xpath("//div[@class='com_tag']") company_category = company_detail.xpath("./p[1]/@title").extract()[0] company_number_of_people = company_detail.xpath( "./p[2]/@title").extract()[0] company_tag = company_detail.xpath("./p[3]/@title").extract()[0] data['url'] = url data['job_name'] = job_name.replace('\xa0', '') data['job_salary'] = job_salary.replace('\xa0', '') data['job_company'] = job_company.replace('\xa0', '') data['job_tag'] = job_tag.replace('\xa0', '') data['job_position_information'] = job_position_information.replace( '\r\n', '').replace('\xa0', '') data['company_category'] = company_category.replace('\xa0', '') data['company_number_of_people'] = company_number_of_people.replace( '\xa0', '') data['company_tag'] = company_tag.replace('\xa0', '') return data
def parse(self, response): '''抽取每个分类中的总页数,并对每一页分发请求''' #print "[url: %s || status: %s]"%(response.url,response.status) retitem = ExporterItem() urlprefix = "http://mp.aiweibang.com/asyn/categoryarticleList?uid=311487&cid=75967&pageindex=" page_num = 1 driver = webdriver.PhantomJS() #essay_urls=[] while True: _url = urlprefix + str(page_num) driver.get(_url) resp = HR("", 200, {}, driver.page_source.encode( "utf8")) #把抓回的内容封装为HtmlResponse只是利用HtmlResponse的XPATH而已 dic = eval(resp.xpath("//pre/text()").extract()[0]) if len(dic["list"]) == 0: break else: for i in dic["list"]: essay = i["url"] if essay.split("&sn")[0] not in self.all_urls: retitem.set_record(essay) page_num += 1 return retitem
def parse(self, response: HtmlResponse): log.info(f"Parsing {response.url}") title = response.xpath("//title/text()").extract_first() log.info(f"Visiting {title}") start_url = TestSpider.start_urls[0] p_url = f"{start_url}?p=" for link_href in response.xpath("//link[@href]"): url = link_href.xpath("@href").extract_first() if p_url in url: log.info(f"Recording page ID URL: {url}") yield { "title": title, "long_url": response.url, "short_url": url } # https://github.com/dkmiller/tidbits/blob/graph-algorithms/2020/2020-12-15_graph-algorithms/Graph.Algorithms/Web.cs for link in response.xpath("//a[@href]"): url = link.xpath("@href").extract_first() if start_url in url and ("#comment-" not in url) and ("mailto:" not in url): log.info(f"Queuing {url} to visit.") yield scrapy.Request(url, callback=self.parse)
def parse_wall(self, response: HtmlResponse): """Parse walls. ... -> Region Wattendorf -> Falkenwand Example: https://www.frankenjura.com/klettern/poi/21 """ item = SectorItem() item["name"] = response.meta["wall_name"] item["fk_sector"] = response.meta["parent"] item["source"] = response.url item["internal_rating"] = _parse_stars(response) item["max_height_in_m"] = _parse_wall_max_height(response) item["rain_protected"] = _parse_rain_protected(response) item["child_friendly"] = _parse_child_friendly(response) item["description"] = _parse_wall_description(response) item["approach"] = _parse_wall_approach(response) item["approach_road"] = _parse_wall_approach_road(response) item["fk_orientation"] = _parse_orientation(response) item["latitude"], item["longitude"] = _parse_lat_lon(response) yield item wall = item.django_model.objects.get(name=item["name"], fk_sector=item["fk_sector"]) routes = response.css('div[class="poi-link-container"]').css("a") for route in routes: meta = {"route_name": route.css("::text").get(), "parent": wall} yield response.follow(route, self.parse_route, meta=meta)
def parse(self, response: HtmlResponse): print(type(response), '+++++++++++++++++++++++++') print(response.encoding) print(response.status) with open('books.html', 'w', encoding='utf8') as f: f.write(response.text) subjects = response.xpath('//li[@class="subject-item"]') for subject in subjects: item = DoubanbookItem() title = subject.xpath('.//h2/a/text()').extract_first() item['title'] = title.strip() rate = subject.xpath( './/span[@class="rating_nums"]/text()').extract_first() item['rate'] = rate publish = subject.xpath( './/div[@class="pub"]/text()').extract_first() item['publish'] = publish.strip() yield item for i in range(2): next_pag = response.xpath( '//div[@class="paginator"]/a/@href').extract_first() url = response.urljoin(next_pag) yield scrapy.Request(url=url, callback=self.parse)
def parse_category(self, response: HtmlResponse) -> HtmlResponse: """ List category and traverse product pages. """ products_query = response.css( "section#bc-sf-filter-products > div.product-grid-item") if not products_query: raise IgnoreRequest('Product items not found') self.logger.info( f'parse product_categories len: {len(products_query)}') for pdp in products_query.css('div.product-grid-item'): item_loader = ProductLoader(item=UrgeItem(), selector=pdp) item_loader.add_css('product_name', 'div.product-text > p.title::text') item_loader.add_css('product_brand', 'div.product-text > h2.vendor.h5::text') # get regular product price through OR (,). item_loader.add_css( 'product_price', 'div.product-text p.price s::text , span[itemprop="price"]::text' ) item_loader.add_css( 'product_sale_price', 'div.product-text p.sale span[itemprop="price"]::text') if 'href' in pdp.css('a').attrib: product_url = pdp.css('a').attrib['href'] yield response.follow(product_url, callback=self.product_page, meta={'item': item_loader.load_item()})
def parse_video_page(self, response: HtmlResponse): self.logger.warn('开始解析{0}真实视频'.format(response.url)) title = response.css('#viewvideo-title::text').extract_first().strip() author = response.css('a[href*="uprofile.php"]').css( 'span::text').extract_first().strip() # 发现有的视频,名字相同,作者相同,只有Url中的viewkey不同 view_key = response.url.split('viewkey=')[1].split('&')[0] # 由于有的视频名字中带 / 会导致创建成文件夹,所以需要处理一下 if '/' in title: title = title.replace('/', '') encrypted_url = response.css('video').extract_first().split( 'strencode("')[1].split('"))')[0] first_encrypted = encrypted_url.split('"')[0] second_excrypted = encrypted_url.split('"')[2] video_link = ParseRealUrl.get_url(first_encrypted, second_excrypted) if video_link: # 处理一下链接中 http://185.38.13.130//mp43/2998... 这种的 url video_link_list = video_link.split('//') real_video_link = video_link_list[0] + '//' + video_link_list[ 1] + '/' + video_link_list[2] self.logger.warn('获取到下载链接,丢入下载队列') down_file_name = title + '-' + author + '-' + view_key yield DownloadVideoItem(file_urls=real_video_link, file_name=down_file_name) self.logger.warn('丢入下载后,更新数据库') yield UpdateMovieLinkItem(movie_page_url=response.url, movie_real_url=real_video_link) else: self.logger.warn('获取视频下载地址失败,地址:{0}'.format(response.url))
def ajax_model_page(self, response: HtmlResponse): model_info_list = response.css('li.pcVideoListItem') for item in model_info_list: # type: SelectorList video_url = item.css('span.title').css('a::attr(href)').get() yield scrapy.Request(response.urljoin(video_url), callback=self.video_page, priority=100)
def process_request(self, request, spider): url = request.url print( "1. process_request(): " + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + " -> " + url) self.driver.get(url) source = self.driver.page_source if str('currentPage') not in url: print("3. if finish process_request(): " + datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S.%f') + " -> " + url) # if str('REPORT_NDOC_006051') in url or str('REPORT_NDOC_006010') in url: # print(">>> debug: " + url) # print(source) response = HtmlResponse(url=url, body=source, request=request, encoding="utf-8") return response else: next_page = self.driver.find_element_by_xpath( "//*[@id='4864']/table/tbody/tr/td/table/tbody/tr/td[8]/a") url = str(next_page.find_element_by_xpath("./a").get_attribute('href')) print("3. else finish process_request(): " + datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S.%f') + " -> " + url) response = HtmlResponse(url=url, body=source, request=request, encoding="utf-8") return response
def channel_page(self, response: HtmlResponse): video_css = response.css('span.title') for item in video_css: video_sub_link = item.css('a::attr(href)').extract_first() video_url = response.urljoin(video_sub_link) self.logger.warning( 'send to parse real video, url is:{0}'.format(video_url)) yield scrapy.Request(video_url, callback=self.video_page)
def _create_product_dictionary( self, response: HtmlResponse, data: Optional[Dict] = None, ) -> product.Product: try: upc = (universal_product_code.UniversalProductCode( upc=data.get('ProductId').replace('_', ''))).value except: # TODO: Log issue and return nothing. return None title1 = response.css('meta[property="og:title"]::attr(content)' ).extract()[0].split('|')[0] title2 = response.css('title::text').get() name = title1 or title2 if not name: pass # TODO: Log error and return none. elif name == 'Grocery Product' or name == 'Produit épicerie en ligne': pass # TODO: Log error and return none. brand = data.get('BrandName') if not name: pass # TODO: Log error and return none. item_loader = product_item_loader.ProductItemLoader( response=response ).add_name( response=response, name=name, # TODO: What about if it's none. language=self.language, ).add_brand( response=response, brand=brand, # TODO: What about if it's none. language=self.language, ).add_upc(response=response, upc=upc) \ .add_product_data_dictionary( product_data_dictionary=self._create_product_data_dictionary( response=response, data=data, name=name, brand=brand, upc=upc, ), ).add_offer_dictionary( offer_dictionary=self._create_offer_dictionary( response=response, data=data, ), ).add_store_dictionary( store_dictionary=self._create_store_dictionary( response=response, ), ).add_supported_language(language=self.language) return item_loader.load_item()
def parse_chapter(response: HtmlResponse): title = response.xpath("//div[@class='bookname']/h1//text()").getall()[0].split() text = response.xpath("//div[@id='content']//text()").getall() chapter_index = re.findall("\\d+", title[0])[0] chapter_title = title[1] chapter_content = "".join([x.strip() for x in text]).strip() yield BookItem(index=chapter_index, title=chapter_title, content=chapter_content)
def get_image_url(cls, response: HtmlResponse) -> Union[str, None]: """Extract image url from html response""" image_p = response.css("p > img") image_figure = response.css("figure > img") image_selectors = image_p if image_p else image_figure images_re = image_selectors.re(r'src="(http.*?)\"') images = [img for img in images_re if img.split(".")[-1] != "svg"] sorted_by_length = sorted(images, key=len, reverse=True) return sorted_by_length[0] if sorted_by_length else None
def parse_list_of_tasks(self, response: HtmlResponse, max_number=0, next_number=0, step=5, subsection:str = ''): task_urls = response.css('.problemsmallnumber .componentboxlink::attr(href)').extract() for task_url in task_urls: callback = partial(self.parse_task, subsection=subsection) yield response.follow(response.urljoin(task_url), callback=callback) if next_number < max_number: url = set_get_parameter(response.url, 'start', next_number) callback = partial(self.parse_list_of_tasks, max_number=max_number, next_number=next_number + step, subsection=subsection) yield response.follow(url, callback=callback)
def video_parse(self, response: HtmlResponse, category): title = response.css('h2.title.big::text').get() for item in response.css('ul.video-downloads-buttons').css('li'): if '1080p' in item.css('a::text').get().strip(): link = item.css('a::attr(href)').get() req_cookie = response.request.headers.get('Cookie').decode() resp_cookie = response.headers.get('Set-Cookie').decode().split(';')[0] yield ArtPornItem(name=title, link=link, category=category, cookie='{0};{1}'.format(req_cookie, resp_cookie))
def parse_next_link(self, response: HtmlResponse) -> str: next_page_tag = response.css( 'a[href*="?category=long&viewtype=basic"]') next_link = None for item in next_page_tag: if '»' == item.css('a::text').extract_first(): ori_link = item.css('a::attr(href)').extract_first() next_link = response.urljoin(ori_link) return next_link
def video_parse(self, response: HtmlResponse, category): link = response.urljoin(response.css("a.full_download_link[onclick*='mp43000']::attr(href)").get()) title = '' for i in response.css('div.title_bar::text').getall(): i = i.strip() if i: title = i break if link != 'http://www.hotwiferio.com/members/': yield HotItem(name=title, link=link, category=category)
def get_contents_list(self, response: HtmlResponse): meat = response.meta contents_list = response.json().get('list') with open('Khala/spider_params/lenovo/language.txt', 'r+') as languages: for language in languages: language = language.replace('\n', '') for contents in contents_list: url = f'https://pcsupport.lenovo.com/us/{language}/products/{meat["model"]}/solutions/{contents["docid"]}' yield response.follow(url=url, callback=self.out_item)
def get_response(url, headers=None, cookies=None, delay=30, response_type="html"): num_retries = 0 response = None if cookies is None: cookies = {} while num_retries < MAX_NUM_RETRY: try: response = None if headers is not None: response = requests.get(url, headers=headers, timeout=delay, verify=False, cookies=cookies) else: response = requests.get(url, timeout=delay, verify=False, cookies=cookies) num_retries += 1 if response.status_code >= 200: if response_type == "html": ret_obj = HtmlResponse(url, status=response.status_code, body=response.content, encoding='utf-8') return ret_obj elif response_type == "xml": ret_obj = XmlResponse(url, status=response.status_code, body=response.content, encoding='utf-8') return ret_obj else: raise Exception("Invalid response type") except Exception as e: logging.error("Exception %s" % e.message) num_retries += 1 logging.error("Could not fetch the url") if response_type == "html": err_obj = HtmlResponse(url, status=110, body="<html><body>Failure</body></html>", encoding='utf-8') else: err_obj = XmlResponse(url, status=110, body="<html><body>Failure</body></html>", encoding='utf-8') return err_obj
def parse_country_links(self, response: HtmlResponse) -> Request: table = response.css(self.config.countries_table) all_link_tags = table.css('a') country_links = [link.attrib['href'] for link in all_link_tags] for country_link in country_links: full_country_link = response.urljoin(country_link) current_country = country_link.split('/')[1] yield scrapy.Request(full_country_link, callback=self.parse_country, cb_kwargs={"country": current_country})
def test_attrib(): f = open('test.html') test_response = HtmlResponse("http://example.com", body = f.read()) print test_response.xpath('/html/body/div[@id="001"]/p/text()').extract() print test_response.xpath('/html/body/div[@id="A001"]/p/text()').extract()
def test_basic(): f = open('test.html') test_response = HtmlResponse('http://www.example.com', body = f.read()) print test_response.xpath('/html/body/div[1]/p/text()').extract() print test_response.xpath('/html/body1/div[1]/p/text()').extract()
def get_response(filepath, encoding='utf-8'): body = open(filepath, 'r').read() response = HtmlResponse('test', encoding=encoding) response.body = body return response