def parse(self, response: Response): pictures = response.xpath("//img/@src[starts-with(., 'http')]") strings = response.xpath( "//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > 30]/text()" ) yield { 'url': response.url, 'payload': [{ 'type': 'text', 'data': text.get().strip() } for text in strings] + [{ 'type': 'image', 'data': image.get() } for image in pictures] } if response.url == self.start_urls[0]: refs = response.xpath("//a/@href") ref = [refs.get() for r in refs][:15] for r in ref: yield scrapy.Request('http://www.posolstva.org.ua' + r, self.parse)
def parse_movie(self, response: Response): item = {} item['entity'] = 'movie' item['movie'] = response.xpath( '//h1/span[@property="v:itemreviewed"]/text()').get().split()[0] item['year'] = response.xpath( '//h1/span[@class="year"]/text()').get()[1:-1] item['score'] = response.xpath('//strong/text()').get() item['director'] = response.xpath( '//a[@rel="v:directedBy"]/text()').getall() item['actor'] = response.xpath( '//a[@rel="v:starring"]/text()').getall() item['genre'] = response.xpath( '//span[@property="v:genre"]/text()').getall() info = ''.join(response.xpath('//div[@id="info"]/text()').getall()) item['country'] = info.replace('/', '').split()[0] item['length'] = re.search( r'\d+', response.xpath( '//span[@property="v:runtime"]/text()').get()).group() item['rank'] = re.search( r'\d+', response.xpath('//span[@class="top250-no"]/text()').get()).group() item['img_url'] = response.xpath( '//div[@id="mainpic"]//img/@src').get() # item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() # item['name'] = response.xpath('//div[@id="name"]').get() # item['description'] = response.xpath('//div[@id="description"]').get() return item
def parse(self, response: Response): img_elems = response.xpath("//img/@data-src[starts-with(., 'http')]") text_elems = response.xpath( "//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > 20]/text()" ) yield { 'url': response.url, 'payload': [ { 'type': 'text', 'data': text.get().strip() } for text in text_elems ] + [ { 'type': 'image', 'data': image.get() } for image in img_elems ] } if response.url == self.start_urls[0]: link_elems = response.xpath( "//a/@href[starts-with(., 'https://isport.ua/') or starts-with(., '/')]" ) links = [ link.get() for link in link_elems if link.get() != "/" ] for link in links[:19]: if link.startswith("/"): link = "https://isport.ua" + link yield scrapy.Request(link, self.parse)
def parse(self, response: Response): ''' 对获得到的结果进行转换 :param response: 获得到的url :return: 直接返回item ''' # 首先提取出所有的图片 image_lists = response.xpath('.//div[@id = "list_img"]//img') # from scrapy.shell import inspect_response # inspect_response(response, self) for image in image_lists: description = image.xpath('.//@alt').extract()[0] src = image.xpath('.//@src').extract_first() item = SecretSpiderItem(image_description=description) if src.startswith('http') or src.startswith('https'): item['image_urls'] = [src] else: full_url = SITE_BASE_URL + src item['image_urls'] = [full_url] yield item # 下面这一段代码 我们来判断是否有下一页来决定是否来构造对应得url(无法通过有效的响应得出来是否存在下一页) pages = response.xpath('//div[@class="page_num"]//a') next_page_url = '' for page in pages: page_text = page.xpath('./text()').extract_first() page_url = page.xpath('./@href').extract_first() if page_text == '下一页': next_page_url = page_url if next_page_url is not '': yield Request(url=next_page_url, callback=self.parse)
def parse(self, response: Response): image_elements = response.xpath("//img/@src") text_elements = response.xpath( "//*[not(self::script)][not(self::style)][not(self::title)][string-length(normalize-space(text())) > 0]/text()" ) yield { 'url': response.url, 'text_elements': map(lambda text: text.get().strip(), text_elements), 'image_elements': map( lambda image: 'https://kpi.ua' + image.get() if image.get().startswith('/') else image.get(), image_elements) } if response.url == self.start_urls[0]: link_elems = response.xpath( "//a/@href[starts-with(., 'https://kpi.ua/') or starts-with(., '/')]" ) links = [link.get() for link in link_elems if link.get() != "/"] for link in links[:20]: if link.startswith("/"): link = "https://kpi.ua" + link yield scrapy.Request(link, self.parse)
def parse(self, response: Response): all_images = response.xpath("//img/@src[starts-with(., 'http')]") all_text = response.xpath( "//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > 30]/text()" ) yield { 'url': response.url, 'payload': [{ 'type': 'text', 'data': text.get().strip() } for text in all_text] + [{ 'type': 'image', 'data': image.get() } for image in all_images] } n = response.url == self.start_urls[0] if response.url == self.start_urls[0]: all_links = response.xpath( "//a/@href[starts-with(., '//www.ukr.net/')][substring(., string-length() - 4) = '.html']" ) selected_links = [link.get() for link in all_links][:19] for link in selected_links: link = 'https:' + link yield scrapy.Request(link, self.parse)
def parse(self, response: Response): images = response.xpath("//img/@src") texts = response.xpath( "//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > " "30]/text()") hyperlinks = response.xpath("//a/@href") yield { 'url': response.url, 'payload': [{ 'type': 'text', 'data': text.get() } for text in texts] + [{ 'type': 'image', 'data': image.get() } for image in images] + [{ 'type': 'hyperlink', 'data': hyperlink.get() } for hyperlink in hyperlinks] } if response.url == self.start_urls[0]: links = response.xpath("//a/@href") selected_links = list(set(link.get() for link in links))[:19] for link in selected_links: yield scrapy.Request('http://basketball365.ru' + link, self.parse)
def parse(self, response: Response): all_images = response.xpath("//img/@src[starts-with(., 'http')]") all_text = response.xpath( "//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > 30]/text()" ) yield { 'url': response.url, 'payload': [{ 'type': 'text', 'data': text.get().strip() } for text in all_text] + [{ 'type': 'image', 'data': image.get() } for image in all_images] } if response.url == self.start_urls[0]: link_elems = response.xpath( "//a/@href[starts-with(., 'https://kpi.ua/') or starts-with(., '/')]" ) links = [ link.get() for link in link_elems if link.get() != "https://kpi.ua/" ][:19] for l in links: link = 'https://kpi.ua/' + l yield scrapy.Request(link, self.parse)
def parse(self, response: Response): all_images = response.xpath("//div[@class='foto']/@style[starts-with(., 'background-image: url(/')]") all_text = response.xpath("//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > 30]/text()") yield { 'url': response.url, 'payload': [{'type': 'text', 'data': text.get().strip()} for text in all_text] + [{'type': 'image', 'data': 'https://stejka.com' + image.get()[22:len(image.get())-2]} for image in all_images] } if response.url == self.start_urls[0]: all_links = response.xpath( "//a/@href[starts-with(., '/rus/')]") selected_links = ['https://stejka.com' + link.get() for link in all_links][:20] for link in selected_links: yield scrapy.Request(link, self.parse)
def parse(self, response: Response): all_images = response.xpath("//img/@data-src[starts-with(., 'http')]") all_text = response.xpath("//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > " "30]/text()") yield { 'url': response.url, 'payload': [{'type': 'text', 'data': text.get().strip()} for text in all_text] + [{'type': 'image', 'data': image.get()} for image in all_images] } if response.url == self.start_urls[0]: all_links = response.xpath( "//a/@href[starts-with(., '/')]") selected_links = ['https://isport.ua' '' + link.get() for link in all_links][:20] for link in selected_links: yield scrapy.Request(link, self.parse)
def extract_market(response: Response) -> dict: data = {} for field in response.xpath("//div[contains(@class, 'group-ema-referral-overview')]/dl/dl"): key = '\n'.join(field.xpath("dt[@role='heading']/button/text()").getall()) value = '\n'.join(field.xpath("dd[@role='region']/div/p/text()").getall()) data[key] = value return data
def parse(self, response: Response): all_images = response.xpath("//img/@src[starts-with(., 'https')]") yield { 'url': response.url, 'payload': [{ 'type': 'image', 'data': image.get() } for image in all_images] } if response.url == self.start_urls[0]: all_links = response.xpath( "//a/@href[starts-with(., 'https://uahotels.info/')]") selected_links = [link.get() for link in all_links][:19] for link in selected_links: yield scrapy.Request(link, self.parse)
def parse(self, response: Response) -> Iterator[Union[Request, Dict]]: """Обработчик http ответа от сайта. Парсит таблицы с данными и делает запросы к следующим страницам. :param response: ответ, получаемый из Scrapy :return: итератор по словарям с результатами парсинга и по запросам к следующим страницам """ symbol = response.meta['symbol'] link_extractor = LinkExtractor( allow= rf'https://www\.nasdaq\.com/symbol/{symbol.lower()}/insider-trades\?page=\d+' ) link: Link for link in link_extractor.extract_links(response): match_page_number: Optional[Match] = re.search( r'page=(\d+)', link.url) if match_page_number is not None: page_number: int = int(match_page_number.group(1)) if page_number <= MAX_PAGE: yield Request(link.url, meta={'symbol': symbol}) for row in response.xpath( '//div[@id="content_main"]//div[@class="genTable"]/table[@class="certain-width"]/tr' ): raw_row = RawRow.from_selector(row, symbol) try: yield ParsedRow.from_raw_row(raw_row).as_dict() except ValueError: logging.exception( 'Ошибка при парсинге строки таблицы с инсайдерскими сделками.' )
def _load_model(response: Response) -> Dict: script = response.xpath( "/html/body/script[text()[contains(.,'window.jsonModel = ')]]/text()" ).extract_first() jsmodel = script[len("window.jsonModel = ") :] model = json.loads(jsmodel) return model
def parse(self, response: Response): products = response.xpath("//div[contains(@class, 'cell item')]")[:20] for product in products: yield { 'description': product.xpath("./h3/a[@class='b1c-name-uk']/text()").get(), 'price': product.xpath("substring-before(./p[contains(@class, 'b1c-withoutprice')]/text(),' грн.')").get(), 'img': product.xpath("./div/a/img[@id='product']/@src[starts-with(., 'https')]").get() }
def parse(self, response: Response): products = response.xpath("//div[contains(@class, 'row table-row')]")[:20] for number in range(20): yield { 'description': products.xpath("//a[contains(@class, 'pnameh')]/text()").extract()[number], 'price': products.xpath("//div[contains(@class, 'pprice')]/text()").extract()[number], 'img': 'https://odissey.kiev.ua/' + products.xpath("//img[contains(@class, 'thumbnail')]/@src").extract()[number] }
def _get_floorplan_images(self, response: Response) -> List[str]: xpath = "//div[@id = 'floorplan-1']//div[contains(@class, 'ui-modal-gallery__asset')]/@style" style = response.xpath(xpath).extract_first() if style: match = re.match(r".*url\('(.*)'\).*", style) if match: return [(match.group(1))] return []
def parse(self, response: Response): products = response.xpath( "//section[contains(@class, 'product-tile_product')]")[:20] for product in products: yield { 'description': product.xpath("./@data-name").get(), 'price': product.xpath("./@data-price").get(), 'img': product.xpath("./@data-img").get() }
def parse(self, response: Response, **kwargs): articles = response.xpath('//div[@class="pad5 english_article persian_article small_font"]') for article in articles: download_url = article.css('.article_links').xpath('(.//a)[3]/@href').get() download_url = response.urljoin(download_url) info_url = article.css('.article_links').xpath('(.//a)[4]/@href').get() info_url = response.urljoin(info_url) yield Request(info_url, cb_kwargs={'download_url': download_url}, callback=self.parse_info)
def _load_property_page_model(response: Response) -> Dict: script = ( response.xpath( "/html/body/script[text()[contains(.,'window.PAGE_MODEL = ')]]/text()" ) .extract_first() .strip() ) jsmodel = script[len("window.PAGE_MODEL = ") :] model = json.loads(jsmodel) return model
def parse(self, response: Response): products = response.xpath( "//div[contains(@class, 'product-block')]")[:19] for product in products: yield { 'description': product.xpath(".//img[@class='img-responsive']/@title").get(), 'price': product.xpath(".//span[@class='special-price']/text()").get(), 'img': product.xpath(".//img[@class='img-responsive']/@src").get() }
def parse(self, response: Response): products = response.xpath( "//div[contains(@class, 'ypi-grid-list__item_body')]")[:20] for product in products: yield { 'description': product.xpath(".//a[@class='product-title']/@title").get(), 'price': product.xpath(".//span[@class='ty-price-num']/text()").get(), 'img': product.xpath(".//img[@class='ty-pict cm-image']/@src").get() }
def parse(self, response: Response): products = response.xpath("//ul[@id=\"product_list\"]/li")[:20] for product in products: yield { 'description': product.xpath(".//a[@class='b1c-name-uk']/@title").get(), 'price': product.xpath( ".//div[@class='content_price']/span/text()").get(), 'img': product.xpath(".//img[@class='b1c-img']/@src").get() }
def parse_main_page(self, response: Response): book_urls = response.xpath(BOOK_URL) genre_urls = response.xpath(GENRE_URL) for url in book_urls: short_name = get_book_name_from_url(url.get()) yield Request( url=BASE_URL.format(short_name), callback=self.parse_book_info, cb_kwargs=dict(short_name=short_name) ) genre_urls = [x.get() for x in genre_urls] genres = list(set([x.replace('/', '') for x in genre_urls])) for k in range(2): yield Request( url=response.urljoin(genre_urls[k]), callback=self.parse_books_in_page ) yield Request( url=GENRE_LIST_URL.format(genres[k]), callback=self.parse_genre_list, cb_kwargs=dict(genre=genres[k]) )
def parse(self, response: Response): products = response.xpath( "//li[contains(@class, 'product-item')]")[:20] for product in products: yield { 'description': product.xpath( ".//div[@class='item-info']/p[@class='h4']/a/text()").get( ), 'price': product.xpath(".//span[@class='value']/text()").get(), 'img': product.xpath(".//img[@class='img-product']/@src").get() }
def parse(self, response: Response): furnitures = response.xpath( "//div[contains(@class, 'product-block')]")[:20] for furniture in furnitures: yield { 'price': furniture.xpath( "./div[@class='product-meta']//span[@class='special-price']/text()" ).get(), 'description': furniture.xpath(".//a[@class='img']/@title").get(), 'image': furniture.xpath(".//img/@src").get() }
def parse(self, response: Response): products = response.xpath("//div[contains(@class, 'port-i')]")[:20] for product in products: yield { 'description': product.xpath( ".//img[@class='UI-CATALOG-PRODUCT-IMAGE']/@title").get(), 'price': product.xpath( ".//span[@class='price-value UAH']/@content").get(), 'img': product.xpath( ".//img[@class='UI-CATALOG-PRODUCT-IMAGE']/@src").get() }
def parse(self, response: Response): all_images = response.xpath( "//div/@style[starts-with(., 'background-image')]") all_text = response.xpath( "//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > 30]/text()" ) yield { 'url': response.url, 'payload': [{ 'type': 'text', 'data': text.get().strip() } for text in all_text] + [{ 'type': 'image', 'data': image.get() } for image in all_images] } if response.url == self.start_urls[0]: all_links = response.xpath("//a/@href[starts-with(., '/rus')]") selected_links = [link.get() for link in all_links][:19] for link in selected_links: print(link) yield scrapy.Request(link, self.parse)
def parse(self, response: Response): products = response.xpath("//div[contains(@class, 'item_div')]")[:20] for files in range(20): yield { 'description': products.xpath( "//div[contains(@class, 'item_nazvanie')]/a/text()"). extract()[files], 'price': products.xpath("//div[contains(@class, 'price fl')]/text()" ).extract()[files], 'img': products.xpath("//img/@src").extract()[files], }
def parse(self, response: Response): products = response.xpath( "//div[contains(@class, 'product-container')]")[:20] for product in products: yield { 'description': product.xpath( ".//img[@class='replace-2x img-responsive']/@title").get(), 'price': product.xpath( ".//span[@class='price product-price']/text()").get(), 'img': product.xpath( ".//img[@class='replace-2x img-responsive']/@src").get() }