def parse_sv_links(self, response: Response) -> FoundLink: """ Yields BrokenLink items for broken links not caught by an error or exception and moves them through the broken_link_detector pipeline. :param response: A response produced by a Rule :return: A BrokenLink Item to be passed to the pipeline """ title = response.css('title::text').get() if self.css: links = response.css(self.css) else: links = response links = links.xpath('./descendant::*[@href]') for link in links: if 'vufind' in link.attrib['href'] or 'sfx' in link.attrib['href']: link_obj = FoundLink() link_obj['a_origin'] = response.url link_obj['b_title'] = title link_obj['c_url'] = assemble_absolute_link( response.url, link.attrib['href']) link_obj['d_text'] = link.xpath('./text()').get() yield link_obj
def parse_forum_page(self, response: Response, forum_url: str = None) -> None: """ Forum page callback. Parses TopicItem. Follows next forum page and threads. :param forum_url: forum url, from first page. Will be extracted from response meta if not provided. :param response: scrapy crawl response """ if forum_url is None: forum_url = response.meta['forum_url'] # threads = response.css('a.topictitle') threads = response.css( 'div.topic_read,div.topic_read_hot,div.topic_read_locked,div.topic_moved,div.sticky_read,' 'div.sticky_read_locked,div.announce_read,div.announce_read_locked' ) # if len(threads) != len(threads2): # print(response.url) too_old_thread_found = False for thread_container in threads: thread = thread_container.css('a.topictitle') topic_loader = ItemLoader(item=TopicItem(), response=response) thread_href_selector = thread.css('a::attr(href)') thread_link = response.urljoin(thread_href_selector.get()) topic_loader.add_value('id', thread_href_selector.re(r'-(t[0-9]*).html')) topic_loader.add_value('thread_link', thread_link) topic_loader.add_value('forum_link', forum_url) topic_loader.add_value('name', thread.css('a::text').get()) yield topic_loader.load_item() if not self.full_crawl: last_post_date_candidates = thread_container.css( 'span.post-date::text').getall() last_post_date = max( map(lambda x: parse_date(x), last_post_date_candidates)) if last_post_date < self.start_date: too_old_thread_found = True continue yield scrapy.Request(thread_link + "?sd=d", callback=self.parse_thread) next_page = response.css('a[rel=next]::attr(href)').get() if next_page and not too_old_thread_found: next_request = response.urljoin(next_page) yield scrapy.Request(next_request, callback=self.parse_forum_page, meta={'forum_url': forum_url})
async def parse_book(self, response: Response) -> dict: url_sha256 = hashlib.sha256(response.url.encode("utf-8")).hexdigest() page = response.meta["playwright_page"] await page.screenshot( path=Path(__file__).parent / "books" / f"{url_sha256}.png", full_page=True ) await page.close() return { "url": response.url, "title": response.css("h1::text").get(), "price": response.css("p.price_color::text").get(), "breadcrumbs": response.css(".breadcrumb a::text").getall(), "image": f"books/{url_sha256}.png", }
def parse_word(self, response: Response) -> dict: """ Parses the word and subtracts the type(f, m, adj, v or v*), the url and the message to send :param response: scrapy.http.response.Response :return: dict """ # extract type, one of: (f, m, adj, v or v*) l_items = response.css( r"tr>td[colspan='2'][valign='TOP'][width='650']>font>i::text" ).extract() l_items = list(map(lambda item: item.strip(), l_items)) type_possibilities = ["m", "f", "adj", "adv", "v", "v*", "pl", 'símb'] l_type = list(filter(lambda item: item in type_possibilities, l_items)) # should at least have 1 type, if not raise because there is a case that we do not control l_type = [item.strip() for item in l_type] try: s_type = l_type[0] except IndexError: str_err = "Something wrong with this l_items: '{}' in url: '{}'".format( l_items, response.url) logger.error(str_err) raise IndexError(str_err) # if the type is plural, then add and s to the type if len(l_type) > 1: if "pl" == l_type[1]: s_type += "s" # get the word from the title word = response.css(r"span[class='enc']::text").extract()[0].strip() data = { 'word': word, # it's only 1 element 'type': s_type, 'url': response.url, 'used': False, 'next_dict_id': self.start_id } # creates the message to send to twitter depending on the type of the word data["msg"] = return_twitter_msg(data) print(data) yield data
def _parse_sections(self, response: Response, folder_root: pathlib.Path): section_name = response.css(".ds-section-headline::text").get() meta = {'folder_root': folder_root} yield from response.follow_all( css=".layout-weekly-edition-section .teaser a.headline-link", callback=self._parse_article, meta=meta)
def parse(self, response: Response, **kwargs): for product_container in response.css('div.product'): self.data_read_callback({ 'name': product_container.css( 'div.productTitleContent a ::text').get().strip(), 'price': self._get_price_from_string( product_container.css( 'span.product_price_text ::text').get().strip()), 'link': response.request.url + product_container.css( 'div.productTitleContent a ::attr(href)').get().strip() }) next_page = response.css('div.pagination a.next ::attr(href)').get() if next_page: yield response.follow(next_page, self.parse)
def parse(self, response: Response, current_page: Optional[int] = None) -> Generator: page_count = response.css(".pager .current::text").re_first(r"Page \d+ of (\d+)") page_count = int(page_count) for page in range(2, page_count + 1): yield response.follow(f"/catalogue/page-{page}.html", cb_kwargs={"current_page": page}) current_page = current_page or 1 for book in response.css("article.product_pod a"): yield response.follow( book, callback=self.parse_book, meta={ "playwright": True, "playwright_include_page": True, "playwright_context": f"page-{current_page}", }, )
def parse(self, response: Response, **kwargs): for product_container in response.css('div.product-container'): self.data_read_callback({ 'name': product_container.css('a.product-name ::text').get().strip(), 'price': self._get_price_from_string( product_container.css( 'span.product-price ::text').get().strip()), 'link': product_container.css( 'a.product-name ::attr(href)').get().strip() }) next_page = response.css('li.pagination_next a::attr(href)').get() if next_page: request_url_split = urlsplit(response.request.url) yield response.follow( f"{request_url_split.scheme}://{request_url_split.netloc}{next_page}", self.parse)
def parse_word(self, response: Response) -> dict: # extract type, one of: (f, m, adj, v or v*) l_items = response.css( r"tr>td[colspan='2'][valign='TOP'][align='left'][width='650']>font::text" ).extract() l_items = list(map(lambda item: item.strip(), l_items)) l_items = list(filter(lambda item: item != "", l_items)) first_def = l_items[0] # get the word from the title word = response.css(r"span[class='enc']::text").extract()[0].strip() data = { 'word': word, # it's only 1 element 'definition': first_def, 'url': response.url } yield data
def parse(self, response: Response) -> dict: """ Parses the search page from the dictionary and gets the URL's of the words and follows the first one :param response: response from the scrapy request :type response: scrapy.http.response.Response :return: yields a dictionary with word, type, url and msg for the dictionary word """ # get <a> elements and retrieve the first one tag_urls = response.css(r"a[href*='GECART']::attr(href)") # get the next page GECART ID and save it js_next = response.css("a[class='SEGUENTS']::attr(href)")[0].get() start_id_next = js_next.split("(")[-1].split(")")[0] self.start_id = start_id_next for tag in tag_urls: url = tag.get() yield scrapy.Request(url, callback=self.parse_word) time.sleep(1)
def _parse_article(self, response: Response): folder_root = response.meta["folder_root"] title = response.css(".article__headline::text").get() + ".pdf" soup = BeautifulSoup(response.text, 'lxml') self._remove_html_node(soup.find("header", class_="ds-masthead")) self._remove_html_node(soup.find("div", class_="article__section")) self._remove_html_node(soup.find("div", class_="layout-article-links")) self._remove_html_node(soup.find("div", class_="newsletter-signup")) self._remove_html_node(soup.find("aside", class_="article__aside")) self._remove_html_node( soup.find("div", class_="layout-related-articles")) self._remove_html_node(soup.find("footer")) with open(folder_root / title, mode='w+b') as dest: pisa_status = pisa.CreatePDF(str(soup), dest=dest)
def parse_thread(self, response: Response) -> None: """ Thread page callback. Parses PostItem. Follows next thread page. :param response: scrapy crawl response """ posts = response.css('div.post.panel-body') post_number = 1 too_old_post_found = False for post in posts: post_loader = ItemLoader(item=PostItem(), selector=post) post_loader.add_value( 'username', post.css('a.username-coloured::text,a.username::text').get()) post_date_string = post.css('div.post-date::text')[1].get()[3:-1] if post_date_string is None: continue post_date = parse_date(post_date_string) post_loader.add_value('date', str(post_date)) post_loader.add_value( 'post_id', post.css('div.post-date > a::attr(href)').re(r'.html#(.*)')) post_loader.add_value('thread_url', response.request.url) post_loader.add_value('post_number', post_number) post_number += 1 post_loader.add_value('content', post.css('div.content').get()) if not self.full_crawl: if post_date < self.start_date: too_old_post_found = True continue yield post_loader.load_item() next_page = response.css('a[rel=next]::attr(href)').get() if next_page and not too_old_post_found: next_request = response.urljoin(next_page) yield scrapy.Request(next_request, callback=self.parse_thread)
def parse(self, response: Response, **kwargs): for product_container in response.css('div.product'): availability = product_container.css( "span.p-cat-availability ::text").get().strip() if not availability.startswith("Skladem"): continue self.data_read_callback({ 'name': product_container.css('a.p-name span ::text').get().strip(), 'price': self._get_price_from_string( product_container.css( 'span.p-det-main-price ::text').get().strip()), 'link': response.request.url + product_container.css( 'a.p-name ::attr(href)').get().strip()[1:] }) next_page = response.css( 'div.pagination a.s-page.pagination-page ::attr(href)').get() if next_page: request_url_split = urlsplit(response.request.url) yield response.follow( f"{request_url_split.scheme}://{request_url_split.netloc}{next_page}", self.parse)
def request_associator(self, request: Request, response: Response): """ Persists the url of the page in which a link was found past the point that information would usually be destroyed by hiding it in the resulting request's meta dictionary :param request: The Request object currently passed to the Rule's LinkExtractor :param response: The Response object which produced request :return: the same request as input """ title = response.css('title::text').get() request.meta['origin'] = response.url request.meta['origin_title'] = title return request
def parse_top_books(self, response: Response): for selector in response.css(BOOK_PATH): short_name = selector.xpath(SHORT_NAME_PATH) last_chapter = selector.xpath(LAST_CHAPTER_PATH) is_vip = selector.xpath(IS_VIP_PATH) try: short_name = short_name.get().split('/')[2] last_chapter = int(last_chapter.get().replace(',', '')) is_free = not is_vip yield { SHORT_NAME: short_name, LAST_CHAPTER: last_chapter, IS_FREE: is_free } except: pass
def parse_forum(self, response: Response) -> None: """ Forum callback. Parses ForumItem. Follows subforum links and thread links (through self.parse_forum_page() method). :param response: scrapy crawl response """ forum_loader = ItemLoader(item=ForumItem(), response=response) forum_loader.add_value('link', response.request.url) forum_loader.add_css('name', 'h2 > a::text') yield forum_loader.load_item() subforums = response.css('a.forumtitle::attr(href)').getall() for forum in subforums: next_request = response.urljoin(forum) yield scrapy.Request(next_request, callback=self.parse_forum) yield from self.parse_forum_page(response, response.url)
def parse(self, response: Response) -> None: """ Default scrapy callback. To be used on forum main page. Follows subforum links. :param response: scrapy crawl resposne :returns :class:`hyperreal.crawler.hypercrawler.items.PostItem`, :class:`hyperreal.crawler.hypercrawler.items.ForumItem`, :class:`hypercrawler.items.TopicItem` """ date = self.settings.get('START_DATE') self.full_crawl = date is None if not self.full_crawl: self.start_date = date subforums = response.css('a.forumtitle::attr(href)').getall() for forum in subforums: next_request = response.urljoin(forum) yield scrapy.Request(next_request, callback=self.parse_forum)
def parse(self, response: Response): current_code1 = None for code in response.css( "#main > #content > #content-inner > span.text"): if len(code.css('b')) != 0: current_code1 = code.css('b::text').extract_first().strip() continue else: current_item = IarcItem() current_item['code1'] = current_code1 current_item['code2'] = code.css( '::text').extract_first().strip() current_item['code2_name'] = code.css( 'a::text').extract_first().strip() yield Request(response.urljoin( code.css('a').xpath('@href').extract_first()), self.parse_code2, meta={'item': current_item})
def parse(self, response: Response, **kwargs): is_empty_page = True for product_container in response.css('div.ramecekshop'): is_empty_page = False self.data_read_callback({ 'name': product_container.css('a.nadpisramecek ::text').get().strip(), 'price': self._get_price_from_string( product_container.css( 'a.objednejkosobr ::text').get().strip()), 'link': self._get_product_link( response, product_container.css( 'a.nadpisramecek ::attr(href)').get().strip()) }) if not is_empty_page: yield response.follow(self._get_next_page_url(response), self.parse)
def parse(self, response: Response): """Parse URLs. :param Response response: HTTP response returned by URL requested """ arts = Selector(response).css("article") logging.info("La page {} contient {} articles".format( response.url, len(arts))) for art in arts: # title art_title_section = art.css("div.title-and-meta") # url art_rel_url = art_title_section.css( "h2.node__title a::attr(href)").get() if art_rel_url is not None: yield response.follow(art_rel_url, callback=self.parse_article) # get next page from bottom pagination to iterate over pages next_page = response.css("li.pager-next a::attr(href)").get() if next_page is not None: yield response.follow(next_page, callback=self.parse)
def parse(self, response: Response): # list of jokes in main page l_jokes = response.css('article[class="chiste mb10"]') # this will tell if we arrived at the last joke page or not if l_jokes: # get all the jokes in string for joke in l_jokes: l_strings = [x.get() for x in joke.css("p[itemprop='articleBody']::text")] s_joke = "".join(l_strings) url_id = joke.css("a[class='compartir']::attr('href')")[0].get() d_joke = {"hash_id": url_id, "user_str_id": "1000Chistes", "user_name": "1000Chistes", "joke": s_joke} yield d_joke time.sleep(5) # follow onto the next page new_page_number = int(response.url.split(r"/")[-1]) + 1 new_url = "{url}/{page_num}".format(url=r"/".join(response.url.split(r"/")[:-1]), page_num=new_page_number) print(new_url) yield response.follow(new_url, self.parse)
def parse_article(self, response: Response): """Specific parsing logic for Geotribu articles :param Response response: HTTP response returned by URL requested """ logging.info("Start parsing ARTICLE: {}".format( response.css("title::text").getall()[0])) item = ArticleItem() # contenu de la art art = response.css("article")[0] # titre art_title_section = art.css("div.title-and-meta") art_title = art_title_section.css("h2.node__title a::text").get() item["title"] = art_title # type d'article - jusqu'en 2013, les revues de presse étaient des articles # comme les autres et n'étaient pas aussi structurées if "revue de presse" in art_title.lower(): item["kind"] = "rdp" else: item["kind"] = "art" # url art_rel_url = art_title_section.css( "h2.node__title a::attr(href)").get() item["url_full"] = art_rel_url # date de publication art_date = art.css("div.date") art_date_day = art_date.css("span.day::text").get() art_date_month = art_date.css("span.month::text").get() art_date_year = art_date.css("span.year::text").get() item["published_date"] = (art_date_day, art_date_month, art_date_year) # tags item["tags"] = art_title_section.css( "span.taxonomy-tag a::text").getall() # récupération de l'intro try: item["intro"] = art.css( "div.field-name-field-introduction").getall()[0] except IndexError: logging.debug("Article doesn't have introduction.") item["intro"] = None # corps art_raw_body = art.css("div.field-name-body") art_out_body = [] for el in art_raw_body: art_out_body.append(el.get()) item["body"] = art_out_body # images URLS (converted into absolute) item["image_urls"] = [ response.urljoin(i) for i in art.css("img").xpath("@src").getall() ] # author author_block = art.css("div.view.view-about-author") if author_block: # author thumbnail thumbnail = (art.css("div.view.view-about-author").css( "img").xpath("@src").getall()) if thumbnail and len(thumbnail): thumbnail = (art.css("div.view.view-about-author").css( "img").xpath("@src").getall()[0]) else: thumbnail = "?" # author name name = (author_block.css( "div.views-field.views-field-field-nom-complet").css( "div.field-content::text").getall()) if name and len(name): author_block.css( "div.views-field.views-field-field-nom-complet").css( "div.field-content::text").getall()[0] else: name = "?" item["author"] = { "thumbnail": thumbnail, "name": name[0], "description": author_block.css( "div.views-field.views-field-field-description p").getall( ), } else: item["author"] = { "thumbnail": "?", "name": art_title_section.css("span.username a::text").get(), "description": "", } yield item