def parse(self, response: Response, **kwargs):
        articles = response.xpath('//div[@class="pad5 english_article persian_article small_font"]')
        for article in articles:
            download_url = article.css('.article_links').xpath('(.//a)[3]/@href').get()
            download_url = response.urljoin(download_url)

            info_url = article.css('.article_links').xpath('(.//a)[4]/@href').get()
            info_url = response.urljoin(info_url)

            yield Request(info_url, cb_kwargs={'download_url': download_url}, callback=self.parse_info)
Пример #2
0
    def parse_forum_page(self,
                         response: Response,
                         forum_url: str = None) -> None:
        """
        Forum page callback. Parses TopicItem.
        Follows next forum page and threads.
        :param forum_url: forum url, from first page. Will be extracted from response meta if not provided.
        :param response: scrapy crawl response
        """
        if forum_url is None:
            forum_url = response.meta['forum_url']

        # threads = response.css('a.topictitle')
        threads = response.css(
            'div.topic_read,div.topic_read_hot,div.topic_read_locked,div.topic_moved,div.sticky_read,'
            'div.sticky_read_locked,div.announce_read,div.announce_read_locked'
        )
        # if len(threads) != len(threads2):
        #     print(response.url)
        too_old_thread_found = False
        for thread_container in threads:
            thread = thread_container.css('a.topictitle')
            topic_loader = ItemLoader(item=TopicItem(), response=response)
            thread_href_selector = thread.css('a::attr(href)')
            thread_link = response.urljoin(thread_href_selector.get())
            topic_loader.add_value('id',
                                   thread_href_selector.re(r'-(t[0-9]*).html'))
            topic_loader.add_value('thread_link', thread_link)
            topic_loader.add_value('forum_link', forum_url)
            topic_loader.add_value('name', thread.css('a::text').get())
            yield topic_loader.load_item()

            if not self.full_crawl:
                last_post_date_candidates = thread_container.css(
                    'span.post-date::text').getall()
                last_post_date = max(
                    map(lambda x: parse_date(x), last_post_date_candidates))
                if last_post_date < self.start_date:
                    too_old_thread_found = True
                    continue

            yield scrapy.Request(thread_link + "?sd=d",
                                 callback=self.parse_thread)

        next_page = response.css('a[rel=next]::attr(href)').get()
        if next_page and not too_old_thread_found:
            next_request = response.urljoin(next_page)
            yield scrapy.Request(next_request,
                                 callback=self.parse_forum_page,
                                 meta={'forum_url': forum_url})
Пример #3
0
    def parse_forum(self, response: Response) -> None:
        """
        Forum callback. Parses ForumItem.
        Follows subforum links and thread links (through self.parse_forum_page() method).
        :param response: scrapy crawl response
        """
        forum_loader = ItemLoader(item=ForumItem(), response=response)
        forum_loader.add_value('link', response.request.url)
        forum_loader.add_css('name', 'h2 > a::text')
        yield forum_loader.load_item()

        subforums = response.css('a.forumtitle::attr(href)').getall()
        for forum in subforums:
            next_request = response.urljoin(forum)
            yield scrapy.Request(next_request, callback=self.parse_forum)

        yield from self.parse_forum_page(response, response.url)
Пример #4
0
    def parse(self, response: Response) -> None:
        """
        Default scrapy callback. To be used on forum main page.
        Follows subforum links.

        :param response: scrapy crawl resposne
        :returns :class:`hyperreal.crawler.hypercrawler.items.PostItem`,
        :class:`hyperreal.crawler.hypercrawler.items.ForumItem`, :class:`hypercrawler.items.TopicItem`
        """
        date = self.settings.get('START_DATE')
        self.full_crawl = date is None
        if not self.full_crawl:
            self.start_date = date

        subforums = response.css('a.forumtitle::attr(href)').getall()
        for forum in subforums:
            next_request = response.urljoin(forum)
            yield scrapy.Request(next_request, callback=self.parse_forum)
Пример #5
0
 def parse(self, response: Response):
     current_code1 = None
     for code in response.css(
             "#main > #content > #content-inner > span.text"):
         if len(code.css('b')) != 0:
             current_code1 = code.css('b::text').extract_first().strip()
             continue
         else:
             current_item = IarcItem()
             current_item['code1'] = current_code1
             current_item['code2'] = code.css(
                 '::text').extract_first().strip()
             current_item['code2_name'] = code.css(
                 'a::text').extract_first().strip()
             yield Request(response.urljoin(
                 code.css('a').xpath('@href').extract_first()),
                           self.parse_code2,
                           meta={'item': current_item})
Пример #6
0
 def parse_main_page(self, response: Response):
     book_urls = response.xpath(BOOK_URL)
     genre_urls = response.xpath(GENRE_URL)
     for url in book_urls:
         short_name = get_book_name_from_url(url.get())
         yield Request(
             url=BASE_URL.format(short_name),
             callback=self.parse_book_info,
             cb_kwargs=dict(short_name=short_name)
         )
     genre_urls = [x.get() for x in genre_urls]
     genres = list(set([x.replace('/', '') for x in genre_urls]))
     for k in range(2):
         yield Request(
             url=response.urljoin(genre_urls[k]),
             callback=self.parse_books_in_page
         )
         yield Request(
             url=GENRE_LIST_URL.format(genres[k]),
             callback=self.parse_genre_list,
             cb_kwargs=dict(genre=genres[k])
         )
Пример #7
0
    def parse_thread(self, response: Response) -> None:
        """
        Thread page callback. Parses PostItem.
        Follows next thread page.
        :param response: scrapy crawl response
        """

        posts = response.css('div.post.panel-body')
        post_number = 1
        too_old_post_found = False
        for post in posts:
            post_loader = ItemLoader(item=PostItem(), selector=post)
            post_loader.add_value(
                'username',
                post.css('a.username-coloured::text,a.username::text').get())
            post_date_string = post.css('div.post-date::text')[1].get()[3:-1]
            if post_date_string is None:
                continue
            post_date = parse_date(post_date_string)
            post_loader.add_value('date', str(post_date))
            post_loader.add_value(
                'post_id',
                post.css('div.post-date > a::attr(href)').re(r'.html#(.*)'))
            post_loader.add_value('thread_url', response.request.url)
            post_loader.add_value('post_number', post_number)
            post_number += 1
            post_loader.add_value('content', post.css('div.content').get())
            if not self.full_crawl:
                if post_date < self.start_date:
                    too_old_post_found = True
                    continue
            yield post_loader.load_item()

        next_page = response.css('a[rel=next]::attr(href)').get()
        if next_page and not too_old_post_found:
            next_request = response.urljoin(next_page)
            yield scrapy.Request(next_request, callback=self.parse_thread)
Пример #8
0
    def parse_book_info(self, response: Response, short_name):
        # Get book's full name and author
        loader = ItemLoader(item=BookInfo(), response=response)
        # Find elements
        loader.add_css(FULL_NAME, BOOK_FULL_NAME_PATH)
        loader.add_css(AUTHOR, BOOK_AUTHOR_PATH)
        loader.add_css(LAST_CHAPTER, BOOK_LAST_CHAPTER_PATH)
        loader.add_css(CHAPTERS, BOOK_CHAPTER_PATH)

        # Extracting data
        page = loader.load_item()
        last_chapter = int(page.get(LAST_CHAPTER))

        yield {
            SHORT_NAME: short_name,
            FULL_NAME: page.get(FULL_NAME),
            AUTHOR: page.get(AUTHOR),
            LAST_CHAPTER: last_chapter
        }

        urls = tuple(map(lambda x: response.urljoin(x), page.get(CHAPTERS)))

        for url in urls:
            try:
                a = url.split('chuong-')[1]
                chapter_index = int(a.split('-')[0])
                yield SplashRequest(
                    url=url,
                    callback=self.parse_chapter,
                    cb_kwargs=dict(short_name=short_name,
                                   chapter_index=chapter_index),
                    args={
                        'lua_source': WAIT_FOR_ELEMENT.format('#borderchapter')
                    })
            except Exception as e:
                logging.error(str(e))
Пример #9
0
    def parse_article(self, response: Response):
        """Specific parsing logic for Geotribu articles

        :param Response response: HTTP response returned by URL requested
        """
        logging.info("Start parsing ARTICLE: {}".format(
            response.css("title::text").getall()[0]))
        item = ArticleItem()

        # contenu de la art
        art = response.css("article")[0]

        # titre
        art_title_section = art.css("div.title-and-meta")
        art_title = art_title_section.css("h2.node__title a::text").get()
        item["title"] = art_title

        # type d'article - jusqu'en 2013, les revues de presse étaient des articles
        # comme les autres et n'étaient pas aussi structurées
        if "revue de presse" in art_title.lower():
            item["kind"] = "rdp"
        else:
            item["kind"] = "art"

        # url
        art_rel_url = art_title_section.css(
            "h2.node__title a::attr(href)").get()
        item["url_full"] = art_rel_url

        # date de publication
        art_date = art.css("div.date")
        art_date_day = art_date.css("span.day::text").get()
        art_date_month = art_date.css("span.month::text").get()
        art_date_year = art_date.css("span.year::text").get()
        item["published_date"] = (art_date_day, art_date_month, art_date_year)

        # tags
        item["tags"] = art_title_section.css(
            "span.taxonomy-tag a::text").getall()

        # récupération de l'intro
        try:
            item["intro"] = art.css(
                "div.field-name-field-introduction").getall()[0]
        except IndexError:
            logging.debug("Article doesn't have introduction.")
            item["intro"] = None

        # corps
        art_raw_body = art.css("div.field-name-body")
        art_out_body = []
        for el in art_raw_body:
            art_out_body.append(el.get())

        item["body"] = art_out_body

        # images URLS (converted into absolute)
        item["image_urls"] = [
            response.urljoin(i) for i in art.css("img").xpath("@src").getall()
        ]

        # author
        author_block = art.css("div.view.view-about-author")
        if author_block:
            # author thumbnail
            thumbnail = (art.css("div.view.view-about-author").css(
                "img").xpath("@src").getall())
            if thumbnail and len(thumbnail):
                thumbnail = (art.css("div.view.view-about-author").css(
                    "img").xpath("@src").getall()[0])
            else:
                thumbnail = "?"

            # author name
            name = (author_block.css(
                "div.views-field.views-field-field-nom-complet").css(
                    "div.field-content::text").getall())
            if name and len(name):
                author_block.css(
                    "div.views-field.views-field-field-nom-complet").css(
                        "div.field-content::text").getall()[0]
            else:
                name = "?"

            item["author"] = {
                "thumbnail":
                thumbnail,
                "name":
                name[0],
                "description":
                author_block.css(
                    "div.views-field.views-field-field-description p").getall(
                    ),
            }
        else:
            item["author"] = {
                "thumbnail": "?",
                "name": art_title_section.css("span.username a::text").get(),
                "description": "",
            }

        yield item
Пример #10
0
 def parse(self, response: Response):
     for i in range(3, 9):
         for j in range(1, 3):
             piece_url = response.xpath(BASE_PIECE_XPATH.format(i, j)).get()
             yield Request(url=response.urljoin(piece_url),
                           callback=self.parse_chess_piece)