예제 #1
0
 def parse_list(self, response: Response):
     # wait for page to load
     # wait for the redirect to finish.
     patent_links = []
     for link in response.xpath("//a[@class='lead-in']/@href").getall():
         patent_links.append(link)
     # for next page
     next_page = response.xpath(
         "//div[@class='nav-previous']/a/@href").get()
     if next_page is not None:
         self.log('process page {}'.format(next_page), level=logging.INFO)
         yield response.follow(
             url=next_page,
             callback=self.parse_list,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
     for p in patent_links:
         name = p.split('/')[-2]
         if os.path.exists(os.path.join(self.work_directory,
                                        name + '.json')):
             self.log('{} already parsed and will skip'.format(p),
                      level=logging.INFO)
             continue
         yield response.follow(
             url=p,
             callback=self.parse,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
예제 #2
0
 def follow_pages(self, response: Response) -> Iterable[Request]:
     yield from (
         response.follow(letter.get(), callback=self.parse)  # type: ignore
         for letter in response.css(".az-keyboard ul li a::attr(href)"))
     yield from (
         response.follow(number.get(), callback=self.parse)  # type: ignore
         for number in response.css("ul.pagination__list li a::attr(href)"))
예제 #3
0
 def parse_list(self, response: Response):
     # wait for page to load
     # wait for the redirect to finish.
     patent_links = []
     for link in response.xpath("//h4[@class='result-title']/a"):
         text = link.xpath("text()").get()
         url = link.xpath("@href").get()
         self.log("find technology {}/{}".format(text, url), level=logging.INFO)
         patent_links.append(url)
     # for next page
     current_page, total_page = self.statictics(response)
     if current_page < total_page:
         self.log('process page {}'.format(self.page), level=logging.INFO)
         yield response.follow(
             url='https://otd.harvard.edu/explore-innovation/technologies/results/P{}/'.format(current_page * 10),
             callback=self.parse_list,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
     for p in patent_links:
         name = self.parse_name_from_url(p)
         if os.path.exists(os.path.join(self.work_directory, name + '.json')):
             self.log('{} already parsed and will skip'.format(p), level=logging.INFO)
             continue
         yield response.follow(
             url=p,
             callback=self.parse,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
    def parse_forum(self, response: Response):
        forum_url_query = urllib.parse.urlparse(response.url).query
        forum_id = int(urllib.parse.parse_qs(forum_url_query)['id'][0])
        forum_title = response.xpath(
            '//div[@id="pun-main"]/h1/span/text()').get()

        section_id = self.forums[forum_id]

        yield ForumItem(id=forum_id, title=forum_title, section_id=section_id)

        for topic in response.xpath(
                '//div[@class="forum"]/div[@class="container"]//tbody//tr/td[@class="tcl"]/div[@class="tclcon"]'
        ):
            topic_url = topic.xpath('a/@href').get()
            topic_url_query = urllib.parse.urlparse(topic_url).query
            topic_id = int(urllib.parse.parse_qs(topic_url_query)['id'][0])

            self.topics[topic_id] = forum_id

            yield response.follow(topic_url, self.parse_topic)

        next_page_url = response.xpath(
            '//div[@class="pagelink"]/a[@class="next"]/@href').get()
        if next_page_url:
            yield response.follow(next_page_url, callback=self.parse_forum)
예제 #5
0
 def parse_list(self, response: Response):
     # wait for page to load
     # wait for the redirect to finish.
     patent_links = []
     for link in response.xpath(
             '//*[@id="formTechPub1"]/div/table[2]/tr/td/a'):
         text = link.xpath("text()").get()
         url = link.xpath("@href").get()
         self.log("find technology {}/{}".format(text, url),
                  level=logging.INFO)
         patent_links.append(url)
     # for next page
     total_result = self.statictics(response)
     self.page += 1
     if self.page * self.item_per_page < total_result:
         self.log('process page {}'.format(self.page), level=logging.INFO)
         yield response.follow(
             url=self.next_page_template.format(self.page),
             callback=self.parse_list,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
     for p in patent_links:
         name = self.parse_name_from_url(p)
         if os.path.exists(os.path.join(self.work_directory,
                                        name + '.json')):
             self.log('{} already parsed and will skip'.format(p),
                      level=logging.INFO)
             continue
         yield response.follow(
             url=p,
             callback=self.parse,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
예제 #6
0
 def parse_list(self, response: Response):
     # wait for page to load
     # wait for the redirect to finish.
     patent_links = []
     for link in response.xpath(
             "//div[@class='view-content']/div[contains(@class,'views-row')]/div/h3/a"
     ):
         text = link.xpath("text()").get()
         url = link.xpath("@href").get()
         self.log("find technology {}/{}".format(text, url),
                  level=logging.INFO)
         patent_links.append(url)
     # for next page
     next_page = response.xpath("//li[@class='pager-next']/a/@href").get()
     if next_page is not None:
         self.log('process page {}'.format(next_page), level=logging.INFO)
         yield response.follow(
             url=next_page,
             callback=self.parse_list,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
     for p in patent_links:
         name = self.parse_name_from_url(p)
         if os.path.exists(os.path.join(self.work_directory,
                                        name + '.json')):
             self.log('{} already parsed and will skip'.format(p),
                      level=logging.INFO)
             continue
         yield response.follow(
             url=p,
             callback=self.parse,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
예제 #7
0
    def parse(self, response: Response):
        for row in response.xpath(
                '//div[@class="usertable"]//div[@class="container"]//tbody/tr'
        ):
            profile_url = row.xpath('td[@class="tcl"]//a/@href').get()
            yield response.follow(profile_url, callback=self.parse_profile)

        next_page_url = response.xpath(
            '//div[@class="pagelink"]/a[@class="next"]/@href').get()
        if next_page_url:
            yield response.follow(next_page_url, callback=self.parse)
    def parse(self, response: Response, **kwargs):
        if self.url_to_crawl:
            yield response.follow(url=self.url_to_crawl, callback=self.parse_residences)
        else:
            residences = response.xpath("//a[contains(@class,'detalii-proprietate')][contains(.,'Vezi detalii')]/@href").getall()
            residences = list(set(residences))

            yield from response.follow_all(urls=residences, callback=self.parse_residences)

            next_page = response.xpath("//a[@class='inainte butonpaginare']/@href").get()
            if next_page:
                yield response.follow(url=next_page, callback=self.parse)
예제 #9
0
    def parse(self, response: Response):
        """
        Parse symbol directory pages such as
        https://www.set.or.th/set/commonslookup.do?language=en&country=TH&prefix=A
        """
        symbol_rows = response.css("tr[valign=top]")
        for row in symbol_rows:
            comp_profile_page = row.css("td a::attr(href)").get()
            comp_holders_page = comp_profile_page \
                .replace("companyprofile", "companyholder")

            yield response.follow(comp_profile_page,
                                  callback=self.parse_comp_profile)
            yield response.follow(comp_holders_page,
                                  callback=self.parse_comp_holders)
예제 #10
0
    def parse(self, response: Response) -> Generator[Request, None, None]:
        """
        Parse whoscored statistic page to get all tournaments url
        :param response:
        :type response: Response
        :return:
        :rtype: Generator[Request, None, None]

        # Scrapy check - because of settings missing, use Premier League
        # (England) only for test purpose
        @url https://www.whoscored.com/Statistics
        @returns requests 1
        """
        js_script: str = response.css(
            "#layout-wrapper > script::text").extract_first()
        tournaments: str = P_TOUR.search(js_script).group("tournaments")

        ctx = MiniRacer()
        for region in ctx.eval(tournaments):
            for tournament in filter(lambda x: x["name"],
                                     region["tournaments"]):
                if (region["id"], tournament["id"]) in self.settings.get(
                        "REGIONS",
                    {(252, 2)}  # pylint: disable=bad-continuation
                ):  # England, Premier League (as default)
                    yield response.follow(
                        tournament["url"],
                        callback=self.parse_tournaments,
                        meta={"waitForSelector": "#layout-content-wrapper"},
                    )
예제 #11
0
    def parse(self, response: Response) -> Iterable[Union[Request, Mapping]]:
        yield from self.follow_pages(response)
        for recipe_url in response.css("a.promo::attr(href)"):
            yield response.follow(recipe_url.get(), callback=self.parse)

        recipe = response.css("div.recipe-main-info")
        if recipe:
            ingredients = [
                self._get_ingredient(response, ingredient) for ingredient in
                recipe.css("li.recipe-ingredients__list-item")
            ]
            if all(ingredient["url"] is not None
                   for ingredient in ingredients):
                chef_name_parts = recipe.css(".chef__name *::text").getall()
                chef_name = chef_name_parts[-1] if len(
                    chef_name_parts) > 0 else None
                image_urls = recipe.css(
                    ".recipe-media__image img::attr(src)").getall()
                yield {
                    "title": recipe.css("h1::text").get(),
                    "url": response.url,
                    "chef_name": chef_name,
                    "ingredients": ingredients,
                    "image_urls": image_urls,
                }
예제 #12
0
 def parse_list(self, response: Response):
     # wait for page to load
     # wait for the redirect to finish.
     patent_links = []
     if os.path.exists(os.path.join(self.work_directory, 'links.json')):
         patent_links = json.load(open(os.path.join(self.work_directory, 'links.json'), 'r'))
     else:
         # the id of product is provded in the <script></script>
         for code in response.xpath("//script").getall():
             if 'id_list' in code:
                 ids = re.findall(r'[0-9]+', re.findall(r'\[[0-9,]+\]', code)[0])
                 patent_links = [response.url + '/public/project/{}'.format(patentId) for patentId in ids]
         with open(os.path.join(self.work_directory, 'links.json'), 'w') as fo:
             json.dump(patent_links, fo)
     for p in patent_links:
         name = p.split('/')[-1]
         if os.path.exists(os.path.join(self.work_directory, name + '.json')):
             self.log('{} already parsed and will skip'.format(p), level=logging.INFO)
             continue
         yield response.follow(
             url=p,
             callback=self.parse,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
예제 #13
0
    def parse_team(self, response: Response) -> Generator[Request, None, None]:
        """

        :param response:
        :type response: Response
        :return:
        :rtype: Generator[Request, None, None]

        @url https://www.whoscored.com/Teams/167/Fixtures/England-Manchester-City
        @returns requests 0
        """
        js_script: str = response.xpath(
            '//*[@id="layout-content-wrapper"]/div[2]/script[3]'
        ).extract_first()

        ctx = MiniRacer()

        fixture: List
        for fixture in ctx.eval(
                P_TEAM_FIXTURES.search(js_script).group("fixtures")):
            fixture_: Match = Match(*fixture)
            yield response.follow(
                url="https://www.whoscored.com/Matches/{id}/".format(
                    id=fixture_.id),
                callback=self.parse_match,
            )
예제 #14
0
 def parse_list(self, response: Response):
     # wait for page to load
     # wait for the redirect to finish.
     patent_links = []
     for link in response.xpath("//ul[@id='tech-licensing']/li/a"):
         text = link.xpath("text()").get()
         url = link.xpath("@href").get()
         if url is None:
             continue
         self.log("find technology {}/{}".format(text, url),
                  level=logging.INFO)
         patent_links.append(url)
     for p in patent_links:
         name = self.parse_name_from_url(p)
         if os.path.exists(os.path.join(self.work_directory,
                                        name + '.json')):
             self.log('{} already parsed and will skip'.format(p),
                      level=logging.INFO)
             continue
         yield response.follow(
             url=p,
             callback=self.parse,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
    def parse(self, response: Response):
        # Parse politics pages
        if self.is_politics_page(response.url):
            yield self.parse_politics_page(response)

        for href in response.css('a::attr(href)'):
            if self.is_in_domain(href.get(), response.url):
                yield response.follow(href, self.parse)
예제 #16
0
    def parse(self, response: Response) -> Generator[Generator, None, None]:
        links_declassified = response.xpath(
            '//a[starts-with(@href,"collection") and (parent::h3 | parent::h2)]/@href'
        ).getall()

        for link in links_declassified:
            yield response.follow(link,
                                  callback=self._document,
                                  cb_kwargs={'url': response.urljoin(link)})
    def parse(self, response: Response):
        if self.is_mobile(response.url):
            # Parse mobile politics page
            yield self.parse_politics_page(response)
        else:
            # Go to mobile page instead
            if self.is_politics_page(response.url):
                yield response.follow(response.url.replace('www', 'm', 1),
                                      self.parse)

            for href in response.css('a::attr(href)'):
                absolute_url = urljoin(
                    response.url, href.get())  # Make relative links absolute
                if self.is_in_domain(href.get(), response.url):
                    # Remove parameters
                    o: SplitResult = urlsplit(absolute_url)
                    base_href = urlunsplit(
                        (o.scheme, o.netloc, o.path, '', ''))
                    yield response.follow(base_href, self.parse)
예제 #18
0
    def parse(self, response: Response):
        yield from response.follow_all(
            xpath='//*[starts-with(@id, "item_")]/div[1]/a/@href',
            callback=self.parse_item,
        )

        link = response.xpath(
            '//*[@id="navigation-bar-bottom"]/div[2]/ul/'
            'li[contains(@class, "next-page")]/a/@href').get()

        yield response.follow(link, callback=self.parse)
예제 #19
0
 def parse_list(self, response: Response):
     # wait for page to load
     # wait for the redirect to finish.
     patent_links = []
     for row in response.xpath(
             "//div[@id='nouvant-portfolio-content']/div[@class='technology']"
     ):
         title = row.xpath("h2/a/text()").get()
         link = row.xpath("h2/a/@href").get()
         abstract = row.xpath("p/span/text()").get()
         self.log('found patent {}'.format(title), level=logging.INFO)
         patent_links.append({
             'title': title,
             'link': link,
             'abstract': abstract
         })
     statistics = self.statistics(response)
     self.log('found {}/{} patents'.format(statistics['end'],
                                           statistics['total']),
              level=logging.INFO)
     if statistics['end'] < statistics['total']:
         yield response.follow(
             url='/technologies?limit=50&offset={}&query='.format(
                 statistics['end']),
             callback=self.parse_list,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
     for p in patent_links:
         name = p['link'].split('/')[-1]
         if os.path.exists(os.path.join(self.work_directory,
                                        name + '.json')):
             self.log('{} already parsed and will skip'.format(p['link']),
                      level=logging.INFO)
             continue
         yield response.follow(
             url=p['link'],
             callback=self.parse,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             errback=self.handle_failure)
예제 #20
0
    def parse(self, response: Response):
        XPATH_URL = "//body//div[@class='childinfo']//div//div[*]//a[1]/@href"
        urls = response.xpath(XPATH_URL).getall()

        for i in range(len(urls)):
            yield response.follow(
                urls[i],
                callback=self.parse_detail,
                meta={
                    "n":
                    (response.meta["page"] - 1) * self.VIDEOS_PER_PAGE + i + 1
                })
예제 #21
0
 def parse_playlist_items(self, response: Response):
     body = json.loads(response.body)
     assert body["kind"] == "youtube#playlistItemListResponse"
     ids = [item["snippet"]["resourceId"]["videoId"] for item in body["items"]]
     yield self.request_videos(ids, response.meta)
     if "nextPageToken" in body:
         request_url = YoutubeSpider.update_url_query(
             response.url, {"pageToken": body["nextPageToken"]}
         )
         yield response.follow(
             request_url, meta=response.meta, callback=self.parse_playlist_items
         )
예제 #22
0
 def parse_category(self, response: Response):
     # with javascript it would be //div[@class='split-taxonomy-4']/ul/li/a/@href
     for row in response.xpath(
             "//section[@id='block-taxonomy-menu-block-1']/ul/li/a/@href"
     ).getall():
         self.log('find category {}'.format(row), level=logging.INFO)
         yield response.follow(
             url=row,
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             callback=self.parse_list,
             errback=self.handle_failure)
예제 #23
0
 def parse_list(self, response: Response):
     # wait for page to load
     # wait for the redirect to finish.
     patent_links = []
     for row in response.xpath(
             "//section[@id='block-system-main']/div[@class='node node-technology node-teaser clearfix']/h2/a"
     ):
         name = row.xpath("text()").get()
         link = row.xpath("@href").get()
         patent_links.append({'name': name, 'link': link})
         self.log('found patents {}'.format(name), level=logging.INFO)
     if response.xpath("//li[@class='pager-last']/a/@href").get() is not None and\
             response.url != response.xpath("//li[@class='pager-last']/a/@href").get():
         # have next page
         if '?page=' in response.url:
             elements = response.url.split("=")
             page = (int(elements[-1]) + 1)
             self.log('go to page {}'.format(page), level=logging.INFO)
             yield response.follow(
                 url='='.join(elements[:-1]) + '={}'.format(page),
                 dont_filter=True,
                 meta={'proxy': POOL.get()} if self.with_proxy else {},
                 callback=self.parse_list,
                 errback=self.handle_failure)
         else:
             self.log('go to page 2', level=logging.INFO)
             yield response.follow(
                 url=response.url + '?page=1',
                 dont_filter=True,
                 meta={'proxy': POOL.get()} if self.with_proxy else {},
                 callback=self.parse_list,
                 errback=self.handle_failure)
     for p in patent_links:
         yield response.follow(
             url=p['link'],
             dont_filter=True,
             meta={'proxy': POOL.get()} if self.with_proxy else {},
             callback=self.parse,
             errback=self.handle_failure)
    def parse(self, response: Response):
        for section in response.xpath(
                '//div[@id="pun-main"]/div[@class="category"]'):
            section_id = int(section.xpath('@id').get()[len('pun-category'):])
            section_title = section.xpath('h2/span/text()').get()

            yield SectionItem(id=section_id, title=section_title)

            for forum in section.xpath(
                    'div[@class="container"]//div[@class="intd"]//h3'):
                forum_url = forum.xpath('a/@href').get()
                forum_url_query = urllib.parse.urlparse(forum_url).query
                forum_id = int(urllib.parse.parse_qs(forum_url_query)['id'][0])

                self.forums[forum_id] = section_id

                yield response.follow(forum_url, self.parse_forum)
예제 #25
0
    def parse_book_contents(self, response: Response):
        # get book name
        book = response.css('div.book-title h1').extract_first()

        # get version titles
        ver_titles = []
        for ver_title in response.css('div.chapter h4'):
            ver_titles.append(ver_title.css('::text').get())

        # get version chapter lists
        branchs = response.css('div.chapter div.chapter-list')
        for ver_title, ver_chapters in zip(ver_titles, branchs):
            chapter_no = 0
            for sub_chapter_list in ver_chapters.css('ul'):
                # lastest chapter appears in the front, reversed to
                # get an increase order chapter list
                for chapter in reversed(sub_chapter_list.css('li a')):
                    # <a href="/comic/xxxx/xxxxxx.html" title="第xx回" ...>
                    #   <span>第xx回<i>xxp</i></span>
                    # </a>
                    chapter_no += 1
                    chapter_url = chapter.css('::attr(href)').extract_first()
                    title, pages = chapter.css('::text').extract()
                    pages = int(pages[:-1])
                    chapter_item = MangaChapterItem({
                        'book': book,
                        'version': ver_title,
                        'chapter_no': chapter_no,
                        'title': title,
                        'pages': pages,
                        'chapter_url': chapter_url,
                        'store_dir': self.out_dir
                    })
                    yield chapter_item

                    # request for each page of chapter
                    for page in range(1, pages+1):
                        page_item = {
                            'chapter': dict(chapter_item),
                            'page_no': page
                        }

                        page_url = '%s#%d' % (chapter_url, page)
                        chapter_request = response.follow(
                            page_url, partial(self.parse_book_page, page_item))
                        yield chapter_request
예제 #26
0
 def parse(self, response: Response):
     visited_links = set()
     for link_element in response.css("li > a"):
         link = link_element.xpath("@href").extract_first()
         if link in visited_links:
             continue
         visited_links.add(link)
         if "prologue" in link:
             self.book_count += 1
             os.makedirs(
                 os.path.dirname(f"chapters/Book {self.book_count}/blah"),
                 exist_ok=True)
             os.makedirs(
                 os.path.dirname(f"sources/Book {self.book_count}/blah"),
                 exist_ok=True)
         if link.startswith("https://practicalguidetoevil.wordpress.com/20"
                            ):  # 2015, 2016...
             request = response.follow(link_element, scrape_chapter)
             request.meta['link'] = link
             request.meta['book_num'] = self.book_count
             yield request
    def parse_topic(self, response: Response):
        topic_url_query = urllib.parse.urlparse(response.url).query
        topic_id = int(urllib.parse.parse_qs(topic_url_query)['id'][0])

        forum_id = self.topics[topic_id]

        topic_title = response.xpath(
            '//div[@id="pun-main"]/h1/span/text()').get()

        yield TopicItem(id=topic_id, title=topic_title, forum_id=forum_id)

        for post in response.xpath(
                '//div[@class="topic"]/div[contains(@class, "post")]'):
            post_id = int(post.xpath('@id').get()[1:])
            post_date = int(post.xpath('@data-posted').get())
            post_number = int(post.xpath('h3/span/strong/text()').get())
            post_author_link = post.xpath(
                'div[@class="container"]/div[@class="post-author"]/ul/li[@class="pa-author"]/a/@href'
            ).get()
            post_author_link_query = urllib.parse.urlparse(
                post_author_link).query
            post_author = int(
                urllib.parse.parse_qs(post_author_link_query)['id'][0])
            post_text_elements = post.xpath(
                'div[@class="container"]/div[@class="post-body"]/div[@class="post-box"]/div[@class="post-content"]/*'
            ).getall()
            post_text = ''.join(post_text_elements)

            yield PostItem(id=post_id,
                           topic=topic_id,
                           number=post_number,
                           date=post_date,
                           author=post_author,
                           text=post_text)

        next_page_url = response.xpath(
            '//div[@class="pagelink"]/a[@class="next"]/@href').get()
        if next_page_url:
            yield response.follow(next_page_url, callback=self.parse_topic)
예제 #28
0
    def parse(self, response: Response):
        data = {
            name: response.css(selector).extract_first()
            for name, selector in self.state.extract_fields.items()
        }

        # Follow next links
        if self.state.follow_next:
            rel_next_url = response.css(
                'link[rel="next"]::attr(href), a[rel="next"]::attr(href)'
            ).extract_first()
            if rel_next_url is not None:
                data['rel_next_url'] = rel_next_url
                yield response.follow(rel_next_url, callback=self.parse)

        # Strip cacheserver from the url if possible
        url = response.url[len(self.state.cacheserver_url):].lstrip('/')
        url = urllib.urlparse(url)
        url = urllib.urlunparse(('', '', url.path, url.params, url.query, ''))

        # Build page entity for dashboard
        cached = bytes_to_str(response.headers.get('Rendertron-Cached', None))
        cached_at = bytes_to_str(
            response.headers.get('Rendertron-Cached-At', None))
        yield {
            'address':
            url,
            'content_type':
            bytes_to_str(response.headers.get('Content-Type', None)),
            'status_code':
            response.status,
            'cache_status':
            'cached'
            if cached == '1' or response.status == 200 else 'not-cached',
            'cached_at':
            cached_at,
            'extract_fields':
            data
        }
예제 #29
0
    def parse(self, response: Response, **cb_kwargs):
        selection = response.css('.vipped-apartments~ .items_list > .items-i')
        # selection = response.css('.items-i >.item_link')
        for bina_listing in selection:
            yield ListingItem({
                'name':
                bina_listing.css(
                    '.items-i .card_params .location::text').get(),
                'url':
                bina_listing.css('.item_link').attrib['href'],
            })

        next_url_selector = response.css(".next a")

        if len(next_url_selector) <= 0:
            return

        next_url_href = next_url_selector.attrib["href"]
        next_page = int(next_url_href.split('=')[1])

        if not self.crawling_limit_reached(next_page):
            self.logger.info("Next URL: %s", next_url_href)
            yield response.follow(next_url_href, callback=self.parse)
예제 #30
0
    def parse_tournaments(
            self, response: Response) -> Generator[Request, None, None]:
        """
        TODO: here are two methods:

         1. go to seasons, and fetch all fixtures in one season
         2. go to team statistics, and fetch all games for each of teams
        :param response:
        :type response: Response
        :return:
        :rtype: Generator[Request, None, None]

        @url https://www.whoscored.com/Regions/252/Tournaments/2/England-Premier-League
        @returns requests 0
        """
        # This is to go for method 1
        # for season in response.css("#seasons option"):
        #     url = season.css("option::attr(value)").extract_first()
        #     yield response.follow(url, callback=self.parse_season)

        # This is to go for method 2
        js_script: str = response.xpath(
            '//*[@id="layout-content-wrapper"]/div[2]/script[4]'
        ).extract_first()

        ctx = MiniRacer()
        team: List
        for team in ctx.eval(P_TEAM.search(js_script).group("history")):
            team_: Team = Team(*team)
            yield response.follow(
                url="https://www.whoscored.com/Teams/{id}/Fixtures/".format(
                    id=team_.id),
                callback=self.parse_team,
                meta={"waitForSelector": "layout-content-wrapper"},
            )

            break