Пример #1
0
    def next_page(self, response: scrapy.http.Response) -> scrapy.Request:
        """
        Goes to next page.

        :param response: response object
        :return: request for next page
        """
        # go to next page
        next_url = response.xpath("//a[@title='下一页']/@href").extract_first()
        if next_url is not None:
            self.log('Next page {}'.format(next_url), level=logging.INFO)
            time.sleep(random.random())
            return response.follow(
                url=next_url,
                callback=self.parse,
                # reuse the current proxy
                meta={'proxy': response.request.meta['proxy']},
                errback=self.handle_failure)
        else:
            # try to build the page by ourself
            arguments = self.decode_url(response.request.url)
            arguments['page'] += 1
            url = self.format_url(arguments)
            self.log('Next page (manually) {}'.format(url), level=logging.INFO)
            return response.follow(
                url=url,
                callback=self.parse,
                # reuse the current proxy
                meta={'proxy': response.request.meta['proxy']},
                errback=self.handle_failure)
Пример #2
0
 def parse_docs(self, response: scrapy.http.Response):
     pdfs: List[str] = []
     for url in response.css('a::attr(href)'):
         full = response.urljoin(url.extract())
         if full.endswith('.pdf'):
             pdfs.append(full)
     yield {'from': response.url, 'file_urls': pdfs}
Пример #3
0
    def parseCityAttractionsListPage(self, response: scrapy.http.Response):
        # example page:  https://www.viator.com/Mumbai/d953

        print(
            'PARSING ATTRACTION LIST ####################################################################################'
        )
        print(response.url)

        self.incrementRequestCount()
        hrefs = response.css('div.ptm *> h2 > a')
        for href in hrefs:
            pointURL = href.css('::attr(href)').extract_first().strip()
            pointName = href.css('::text').extract_first().strip()
            yield response.follow(pointURL,
                                  callback=self.parseAttractionsPage,
                                  meta={
                                      'countryName':
                                      response.meta['countryName'],
                                      'cityName': response.meta['cityName'],
                                      'pointName': pointName
                                  })

        nextPageLink = response.css(
            'div.ptm > div:nth-child(1) > div:nth-child(2) > p > a:last-child::attr(href)'
        ).extract_first()
        if nextPageLink:
            yield response.follow(nextPageLink,
                                  callback=self.parseCityAttractionsListPage,
                                  meta=response.meta)
Пример #4
0
    def parse(self, response: scrapy.http.Response):
        """
        Parses content from a html page response.
        """
        listings = response.xpath('//li[@class="result-row"]')

        for listing in listings:
            # Relative matching
            date = listing.xpath(
                './/*[@class="result-date"]/@datetime').extract_first()
            url = listing.xpath(
                './/a[@class="result-title hdrlnk"]/@href').extract_first()
            title = listing.xpath(
                './/a[@class="result-title hdrlnk"]/text()').extract_first()

            yield scrapy.Request(url,
                                 callback=self.parse_listing,
                                 meta=dict(date=date, url=url, title=title))

        # Move to the next page of data.
        next_page_url = response.xpath(
            '//*[@class="button next"]/@href').extract_first()
        if next_page_url:
            # url must be absolute.
            abs_next_page_url = response.urljoin(next_page_url)
            yield scrapy.Request(url=abs_next_page_url, callback=self.parse)
Пример #5
0
    def parse(self, response: scrapy.http.Response, **kwargs):
        form_data = {
            '__VIEWSTATE':
            response.xpath('//input[@id="__VIEWSTATE"]/@value').get(),
            '__VIEWSTATEGENERATOR':
            response.xpath('//input[@id="__VIEWSTATEGENERATOR"]/@value').get(),
            '__EVENTVALIDATION':
            response.xpath('//input[@id="__EVENTVALIDATION"]/@value').get(),
        }

        next_page_btn = response.xpath(
            '//a[contains(@href, "Page$Next")]').get()

        if next_page_btn is not None:
            data = form_data.copy()
            data['__EVENTTARGET'] = 'grdSQL'
            data['__EVENTARGUMENT'] = 'Page$Next'

            yield scrapy.FormRequest(f'{self.base_url}/Events.aspx',
                                     formdata=data)

        entries = response.xpath(
            '//table[@id="grdSQL"]//tr[@onmouseover]').getall()

        for i, entry in enumerate(entries):
            data = form_data.copy()
            data['__EVENTTARGET'] = 'grdSQL'
            data['__EVENTARGUMENT'] = f'SysRowSelector${i}'

            yield scrapy.FormRequest(f'{self.base_url}/Events.aspx',
                                     formdata=data,
                                     callback=self.parse_entry,
                                     meta={'row_index': i})
Пример #6
0
    def parse(self, response: scrapy.http.Response):
        findform = response.xpath("//form[@name='form1']")
        form = self.build_form(findform)

        if "kirjaamo" not in form:
            raise ValueError("kirjaamo not found")

        if not isinstance(form["kirjaamo"], list):
            raise ValueError("kirjaamo is not list")

        method = findform.xpath("./@method").get()
        action = response.urljoin(findform.xpath("./@action").get())

        alist = form["kirjaamo"]
        del form["kirjaamo"]

        for param in alist:
            val = param["value"]
            if val == "":
                continue

            fdata = form
            fdata["kirjaamo"] = val

            yield scrapy.FormRequest(
                action,
                method=method,
                formdata=fdata,
                meta={
                    "name": param["name"],
                    "dont_cache": True,
                },
                callback=self.parse_search_result,
            )
 def parse_subtopic_triangle(self, response: scrapy.http.Response):
     # Gathers all subtopics from https://www.walter-fendt.de/html5/mde/tl/tl_start_de.htm
     triangle_subtopics = response.xpath(
         '/html/body/ul/li/a/@href').getall()
     for subtopic_url in triangle_subtopics:
         subtopic_url = response.urljoin(subtopic_url)
         yield scrapy.Request(url=subtopic_url, callback=self.parse)
Пример #8
0
    def parse(self, response: scrapy.http.Response):

        # Next page link
        next_page = response.xpath(
            "//div[@id='pager_one']/div[@class='subcontainer']/div[contains(@class, 'pageside') and contains(@class, 'pright')]/a/@href"
        ).get()

        if next_page is not None:
            yield scrapy.Request(
                response.urljoin(next_page),
                callback=self.parse,
            )

        for row in response.xpath("//table[@id='remixtable']/tr[@class]"):
            # Get tunes
            addeddate = datetime.strptime(
                row.xpath("td[1]/text()").get(), "%Y-%m-%d")
            link = response.urljoin(row.xpath("td[2]/a/@href").get())
            title = row.xpath("td[2]/a/text()").get()
            arranger = row.xpath("td[3]/a/text()").get()
            composer = row.xpath("td[4]/text()").get()

            yield scrapy.Request(link,
                                 callback=self.dl_tune,
                                 meta={
                                     "tune":
                                     Tune(
                                         title=title,
                                         arranger=arranger,
                                         added=addeddate,
                                         composer=composer,
                                         data=None,
                                     ),
                                 })
Пример #9
0
    def parseCountryPage(self, response: scrapy.http.Response):
        # example page:  https://www.viator.com/India/d723-ttd

        self.incrementRequestCount()

        breadcrumbs = response.css('div.crumbler *> span::text').extract()
        countryName = breadcrumbs[1].strip()

        countryListing = CountryListing(crawler=self.name,
                                        sourceURL=response.url,
                                        crawlTimestamp=getCurrentTime(),
                                        countryName=countryName)
        yield countryListing.jsonify()

        if skipNonRequired:
            if processName(countryName) not in processedRequiredCountries:
                # do not process this country's cities
                print('Skipping country: ', countryName)
                return
        countryId = response.url.split('/')[-1].split('-')[0][1:]
        cityListingURL = 'https://www.viator.com/pascities.jspa?country={}'.format(
            countryId)
        yield response.follow(cityListingURL,
                              callback=self.parseCountryCities,
                              meta={'countryName': countryName})
Пример #10
0
    def parse(self, response: scrapy.http.Response):
        print("Parsing URL: " + response.url)

        # Call Splash only once per page (that contains multiple XML elements).
        data = self.getUrlData(response.url)
        response.meta["rendered_data"] = data

        # We would use .fromstring(response.text) if the response did not include the XML declaration:
        # <?xml version="1.0" encoding="utf-8"?>
        root = etree.XML(response.body)
        tree = etree.ElementTree(root)

        # If results are returned.
        elements = tree.xpath("/root/items/*")
        if len(elements) > 0:
            for element in elements:
                copyResponse = response.copy()
                element_xml_str = etree.tostring(element,
                                                 pretty_print=True,
                                                 encoding="unicode")
                element_dict = xmltodict.parse(element_xml_str)

                # Temporary solution for public-only content.
                # TODO: remove this when licensed content are enabled!
                if not self.is_public(element_dict["data"]):
                    continue

                # TODO: It's probably a pointless attribute.
                # del element_dict["data"]["score"]

                # Passing the dictionary for easier access to attributes.
                copyResponse.meta["item"] = element_dict["data"]

                # In case JSON string representation is preferred:
                # copyResponse._set_body(json.dumps(copyResponse.meta['item'], indent=1, ensure_ascii=False))
                copyResponse._set_body(element_xml_str)

                if self.hasChanged(copyResponse):
                    yield self.handleEntry(copyResponse)

                # LomBase.parse() has to be called for every individual instance that needs to be saved to the database.
                LomBase.parse(self, copyResponse)

        # TODO: To not stress the Rest APIs.
        # time.sleep(0.1)

        # If the number of returned results is equal to the imposed limit, it means that there are more to be returned.
        if len(elements) == self.limit:
            self.page += 1
            url = self.apiUrl.replace("%start",
                                      str(self.page * self.limit)).replace(
                                          "%anzahl", str(self.limit))
            yield scrapy.Request(
                url=url,
                callback=self.parse,
                headers={
                    "Accept": "application/xml",
                    "Content-Type": "application/xml",
                },
            )
Пример #11
0
    def post(
        self, response: scrapy.http.Response
    ) -> Union[Iterator[items.ArticleItem], Iterator[scrapy.Request]]:
        """Get medium posts.

        Args:
            response (scrapy.http.Response): scrapy response

        Yields:
            items.ArticleItem: ArticleItem object
            scrapy.Request: scrapy request object
        """
        data = response.text.replace('])}while(1);</x>', '', 1)
        obj = json.loads(data)['payload']
        post_record = self.parse_post_item(post=obj)
        yield post_record

        if post_record['comment_count'] > 0:
            post_id = obj['value']['id']
            response.meta['post_id'] = post_id
            response.meta['post_record'] = post_record
            url = (f'https://medium.com/_/api/posts/{post_id}/responsesStream')
            yield scrapy.Request(url=url,
                                 meta=response.meta,
                                 callback=self.comment)
Пример #12
0
    def parse(self, response: scrapy.http.Response):
        """
        Get list of tunes
        """

        u: SplitResult = urlsplit(response.url)
        q: dict = dict(queryparse(u.query))

        for tune in response.xpath(
                "//div[@id='result']/table/tr/th[@colspan='6']/../../tr[@class]"
        ):
            artist = "".join(tune.xpath("./td[2]//text()").getall()).strip()
            title = "".join(tune.xpath("./td[1]//text()").getall()).strip()
            link = tune.xpath("./td[1]/a/@href").get().strip()
            fileformat = "".join(
                tune.xpath("./td[3]//text()").getall()).strip().lower()

            # Download tune
            yield scrapy.Request(
                response.urljoin(link),
                callback=self.download_mod,
                meta={
                    "tune": {
                        "id": q['view'],
                        "artist": artist,
                        "title": title,
                        "format": fileformat,
                    }
                },
            )
 def parse_apollonian_subtopic(self, response: scrapy.http.Response):
     # Gathers variant-URLs to crawl from https://www.walter-fendt.de/html5/mde/apolloniosproblem_de.htm
     apollonios_subtopics = response.xpath(
         '//table/tbody/tr/td/a/@href').getall()
     for apollo_url in apollonios_subtopics:
         apollo_url = response.urljoin(apollo_url)
         yield scrapy.Request(url=apollo_url, callback=self.parse)
Пример #14
0
 def parse(self, response: scrapy.http.Response):
     # example page:  https://www.viator.com/Amsterdam/d525-ttd
     countryMenuBox = response.css(
         '#countryMenuBox > div.menu-dropdown-box.small > div > div:nth-child(1)'
     )
     hrefs = countryMenuBox.css('a::attr(durl)').extract()
     for href in hrefs:
         yield response.follow(href, callback=self.parseCountryPage)
Пример #15
0
    def parse_category_overview_for_topics_and_subpages(
            self, response: scrapy.http.Response):
        """
        Crawls an overview page of a "type"-category (e.g. "Hintergrund", "Bilderserie" etc.) for subpages and topics.
        If the overview has subpages, it will recursively yield additional scrapy.Requests to the overview-subpages.
        Afterwards it yields the (10) individual topic_urls (per overview page) to the parse()-method.

        Scrapy Contracts:
        @url https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Alessons
        @returns requests 10
        """
        topic_urls_raw: list = response.xpath(
            '//a[@class="internal-link readmore"]/@href').getall()

        for url_ending in topic_urls_raw:
            self.topic_urls.add(response.urljoin(url_ending))

        # if there's a "Letzte"-Button in the overview, there's more topic_urls to be gathered than the initially
        # displayed 10 elements
        last_page_button_url: str = response.xpath(
            '//li[@class="tx-pagebrowse-last last"]/a/@href').get()
        # the string last_page_button_url typically looks like this:
        # "/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_images&tx_solr%5Bpage%5D=8"
        page_number_regex = re.compile(
            r'(?P<url_with_parameters>.*&tx_solr%5Bpage%5D=)(?P<nr>\d+)')

        overview_urls_parsed: set = set(
        )  # temporary set used for checking off already visited URLs
        if last_page_button_url is not None:
            page_number_dict: dict = page_number_regex.search(
                last_page_button_url).groupdict()
            url_without_page_parameter: str = response.urljoin(
                page_number_dict.get('url_with_parameters'))
            last_page_number = int(page_number_dict.get('nr'))
            for i in range(2, last_page_number + 1):
                # the initial url from start_urls already counts as page 1, therefore we're iterating
                # from page 2 to the last page
                next_overview_subpage_to_crawl = str(
                    url_without_page_parameter + str(i))
                if next_overview_subpage_to_crawl not in self.overview_urls_already_parsed:
                    yield scrapy.Request(
                        url=next_overview_subpage_to_crawl,
                        callback=self.
                        parse_category_overview_for_topics_and_subpages)
                    overview_urls_parsed.add(next_overview_subpage_to_crawl)
            self.overview_urls_already_parsed.update(
                overview_urls_parsed
            )  # checking off the (10) URLs that we yielded

        parsed_urls: set = set(
        )  # temporary set used for checking off already visited topics
        for url in self.topic_urls:
            if url not in self.topic_urls_parsed:
                # making sure that we don't accidentally crawl individual pages more than once
                yield scrapy.Request(url=url, callback=self.parse)
                parsed_urls.add(url)
        self.topic_urls_parsed.update(parsed_urls)
Пример #16
0
 def parse_landing_page(self, response: scrapy.http.Response):
     # On a landing page, we can extract all the documents, or infer the JSON link and use that.
     #    yield {'title': pub.css('h1 ::text').extract_first().strip()}
     for pub in response.css('.publication'):
         # This is a publication, so let's infer the API link:
         lp_url = list(urlsplit(response.url))
         lp_url[2] = "/api/content%s" % lp_url[2]
         api_json_url = urlunsplit(lp_url)
         yield response.follow(api_json_url, self.parse_content_api_json)
Пример #17
0
    def parse(self, response: scrapy.http.Response):
        for relative_url in response.xpath('//h3/a/@href').extract():
            absolute_url = response.urljoin(relative_url)
            yield scrapy.Request(absolute_url, callback=self.parse_book)

        next_page_url = response.xpath(
            '//*[@class="next"]/a/@href').extract_first()
        absolute_url = response.urljoin(next_page_url)
        yield scrapy.Request(absolute_url)
Пример #18
0
    def parse_posts_list(self, response: scrapy.http.Response):
        # Fetch the posts
        for href in response.css("#posts a::attr(href)"):
            if href.get().startswith("/p"):
                yield response.follow(href, self.parse_thread)

        # Fetch all pages
        for href in response.css(".pagination a::attr(href)"):
            yield response.follow(href, self.parse_posts_list)
Пример #19
0
    def parse(self, response: scrapy.http.Response):

        # Extract every link to a landing page:
        for title in response.css('.document-row > h3 > a'):
            yield response.follow(title, self.parse_landing_page)

        # Extract the link to the next page of results:
        for next_page in response.css('.next > a'):
            yield response.follow(next_page, self.parse)
Пример #20
0
 def parse_listing(self, response: scrapy.http.Response):
     i = {}
     i['url'] = response.url
     i['expire'] = response.xpath(
         '//a[@class="expire"]/span/text()').extract_first()
     i['job-title'] = response.css('span#main-job-title  *::text').extract()
     i['main'] = response.css('div#main-lang-block *::text').extract()
     i['job-details'] = response.css('div.jobdetails *::text').extract()
     return i
Пример #21
0
 def parse_page(self, response: scrapy.http.Response):
     image_url = response.css(
         'div#all div.text-center img.img-fluid::attr(src)').get()
     image_url = response.urljoin(image_url)
     image = ImageItem()
     image['comic_id'] = response.meta['comic_id']
     image['vol_id'] = response.meta['vol_id']
     image['page'] = response.meta['page']
     image['url'] = image_url
     yield image
Пример #22
0
 def parse_content(self, response: scrapy.http.Response):
     item = PttItem()
     item['content'] = response.xpath(
         "//div[@id='main-content']/text()").get().replace('\n', '')
     meta = response.xpath("//span[@class='article-meta-value']")
     item['author'] = meta[0].xpath('text()').get()
     item['title'] = meta[2].xpath('text()').get()
     item['date'] = meta[3].xpath('text()').get()
     item['url'] = response.url
     yield item
Пример #23
0
 def get_next_vimeo_overview_page(self, response: scrapy.http.Response):
     """
     if there is a "next"-button at the bottom of the vimeo-user's overview page:
     grabs the url from it and yields it
     """
     # next_vimeo_overview_page = response.xpath('//*[@id="pagination"]/ol/li[9]').get()
     next_vimeo_overview_page = response.css(
         '#pagination > ol > li.pagination_next a::attr(href)').get()
     if next_vimeo_overview_page is not None:
         yield response.follow(next_vimeo_overview_page, self.parse)
Пример #24
0
 def parse_ad_page(self, resp: scrapy.http.Response) -> None:
     '''
     Yields Ad objects if search phrase is found in response.
     '''
     ensure_response_200(resp)
     title = resp.xpath('//div[@id = "adTitle"]//text()').extract_first()
     description = ' '.join(
         resp.xpath('//div[@id = "adDescription"]//text()').extract())
     if title and description:
         if self._search_phrase in title or self._search_phrase in description:
             yield Ad(url=resp.url, search_phrase=self._search_phrase)
Пример #25
0
    def parseCountryCities(self, response: scrapy.http.Response):
        # example page: https://www.viator.com/pascities.jspa?country=723

        self.incrementRequestCount()

        hrefs = response.css(
            'div.unit.size-pas-cities *> a::attr(durl)').extract()
        for href in hrefs:
            yield response.follow(href,
                                  callback=self.parseCityPage,
                                  meta=response.meta)
Пример #26
0
 def parse_section_overview(self, response: scrapy.http.Response):
     # Each section (e.g. "Mathematik Teilgebiete") holds a list of individual topic-categories (e.g. "Kreislehre")
     section_urls = response.xpath(
         '/html/body/table/tr/td/a/@href').getall()
     section_urls.sort()
     # print(section_urls)
     # print("Section URLs: ", len(section_urls))
     for url in section_urls:
         current_url = response.urljoin(url)
         yield scrapy.Request(url=current_url,
                              callback=self.parse_topic_overview)
Пример #27
0
    def parse(self, response: scrapy.http.Response, **kwargs):
        next_page_url = response.xpath('//a[@rel="next"]/@href').get()
        if next_page_url is not None:
            yield scrapy.Request(next_page_url)

        entry_urls = response.xpath(
            f'//div[{util.xpath_class(["type-tribe_events"])}]//*[{util.xpath_class(["tribe-events-list-event-title"])}]/a/@href'
        ).getall()

        for url in entry_urls:
            yield scrapy.Request(url, callback=self.parse_entry)
Пример #28
0
    def parse(self, response: scrapy.http.Response, **kwargs):
        next_page_url = response.xpath(
            f'//li[{xpath_class(["next"])}]/a/@href').get()
        if next_page_url is not None:
            yield scrapy.Request(f'{self.base_url}{next_page_url}')

        entries = response.xpath(
            f'//article[{xpath_class(["event"])}]').getall()

        for entry in entries:
            yield ResponseItem({'body': entry, 'meta': response.meta})
Пример #29
0
 def parse(
     self, response: scrapy.http.Response
 ) -> typing.Generator[scrapy.Request, None, None]:
     """Find all the cases."""
     for case_url in response.xpath('//table[@class="cases"]/tbody/tr/td/a/@href'):
         url = response.urljoin(case_url.extract())
         yield scrapy.Request(
             url=url,
             callback=self.parse_case,
             dont_filter=True,
         )
Пример #30
0
    def parse_search_result(self, response: scrapy.http.Response):
        tbl = response.xpath(
            "//table[@class='table table-striped table-hover table-bordered']")
        for rowidx, row in enumerate(tbl.xpath("./tr")):
            if rowidx == 0:
                continue

            obj = {}

            for idx, col in enumerate(row.xpath("./td")):
                if idx == 0:
                    rawdate = "".join(col.xpath("./text()").getall()).strip()
                    rawdate = ' '.join(rawdate.split())
                    rawdate = rawdate.strip()

                    rem = re.split(r"^(\d+)\s+/(\d+) (\d+)\.(\d+)\.(\d+)$",
                                   rawdate)[1:]
                    rem.pop()

                    vhnum, vhyear, pday, pmonth, pyear = rem
                    obj["date"] = f"{vhyear}-{vhnum.zfill(3)}__{pyear}-{pmonth.zfill(2)}-{pday.zfill(2)}"
                elif idx == 1:
                    for link in col.xpath("./a"):
                        txt = link.xpath("./text()").get().strip()
                        url = response.urljoin(link.xpath("./@href").get())
                        if txt == '0 kpl':
                            continue

                        if 'title' not in obj:
                            obj["title"] = txt
                            obj["link"] = url
                        else:
                            obj["attach"] = url

            dirpath = os.path.join(self.name, )

            if "attach" in obj:
                yield scrapy.Request(
                    obj["attach"],
                    meta={
                        "name": response.meta["name"],
                        "id": obj["date"],
                    },
                    callback=self.parse_attachments,
                )

            yield scrapy.Request(
                obj["link"],
                meta={
                    "name": response.meta["name"],
                    "id": obj["date"],
                },
                callback=self.dl_doc,
            )