def parse_subtopic_triangle(self, response: scrapy.http.Response):
     # Gathers all subtopics from https://www.walter-fendt.de/html5/mde/tl/tl_start_de.htm
     triangle_subtopics = response.xpath(
         '/html/body/ul/li/a/@href').getall()
     for subtopic_url in triangle_subtopics:
         subtopic_url = response.urljoin(subtopic_url)
         yield scrapy.Request(url=subtopic_url, callback=self.parse)
Пример #2
0
    def parse(self, response: scrapy.http.Response):
        """
        Parses content from a html page response.
        """
        listings = response.xpath('//li[@class="result-row"]')

        for listing in listings:
            # Relative matching
            date = listing.xpath(
                './/*[@class="result-date"]/@datetime').extract_first()
            url = listing.xpath(
                './/a[@class="result-title hdrlnk"]/@href').extract_first()
            title = listing.xpath(
                './/a[@class="result-title hdrlnk"]/text()').extract_first()

            yield scrapy.Request(url,
                                 callback=self.parse_listing,
                                 meta=dict(date=date, url=url, title=title))

        # Move to the next page of data.
        next_page_url = response.xpath(
            '//*[@class="button next"]/@href').extract_first()
        if next_page_url:
            # url must be absolute.
            abs_next_page_url = response.urljoin(next_page_url)
            yield scrapy.Request(url=abs_next_page_url, callback=self.parse)
Пример #3
0
    def parse(self, response: scrapy.http.Response):
        findform = response.xpath("//form[@name='form1']")
        form = self.build_form(findform)

        if "kirjaamo" not in form:
            raise ValueError("kirjaamo not found")

        if not isinstance(form["kirjaamo"], list):
            raise ValueError("kirjaamo is not list")

        method = findform.xpath("./@method").get()
        action = response.urljoin(findform.xpath("./@action").get())

        alist = form["kirjaamo"]
        del form["kirjaamo"]

        for param in alist:
            val = param["value"]
            if val == "":
                continue

            fdata = form
            fdata["kirjaamo"] = val

            yield scrapy.FormRequest(
                action,
                method=method,
                formdata=fdata,
                meta={
                    "name": param["name"],
                    "dont_cache": True,
                },
                callback=self.parse_search_result,
            )
 def parse_apollonian_subtopic(self, response: scrapy.http.Response):
     # Gathers variant-URLs to crawl from https://www.walter-fendt.de/html5/mde/apolloniosproblem_de.htm
     apollonios_subtopics = response.xpath(
         '//table/tbody/tr/td/a/@href').getall()
     for apollo_url in apollonios_subtopics:
         apollo_url = response.urljoin(apollo_url)
         yield scrapy.Request(url=apollo_url, callback=self.parse)
Пример #5
0
 def parse_docs(self, response: scrapy.http.Response):
     pdfs: List[str] = []
     for url in response.css('a::attr(href)'):
         full = response.urljoin(url.extract())
         if full.endswith('.pdf'):
             pdfs.append(full)
     yield {'from': response.url, 'file_urls': pdfs}
Пример #6
0
    def parse(self, response: scrapy.http.Response):
        """
        Get list of tunes
        """

        u: SplitResult = urlsplit(response.url)
        q: dict = dict(queryparse(u.query))

        for tune in response.xpath(
                "//div[@id='result']/table/tr/th[@colspan='6']/../../tr[@class]"
        ):
            artist = "".join(tune.xpath("./td[2]//text()").getall()).strip()
            title = "".join(tune.xpath("./td[1]//text()").getall()).strip()
            link = tune.xpath("./td[1]/a/@href").get().strip()
            fileformat = "".join(
                tune.xpath("./td[3]//text()").getall()).strip().lower()

            # Download tune
            yield scrapy.Request(
                response.urljoin(link),
                callback=self.download_mod,
                meta={
                    "tune": {
                        "id": q['view'],
                        "artist": artist,
                        "title": title,
                        "format": fileformat,
                    }
                },
            )
    def crawl_next_url_level(self, diff_set, response: scrapy.http.Response,
                             url_depth: int):
        # To fetch all sub-pages of the website, we must grab all the unique urls from the navigation sidebar first
        # The sidebar has several levels, e.g.:
        # Welt                      1st level links (= url_depth 1)
        #   Afrika                  2nd level links (= url_depth 2)
        #       Afrika ( - D)       3rd level links (= url_depth 3)
        #           Ägypten         4th level links (= url_depth 4)
        # each of these layers has its own .html and only shows the child-links if you navigate through the parent-node.
        # For example, to see "landkarten_aegypten.html", we need to be on the "Afrika ( - D)"-level of the sidebar.

        if diff_set.issubset(self.navigation_urls) is False:
            self.navigation_urls.update(diff_set)
            if len(diff_set) > 0:
                # print("Found", (len(diff_set)), "new URLs to crawl on url_depth =", url_depth)
                for diff_item in diff_set:
                    # print(diff_item)
                    temp_url = response.urljoin(diff_item)
                    if url_depth == 1:
                        yield scrapy.Request(
                            url=temp_url,
                            callback=self.get_navigation_urls_second_level)
                    if url_depth == 2:
                        yield scrapy.Request(
                            url=temp_url,
                            callback=self.get_navigation_urls_third_level)
                    if url_depth == 3:
                        yield scrapy.Request(
                            url=temp_url,
                            callback=self.get_navigation_urls_fourth_level)
Пример #8
0
    def parse_page(self, response: scrapy.http.Response) -> Iterator[Issue]:
        pattern = re.compile(r".*\-b\.pdf")
        links = response.xpath("//a[@href]")
        for link in links:
            if re.match(pattern, link.attrib["href"]):
                item = Issue(
                    cover=response.urljoin(
                        link.xpath("./../../..").xpath(
                            ".//img").attrib["src"]),
                    file=response.urljoin(link.attrib["href"]),
                )

                text = response.css("h2::text").get()
                if text:
                    item["text"] = text

                yield item
Пример #9
0
    def parse_motherboard(self, response: scrapy.http.Response):
        query = dict(queryparse(urlsplit(response.url).query))
        current_page = int(query['page'])

        data = json.loads(response.body)
        for memmodule in data['results']:

            if 'url' in memmodule:
                memmodule['url'] = response.urljoin(memmodule['url'])

            remove_keys = [
                'stock',
                'priceRange',
                'availableForPickup',
            ]

            for idx, (k, v) in enumerate(memmodule.items()):
                if v is None:
                    remove_keys.append(k)

            for k in remove_keys:
                if k in memmodule:
                    del memmodule[k]

            if 'stock' in memmodule:
                del memmodule['stock']

            yield Memory({
                '_manufacturer': self.manufacturer,
                '_model': response.meta['model'],
                'memory': memmodule,
            })

        if current_page == 0 and data['pagination']['numberOfPages'] > 1:
            for pnum in range(1, data['pagination']['numberOfPages']):
                query['page'] = str(pnum)

                # Call the same page with increased page number
                yield scrapy.Request(
                    response.urljoin("?" + urlencode(query)),
                    callback=self.parse_motherboard,
                    meta={
                        'model': response.meta['model'],
                    },
                )
Пример #10
0
 def parse_page(self, response: scrapy.http.Response):
     image_url = response.css(
         'div#all div.text-center img.img-fluid::attr(src)').get()
     image_url = response.urljoin(image_url)
     image = ImageItem()
     image['comic_id'] = response.meta['comic_id']
     image['vol_id'] = response.meta['vol_id']
     image['page'] = response.meta['page']
     image['url'] = image_url
     yield image
Пример #11
0
    def parse_topic_overview(self, response: scrapy.http.Response):
        # Each topic (e.g. "Bruchzahlen / Bruchrechnen") holds a list of sub-topics that are either individual
        #   .htm-pages with explanations about a specific topic
        #   eLearning-exercises or
        #   "Aufgabengeneratoren" inside a .xls file
        topic_urls = response.xpath('/html/body/table/tr/td/a/@href').getall()
        # print("Topic URLs:", topic_urls)
        # print("Number of topic_urls in this section:", len(topic_urls))

        url_set = set()
        # xls_set = set()
        for url in topic_urls:
            if url.endswith('.htm') or url.endswith('.html'):
                # topics that consist of illustrations or explanations are found inside individual .htm-documents
                current_url = response.urljoin(url)
                url_set.add(current_url)
            # if url.endswith('.xls'):
            #     # there are currently 3 links to .xls files, which are "Aufgabengeneratoren"
            #     # e.g. on this topic overview: http://www.zum.de/dwu/umamgl.htm
            #     # If we really wanted to handle the 3 .xls links, we need an additional xls-specific parse method
            #     xls_set.add(url)
            #     self.debug_xls_set.add(url)
            elif url.startswith("javascript"):
                # in some sections there are topics that lead to a javascript href, e.g.
                # "javascript:infowin('infodep/i-lingleich.htm');"
                # we'll have to extract the .htm-link from that string to parse it: the starting ' is our delimiter
                js_regex = re.compile(r"([^']*.htm)")
                js_url = js_regex.search(url)
                js_url = js_url.group()
                # url_set.add(js_url)
                current_url = response.urljoin(js_url)
                url_set.add(current_url)

        # print("debug XLS set length:", len(self.debug_xls_set))
        # print(self.debug_xls_set)

        for url in url_set:
            # only yield a scrapy Request if the url hasn't been parsed yet, this should help with duplicate links
            # that are found across different topics
            if url not in self.parsed_urls:
                yield scrapy.Request(url=url, callback=self.parse)
                self.parsed_urls.add(url)
Пример #12
0
    def parse_search_result(self, response: scrapy.http.Response):
        tbl = response.xpath(
            "//table[@class='table table-striped table-hover table-bordered']")
        for rowidx, row in enumerate(tbl.xpath("./tr")):
            if rowidx == 0:
                continue

            obj = {}

            for idx, col in enumerate(row.xpath("./td")):
                if idx == 0:
                    rawdate = "".join(col.xpath("./text()").getall()).strip()
                    rawdate = ' '.join(rawdate.split())
                    rawdate = rawdate.strip()

                    rem = re.split(r"^(\d+)\s+/(\d+) (\d+)\.(\d+)\.(\d+)$",
                                   rawdate)[1:]
                    rem.pop()

                    vhnum, vhyear, pday, pmonth, pyear = rem
                    obj["date"] = f"{vhyear}-{vhnum.zfill(3)}__{pyear}-{pmonth.zfill(2)}-{pday.zfill(2)}"
                elif idx == 1:
                    for link in col.xpath("./a"):
                        txt = link.xpath("./text()").get().strip()
                        url = response.urljoin(link.xpath("./@href").get())
                        if txt == '0 kpl':
                            continue

                        if 'title' not in obj:
                            obj["title"] = txt
                            obj["link"] = url
                        else:
                            obj["attach"] = url

            dirpath = os.path.join(self.name, )

            if "attach" in obj:
                yield scrapy.Request(
                    obj["attach"],
                    meta={
                        "name": response.meta["name"],
                        "id": obj["date"],
                    },
                    callback=self.parse_attachments,
                )

            yield scrapy.Request(
                obj["link"],
                meta={
                    "name": response.meta["name"],
                    "id": obj["date"],
                },
                callback=self.dl_doc,
            )
Пример #13
0
 def parse(
     self, response: scrapy.http.Response
 ) -> typing.Generator[scrapy.Request, None, None]:
     """Find all the cases."""
     for case_url in response.xpath('//table[@class="cases"]/tbody/tr/td/a/@href'):
         url = response.urljoin(case_url.extract())
         yield scrapy.Request(
             url=url,
             callback=self.parse_case,
             dont_filter=True,
         )
Пример #14
0
    def parse(self, response: scrapy.http.Response, **kwargs):
        section: list = response.css("ul.bread-crumbs").css(
            "span::text").getall()

        for product in response.css('div.dtList.i-dtList.j-card-item'):
            product_ref = product.css(
                "a.ref_goods_n_p.j-open-full-product-card::attr(href)").get()
            prod_card_url = response.urljoin(product_ref)
            yield common_request(url=prod_card_url,
                                 callback=self.parse_product_card,
                                 meta={'section': section})
            if self.small_sample:
                break

        next_page_ref = response.css("a.pagination-next::attr(href)").get()
        if next_page_ref is not None:
            next_page_url = response.urljoin(next_page_ref)
            yield common_request(url=next_page_url,
                                 callback=self.parse,
                                 meta={'section': section})
Пример #15
0
 def parse_section_overview(self, response: scrapy.http.Response):
     # Each section (e.g. "Mathematik Teilgebiete") holds a list of individual topic-categories (e.g. "Kreislehre")
     section_urls = response.xpath(
         '/html/body/table/tr/td/a/@href').getall()
     section_urls.sort()
     # print(section_urls)
     # print("Section URLs: ", len(section_urls))
     for url in section_urls:
         current_url = response.urljoin(url)
         yield scrapy.Request(url=current_url,
                              callback=self.parse_topic_overview)
Пример #16
0
 def parse(self, response: scrapy.http.Response):
     hrefs = response.css('div.tours > a::attr(href)').extract()
     attractionNumber = 1
     for href in hrefs:
         href = response.urljoin(href)
         self.log("visiting: " + href)
         meta = urlToCityAndCountryMapping[response.url]
         meta['rank'] = attractionNumber
         yield response.follow(href,
                               callback=self.parseAttractionsPage,
                               meta=meta)
         attractionNumber += 1
Пример #17
0
 def parse_main(
     self, response: scrapy.http.Response
 ) -> Union[Iterator[Issue], scrapy.http.Request]:
     links = (
         response.css("font.hdr b")[-1].xpath("../../../../../../*")[-1].
         xpath('.//td[@valign="top"]').xpath(".//a[not(@hidden)][@href]"))
     for link in links:
         href = link.attrib["href"]
         if href.endswith(".pdf") or href.endswith(".djvu"):
             yield Issue(file=response.urljoin(href),
                         text=link.css("::text").get())
         else:
             yield response.follow(url=href, callback=self.parse_page)
    def getReviews(self, response: scrapy.http.Response):
        self.log("review method called")

        reviewCount = 0
        reviewsUrl = response.css('div.quote.isNew > a::attr(href)').extract()
        for url in reviewsUrl:
            url = response.urljoin(url)
            self.log("review url: " + url)
            yield scrapy.Request(url,
                                 callback=self.parseReviewsPage,
                                 meta=response.meta)
            reviewCount += 1

        nextPageLink = response.css(
            'div.collapsedReviewsList > div > div > a::attr(href)').extract()

        if len(nextPageLink) == 2:
            newPageUrl = nextPageLink[1]
            newPageUrl = response.urljoin(newPageUrl)
            if reviewCount < 25:
                yield scrapy.Request(url=newPageUrl,
                                     callback=self.getReviews,
                                     meta=response.meta)
Пример #19
0
    def parse_vol(self, response: scrapy.http.Response):
        vol = VolItem()
        vol['vol_id'] = response.meta['id']
        vol['comic_id'] = response.meta['comic_id']
        vol['images'] = response.css('select#page-selector')[0].css(
            'option::attr(value)').getall()
        vol['images'] = [response.urljoin(url) for url in vol['images']]
        yield vol

        for i, url in enumerate(vol['images'], start=1):
            request = scrapy.Request(url, callback=self.parse_page)
            request.meta['comic_id'] = vol['comic_id']
            request.meta['vol_id'] = vol['vol_id']
            request.meta['page'] = i
            yield request
Пример #20
0
 def parse_vacancies_links(self, response: scrapy.http.Response) -> \
         scrapy.http.Request:
     """
     This method gets the links of the vacancies listed in the response,
     requests its own response and calls the ´´parse_vacancy_contents´´
     method for each of them to parse its data.
     :param response: Scraped response of the listing page
     :return: Request of parsing the contents of each listed vacancy
     """
     # self.logger.info('Processing listing page: %s', response.url)
     for href in response.xpath(
             "//section[@class='c-jobsearchpage__content']"
             "//div[@class='c-jobcarousel__slider--title']"
             "//a/@href").getall():
         yield scrapy.Request(response.urljoin(href),
                              self.parse_vacancies_contents)
Пример #21
0
    def parse_attachments(self, response: scrapy.http.Response):
        for link in response.xpath("//a"):
            href = link.xpath("./@href").get()
            q = dict(queryparse(urlsplit(href).query))
            if not q:
                continue

            if ('doctype' in q) or ('docid' in q):
                yield scrapy.Request(
                    response.urljoin(href),
                    meta={
                        "name": response.meta["name"],
                        "id": response.meta["id"],
                    },
                    callback=self.dl_doc,
                )
Пример #22
0
    def parse_search_result(self, response: scrapy.http.Response):
        for link in response.xpath("//a"):
            href = link.xpath("./@href").get()
            q = dict(queryparse(urlsplit(href).query))
            if not q:
                continue

            if ' bid' in q:
                yield scrapy.Request(
                    response.urljoin(href),
                    meta={
                        "name": response.meta["name"],
                        "dont_cache": True,
                    },
                    callback=self.parse_bid,
                )
Пример #23
0
    def parse_series(self, response: scrapy.http.Response):
        """
        Series-specific CPU list such as Atom CPUs
        :param response:
        :return:
        """

        # Find Products Home > Product Specifications > Processors breadcrumb
        if response.xpath("//a[contains(@class, 'hidden-crumb-xs')]/text()").get().strip() != "Processors":
            raise scrapy.exceptions.CloseSpider("Processors not found in crumb")

        for link in response.xpath("//tr/td/a/@href"):
            if link.root.find("/products/") == -1:
                self.logger.error("product not found from link, skipping")
                continue
            yield scrapy.Request(response.urljoin(link.root), callback=self.parse_specs)
    def parse_topic_overview(self, response: scrapy.http.Response):
        """
        Looks for individual topics within the overview and yields the URL to the main parse()-method.

        :param response: the current 'url' from start_urls
        :return: scrapy.Request

        Scrapy Contracts:
        @url https://www.walter-fendt.de/html5/phde/
        @returns requests 50
        """
        # the different topics are within tables: response.xpath('//table[@class="Gebiet"]')
        topic_urls = response.xpath('//td[@class="App"]/a/@href').getall()
        for topic_url in topic_urls:
            topic_url = response.urljoin(topic_url)
            yield scrapy.Request(url=topic_url, callback=self.parse)
Пример #25
0
 def parse_database(
     self, response: scrapy.http.Response
 ) -> typing.Generator[scrapy.Request, None, None]:
     """Find all the years in each database."""
     for year_url in response.xpath(
             '//div[@class="year-specific-options year-options"]/ul/li/h5'):
         url = response.urljoin(year_url.xpath("./a/@href").extract()[0])
         text = year_url.xpath("./a/text()").extract()[0]
         yield scrapy.Request(
             url=url,
             callback=self.parse_years,
             meta={
                 DATABASE_KEY: response.meta[DATABASE_KEY],
                 YEAR_KEY: text,
             },
             dont_filter=True,
         )
Пример #26
0
    def parse_index_page(self, resp: scrapy.http.Response) -> None:
        ensure_response_200(resp)
        names = resp.xpath('//ul[@class = "name-list"]/li/a/@href').extract()
        for name_url in names:
            yield NameUrl(url=resp.urljoin(name_url))

        page_count = extract_page_count(resp)
        curr_page = resp.meta['curr_page']
        if curr_page < page_count:
            curr_letter = resp.meta['letter']
            yield Request(
                dir_url_for_letter(curr_letter, curr_page + 1),
                callback=self.parse_index_page,
                meta={
                    'letter': curr_letter,
                    'curr_page': curr_page + 1
                },
            )
Пример #27
0
 def parse(
     self, response: scrapy.http.Response
 ) -> typing.Generator[scrapy.Request, None, None]:
     """Find all the databases."""
     for db_url in response.xpath('//div[@class="card"]/ul/li'):
         relative_url = db_url.xpath("./a/@href").extract()
         if not relative_url:
             continue
         url = response.urljoin(relative_url[0])
         text = db_url.xpath("./a/text()").extract()[0]
         yield scrapy.Request(
             url=url,
             callback=self.parse_database,
             meta={
                 DATABASE_KEY: text,
             },
             dont_filter=True,
         )
Пример #28
0
 def parse_comments(
         self,
         response: scrapy.http.Response) -> Generator[dict, None, None]:
     """Takes in a response from a comment thread page
     (e.g. https://beta4v.mydramalist.com/v1/threads?&c=title&t=9025&page=1)
     and yields. Also yields a request to the next comment
     page, if more comments do exists.
     """
     data = json.loads(response.body)
     show_id = parse_qs(response.url)['t'][0]
     data['show_id'] = show_id
     data['url'] = response.url
     yield data
     if data['has_more']:
         parts = response.url.split('=')
         parts[-1] = str(int(parts[-1]) + 1)
         next_url = '='.join(parts)
         yield scrapy.Request(response.urljoin(next_url),
                              callback=self.parse_comments)
Пример #29
0
    def parse(self, response: scrapy.http.Response):
        boarddata_tpl = {}

        for header in response.xpath("//table[@class='display']/thead/tr/th"):
            name = header.xpath("text()").get().strip()
            boarddata_tpl[name] = None

        for board in response.xpath("//table[@class='display']/tbody/tr"):
            tmp = {}

            for idx, key in enumerate(boarddata_tpl):
                info = "".join(
                    board.xpath(f"./td[{idx + 1}]//text()").getall()).strip()
                tmp[key] = info

            yield scrapy.Request(
                response.urljoin(
                    f"/en/products/motherboard/{tmp['Motherboard']}"),
                callback=self.parse_motherboard,
            )
Пример #30
0
 def parse_case(
     self, response: scrapy.http.Response
 ) -> typing.Generator[scrapy.Request, None, None]:
     """Find all the documents in the case."""
     item_full_text_div = response.xpath('//div[@class="itemFullText"]')[0]
     case_name = item_full_text_div.xpath('./h2/text()')[0].extract().strip()
     case_number = response.xpath('//h1[@class="itemTitle"]/text()')[0].extract().strip()
     for paragraph in item_full_text_div.xpath('./p'):
         paragraph_text = paragraph.xpath('string(.)').extract()[0]
         link = paragraph.xpath('./a')
         if not link:
             continue
         document_url = link[-1].xpath('./@href')[0]
         url = response.urljoin(document_url.extract())
         date = paragraph_text.split()[0]
         try:
             parse(date)
         except:
             continue
         entity_name = ''
         entity_class = ''
         braces = re.findall(r"\(.*?\)", paragraph_text)
         if braces:
             brace_text = braces[0].replace("(", "").replace(")", "")
             braces_split = [x.strip() for x in brace_text.split("-")]
             entity_name = braces_split[-1]
             if len(braces_split) > 1:
                 entity_class = braces_split[0]
         document_name = link[-1].xpath('./text()').extract()
         yield scrapy.Request(
             url=url,
             callback=self.parse_document,
             meta={
                 CASE_NAME: case_name,
                 CASE_NUMBER: case_number,
                 ENTITY_NAME: entity_name,
                 ENTITY_CLASS: entity_class,
                 DOCUMENT_NAME: document_name,
                 DATE: date,
             },
         )