Пример #1
0
 def parse_hotel(
     self, response: scrapy.http.response.html.HtmlResponse, name, text_eng, text_hr
 ):
     if text_hr is None:
         text_hr = "".join(
             response.xpath("//dd[@class='hotel-description']//text()").extract()
         )
         new_link = response._get_url().replace("esky.hr/hoteli", "esky.com/hotels")
         return scrapy.Request(
             new_link,
             callback=self.parse_hotel,
             cb_kwargs={"name": name, "text_eng": None, "text_hr": text_hr},
         )
     else:
         text_eng = "".join(
             response.xpath("//dd[@class='hotel-description']//text()").extract()
         )
         if text_hr != text_eng:
             return {
                 "name": name,
                 "text_eng": text_eng.strip(),
                 "text_hr": text_hr.strip(),
             }
         else:
             return None
Пример #2
0
 def parse_topic(self, response: scrapy.http.response.html.HtmlResponse):
     messages = []
     topic_id = self.last_part(response.url)
     for i, message in enumerate(response.css("tr")):
         topic_url = message.css(
             "td[class=subject] > a::attr(href)").extract_first()
         if topic_url is None:
             continue
         message_id = self.last_part(topic_url)
         messages.append({
             "id":
             message_id,
             "author":
             message.css("td[class=author] ::text").extract_first(),
             "date":
             message.css("td[class=lastPostDate] ::text").extract_first(),
             "file":
             self.locate_email_file(topic_id, i, message_id, False)
         })
         file_name = self.locate_email_file(topic_id, i, message_id, True)
         if os.path.exists(file_name):
             self.log("Skipped %s/%s - already fetched" %
                      (topic_id, message_id))
             continue
         yield response.follow(
             "%s/%s/message/raw?msg=%s/%s/%s" %
             (self.root, self.prefix, self.name, topic_id, message_id),
             functools.partial(self.save_email, file_name=file_name))
     yield {
         "topic": response.css("h2 ::text").extract_first(),
         "id": topic_id,
         "messages": messages
     }
Пример #3
0
 def parse(self, response: scrapy.http.response.html.HtmlResponse):
     text_hr = "".join(
         response.xpath(
             "//dd[@class='hotel-description']//text()").extract())
     if text_hr == "":
         return None
     new_link = response._get_url().replace("esky.hr/hoteli",
                                            "esky.com/hotels")
     return scrapy.Request(new_link,
                           callback=self.parse_eng,
                           cb_kwargs={"text_hr": text_hr})
Пример #4
0
    def parse(self, response: scrapy.http.response.html.HtmlResponse):
        for topic in response.css("tr a::attr(href)"):
            topic_url = "%s/forum/?_escaped_fragment_=topic/%s/%s" % (
                self.root, self.name, self.last_part(topic.extract()))
            yield response.follow(topic_url, self.parse_topic)

        for next_page in response.css("body > a"):
            self.log("Page: %s -> %s" %
                     (self.last_part(response.url),
                      self.last_part(
                          next_page.css("::attr(href)").extract_first())))
            yield response.follow(next_page, self.parse)
Пример #5
0
    def parse(self, response: scrapy.http.response.html.HtmlResponse):

        selectors = response.xpath('//div[@class="controls"]/select/option')

        # current movies:
        base_kinodetail_url = 'https://www.berlin.de/kino/_bin/kinodetail.php/'

        hrefs = [
            base_kinodetail_url + sel.attrib['value'] for sel in selectors
            if is_positiveinteger(sel.attrib['value'])
        ]

        for href in hrefs:
            self.logger.info(f'Scraping: {href}')
            yield response.follow(href, self.parse_cinema)
    def parse_cinema(self,
                     response: scrapy.http.response.html.HtmlResponse) -> dict:

        titles = self.get_titles(response)
        movies_times = list()
        for movie in response.xpath(
                '//div[@class="table-responsive-wrapper"]'):
            times = list()
            for showtime in movie.xpath('.//tr'):
                times += showtime.xpath('./td/text()').getall()
            movies_times.append(times)

        cinema = Cinema(
            name=self.get_name(response),
            description=self.get_description(response),
            address=Address(street=self.get_street(response),
                            postal_code=self.get_postal_code(response),
                            district=self.get_district(response),
                            city='Berlin',
                            country='Germany'),
            contact=Contact(telephone=self.get_telephone(response)),
            prices=self.get_prices(response),
            shows=self.create_shows(titles, movies_times))

        self.logger.info(f'Scraped cinema: {cinema.name}')

        yield cinema.to_dict()
Пример #7
0
    def parse(self, response: scrapy.http.response.html.HtmlResponse) -> typing.Iterator[dict]:
        # FIXME: when xpath has <1 match, .get() cheerfully returns None.
        # FIXME: when xpath has >1 match, .get() cheerfully returns the first.
        # How do I get exceptions for both cases?
        for quote_etree in response.xpath('//*[@itemscope]'):
            yield {
                'author': quote_etree.xpath('.//*[@itemprop="author"]/text()').get(),
                'text': quote_etree.xpath('.//*[@itemprop="text"]/text()').get(),
                'tags': quote_etree.xpath('.//*[@class="tag"]/text()').getall()}

        # Recursively descend the next page.
        # Follow the "next page" link
        for next_url in response.xpath('//li[@class="next"]/a/@href').getall():
            yield scrapy.Request(
                response.urljoin(next_url),
                callback=self.parse)
Пример #8
0
    def parse(self, response: scrapy.http.response.html.HtmlResponse):
        # Popular heroes parser:
        results = response.css('div.heroes-overview').css('div.r-row')
        for i, result in enumerate(results):
            hero = result.css('div.r-icon-text').css('div.r-body').css(
                'div.r-none-mobile').css('a::text').get()
            temp = result.css('div.r-10')
            matches = temp[0].css('div.r-body::text').get()
            winrate = temp[1].css('div.r-body::text').get()
            kda = temp[2].css('div.r-body::text').get()
            temp = result.css('div.r-175')
            try:
                role = temp[0].css('div.r-body').css('div.group').css(
                    'span::text').get()
            except IndexError:
                role = 'Undefined'

            try:
                line = temp[1].css('div.r-body').css('div.group').css(
                    'span::text').get()
            except IndexError:
                line = 'Undefined'

            yield {
                i: {
                    'hero': hero,
                    'matches': matches,
                    'winrate': winrate,
                    'KDA': kda,
                    'role': role,
                    'line': line
                }
            }
Пример #9
0
    def parse(self, response: scrapy.http.response.html.HtmlResponse):
        quote: scrapy.selector.unified.Selector
        for quote in response.selector.xpath("//div[@class='quote']"):
            loader = ItemLoader(item=QuoteItem(),
                                selector=quote,
                                response=response)
            loader.add_xpath('text', ".//div[@class='quoteText']/text()")
            loader.add_xpath('author', ".//span[@class='authorOrTitle']")
            loader.add_xpath('tags',
                             ".//div[@class='greyText smallText left']/a")
            yield loader.load_item()
            # yield {
            #     'text':
            #     quote.xpath(".//div[@class='quoteText']/text()[1]"
            #                 ).extract_first().strip(),
            #     'author':
            #     quote.xpath(".//span[@class='authorOrTitle']/text()").
            #     extract_first().strip(),
            #     'tags':
            #     quote.xpath(".//div[@class='greyText smallText left']/a/text()"
            #                 ).extract()
            # }

        next_page = response.selector.xpath(
            '//a[@class="next_page"]/@href').extract_first()
        if next_page:
            next_page_url = response.urljoin(next_page)
            yield scrapy.Request(url=next_page_url, callback=self.parse)
Пример #10
0
    def parse_keyword(self, response: scrapy.http.response.html.HtmlResponse):
        self.logger.info('Found new keyword page: {}'.format(response.url))

        keyword = response.css('a.mw-selflink.selflink::text').get()

        if keyword:
            yield INCARKeywordItem(keyword=keyword)
 def parse_keyword(self, response: scrapy.http.response.html.HtmlResponse):
     self.logger.info('Found new keyword page: {}'.format(response.url))
     
     keyword = response.css('div[id=mw-content-text] p strong::text').extract_first()
     
     if keyword:
         yield INCARKeywordItem(keyword=keyword)
Пример #12
0
 def parse(self, response: scrapy.http.response.html.HtmlResponse):
     results = response.css('div.element')
     for i, result in enumerate(results):
         name = result.css('p.name').css('a::text').get()
         year = result.css('p.name').css('span.year::text').get()
         duration = result.css('div.info').css('span.gray')[0].css('::text').get()
         country = result.css('div.info').css('span.gray')[1].css('::text').get().split('<')[0]
         author = result.css('div.info').css('span.gray')[1].css('a::text').get()
         yield {i: {'name': name, 'year': year, 'duration': duration,
                    'country': country, 'author': author}}
Пример #13
0
 def parse_eng(self, response: scrapy.http.response.html.HtmlResponse,
               text_hr: str):
     text_eng = "".join(
         response.xpath(
             "//dd[@class='hotel-description']//text()").extract())
     if text_hr != text_eng:
         with open("c_output_hr.txt", "a", encoding="utf-8") as f:
             f.write(text_hr.replace("\t", "").replace("\n", ""))
         with open("c_output_en.txt", "a", encoding="utf-8") as f:
             f.write(text_eng.replace("\t", "").replace("\n", ""))
Пример #14
0
    def parse_form(self, response: scrapy.http.response.html.HtmlResponse):
        try:
            input_element_list = response.css(
                'form input::attr(value)').extract()
        except KeyError as e:
            return None

        # Set up form with generative keys
        formdata = self._create_formdata(input_element_list)

        yield scrapy.FormRequest(url=self.start_urls[0],
                                 formdata=formdata,
                                 callback=self.parse_results)
    def get_crawl_list(
            self, response: scrapy.http.response.html.HtmlResponse) -> List:
        """
        DOMの内容から企業情報が載っているURlを取得する.

        Args:
            response (scrapy.http.response.html.HtmlResponse): オブジェクト

        Returns:
            List: 企業の情報が入ったListを返す.
        """
        company_list = []

        company_list_box = response.css(".entryList01")
        company_list_tag = company_list_box.css("li")

        for company in company_list_tag:
            company_path = company.css("a::attr(href)").extract_first()
            company_url = response.urljoin(company_path)

            company_list.append({"url": company_url})

        return company_list
Пример #16
0
 def parse(self, response: scrapy.http.response.html.HtmlResponse):
     results = response.css("li.expanded-shelf-content-item-wrapper").css(
         "div.yt-lockup-content")
     for i, result in enumerate(results):
         yield {
             i: [
                 result.css('a.yt-uix-tile-link::text').get(),
                 result.css('span.accessible-description::text').get()[3:],
                 result.css('div.yt-lockup-byline').css(
                     'a.spf-link::text').get(), *result.css(
                         'ul.yt-lockup-meta-info').css('li::text').getall(),
                 str('https://youtube.com' +
                     result.css('a.yt-uix-tile-link::attr(href)').get())
             ]
         }
Пример #17
0
    def parse(self, response: scrapy.http.response.html.HtmlResponse):
        results = response.css("div.b-post_padbot_15")
        for result in results:
            in_script = result.css('script::text').getall()
            secure = 'Безопасная сделка' in in_script[0]
            price = (in_script[0][497:-10] if secure else
                     in_script[0][183:-10]).replace('&nbsp',
                                                    '').replace(';', '')
            text = in_script[1][142:-78]
            html_string = in_script[2][17:-3]
            type_ = 'Проект' if 'Проект' in html_string else 'Вакансия' if 'Вакансия' in html_string else 'Конкурс'

            yield {
                bool(result.css('img.b-pic_margtop_1')): {
                    'title': result.css('a.b-post__link::text').get(),
                    'secure': secure,
                    'price': price,
                    'text': text,
                    'type': type_
                }
            }
    def next_page_link(
            self, response: scrapy.http.response.html.HtmlResponse
    ) -> scrapy.Request:
        """
        次のクローリング先のURLを生成し、scray.Requestオブジェクトを生成する.

        Args:
            response (scrapy.http.response.html.HtmlResponse): オブジェクト.

        Returns:
            scrapy.Request: scrapy.Requestオブジェクトを返す.
        """

        self.page_count += 1

        # index path
        index_path = "index_" + str(self.page_count) + ".html"
        index_path = index_path if self.page_count != 1 else ""

        # URLが相対パスだった場合に絶対パスに変換する
        older_post_link = response.urljoin(index_path)

        # 次のページをのリクエストを実行する
        return scrapy.Request(older_post_link, callback=self.parse)
Пример #19
0
    def analyze_website(self, website_url: str,
                        response: scrapy.http.response.html.HtmlResponse):
        gallery_page_matched_by_url = False
        page_with_gallery_matched_by_url = False
        can_be_page_with_gallery = True

        number_of_imgs_matched_by_a_href = 0
        number_of_images_by_img_src = 0

        img_src_values = response.xpath("//img/@src").extract()
        a_href_values = response.xpath("//a/@href").extract()

        some_config_matched = False
        for config in self.configs:
            if not re.match(f'.*{config.domain}.*', website_url):
                continue
            some_config_matched = True
            if config.pageWithGalleryUrlMatchesRegexp:
                rexp_match = re.match(config.pageWithGalleryUrlMatchesRegexp,
                                      website_url)
                page_with_gallery_matched_by_url = page_with_gallery_matched_by_url or rexp_match is not None

            if config.galleryUrlMatchesRegexp:
                rexp_match = re.match(config.galleryUrlMatchesRegexp,
                                      website_url)
                gallery_page_matched_by_url = gallery_page_matched_by_url or rexp_match is not None

            if config.pageWithGalleryUrlHasToMatchRegexp:
                rexp_match = re.match(
                    config.pageWithGalleryUrlHasToMatchRegexp, website_url)
                can_be_page_with_gallery = rexp_match is not None

            if config.pageWithGalleryContainsImgSrcRegexp:
                for img_src_val in img_src_values:
                    if re.match(config.pageWithGalleryContainsImgSrcRegexp,
                                img_src_val):
                        number_of_images_by_img_src += 1

            if config.pageWithGalleryContainsAnchorHrefRegexp:
                for a_href_val in a_href_values:
                    if re.match(config.pageWithGalleryContainsAnchorHrefRegexp,
                                a_href_val):
                        number_of_imgs_matched_by_a_href += 1

        has_match_for_page_with_gallery_by_imgs = number_of_imgs_matched_by_a_href >= 1 \
            or number_of_images_by_img_src >= 1

        has_match_for_page_with_gallery_by_imgs = has_match_for_page_with_gallery_by_imgs \
            and can_be_page_with_gallery

        has_match = page_with_gallery_matched_by_url \
            or gallery_page_matched_by_url \
            or has_match_for_page_with_gallery_by_imgs

        page_analysis_results = PageAnalysisResults()

        page_analysis_results[
            'number_of_images_by_a_href'] = number_of_imgs_matched_by_a_href
        page_analysis_results[
            'number_of_images_by_img_src'] = number_of_images_by_img_src
        page_analysis_results['has_match'] = has_match
        page_analysis_results[
            'url_matched_for_gallery_page'] = gallery_page_matched_by_url
        page_analysis_results[
            'url_matched_for_page_with_gallery'] = page_with_gallery_matched_by_url

        return page_analysis_results
 def get_telephone(self,
                   response: scrapy.http.response.html.HtmlResponse) -> str:
     return response.xpath(
         '//span[contains(text(), "Telefon")]/following-sibling::span/text()'
     ).get()
 def get_district(self,
                  response: scrapy.http.response.html.HtmlResponse) -> str:
     return response.xpath('//span[@class="locality"]/text()').get()
 def get_postal_code(
         self, response: scrapy.http.response.html.HtmlResponse) -> str:
     return response.xpath('//span[@class="postal-code"]/text()').get()
 def get_street(self,
                response: scrapy.http.response.html.HtmlResponse) -> str:
     return response.xpath('//span[@class="street-address"]/text()').get()
 def get_description(
         self, response: scrapy.http.response.html.HtmlResponse) -> str:
     return response.xpath('//div[@class="kinodetail echo"]/p/text()').get()
 def get_name(self,
              response: scrapy.http.response.html.HtmlResponse) -> str:
     return response.css('h1.top::text').get()
 def get_titles(
         self,
         response: scrapy.http.response.html.HtmlResponse) -> List[str]:
     return response.css('button.accordion-trigger::text').getall()
 def get_prices(
         self,
         response: scrapy.http.response.html.HtmlResponse) -> List[str]:
     return response.xpath(
         '//section[@class="infoblock oeffnungszeiten"]/div/*/text()'
     ).getall()