def parse_hotel( self, response: scrapy.http.response.html.HtmlResponse, name, text_eng, text_hr ): if text_hr is None: text_hr = "".join( response.xpath("//dd[@class='hotel-description']//text()").extract() ) new_link = response._get_url().replace("esky.hr/hoteli", "esky.com/hotels") return scrapy.Request( new_link, callback=self.parse_hotel, cb_kwargs={"name": name, "text_eng": None, "text_hr": text_hr}, ) else: text_eng = "".join( response.xpath("//dd[@class='hotel-description']//text()").extract() ) if text_hr != text_eng: return { "name": name, "text_eng": text_eng.strip(), "text_hr": text_hr.strip(), } else: return None
def parse_cinema(self, response: scrapy.http.response.html.HtmlResponse) -> dict: titles = self.get_titles(response) movies_times = list() for movie in response.xpath( '//div[@class="table-responsive-wrapper"]'): times = list() for showtime in movie.xpath('.//tr'): times += showtime.xpath('./td/text()').getall() movies_times.append(times) cinema = Cinema( name=self.get_name(response), description=self.get_description(response), address=Address(street=self.get_street(response), postal_code=self.get_postal_code(response), district=self.get_district(response), city='Berlin', country='Germany'), contact=Contact(telephone=self.get_telephone(response)), prices=self.get_prices(response), shows=self.create_shows(titles, movies_times)) self.logger.info(f'Scraped cinema: {cinema.name}') yield cinema.to_dict()
def parse(self, response: scrapy.http.response.html.HtmlResponse) -> typing.Iterator[dict]: # FIXME: when xpath has <1 match, .get() cheerfully returns None. # FIXME: when xpath has >1 match, .get() cheerfully returns the first. # How do I get exceptions for both cases? for quote_etree in response.xpath('//*[@itemscope]'): yield { 'author': quote_etree.xpath('.//*[@itemprop="author"]/text()').get(), 'text': quote_etree.xpath('.//*[@itemprop="text"]/text()').get(), 'tags': quote_etree.xpath('.//*[@class="tag"]/text()').getall()} # Recursively descend the next page. # Follow the "next page" link for next_url in response.xpath('//li[@class="next"]/a/@href').getall(): yield scrapy.Request( response.urljoin(next_url), callback=self.parse)
def parse_eng(self, response: scrapy.http.response.html.HtmlResponse, text_hr: str): text_eng = "".join( response.xpath( "//dd[@class='hotel-description']//text()").extract()) if text_hr != text_eng: with open("c_output_hr.txt", "a", encoding="utf-8") as f: f.write(text_hr.replace("\t", "").replace("\n", "")) with open("c_output_en.txt", "a", encoding="utf-8") as f: f.write(text_eng.replace("\t", "").replace("\n", ""))
def parse(self, response: scrapy.http.response.html.HtmlResponse): text_hr = "".join( response.xpath( "//dd[@class='hotel-description']//text()").extract()) if text_hr == "": return None new_link = response._get_url().replace("esky.hr/hoteli", "esky.com/hotels") return scrapy.Request(new_link, callback=self.parse_eng, cb_kwargs={"text_hr": text_hr})
def parse(self, response: scrapy.http.response.html.HtmlResponse): selectors = response.xpath('//div[@class="controls"]/select/option') # current movies: base_kinodetail_url = 'https://www.berlin.de/kino/_bin/kinodetail.php/' hrefs = [ base_kinodetail_url + sel.attrib['value'] for sel in selectors if is_positiveinteger(sel.attrib['value']) ] for href in hrefs: self.logger.info(f'Scraping: {href}') yield response.follow(href, self.parse_cinema)
def get_telephone(self, response: scrapy.http.response.html.HtmlResponse) -> str: return response.xpath( '//span[contains(text(), "Telefon")]/following-sibling::span/text()' ).get()
def get_district(self, response: scrapy.http.response.html.HtmlResponse) -> str: return response.xpath('//span[@class="locality"]/text()').get()
def get_postal_code( self, response: scrapy.http.response.html.HtmlResponse) -> str: return response.xpath('//span[@class="postal-code"]/text()').get()
def get_street(self, response: scrapy.http.response.html.HtmlResponse) -> str: return response.xpath('//span[@class="street-address"]/text()').get()
def get_description( self, response: scrapy.http.response.html.HtmlResponse) -> str: return response.xpath('//div[@class="kinodetail echo"]/p/text()').get()
def get_prices( self, response: scrapy.http.response.html.HtmlResponse) -> List[str]: return response.xpath( '//section[@class="infoblock oeffnungszeiten"]/div/*/text()' ).getall()
def analyze_website(self, website_url: str, response: scrapy.http.response.html.HtmlResponse): gallery_page_matched_by_url = False page_with_gallery_matched_by_url = False can_be_page_with_gallery = True number_of_imgs_matched_by_a_href = 0 number_of_images_by_img_src = 0 img_src_values = response.xpath("//img/@src").extract() a_href_values = response.xpath("//a/@href").extract() some_config_matched = False for config in self.configs: if not re.match(f'.*{config.domain}.*', website_url): continue some_config_matched = True if config.pageWithGalleryUrlMatchesRegexp: rexp_match = re.match(config.pageWithGalleryUrlMatchesRegexp, website_url) page_with_gallery_matched_by_url = page_with_gallery_matched_by_url or rexp_match is not None if config.galleryUrlMatchesRegexp: rexp_match = re.match(config.galleryUrlMatchesRegexp, website_url) gallery_page_matched_by_url = gallery_page_matched_by_url or rexp_match is not None if config.pageWithGalleryUrlHasToMatchRegexp: rexp_match = re.match( config.pageWithGalleryUrlHasToMatchRegexp, website_url) can_be_page_with_gallery = rexp_match is not None if config.pageWithGalleryContainsImgSrcRegexp: for img_src_val in img_src_values: if re.match(config.pageWithGalleryContainsImgSrcRegexp, img_src_val): number_of_images_by_img_src += 1 if config.pageWithGalleryContainsAnchorHrefRegexp: for a_href_val in a_href_values: if re.match(config.pageWithGalleryContainsAnchorHrefRegexp, a_href_val): number_of_imgs_matched_by_a_href += 1 has_match_for_page_with_gallery_by_imgs = number_of_imgs_matched_by_a_href >= 1 \ or number_of_images_by_img_src >= 1 has_match_for_page_with_gallery_by_imgs = has_match_for_page_with_gallery_by_imgs \ and can_be_page_with_gallery has_match = page_with_gallery_matched_by_url \ or gallery_page_matched_by_url \ or has_match_for_page_with_gallery_by_imgs page_analysis_results = PageAnalysisResults() page_analysis_results[ 'number_of_images_by_a_href'] = number_of_imgs_matched_by_a_href page_analysis_results[ 'number_of_images_by_img_src'] = number_of_images_by_img_src page_analysis_results['has_match'] = has_match page_analysis_results[ 'url_matched_for_gallery_page'] = gallery_page_matched_by_url page_analysis_results[ 'url_matched_for_page_with_gallery'] = page_with_gallery_matched_by_url return page_analysis_results