def parse_hotel( self, response: scrapy.http.response.html.HtmlResponse, name, text_eng, text_hr ): if text_hr is None: text_hr = "".join( response.xpath("//dd[@class='hotel-description']//text()").extract() ) new_link = response._get_url().replace("esky.hr/hoteli", "esky.com/hotels") return scrapy.Request( new_link, callback=self.parse_hotel, cb_kwargs={"name": name, "text_eng": None, "text_hr": text_hr}, ) else: text_eng = "".join( response.xpath("//dd[@class='hotel-description']//text()").extract() ) if text_hr != text_eng: return { "name": name, "text_eng": text_eng.strip(), "text_hr": text_hr.strip(), } else: return None
def parse_topic(self, response: scrapy.http.response.html.HtmlResponse): messages = [] topic_id = self.last_part(response.url) for i, message in enumerate(response.css("tr")): topic_url = message.css( "td[class=subject] > a::attr(href)").extract_first() if topic_url is None: continue message_id = self.last_part(topic_url) messages.append({ "id": message_id, "author": message.css("td[class=author] ::text").extract_first(), "date": message.css("td[class=lastPostDate] ::text").extract_first(), "file": self.locate_email_file(topic_id, i, message_id, False) }) file_name = self.locate_email_file(topic_id, i, message_id, True) if os.path.exists(file_name): self.log("Skipped %s/%s - already fetched" % (topic_id, message_id)) continue yield response.follow( "%s/%s/message/raw?msg=%s/%s/%s" % (self.root, self.prefix, self.name, topic_id, message_id), functools.partial(self.save_email, file_name=file_name)) yield { "topic": response.css("h2 ::text").extract_first(), "id": topic_id, "messages": messages }
def parse(self, response: scrapy.http.response.html.HtmlResponse): text_hr = "".join( response.xpath( "//dd[@class='hotel-description']//text()").extract()) if text_hr == "": return None new_link = response._get_url().replace("esky.hr/hoteli", "esky.com/hotels") return scrapy.Request(new_link, callback=self.parse_eng, cb_kwargs={"text_hr": text_hr})
def parse(self, response: scrapy.http.response.html.HtmlResponse): for topic in response.css("tr a::attr(href)"): topic_url = "%s/forum/?_escaped_fragment_=topic/%s/%s" % ( self.root, self.name, self.last_part(topic.extract())) yield response.follow(topic_url, self.parse_topic) for next_page in response.css("body > a"): self.log("Page: %s -> %s" % (self.last_part(response.url), self.last_part( next_page.css("::attr(href)").extract_first()))) yield response.follow(next_page, self.parse)
def parse(self, response: scrapy.http.response.html.HtmlResponse): selectors = response.xpath('//div[@class="controls"]/select/option') # current movies: base_kinodetail_url = 'https://www.berlin.de/kino/_bin/kinodetail.php/' hrefs = [ base_kinodetail_url + sel.attrib['value'] for sel in selectors if is_positiveinteger(sel.attrib['value']) ] for href in hrefs: self.logger.info(f'Scraping: {href}') yield response.follow(href, self.parse_cinema)
def parse_cinema(self, response: scrapy.http.response.html.HtmlResponse) -> dict: titles = self.get_titles(response) movies_times = list() for movie in response.xpath( '//div[@class="table-responsive-wrapper"]'): times = list() for showtime in movie.xpath('.//tr'): times += showtime.xpath('./td/text()').getall() movies_times.append(times) cinema = Cinema( name=self.get_name(response), description=self.get_description(response), address=Address(street=self.get_street(response), postal_code=self.get_postal_code(response), district=self.get_district(response), city='Berlin', country='Germany'), contact=Contact(telephone=self.get_telephone(response)), prices=self.get_prices(response), shows=self.create_shows(titles, movies_times)) self.logger.info(f'Scraped cinema: {cinema.name}') yield cinema.to_dict()
def parse(self, response: scrapy.http.response.html.HtmlResponse) -> typing.Iterator[dict]: # FIXME: when xpath has <1 match, .get() cheerfully returns None. # FIXME: when xpath has >1 match, .get() cheerfully returns the first. # How do I get exceptions for both cases? for quote_etree in response.xpath('//*[@itemscope]'): yield { 'author': quote_etree.xpath('.//*[@itemprop="author"]/text()').get(), 'text': quote_etree.xpath('.//*[@itemprop="text"]/text()').get(), 'tags': quote_etree.xpath('.//*[@class="tag"]/text()').getall()} # Recursively descend the next page. # Follow the "next page" link for next_url in response.xpath('//li[@class="next"]/a/@href').getall(): yield scrapy.Request( response.urljoin(next_url), callback=self.parse)
def parse(self, response: scrapy.http.response.html.HtmlResponse): # Popular heroes parser: results = response.css('div.heroes-overview').css('div.r-row') for i, result in enumerate(results): hero = result.css('div.r-icon-text').css('div.r-body').css( 'div.r-none-mobile').css('a::text').get() temp = result.css('div.r-10') matches = temp[0].css('div.r-body::text').get() winrate = temp[1].css('div.r-body::text').get() kda = temp[2].css('div.r-body::text').get() temp = result.css('div.r-175') try: role = temp[0].css('div.r-body').css('div.group').css( 'span::text').get() except IndexError: role = 'Undefined' try: line = temp[1].css('div.r-body').css('div.group').css( 'span::text').get() except IndexError: line = 'Undefined' yield { i: { 'hero': hero, 'matches': matches, 'winrate': winrate, 'KDA': kda, 'role': role, 'line': line } }
def parse(self, response: scrapy.http.response.html.HtmlResponse): quote: scrapy.selector.unified.Selector for quote in response.selector.xpath("//div[@class='quote']"): loader = ItemLoader(item=QuoteItem(), selector=quote, response=response) loader.add_xpath('text', ".//div[@class='quoteText']/text()") loader.add_xpath('author', ".//span[@class='authorOrTitle']") loader.add_xpath('tags', ".//div[@class='greyText smallText left']/a") yield loader.load_item() # yield { # 'text': # quote.xpath(".//div[@class='quoteText']/text()[1]" # ).extract_first().strip(), # 'author': # quote.xpath(".//span[@class='authorOrTitle']/text()"). # extract_first().strip(), # 'tags': # quote.xpath(".//div[@class='greyText smallText left']/a/text()" # ).extract() # } next_page = response.selector.xpath( '//a[@class="next_page"]/@href').extract_first() if next_page: next_page_url = response.urljoin(next_page) yield scrapy.Request(url=next_page_url, callback=self.parse)
def parse_keyword(self, response: scrapy.http.response.html.HtmlResponse): self.logger.info('Found new keyword page: {}'.format(response.url)) keyword = response.css('a.mw-selflink.selflink::text').get() if keyword: yield INCARKeywordItem(keyword=keyword)
def parse_keyword(self, response: scrapy.http.response.html.HtmlResponse): self.logger.info('Found new keyword page: {}'.format(response.url)) keyword = response.css('div[id=mw-content-text] p strong::text').extract_first() if keyword: yield INCARKeywordItem(keyword=keyword)
def parse(self, response: scrapy.http.response.html.HtmlResponse): results = response.css('div.element') for i, result in enumerate(results): name = result.css('p.name').css('a::text').get() year = result.css('p.name').css('span.year::text').get() duration = result.css('div.info').css('span.gray')[0].css('::text').get() country = result.css('div.info').css('span.gray')[1].css('::text').get().split('<')[0] author = result.css('div.info').css('span.gray')[1].css('a::text').get() yield {i: {'name': name, 'year': year, 'duration': duration, 'country': country, 'author': author}}
def parse_eng(self, response: scrapy.http.response.html.HtmlResponse, text_hr: str): text_eng = "".join( response.xpath( "//dd[@class='hotel-description']//text()").extract()) if text_hr != text_eng: with open("c_output_hr.txt", "a", encoding="utf-8") as f: f.write(text_hr.replace("\t", "").replace("\n", "")) with open("c_output_en.txt", "a", encoding="utf-8") as f: f.write(text_eng.replace("\t", "").replace("\n", ""))
def parse_form(self, response: scrapy.http.response.html.HtmlResponse): try: input_element_list = response.css( 'form input::attr(value)').extract() except KeyError as e: return None # Set up form with generative keys formdata = self._create_formdata(input_element_list) yield scrapy.FormRequest(url=self.start_urls[0], formdata=formdata, callback=self.parse_results)
def get_crawl_list( self, response: scrapy.http.response.html.HtmlResponse) -> List: """ DOMの内容から企業情報が載っているURlを取得する. Args: response (scrapy.http.response.html.HtmlResponse): オブジェクト Returns: List: 企業の情報が入ったListを返す. """ company_list = [] company_list_box = response.css(".entryList01") company_list_tag = company_list_box.css("li") for company in company_list_tag: company_path = company.css("a::attr(href)").extract_first() company_url = response.urljoin(company_path) company_list.append({"url": company_url}) return company_list
def parse(self, response: scrapy.http.response.html.HtmlResponse): results = response.css("li.expanded-shelf-content-item-wrapper").css( "div.yt-lockup-content") for i, result in enumerate(results): yield { i: [ result.css('a.yt-uix-tile-link::text').get(), result.css('span.accessible-description::text').get()[3:], result.css('div.yt-lockup-byline').css( 'a.spf-link::text').get(), *result.css( 'ul.yt-lockup-meta-info').css('li::text').getall(), str('https://youtube.com' + result.css('a.yt-uix-tile-link::attr(href)').get()) ] }
def parse(self, response: scrapy.http.response.html.HtmlResponse): results = response.css("div.b-post_padbot_15") for result in results: in_script = result.css('script::text').getall() secure = 'Безопасная сделка' in in_script[0] price = (in_script[0][497:-10] if secure else in_script[0][183:-10]).replace(' ', '').replace(';', '') text = in_script[1][142:-78] html_string = in_script[2][17:-3] type_ = 'Проект' if 'Проект' in html_string else 'Вакансия' if 'Вакансия' in html_string else 'Конкурс' yield { bool(result.css('img.b-pic_margtop_1')): { 'title': result.css('a.b-post__link::text').get(), 'secure': secure, 'price': price, 'text': text, 'type': type_ } }
def next_page_link( self, response: scrapy.http.response.html.HtmlResponse ) -> scrapy.Request: """ 次のクローリング先のURLを生成し、scray.Requestオブジェクトを生成する. Args: response (scrapy.http.response.html.HtmlResponse): オブジェクト. Returns: scrapy.Request: scrapy.Requestオブジェクトを返す. """ self.page_count += 1 # index path index_path = "index_" + str(self.page_count) + ".html" index_path = index_path if self.page_count != 1 else "" # URLが相対パスだった場合に絶対パスに変換する older_post_link = response.urljoin(index_path) # 次のページをのリクエストを実行する return scrapy.Request(older_post_link, callback=self.parse)
def analyze_website(self, website_url: str, response: scrapy.http.response.html.HtmlResponse): gallery_page_matched_by_url = False page_with_gallery_matched_by_url = False can_be_page_with_gallery = True number_of_imgs_matched_by_a_href = 0 number_of_images_by_img_src = 0 img_src_values = response.xpath("//img/@src").extract() a_href_values = response.xpath("//a/@href").extract() some_config_matched = False for config in self.configs: if not re.match(f'.*{config.domain}.*', website_url): continue some_config_matched = True if config.pageWithGalleryUrlMatchesRegexp: rexp_match = re.match(config.pageWithGalleryUrlMatchesRegexp, website_url) page_with_gallery_matched_by_url = page_with_gallery_matched_by_url or rexp_match is not None if config.galleryUrlMatchesRegexp: rexp_match = re.match(config.galleryUrlMatchesRegexp, website_url) gallery_page_matched_by_url = gallery_page_matched_by_url or rexp_match is not None if config.pageWithGalleryUrlHasToMatchRegexp: rexp_match = re.match( config.pageWithGalleryUrlHasToMatchRegexp, website_url) can_be_page_with_gallery = rexp_match is not None if config.pageWithGalleryContainsImgSrcRegexp: for img_src_val in img_src_values: if re.match(config.pageWithGalleryContainsImgSrcRegexp, img_src_val): number_of_images_by_img_src += 1 if config.pageWithGalleryContainsAnchorHrefRegexp: for a_href_val in a_href_values: if re.match(config.pageWithGalleryContainsAnchorHrefRegexp, a_href_val): number_of_imgs_matched_by_a_href += 1 has_match_for_page_with_gallery_by_imgs = number_of_imgs_matched_by_a_href >= 1 \ or number_of_images_by_img_src >= 1 has_match_for_page_with_gallery_by_imgs = has_match_for_page_with_gallery_by_imgs \ and can_be_page_with_gallery has_match = page_with_gallery_matched_by_url \ or gallery_page_matched_by_url \ or has_match_for_page_with_gallery_by_imgs page_analysis_results = PageAnalysisResults() page_analysis_results[ 'number_of_images_by_a_href'] = number_of_imgs_matched_by_a_href page_analysis_results[ 'number_of_images_by_img_src'] = number_of_images_by_img_src page_analysis_results['has_match'] = has_match page_analysis_results[ 'url_matched_for_gallery_page'] = gallery_page_matched_by_url page_analysis_results[ 'url_matched_for_page_with_gallery'] = page_with_gallery_matched_by_url return page_analysis_results
def get_telephone(self, response: scrapy.http.response.html.HtmlResponse) -> str: return response.xpath( '//span[contains(text(), "Telefon")]/following-sibling::span/text()' ).get()
def get_district(self, response: scrapy.http.response.html.HtmlResponse) -> str: return response.xpath('//span[@class="locality"]/text()').get()
def get_postal_code( self, response: scrapy.http.response.html.HtmlResponse) -> str: return response.xpath('//span[@class="postal-code"]/text()').get()
def get_street(self, response: scrapy.http.response.html.HtmlResponse) -> str: return response.xpath('//span[@class="street-address"]/text()').get()
def get_description( self, response: scrapy.http.response.html.HtmlResponse) -> str: return response.xpath('//div[@class="kinodetail echo"]/p/text()').get()
def get_name(self, response: scrapy.http.response.html.HtmlResponse) -> str: return response.css('h1.top::text').get()
def get_titles( self, response: scrapy.http.response.html.HtmlResponse) -> List[str]: return response.css('button.accordion-trigger::text').getall()
def get_prices( self, response: scrapy.http.response.html.HtmlResponse) -> List[str]: return response.xpath( '//section[@class="infoblock oeffnungszeiten"]/div/*/text()' ).getall()