def next_page(self, response: scrapy.http.Response) -> scrapy.Request: """ Goes to next page. :param response: response object :return: request for next page """ # go to next page next_url = response.xpath("//a[@title='下一页']/@href").extract_first() if next_url is not None: self.log('Next page {}'.format(next_url), level=logging.INFO) time.sleep(random.random()) return response.follow( url=next_url, callback=self.parse, # reuse the current proxy meta={'proxy': response.request.meta['proxy']}, errback=self.handle_failure) else: # try to build the page by ourself arguments = self.decode_url(response.request.url) arguments['page'] += 1 url = self.format_url(arguments) self.log('Next page (manually) {}'.format(url), level=logging.INFO) return response.follow( url=url, callback=self.parse, # reuse the current proxy meta={'proxy': response.request.meta['proxy']}, errback=self.handle_failure)
def parse_docs(self, response: scrapy.http.Response): pdfs: List[str] = [] for url in response.css('a::attr(href)'): full = response.urljoin(url.extract()) if full.endswith('.pdf'): pdfs.append(full) yield {'from': response.url, 'file_urls': pdfs}
def parseCityAttractionsListPage(self, response: scrapy.http.Response): # example page: https://www.viator.com/Mumbai/d953 print( 'PARSING ATTRACTION LIST ####################################################################################' ) print(response.url) self.incrementRequestCount() hrefs = response.css('div.ptm *> h2 > a') for href in hrefs: pointURL = href.css('::attr(href)').extract_first().strip() pointName = href.css('::text').extract_first().strip() yield response.follow(pointURL, callback=self.parseAttractionsPage, meta={ 'countryName': response.meta['countryName'], 'cityName': response.meta['cityName'], 'pointName': pointName }) nextPageLink = response.css( 'div.ptm > div:nth-child(1) > div:nth-child(2) > p > a:last-child::attr(href)' ).extract_first() if nextPageLink: yield response.follow(nextPageLink, callback=self.parseCityAttractionsListPage, meta=response.meta)
def parse(self, response: scrapy.http.Response): """ Parses content from a html page response. """ listings = response.xpath('//li[@class="result-row"]') for listing in listings: # Relative matching date = listing.xpath( './/*[@class="result-date"]/@datetime').extract_first() url = listing.xpath( './/a[@class="result-title hdrlnk"]/@href').extract_first() title = listing.xpath( './/a[@class="result-title hdrlnk"]/text()').extract_first() yield scrapy.Request(url, callback=self.parse_listing, meta=dict(date=date, url=url, title=title)) # Move to the next page of data. next_page_url = response.xpath( '//*[@class="button next"]/@href').extract_first() if next_page_url: # url must be absolute. abs_next_page_url = response.urljoin(next_page_url) yield scrapy.Request(url=abs_next_page_url, callback=self.parse)
def parse(self, response: scrapy.http.Response, **kwargs): form_data = { '__VIEWSTATE': response.xpath('//input[@id="__VIEWSTATE"]/@value').get(), '__VIEWSTATEGENERATOR': response.xpath('//input[@id="__VIEWSTATEGENERATOR"]/@value').get(), '__EVENTVALIDATION': response.xpath('//input[@id="__EVENTVALIDATION"]/@value').get(), } next_page_btn = response.xpath( '//a[contains(@href, "Page$Next")]').get() if next_page_btn is not None: data = form_data.copy() data['__EVENTTARGET'] = 'grdSQL' data['__EVENTARGUMENT'] = 'Page$Next' yield scrapy.FormRequest(f'{self.base_url}/Events.aspx', formdata=data) entries = response.xpath( '//table[@id="grdSQL"]//tr[@onmouseover]').getall() for i, entry in enumerate(entries): data = form_data.copy() data['__EVENTTARGET'] = 'grdSQL' data['__EVENTARGUMENT'] = f'SysRowSelector${i}' yield scrapy.FormRequest(f'{self.base_url}/Events.aspx', formdata=data, callback=self.parse_entry, meta={'row_index': i})
def parse(self, response: scrapy.http.Response): findform = response.xpath("//form[@name='form1']") form = self.build_form(findform) if "kirjaamo" not in form: raise ValueError("kirjaamo not found") if not isinstance(form["kirjaamo"], list): raise ValueError("kirjaamo is not list") method = findform.xpath("./@method").get() action = response.urljoin(findform.xpath("./@action").get()) alist = form["kirjaamo"] del form["kirjaamo"] for param in alist: val = param["value"] if val == "": continue fdata = form fdata["kirjaamo"] = val yield scrapy.FormRequest( action, method=method, formdata=fdata, meta={ "name": param["name"], "dont_cache": True, }, callback=self.parse_search_result, )
def parse_subtopic_triangle(self, response: scrapy.http.Response): # Gathers all subtopics from https://www.walter-fendt.de/html5/mde/tl/tl_start_de.htm triangle_subtopics = response.xpath( '/html/body/ul/li/a/@href').getall() for subtopic_url in triangle_subtopics: subtopic_url = response.urljoin(subtopic_url) yield scrapy.Request(url=subtopic_url, callback=self.parse)
def parse(self, response: scrapy.http.Response): # Next page link next_page = response.xpath( "//div[@id='pager_one']/div[@class='subcontainer']/div[contains(@class, 'pageside') and contains(@class, 'pright')]/a/@href" ).get() if next_page is not None: yield scrapy.Request( response.urljoin(next_page), callback=self.parse, ) for row in response.xpath("//table[@id='remixtable']/tr[@class]"): # Get tunes addeddate = datetime.strptime( row.xpath("td[1]/text()").get(), "%Y-%m-%d") link = response.urljoin(row.xpath("td[2]/a/@href").get()) title = row.xpath("td[2]/a/text()").get() arranger = row.xpath("td[3]/a/text()").get() composer = row.xpath("td[4]/text()").get() yield scrapy.Request(link, callback=self.dl_tune, meta={ "tune": Tune( title=title, arranger=arranger, added=addeddate, composer=composer, data=None, ), })
def parseCountryPage(self, response: scrapy.http.Response): # example page: https://www.viator.com/India/d723-ttd self.incrementRequestCount() breadcrumbs = response.css('div.crumbler *> span::text').extract() countryName = breadcrumbs[1].strip() countryListing = CountryListing(crawler=self.name, sourceURL=response.url, crawlTimestamp=getCurrentTime(), countryName=countryName) yield countryListing.jsonify() if skipNonRequired: if processName(countryName) not in processedRequiredCountries: # do not process this country's cities print('Skipping country: ', countryName) return countryId = response.url.split('/')[-1].split('-')[0][1:] cityListingURL = 'https://www.viator.com/pascities.jspa?country={}'.format( countryId) yield response.follow(cityListingURL, callback=self.parseCountryCities, meta={'countryName': countryName})
def parse(self, response: scrapy.http.Response): print("Parsing URL: " + response.url) # Call Splash only once per page (that contains multiple XML elements). data = self.getUrlData(response.url) response.meta["rendered_data"] = data # We would use .fromstring(response.text) if the response did not include the XML declaration: # <?xml version="1.0" encoding="utf-8"?> root = etree.XML(response.body) tree = etree.ElementTree(root) # If results are returned. elements = tree.xpath("/root/items/*") if len(elements) > 0: for element in elements: copyResponse = response.copy() element_xml_str = etree.tostring(element, pretty_print=True, encoding="unicode") element_dict = xmltodict.parse(element_xml_str) # Temporary solution for public-only content. # TODO: remove this when licensed content are enabled! if not self.is_public(element_dict["data"]): continue # TODO: It's probably a pointless attribute. # del element_dict["data"]["score"] # Passing the dictionary for easier access to attributes. copyResponse.meta["item"] = element_dict["data"] # In case JSON string representation is preferred: # copyResponse._set_body(json.dumps(copyResponse.meta['item'], indent=1, ensure_ascii=False)) copyResponse._set_body(element_xml_str) if self.hasChanged(copyResponse): yield self.handleEntry(copyResponse) # LomBase.parse() has to be called for every individual instance that needs to be saved to the database. LomBase.parse(self, copyResponse) # TODO: To not stress the Rest APIs. # time.sleep(0.1) # If the number of returned results is equal to the imposed limit, it means that there are more to be returned. if len(elements) == self.limit: self.page += 1 url = self.apiUrl.replace("%start", str(self.page * self.limit)).replace( "%anzahl", str(self.limit)) yield scrapy.Request( url=url, callback=self.parse, headers={ "Accept": "application/xml", "Content-Type": "application/xml", }, )
def post( self, response: scrapy.http.Response ) -> Union[Iterator[items.ArticleItem], Iterator[scrapy.Request]]: """Get medium posts. Args: response (scrapy.http.Response): scrapy response Yields: items.ArticleItem: ArticleItem object scrapy.Request: scrapy request object """ data = response.text.replace('])}while(1);</x>', '', 1) obj = json.loads(data)['payload'] post_record = self.parse_post_item(post=obj) yield post_record if post_record['comment_count'] > 0: post_id = obj['value']['id'] response.meta['post_id'] = post_id response.meta['post_record'] = post_record url = (f'https://medium.com/_/api/posts/{post_id}/responsesStream') yield scrapy.Request(url=url, meta=response.meta, callback=self.comment)
def parse(self, response: scrapy.http.Response): """ Get list of tunes """ u: SplitResult = urlsplit(response.url) q: dict = dict(queryparse(u.query)) for tune in response.xpath( "//div[@id='result']/table/tr/th[@colspan='6']/../../tr[@class]" ): artist = "".join(tune.xpath("./td[2]//text()").getall()).strip() title = "".join(tune.xpath("./td[1]//text()").getall()).strip() link = tune.xpath("./td[1]/a/@href").get().strip() fileformat = "".join( tune.xpath("./td[3]//text()").getall()).strip().lower() # Download tune yield scrapy.Request( response.urljoin(link), callback=self.download_mod, meta={ "tune": { "id": q['view'], "artist": artist, "title": title, "format": fileformat, } }, )
def parse_apollonian_subtopic(self, response: scrapy.http.Response): # Gathers variant-URLs to crawl from https://www.walter-fendt.de/html5/mde/apolloniosproblem_de.htm apollonios_subtopics = response.xpath( '//table/tbody/tr/td/a/@href').getall() for apollo_url in apollonios_subtopics: apollo_url = response.urljoin(apollo_url) yield scrapy.Request(url=apollo_url, callback=self.parse)
def parse(self, response: scrapy.http.Response): # example page: https://www.viator.com/Amsterdam/d525-ttd countryMenuBox = response.css( '#countryMenuBox > div.menu-dropdown-box.small > div > div:nth-child(1)' ) hrefs = countryMenuBox.css('a::attr(durl)').extract() for href in hrefs: yield response.follow(href, callback=self.parseCountryPage)
def parse_category_overview_for_topics_and_subpages( self, response: scrapy.http.Response): """ Crawls an overview page of a "type"-category (e.g. "Hintergrund", "Bilderserie" etc.) for subpages and topics. If the overview has subpages, it will recursively yield additional scrapy.Requests to the overview-subpages. Afterwards it yields the (10) individual topic_urls (per overview page) to the parse()-method. Scrapy Contracts: @url https://www.umwelt-im-unterricht.de/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Alessons @returns requests 10 """ topic_urls_raw: list = response.xpath( '//a[@class="internal-link readmore"]/@href').getall() for url_ending in topic_urls_raw: self.topic_urls.add(response.urljoin(url_ending)) # if there's a "Letzte"-Button in the overview, there's more topic_urls to be gathered than the initially # displayed 10 elements last_page_button_url: str = response.xpath( '//li[@class="tx-pagebrowse-last last"]/a/@href').get() # the string last_page_button_url typically looks like this: # "/suche/?tx_solr%5Bfilter%5D%5B0%5D=type%3Amaterials_images&tx_solr%5Bpage%5D=8" page_number_regex = re.compile( r'(?P<url_with_parameters>.*&tx_solr%5Bpage%5D=)(?P<nr>\d+)') overview_urls_parsed: set = set( ) # temporary set used for checking off already visited URLs if last_page_button_url is not None: page_number_dict: dict = page_number_regex.search( last_page_button_url).groupdict() url_without_page_parameter: str = response.urljoin( page_number_dict.get('url_with_parameters')) last_page_number = int(page_number_dict.get('nr')) for i in range(2, last_page_number + 1): # the initial url from start_urls already counts as page 1, therefore we're iterating # from page 2 to the last page next_overview_subpage_to_crawl = str( url_without_page_parameter + str(i)) if next_overview_subpage_to_crawl not in self.overview_urls_already_parsed: yield scrapy.Request( url=next_overview_subpage_to_crawl, callback=self. parse_category_overview_for_topics_and_subpages) overview_urls_parsed.add(next_overview_subpage_to_crawl) self.overview_urls_already_parsed.update( overview_urls_parsed ) # checking off the (10) URLs that we yielded parsed_urls: set = set( ) # temporary set used for checking off already visited topics for url in self.topic_urls: if url not in self.topic_urls_parsed: # making sure that we don't accidentally crawl individual pages more than once yield scrapy.Request(url=url, callback=self.parse) parsed_urls.add(url) self.topic_urls_parsed.update(parsed_urls)
def parse_landing_page(self, response: scrapy.http.Response): # On a landing page, we can extract all the documents, or infer the JSON link and use that. # yield {'title': pub.css('h1 ::text').extract_first().strip()} for pub in response.css('.publication'): # This is a publication, so let's infer the API link: lp_url = list(urlsplit(response.url)) lp_url[2] = "/api/content%s" % lp_url[2] api_json_url = urlunsplit(lp_url) yield response.follow(api_json_url, self.parse_content_api_json)
def parse(self, response: scrapy.http.Response): for relative_url in response.xpath('//h3/a/@href').extract(): absolute_url = response.urljoin(relative_url) yield scrapy.Request(absolute_url, callback=self.parse_book) next_page_url = response.xpath( '//*[@class="next"]/a/@href').extract_first() absolute_url = response.urljoin(next_page_url) yield scrapy.Request(absolute_url)
def parse_posts_list(self, response: scrapy.http.Response): # Fetch the posts for href in response.css("#posts a::attr(href)"): if href.get().startswith("/p"): yield response.follow(href, self.parse_thread) # Fetch all pages for href in response.css(".pagination a::attr(href)"): yield response.follow(href, self.parse_posts_list)
def parse(self, response: scrapy.http.Response): # Extract every link to a landing page: for title in response.css('.document-row > h3 > a'): yield response.follow(title, self.parse_landing_page) # Extract the link to the next page of results: for next_page in response.css('.next > a'): yield response.follow(next_page, self.parse)
def parse_listing(self, response: scrapy.http.Response): i = {} i['url'] = response.url i['expire'] = response.xpath( '//a[@class="expire"]/span/text()').extract_first() i['job-title'] = response.css('span#main-job-title *::text').extract() i['main'] = response.css('div#main-lang-block *::text').extract() i['job-details'] = response.css('div.jobdetails *::text').extract() return i
def parse_page(self, response: scrapy.http.Response): image_url = response.css( 'div#all div.text-center img.img-fluid::attr(src)').get() image_url = response.urljoin(image_url) image = ImageItem() image['comic_id'] = response.meta['comic_id'] image['vol_id'] = response.meta['vol_id'] image['page'] = response.meta['page'] image['url'] = image_url yield image
def parse_content(self, response: scrapy.http.Response): item = PttItem() item['content'] = response.xpath( "//div[@id='main-content']/text()").get().replace('\n', '') meta = response.xpath("//span[@class='article-meta-value']") item['author'] = meta[0].xpath('text()').get() item['title'] = meta[2].xpath('text()').get() item['date'] = meta[3].xpath('text()').get() item['url'] = response.url yield item
def get_next_vimeo_overview_page(self, response: scrapy.http.Response): """ if there is a "next"-button at the bottom of the vimeo-user's overview page: grabs the url from it and yields it """ # next_vimeo_overview_page = response.xpath('//*[@id="pagination"]/ol/li[9]').get() next_vimeo_overview_page = response.css( '#pagination > ol > li.pagination_next a::attr(href)').get() if next_vimeo_overview_page is not None: yield response.follow(next_vimeo_overview_page, self.parse)
def parse_ad_page(self, resp: scrapy.http.Response) -> None: ''' Yields Ad objects if search phrase is found in response. ''' ensure_response_200(resp) title = resp.xpath('//div[@id = "adTitle"]//text()').extract_first() description = ' '.join( resp.xpath('//div[@id = "adDescription"]//text()').extract()) if title and description: if self._search_phrase in title or self._search_phrase in description: yield Ad(url=resp.url, search_phrase=self._search_phrase)
def parseCountryCities(self, response: scrapy.http.Response): # example page: https://www.viator.com/pascities.jspa?country=723 self.incrementRequestCount() hrefs = response.css( 'div.unit.size-pas-cities *> a::attr(durl)').extract() for href in hrefs: yield response.follow(href, callback=self.parseCityPage, meta=response.meta)
def parse_section_overview(self, response: scrapy.http.Response): # Each section (e.g. "Mathematik Teilgebiete") holds a list of individual topic-categories (e.g. "Kreislehre") section_urls = response.xpath( '/html/body/table/tr/td/a/@href').getall() section_urls.sort() # print(section_urls) # print("Section URLs: ", len(section_urls)) for url in section_urls: current_url = response.urljoin(url) yield scrapy.Request(url=current_url, callback=self.parse_topic_overview)
def parse(self, response: scrapy.http.Response, **kwargs): next_page_url = response.xpath('//a[@rel="next"]/@href').get() if next_page_url is not None: yield scrapy.Request(next_page_url) entry_urls = response.xpath( f'//div[{util.xpath_class(["type-tribe_events"])}]//*[{util.xpath_class(["tribe-events-list-event-title"])}]/a/@href' ).getall() for url in entry_urls: yield scrapy.Request(url, callback=self.parse_entry)
def parse(self, response: scrapy.http.Response, **kwargs): next_page_url = response.xpath( f'//li[{xpath_class(["next"])}]/a/@href').get() if next_page_url is not None: yield scrapy.Request(f'{self.base_url}{next_page_url}') entries = response.xpath( f'//article[{xpath_class(["event"])}]').getall() for entry in entries: yield ResponseItem({'body': entry, 'meta': response.meta})
def parse( self, response: scrapy.http.Response ) -> typing.Generator[scrapy.Request, None, None]: """Find all the cases.""" for case_url in response.xpath('//table[@class="cases"]/tbody/tr/td/a/@href'): url = response.urljoin(case_url.extract()) yield scrapy.Request( url=url, callback=self.parse_case, dont_filter=True, )
def parse_search_result(self, response: scrapy.http.Response): tbl = response.xpath( "//table[@class='table table-striped table-hover table-bordered']") for rowidx, row in enumerate(tbl.xpath("./tr")): if rowidx == 0: continue obj = {} for idx, col in enumerate(row.xpath("./td")): if idx == 0: rawdate = "".join(col.xpath("./text()").getall()).strip() rawdate = ' '.join(rawdate.split()) rawdate = rawdate.strip() rem = re.split(r"^(\d+)\s+/(\d+) (\d+)\.(\d+)\.(\d+)$", rawdate)[1:] rem.pop() vhnum, vhyear, pday, pmonth, pyear = rem obj["date"] = f"{vhyear}-{vhnum.zfill(3)}__{pyear}-{pmonth.zfill(2)}-{pday.zfill(2)}" elif idx == 1: for link in col.xpath("./a"): txt = link.xpath("./text()").get().strip() url = response.urljoin(link.xpath("./@href").get()) if txt == '0 kpl': continue if 'title' not in obj: obj["title"] = txt obj["link"] = url else: obj["attach"] = url dirpath = os.path.join(self.name, ) if "attach" in obj: yield scrapy.Request( obj["attach"], meta={ "name": response.meta["name"], "id": obj["date"], }, callback=self.parse_attachments, ) yield scrapy.Request( obj["link"], meta={ "name": response.meta["name"], "id": obj["date"], }, callback=self.dl_doc, )