def parse_list(self, response: Response): # wait for page to load # wait for the redirect to finish. patent_links = [] for link in response.xpath("//a[@class='lead-in']/@href").getall(): patent_links.append(link) # for next page next_page = response.xpath( "//div[@class='nav-previous']/a/@href").get() if next_page is not None: self.log('process page {}'.format(next_page), level=logging.INFO) yield response.follow( url=next_page, callback=self.parse_list, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure) for p in patent_links: name = p.split('/')[-2] if os.path.exists(os.path.join(self.work_directory, name + '.json')): self.log('{} already parsed and will skip'.format(p), level=logging.INFO) continue yield response.follow( url=p, callback=self.parse, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure)
def follow_pages(self, response: Response) -> Iterable[Request]: yield from ( response.follow(letter.get(), callback=self.parse) # type: ignore for letter in response.css(".az-keyboard ul li a::attr(href)")) yield from ( response.follow(number.get(), callback=self.parse) # type: ignore for number in response.css("ul.pagination__list li a::attr(href)"))
def parse_list(self, response: Response): # wait for page to load # wait for the redirect to finish. patent_links = [] for link in response.xpath("//h4[@class='result-title']/a"): text = link.xpath("text()").get() url = link.xpath("@href").get() self.log("find technology {}/{}".format(text, url), level=logging.INFO) patent_links.append(url) # for next page current_page, total_page = self.statictics(response) if current_page < total_page: self.log('process page {}'.format(self.page), level=logging.INFO) yield response.follow( url='https://otd.harvard.edu/explore-innovation/technologies/results/P{}/'.format(current_page * 10), callback=self.parse_list, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure) for p in patent_links: name = self.parse_name_from_url(p) if os.path.exists(os.path.join(self.work_directory, name + '.json')): self.log('{} already parsed and will skip'.format(p), level=logging.INFO) continue yield response.follow( url=p, callback=self.parse, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure)
def parse_forum(self, response: Response): forum_url_query = urllib.parse.urlparse(response.url).query forum_id = int(urllib.parse.parse_qs(forum_url_query)['id'][0]) forum_title = response.xpath( '//div[@id="pun-main"]/h1/span/text()').get() section_id = self.forums[forum_id] yield ForumItem(id=forum_id, title=forum_title, section_id=section_id) for topic in response.xpath( '//div[@class="forum"]/div[@class="container"]//tbody//tr/td[@class="tcl"]/div[@class="tclcon"]' ): topic_url = topic.xpath('a/@href').get() topic_url_query = urllib.parse.urlparse(topic_url).query topic_id = int(urllib.parse.parse_qs(topic_url_query)['id'][0]) self.topics[topic_id] = forum_id yield response.follow(topic_url, self.parse_topic) next_page_url = response.xpath( '//div[@class="pagelink"]/a[@class="next"]/@href').get() if next_page_url: yield response.follow(next_page_url, callback=self.parse_forum)
def parse_list(self, response: Response): # wait for page to load # wait for the redirect to finish. patent_links = [] for link in response.xpath( '//*[@id="formTechPub1"]/div/table[2]/tr/td/a'): text = link.xpath("text()").get() url = link.xpath("@href").get() self.log("find technology {}/{}".format(text, url), level=logging.INFO) patent_links.append(url) # for next page total_result = self.statictics(response) self.page += 1 if self.page * self.item_per_page < total_result: self.log('process page {}'.format(self.page), level=logging.INFO) yield response.follow( url=self.next_page_template.format(self.page), callback=self.parse_list, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure) for p in patent_links: name = self.parse_name_from_url(p) if os.path.exists(os.path.join(self.work_directory, name + '.json')): self.log('{} already parsed and will skip'.format(p), level=logging.INFO) continue yield response.follow( url=p, callback=self.parse, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure)
def parse_list(self, response: Response): # wait for page to load # wait for the redirect to finish. patent_links = [] for link in response.xpath( "//div[@class='view-content']/div[contains(@class,'views-row')]/div/h3/a" ): text = link.xpath("text()").get() url = link.xpath("@href").get() self.log("find technology {}/{}".format(text, url), level=logging.INFO) patent_links.append(url) # for next page next_page = response.xpath("//li[@class='pager-next']/a/@href").get() if next_page is not None: self.log('process page {}'.format(next_page), level=logging.INFO) yield response.follow( url=next_page, callback=self.parse_list, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure) for p in patent_links: name = self.parse_name_from_url(p) if os.path.exists(os.path.join(self.work_directory, name + '.json')): self.log('{} already parsed and will skip'.format(p), level=logging.INFO) continue yield response.follow( url=p, callback=self.parse, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure)
def parse(self, response: Response): for row in response.xpath( '//div[@class="usertable"]//div[@class="container"]//tbody/tr' ): profile_url = row.xpath('td[@class="tcl"]//a/@href').get() yield response.follow(profile_url, callback=self.parse_profile) next_page_url = response.xpath( '//div[@class="pagelink"]/a[@class="next"]/@href').get() if next_page_url: yield response.follow(next_page_url, callback=self.parse)
def parse(self, response: Response, **kwargs): if self.url_to_crawl: yield response.follow(url=self.url_to_crawl, callback=self.parse_residences) else: residences = response.xpath("//a[contains(@class,'detalii-proprietate')][contains(.,'Vezi detalii')]/@href").getall() residences = list(set(residences)) yield from response.follow_all(urls=residences, callback=self.parse_residences) next_page = response.xpath("//a[@class='inainte butonpaginare']/@href").get() if next_page: yield response.follow(url=next_page, callback=self.parse)
def parse(self, response: Response): """ Parse symbol directory pages such as https://www.set.or.th/set/commonslookup.do?language=en&country=TH&prefix=A """ symbol_rows = response.css("tr[valign=top]") for row in symbol_rows: comp_profile_page = row.css("td a::attr(href)").get() comp_holders_page = comp_profile_page \ .replace("companyprofile", "companyholder") yield response.follow(comp_profile_page, callback=self.parse_comp_profile) yield response.follow(comp_holders_page, callback=self.parse_comp_holders)
def parse(self, response: Response) -> Generator[Request, None, None]: """ Parse whoscored statistic page to get all tournaments url :param response: :type response: Response :return: :rtype: Generator[Request, None, None] # Scrapy check - because of settings missing, use Premier League # (England) only for test purpose @url https://www.whoscored.com/Statistics @returns requests 1 """ js_script: str = response.css( "#layout-wrapper > script::text").extract_first() tournaments: str = P_TOUR.search(js_script).group("tournaments") ctx = MiniRacer() for region in ctx.eval(tournaments): for tournament in filter(lambda x: x["name"], region["tournaments"]): if (region["id"], tournament["id"]) in self.settings.get( "REGIONS", {(252, 2)} # pylint: disable=bad-continuation ): # England, Premier League (as default) yield response.follow( tournament["url"], callback=self.parse_tournaments, meta={"waitForSelector": "#layout-content-wrapper"}, )
def parse(self, response: Response) -> Iterable[Union[Request, Mapping]]: yield from self.follow_pages(response) for recipe_url in response.css("a.promo::attr(href)"): yield response.follow(recipe_url.get(), callback=self.parse) recipe = response.css("div.recipe-main-info") if recipe: ingredients = [ self._get_ingredient(response, ingredient) for ingredient in recipe.css("li.recipe-ingredients__list-item") ] if all(ingredient["url"] is not None for ingredient in ingredients): chef_name_parts = recipe.css(".chef__name *::text").getall() chef_name = chef_name_parts[-1] if len( chef_name_parts) > 0 else None image_urls = recipe.css( ".recipe-media__image img::attr(src)").getall() yield { "title": recipe.css("h1::text").get(), "url": response.url, "chef_name": chef_name, "ingredients": ingredients, "image_urls": image_urls, }
def parse_list(self, response: Response): # wait for page to load # wait for the redirect to finish. patent_links = [] if os.path.exists(os.path.join(self.work_directory, 'links.json')): patent_links = json.load(open(os.path.join(self.work_directory, 'links.json'), 'r')) else: # the id of product is provded in the <script></script> for code in response.xpath("//script").getall(): if 'id_list' in code: ids = re.findall(r'[0-9]+', re.findall(r'\[[0-9,]+\]', code)[0]) patent_links = [response.url + '/public/project/{}'.format(patentId) for patentId in ids] with open(os.path.join(self.work_directory, 'links.json'), 'w') as fo: json.dump(patent_links, fo) for p in patent_links: name = p.split('/')[-1] if os.path.exists(os.path.join(self.work_directory, name + '.json')): self.log('{} already parsed and will skip'.format(p), level=logging.INFO) continue yield response.follow( url=p, callback=self.parse, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure)
def parse_team(self, response: Response) -> Generator[Request, None, None]: """ :param response: :type response: Response :return: :rtype: Generator[Request, None, None] @url https://www.whoscored.com/Teams/167/Fixtures/England-Manchester-City @returns requests 0 """ js_script: str = response.xpath( '//*[@id="layout-content-wrapper"]/div[2]/script[3]' ).extract_first() ctx = MiniRacer() fixture: List for fixture in ctx.eval( P_TEAM_FIXTURES.search(js_script).group("fixtures")): fixture_: Match = Match(*fixture) yield response.follow( url="https://www.whoscored.com/Matches/{id}/".format( id=fixture_.id), callback=self.parse_match, )
def parse_list(self, response: Response): # wait for page to load # wait for the redirect to finish. patent_links = [] for link in response.xpath("//ul[@id='tech-licensing']/li/a"): text = link.xpath("text()").get() url = link.xpath("@href").get() if url is None: continue self.log("find technology {}/{}".format(text, url), level=logging.INFO) patent_links.append(url) for p in patent_links: name = self.parse_name_from_url(p) if os.path.exists(os.path.join(self.work_directory, name + '.json')): self.log('{} already parsed and will skip'.format(p), level=logging.INFO) continue yield response.follow( url=p, callback=self.parse, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure)
def parse(self, response: Response): # Parse politics pages if self.is_politics_page(response.url): yield self.parse_politics_page(response) for href in response.css('a::attr(href)'): if self.is_in_domain(href.get(), response.url): yield response.follow(href, self.parse)
def parse(self, response: Response) -> Generator[Generator, None, None]: links_declassified = response.xpath( '//a[starts-with(@href,"collection") and (parent::h3 | parent::h2)]/@href' ).getall() for link in links_declassified: yield response.follow(link, callback=self._document, cb_kwargs={'url': response.urljoin(link)})
def parse(self, response: Response): if self.is_mobile(response.url): # Parse mobile politics page yield self.parse_politics_page(response) else: # Go to mobile page instead if self.is_politics_page(response.url): yield response.follow(response.url.replace('www', 'm', 1), self.parse) for href in response.css('a::attr(href)'): absolute_url = urljoin( response.url, href.get()) # Make relative links absolute if self.is_in_domain(href.get(), response.url): # Remove parameters o: SplitResult = urlsplit(absolute_url) base_href = urlunsplit( (o.scheme, o.netloc, o.path, '', '')) yield response.follow(base_href, self.parse)
def parse(self, response: Response): yield from response.follow_all( xpath='//*[starts-with(@id, "item_")]/div[1]/a/@href', callback=self.parse_item, ) link = response.xpath( '//*[@id="navigation-bar-bottom"]/div[2]/ul/' 'li[contains(@class, "next-page")]/a/@href').get() yield response.follow(link, callback=self.parse)
def parse_list(self, response: Response): # wait for page to load # wait for the redirect to finish. patent_links = [] for row in response.xpath( "//div[@id='nouvant-portfolio-content']/div[@class='technology']" ): title = row.xpath("h2/a/text()").get() link = row.xpath("h2/a/@href").get() abstract = row.xpath("p/span/text()").get() self.log('found patent {}'.format(title), level=logging.INFO) patent_links.append({ 'title': title, 'link': link, 'abstract': abstract }) statistics = self.statistics(response) self.log('found {}/{} patents'.format(statistics['end'], statistics['total']), level=logging.INFO) if statistics['end'] < statistics['total']: yield response.follow( url='/technologies?limit=50&offset={}&query='.format( statistics['end']), callback=self.parse_list, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure) for p in patent_links: name = p['link'].split('/')[-1] if os.path.exists(os.path.join(self.work_directory, name + '.json')): self.log('{} already parsed and will skip'.format(p['link']), level=logging.INFO) continue yield response.follow( url=p['link'], callback=self.parse, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure)
def parse(self, response: Response): XPATH_URL = "//body//div[@class='childinfo']//div//div[*]//a[1]/@href" urls = response.xpath(XPATH_URL).getall() for i in range(len(urls)): yield response.follow( urls[i], callback=self.parse_detail, meta={ "n": (response.meta["page"] - 1) * self.VIDEOS_PER_PAGE + i + 1 })
def parse_playlist_items(self, response: Response): body = json.loads(response.body) assert body["kind"] == "youtube#playlistItemListResponse" ids = [item["snippet"]["resourceId"]["videoId"] for item in body["items"]] yield self.request_videos(ids, response.meta) if "nextPageToken" in body: request_url = YoutubeSpider.update_url_query( response.url, {"pageToken": body["nextPageToken"]} ) yield response.follow( request_url, meta=response.meta, callback=self.parse_playlist_items )
def parse_category(self, response: Response): # with javascript it would be //div[@class='split-taxonomy-4']/ul/li/a/@href for row in response.xpath( "//section[@id='block-taxonomy-menu-block-1']/ul/li/a/@href" ).getall(): self.log('find category {}'.format(row), level=logging.INFO) yield response.follow( url=row, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, callback=self.parse_list, errback=self.handle_failure)
def parse_list(self, response: Response): # wait for page to load # wait for the redirect to finish. patent_links = [] for row in response.xpath( "//section[@id='block-system-main']/div[@class='node node-technology node-teaser clearfix']/h2/a" ): name = row.xpath("text()").get() link = row.xpath("@href").get() patent_links.append({'name': name, 'link': link}) self.log('found patents {}'.format(name), level=logging.INFO) if response.xpath("//li[@class='pager-last']/a/@href").get() is not None and\ response.url != response.xpath("//li[@class='pager-last']/a/@href").get(): # have next page if '?page=' in response.url: elements = response.url.split("=") page = (int(elements[-1]) + 1) self.log('go to page {}'.format(page), level=logging.INFO) yield response.follow( url='='.join(elements[:-1]) + '={}'.format(page), dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, callback=self.parse_list, errback=self.handle_failure) else: self.log('go to page 2', level=logging.INFO) yield response.follow( url=response.url + '?page=1', dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, callback=self.parse_list, errback=self.handle_failure) for p in patent_links: yield response.follow( url=p['link'], dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, callback=self.parse, errback=self.handle_failure)
def parse(self, response: Response): for section in response.xpath( '//div[@id="pun-main"]/div[@class="category"]'): section_id = int(section.xpath('@id').get()[len('pun-category'):]) section_title = section.xpath('h2/span/text()').get() yield SectionItem(id=section_id, title=section_title) for forum in section.xpath( 'div[@class="container"]//div[@class="intd"]//h3'): forum_url = forum.xpath('a/@href').get() forum_url_query = urllib.parse.urlparse(forum_url).query forum_id = int(urllib.parse.parse_qs(forum_url_query)['id'][0]) self.forums[forum_id] = section_id yield response.follow(forum_url, self.parse_forum)
def parse_book_contents(self, response: Response): # get book name book = response.css('div.book-title h1').extract_first() # get version titles ver_titles = [] for ver_title in response.css('div.chapter h4'): ver_titles.append(ver_title.css('::text').get()) # get version chapter lists branchs = response.css('div.chapter div.chapter-list') for ver_title, ver_chapters in zip(ver_titles, branchs): chapter_no = 0 for sub_chapter_list in ver_chapters.css('ul'): # lastest chapter appears in the front, reversed to # get an increase order chapter list for chapter in reversed(sub_chapter_list.css('li a')): # <a href="/comic/xxxx/xxxxxx.html" title="第xx回" ...> # <span>第xx回<i>xxp</i></span> # </a> chapter_no += 1 chapter_url = chapter.css('::attr(href)').extract_first() title, pages = chapter.css('::text').extract() pages = int(pages[:-1]) chapter_item = MangaChapterItem({ 'book': book, 'version': ver_title, 'chapter_no': chapter_no, 'title': title, 'pages': pages, 'chapter_url': chapter_url, 'store_dir': self.out_dir }) yield chapter_item # request for each page of chapter for page in range(1, pages+1): page_item = { 'chapter': dict(chapter_item), 'page_no': page } page_url = '%s#%d' % (chapter_url, page) chapter_request = response.follow( page_url, partial(self.parse_book_page, page_item)) yield chapter_request
def parse(self, response: Response): visited_links = set() for link_element in response.css("li > a"): link = link_element.xpath("@href").extract_first() if link in visited_links: continue visited_links.add(link) if "prologue" in link: self.book_count += 1 os.makedirs( os.path.dirname(f"chapters/Book {self.book_count}/blah"), exist_ok=True) os.makedirs( os.path.dirname(f"sources/Book {self.book_count}/blah"), exist_ok=True) if link.startswith("https://practicalguidetoevil.wordpress.com/20" ): # 2015, 2016... request = response.follow(link_element, scrape_chapter) request.meta['link'] = link request.meta['book_num'] = self.book_count yield request
def parse_topic(self, response: Response): topic_url_query = urllib.parse.urlparse(response.url).query topic_id = int(urllib.parse.parse_qs(topic_url_query)['id'][0]) forum_id = self.topics[topic_id] topic_title = response.xpath( '//div[@id="pun-main"]/h1/span/text()').get() yield TopicItem(id=topic_id, title=topic_title, forum_id=forum_id) for post in response.xpath( '//div[@class="topic"]/div[contains(@class, "post")]'): post_id = int(post.xpath('@id').get()[1:]) post_date = int(post.xpath('@data-posted').get()) post_number = int(post.xpath('h3/span/strong/text()').get()) post_author_link = post.xpath( 'div[@class="container"]/div[@class="post-author"]/ul/li[@class="pa-author"]/a/@href' ).get() post_author_link_query = urllib.parse.urlparse( post_author_link).query post_author = int( urllib.parse.parse_qs(post_author_link_query)['id'][0]) post_text_elements = post.xpath( 'div[@class="container"]/div[@class="post-body"]/div[@class="post-box"]/div[@class="post-content"]/*' ).getall() post_text = ''.join(post_text_elements) yield PostItem(id=post_id, topic=topic_id, number=post_number, date=post_date, author=post_author, text=post_text) next_page_url = response.xpath( '//div[@class="pagelink"]/a[@class="next"]/@href').get() if next_page_url: yield response.follow(next_page_url, callback=self.parse_topic)
def parse(self, response: Response): data = { name: response.css(selector).extract_first() for name, selector in self.state.extract_fields.items() } # Follow next links if self.state.follow_next: rel_next_url = response.css( 'link[rel="next"]::attr(href), a[rel="next"]::attr(href)' ).extract_first() if rel_next_url is not None: data['rel_next_url'] = rel_next_url yield response.follow(rel_next_url, callback=self.parse) # Strip cacheserver from the url if possible url = response.url[len(self.state.cacheserver_url):].lstrip('/') url = urllib.urlparse(url) url = urllib.urlunparse(('', '', url.path, url.params, url.query, '')) # Build page entity for dashboard cached = bytes_to_str(response.headers.get('Rendertron-Cached', None)) cached_at = bytes_to_str( response.headers.get('Rendertron-Cached-At', None)) yield { 'address': url, 'content_type': bytes_to_str(response.headers.get('Content-Type', None)), 'status_code': response.status, 'cache_status': 'cached' if cached == '1' or response.status == 200 else 'not-cached', 'cached_at': cached_at, 'extract_fields': data }
def parse(self, response: Response, **cb_kwargs): selection = response.css('.vipped-apartments~ .items_list > .items-i') # selection = response.css('.items-i >.item_link') for bina_listing in selection: yield ListingItem({ 'name': bina_listing.css( '.items-i .card_params .location::text').get(), 'url': bina_listing.css('.item_link').attrib['href'], }) next_url_selector = response.css(".next a") if len(next_url_selector) <= 0: return next_url_href = next_url_selector.attrib["href"] next_page = int(next_url_href.split('=')[1]) if not self.crawling_limit_reached(next_page): self.logger.info("Next URL: %s", next_url_href) yield response.follow(next_url_href, callback=self.parse)
def parse_tournaments( self, response: Response) -> Generator[Request, None, None]: """ TODO: here are two methods: 1. go to seasons, and fetch all fixtures in one season 2. go to team statistics, and fetch all games for each of teams :param response: :type response: Response :return: :rtype: Generator[Request, None, None] @url https://www.whoscored.com/Regions/252/Tournaments/2/England-Premier-League @returns requests 0 """ # This is to go for method 1 # for season in response.css("#seasons option"): # url = season.css("option::attr(value)").extract_first() # yield response.follow(url, callback=self.parse_season) # This is to go for method 2 js_script: str = response.xpath( '//*[@id="layout-content-wrapper"]/div[2]/script[4]' ).extract_first() ctx = MiniRacer() team: List for team in ctx.eval(P_TEAM.search(js_script).group("history")): team_: Team = Team(*team) yield response.follow( url="https://www.whoscored.com/Teams/{id}/Fixtures/".format( id=team_.id), callback=self.parse_team, meta={"waitForSelector": "layout-content-wrapper"}, ) break