def next_page(self, response: scrapy.http.Response) -> scrapy.Request: """ Goes to next page. :param response: response object :return: request for next page """ # go to next page next_url = response.xpath("//a[@title='下一页']/@href").extract_first() if next_url is not None: self.log('Next page {}'.format(next_url), level=logging.INFO) time.sleep(random.random()) return response.follow( url=next_url, callback=self.parse, # reuse the current proxy meta={'proxy': response.request.meta['proxy']}, errback=self.handle_failure) else: # try to build the page by ourself arguments = self.decode_url(response.request.url) arguments['page'] += 1 url = self.format_url(arguments) self.log('Next page (manually) {}'.format(url), level=logging.INFO) return response.follow( url=url, callback=self.parse, # reuse the current proxy meta={'proxy': response.request.meta['proxy']}, errback=self.handle_failure)
def parseCity(self, response: scrapy.http.Response): #example https://www.tripadvisor.in/Attractions-g186338-Activities-London_England.html#FILTERED_LIST attractionBoxs = response.css( 'div.attraction_list.attraction_list_short > div.attraction_element > div > div > div > div > div.listing_title' ) tourSetRegex = ".+([0-9]+).*" tourSetRegChecker = re.compile(tourSetRegex) for attraction in attractionBoxs: pointName = attraction.css('a::text').extract_first() if not tourSetRegChecker.match(pointName): attractionUrl = response.urljoin( attraction.css('a::attr(href)').extract_first()) response.meta['rank'] += 1 yield response.follow(url=attractionUrl, callback=self.parseAttractionsPage, meta=response.meta) nextPageLink = response.css( 'div.al_border.deckTools.btm > div > div.unified.pagination > a.nav.next.rndBtn.ui_button.primary.taLnk::attr(href)' ) if nextPageLink: nextPageLink = response.urljoin(nextPageLink.extract_first()) self.log("nextpage: " + nextPageLink) if response.meta['rank'] < 100: yield response.follow(nextPageLink, callback=self.parseCity, meta=response.meta)
def parseCityAttractionsListPage(self, response: scrapy.http.Response): # example page: https://www.viator.com/Mumbai/d953 print( 'PARSING ATTRACTION LIST ####################################################################################' ) print(response.url) self.incrementRequestCount() hrefs = response.css('div.ptm *> h2 > a') for href in hrefs: pointURL = href.css('::attr(href)').extract_first().strip() pointName = href.css('::text').extract_first().strip() yield response.follow(pointURL, callback=self.parseAttractionsPage, meta={ 'countryName': response.meta['countryName'], 'cityName': response.meta['cityName'], 'pointName': pointName }) nextPageLink = response.css( 'div.ptm > div:nth-child(1) > div:nth-child(2) > p > a:last-child::attr(href)' ).extract_first() if nextPageLink: yield response.follow(nextPageLink, callback=self.parseCityAttractionsListPage, meta=response.meta)
def parse(self, response: scrapy.http.Response): # Extract every link to a landing page: for title in response.css('.document-row > h3 > a'): yield response.follow(title, self.parse_landing_page) # Extract the link to the next page of results: for next_page in response.css('.next > a'): yield response.follow(next_page, self.parse)
def parse_posts_list(self, response: scrapy.http.Response): # Fetch the posts for href in response.css("#posts a::attr(href)"): if href.get().startswith("/p"): yield response.follow(href, self.parse_thread) # Fetch all pages for href in response.css(".pagination a::attr(href)"): yield response.follow(href, self.parse_posts_list)
def parseCountryAttractionsListPage(self, response: scrapy.http.Response): # example page: https://www.viator.com/Netherlands/d60 self.incrementRequestCount() hrefs = response.css('div.ptm *> h2 > a::attr(href)').extract() for href in hrefs: yield response.follow(href, callback=self.parseAttractionsPage) nextPageLink = response.css( 'div.ptm > div:nth-child(1) > div:nth-child(2) > p > a:last-child::attr(href)' ).extract_first() if nextPageLink: yield response.follow( nextPageLink, callback=self.parseCountryAttractionsListPage)
def parse(self, response: scrapy.http.Response, **kwargs): titles = response.xpath("//div[@class='r-ent']") for title in titles: try: url = title.xpath("div[@class='title']/a/@href").get() yield response.follow(url, callback=self.parse_content) except Exception: pass next_page = response.xpath( "//div[@class='btn-group btn-group-paging']/a[@class='btn wide'][2]/@href" ).get() if next_page and self.i < self.max_pages: self.logger.info(f'follow {next_page}') self.i += 1 yield response.follow(next_page, callback=self.parse)
def parseCountryPage(self, response: scrapy.http.Response): # example page: https://www.viator.com/India/d723-ttd self.incrementRequestCount() breadcrumbs = response.css('div.crumbler *> span::text').extract() countryName = breadcrumbs[1].strip() countryListing = CountryListing(crawler=self.name, sourceURL=response.url, crawlTimestamp=getCurrentTime(), countryName=countryName) yield countryListing.jsonify() if skipNonRequired: if processName(countryName) not in processedRequiredCountries: # do not process this country's cities print('Skipping country: ', countryName) return countryId = response.url.split('/')[-1].split('-')[0][1:] cityListingURL = 'https://www.viator.com/pascities.jspa?country={}'.format( countryId) yield response.follow(cityListingURL, callback=self.parseCountryCities, meta={'countryName': countryName})
def parse(self, response: scrapy.http.Response): # example page: https://www.viator.com/Amsterdam/d525-ttd countryMenuBox = response.css( '#countryMenuBox > div.menu-dropdown-box.small > div > div:nth-child(1)' ) hrefs = countryMenuBox.css('a::attr(durl)').extract() for href in hrefs: yield response.follow(href, callback=self.parseCountryPage)
def parse_landing_page(self, response: scrapy.http.Response): # On a landing page, we can extract all the documents, or infer the JSON link and use that. # yield {'title': pub.css('h1 ::text').extract_first().strip()} for pub in response.css('.publication'): # This is a publication, so let's infer the API link: lp_url = list(urlsplit(response.url)) lp_url[2] = "/api/content%s" % lp_url[2] api_json_url = urlunsplit(lp_url) yield response.follow(api_json_url, self.parse_content_api_json)
def get_next_vimeo_overview_page(self, response: scrapy.http.Response): """ if there is a "next"-button at the bottom of the vimeo-user's overview page: grabs the url from it and yields it """ # next_vimeo_overview_page = response.xpath('//*[@id="pagination"]/ol/li[9]').get() next_vimeo_overview_page = response.css( '#pagination > ol > li.pagination_next a::attr(href)').get() if next_vimeo_overview_page is not None: yield response.follow(next_vimeo_overview_page, self.parse)
def parseX(self, response: scrapy.http.Response): #"https://www.trip.skyscanner.com/bangkok/things-to-do hrefs = response.css('div.items_list *> h2 > a::attr(href)').extract() for href in hrefs: self.log("visiting: " + href) response.meta['rank'] += 1 yield response.follow(href, callback=self.parseAttractionsPage, meta=response.meta) nextPageLink = response.css( 'div.items_list > div:nth-child(2) > ul > li.next.next_page > a::attr(href)' ).extract_first() if nextPageLink: self.log("nextpage: " + nextPageLink) if response.meta['rank'] < 100: yield response.follow(nextPageLink, callback=self.parseX, meta=response.meta)
def parseCountryCities(self, response: scrapy.http.Response): # example page: https://www.viator.com/pascities.jspa?country=723 self.incrementRequestCount() hrefs = response.css( 'div.unit.size-pas-cities *> a::attr(durl)').extract() for href in hrefs: yield response.follow(href, callback=self.parseCityPage, meta=response.meta)
def parse(self, response: scrapy.http.Response): hrefs = response.css('div.tours > a::attr(href)').extract() attractionNumber = 1 for href in hrefs: href = response.urljoin(href) self.log("visiting: " + href) meta = urlToCityAndCountryMapping[response.url] meta['rank'] = attractionNumber yield response.follow(href, callback=self.parseAttractionsPage, meta=meta) attractionNumber += 1
def parse_delivery_method(self, response: scrapy.http.Response): data = json.loads(response.body) data = data['result']['forceGet']['shipping_methods']['data'] method_names = {elem['delivery_block_label'] for elem in data} il = response.meta.get('item loader') if len(method_names) > 1 and Names.PAGE_PICK_UP_LABEL.value in method_names: il.add_value('delivery_method', Names.DELIVER_ALL.value) elif Names.PAGE_PICK_UP_LABEL.value in method_names: il.add_value('delivery_method', Names.DELIVER_TO_STORE.value) else: il.add_value('delivery_method', Names.DELIVER_TO_HOME.value) yield response.follow(url=response.meta[Names.ACTIONS_URL_KEY], callback=self.parse_actions, meta=response.meta)
def parse_main( self, response: scrapy.http.Response ) -> Union[Iterator[Issue], scrapy.http.Request]: links = ( response.css("font.hdr b")[-1].xpath("../../../../../../*")[-1]. xpath('.//td[@valign="top"]').xpath(".//a[not(@hidden)][@href]")) for link in links: href = link.attrib["href"] if href.endswith(".pdf") or href.endswith(".djvu"): yield Issue(file=response.urljoin(href), text=link.css("::text").get()) else: yield response.follow(url=href, callback=self.parse_page)
def parse(self, response: scrapy.http.Response): # must always be fired venuesQueryURL = 'https://api.tripexpert.com/v1/venues?destination_id={}&api_key={}&limit={}' for city in availableCities: if processName(city['name']) not in processedRequiredCities: if skipNonRequired: print('Skipping', city['name']) continue queryURL = venuesQueryURL.format(city['id'], apiKey, limit) yield response.follow(queryURL, callback=self.parseCityVenues, meta={ 'city_id': city['id'] })
def parse_thread(self, response: scrapy.http.Response): page = response.url.split("/")[3] # http://blbla.com/< 3 folder = os.path.join(os.getcwd(), "downloaded_data", self.userid, "posts") os.makedirs(folder, exist_ok=True) filename = f"{folder}/{page}.html" with open(filename, "wb") as f: f.write(response.body) self.log("Saved file %s" % filename) # Fetch other pages of the same thread if self.fetch_full_thread: for href in response.css(".pagination a::attr(href)"): yield response.follow(href, self.parse_thread)
def parse_m3u8(self, response: scrapy.http.Response): current_url = response.url # m3u8的url,用于生成目录名和拼接后续请求的url another_m3u8 = re.search(r'\S+\.m3u8', response.text) # 如果指定了另一m3u8文件 if another_m3u8: yield response.follow(another_m3u8.group(0), callback=self.parse_m3u8) return # 判断是否加密 match = re.search(r'#EXT-X-KEY:METHOD=AES-128(\S+)', response.text) if match: # 获取AES加密的key info = match.group(1) key = urljoin(current_url, re.search(r'URI="([^"]+)"', info).group(1)) self.key = requests.get(url=key, headers={ 'USER_AGENT': USER_AGENT }).content # 是否提供了IV match = re.search(r'IV=0x([0-9A-Fa-f]{32})', info) self.iv = bytes.fromhex(match.group(1)) if match else None else: self.key = None self.file_names = re.findall(r'\S+\.ts\S*', response.text) # m3u8包含的ts片段名 self.directory = hashlib.md5( current_url.encode('utf-8')).hexdigest() # ts片段存储目录 if not os.path.exists(self.directory): os.mkdir(self.directory) for i, file_name in enumerate(self.file_names): file_path = os.path.join(self.directory, f'{i}.ts') # 存储位置,按序命名 # 忽略已下载的片段 if os.path.exists(file_path): self.logger.info(f'{i}.ts already crawled') continue yield response.follow(file_name, callback=self.parse_ts, meta={'file_path': file_path})
def parse_main(self, response: scrapy.http.Response): def fake_request_to_set_default_sorting(): return response.follow(url='https://allo.ua/ru/products/mobile/dir-asc/klass-kommunikator_smartfon/order-price/0', callback=self.fake_parser) yield fake_request_to_set_default_sorting() categories = response.xpath('//a[@class="level-top"]') assert len(categories) in range(14, 17) for category in categories: cat_name = category.xpath('./span/text()').extract_first() if cat_name in self.cat_1_exceptions: continue url = category.xpath('./@href').extract_first().replace('allo.ua/ua/', 'allo.ua/ru/') yield response.follow(url, meta={Names.CAT_TREE_KEY: [cat_name]})
def apply_filter(self, response: scrapy.http.Response) -> scrapy.Request: """ Applies the filter to the request. :param response: response object """ url = self.format_url(response.request.meta['extra']) self.log('Process page {}'.format(url), level=logging.INFO) yield response.follow( url=url, dont_filter=True, callback=self.parse, meta={'proxy': response.request.meta['proxy']}, errback=self.handle_failure)
def parseCityPage(self, response: scrapy.http.Response): # example page: https://www.viator.com/Lucknow/d23770-ttd self.incrementRequestCount() breadcrumbs = response.css('div.crumbler *> span::text').extract() countryName = breadcrumbs[1] if countryName != response.meta['countryName']: if countryName is None: countryName = response.meta['countryName'].strip() else: self.log( 'Country name mismatch.\nExpected: {}\nFound: {}'.format( meta['countryName'], countryName)) if len(breadcrumbs) == 4: regionName, cityName = breadcrumbs[2:4] cityName = cityName.strip() regionName = regionName.strip() else: # example page: https://www.viator.com/Mumbai/d953-ttd regionName, cityName = None, breadcrumbs[2] cityName = cityName.strip() countryName = countryName.strip() cityListing = CityListing(crawler=self.name, sourceURL=response.url, crawlTimestamp=getCurrentTime(), countryName=countryName, cityName=cityName, regionName=regionName) yield cityListing.jsonify() if skipNonRequired: if processName(cityName) not in processedRequiredCities: # do not process this country's cities print('Skipping city: ', countryName, cityName) return attractionsPageURL = response.url[:-4] yield response.follow(attractionsPageURL, callback=self.parseCityAttractionsListPage, meta={ 'countryName': countryName, 'cityName': cityName, })
def parse(self, response: scrapy.http.Response): news_divs = response.xpath("//div[contains(@class, 'news-info')]") if not news_divs: # no news any more in this page return has_new_news = False for news_div in news_divs: item = self._parse_news_info(news_div) uri = item.get('url').replace(self.base_url, '') if not self.state.get(uri): self.state.setdefault(uri, True) has_new_news = True yield item # go to next page if has_new_news: cur_page_no = int(response.url.split('/')[-1]) next_url = f"{self.page_url}{cur_page_no+1}" yield response.follow(url=next_url, dont_filter=False, callback=self.parse)
def parse(self, response: scrapy.http.Response): rows = response.xpath('//table[@class="brd_list_n"]/tbody/tr') for row in rows: result = { '산학연계여부': row.xpath('td[1]/text()').get(), '지방청': row.xpath('td[2]/text()').get(), '채용유무': row.xpath('td[3]/text()').get(), } link = row.xpath('th/a/@href').get() yield response.follow(link, self.parse_content, meta=result) page_info = response.xpath('//div[@class="topics"]').get() current_page = int(page_info.split('(')[1].split('/')[0]) total_page = int(page_info.split('/')[1].split(' ')[0]) if current_page < total_page: yield self.scrap_page(current_page + 1)
def parse(self, response:scrapy.http.Response): postmatch = re.compile(r'\s*ID[::\s]+(.+)\s+帖.*\s+楼.*\s+天[^::]*[::\s]+(.+)\s+原[^::]*[::\s]+(.*)') ##用于辨析禁言格式的regex语句. .group(1)=ID, .group(2)=天数, group(3)=原因 posts = response.xpath('//div[@class="dfsj_post mbm"]') ##找出所有楼层 for post in posts: bannerusername = post.xpath('descendant::a[@class="xw1"]/text()').extract_first() ##禁言人 posttext = ''.join(post.xpath('descendant::td[@class="t_f"]/descendant-or-self::text()').extract()) ##文章内容 postmatched = postmatch.match(posttext) ##regex match if postmatched: userid = postmatched.group(1).strip('\r') duration = postmatched.group(2).strip('\r') reason = postmatched.group(3).strip('\r') extraction = { 'bannerusername': bannerusername, 'userid': userid, 'duration': duration, 'reason': reason } yield extraction next_page = response.xpath('//a[@class="bm_h"]/@href').extract_first() if next_page is not None: yield response.follow(next_page, callback = self.parse)
def parseCityVenues(self, response: scrapy.http.Response): # example page: https://api.tripexpert.com/v1/venues?destination_id=3&api_key=6cb54d22babb25cc64ae730f17455338&limit=100 self.incrementRequestCount() venues = json.loads(response.text)['response']['venues'] venueIdURL = 'https://api.tripexpert.com/v1/venues/{}?api_key={}' for index, venue in enumerate(venues): venueType = int(venue['venue_type_id']) if 'rank_in_destination' not in venue: venueRank = 1 + (index // 3) else: venueRank = int(venue['rank_in_destination']) if venueRank > numForType[venueType]: # This venue is too poor for our interest continue queryURL = venueIdURL.format(venue['id'], apiKey) yield response.follow(queryURL, callback=self.parseVenueDetails, meta={ 'city_id': response.meta['city_id'], 'venue_id': venue['id'] })
def parseAttractionsPage(self, response: scrapy.http.Response): # example page: https://www.viator.com/Amsterdam-attractions/Albert-Cuyp-Market/d525-a8126 print( 'PARSING ATTRACTION PAGE @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' ) print(response.url) self.incrementRequestCount() breadcrumbs = response.css('div.crumbler *> span::text').extract() if breadcrumbs: countryName = breadcrumbs[1].strip() cityName = breadcrumbs[-3].strip() # -2 is the word 'attractions' pointName = breadcrumbs[-1].strip() # we don't really care about the region once we have the city? else: countryName = response.meta['countryName'] cityName = response.meta['cityName'] pointName = response.meta['pointName'] data = response.css('div.cms-content') description, notes = None, None if len(data) > 0: description = data[0] description = '\n'.join( description.css('div::text').extract()).strip() if len(data) > 1: notes = data[1].css('::text').extract_first() if notes: notes = notes.strip() sideBox = response.css( 'body > div.page.mtl > div.body > div.main-wide.unitRight > div.page-bg.line.light-border-b > div.unitRight.aside > div > div.mtmm.mhmm > div.line > div' ) address = sideBox.css( 'meta[itemprop="streetAddress"]::attr(content)').extract_first() if address: address = address.strip() ratingBox = sideBox.css('p[itemprop="aggregateRating"]') avgRating, ratingCount = None, None if ratingBox: bestRating = int( ratingBox.css('meta[itemprop="bestRating"]::attr(content)'). extract_first()) worstRating = int( ratingBox.css('meta[itemprop="worstRating"]::attr(content)'). extract_first()) givenRating = float( ratingBox.css('meta[itemprop="ratingValue"]::attr(content)'). extract_first()) ratingCount = int( ratingBox.css( 'span[itemprop="reviewCount"]::text').extract_first()) avgRating = scaleRating(givenRating=givenRating, worstRating=worstRating, bestRating=bestRating) pointListing = PointListing(crawler=self.name, sourceURL=response.url, crawlTimestamp=getCurrentTime(), countryName=countryName, cityName=cityName, pointName=pointName, description=description, notes=notes, address=address, avgRating=avgRating, ratingCount=ratingCount) yield pointListing.jsonify() pointImage = response.css( 'div.img-product > img::attr(src)').extract_first() if pointImage: pointImage = pointImage.strip() yield ImageResource(crawler=self.name, sourceURL=response.url, crawlTimestamp=getCurrentTime(), countryName=countryName, cityName=cityName, pointName=pointName, imageURL=pointImage).jsonify() yield response.follow('?subPageType=reviews', callback=self.parseReviewsPage, meta={ 'countryName': countryName, 'cityName': cityName, 'pointName': pointName })
def parse(self, response: scrapy.http.Response): # We get our soup. soup = BeautifulSoup(response.text, 'html.parser') # We create an empty dictionary to store all data about this page. data = {} data['title'] = soup.h1.string data['uri'] = response.url log.info(f"Processing {data['uri']}") try: href = soup.select_one( "#ctl00_placeHolderMain_linkEmailArticle")["href"] data['id'] = re.search(r"id=([0-9]+)", href).group(1) except Exception as e: log.info(f"Error ({data['uri']}): {e}") error.info("Error ({data['uri']}): {e}") return None # Mark as visited self.visited.add(data['uri'][data['uri'].find("article.aspx"):]) self.visited.add(f"article.aspx?id={data['id']}") # Basic stuff data['abstract'] = soup.select_one(".articleblockconteiner p").text # Images data['images'] = [] for img in soup.select("img.mbimg"): image = {} image["thumbUrl"] = f"http://www.yivoencyclopedia.org{img['src']}" href = img.parent["href"] image["viewerUrl"] = re.search(r"(http.*)&article", href).group(0) caption = img.parent.find_next_sibling("div") if caption: image["imgDesc"] = caption.text.replace( "SEE MEDIA RELATED TO THIS ARTICLE", "").strip() data['images'].append(image) # Links data['links'] = [] for a in soup.select( f"#ctl00_placeHolderMain_panelArticleText a[href^='article.aspx/']" ): link = {} link["href"] = f"http://www.yivoencyclopedia.org/{a['href']}" link["text"] = a.text.strip() if len(link["text"] ) > 0: # Strangely, there are sometimes empty links data['links'].append(link) # With yield, we can either return a new URL to be crawled # or the final data. if self.check_queue(link['href']): yield response.follow(link["href"]) # Glossary terms data['glossary'] = [] for span in soup.select(".term"): term = span.text.strip() if len(term) > 0: # Strangely, there are sometimes empty spans data['glossary'].append(term) # Subrecords, i.e., multi-page articles (like Poland) data['subrecords'] = [] isMain = True for index, a in enumerate( soup.select("#ctl00_placeHolderMain_panelPager a")): sr = {} sr["href"] = f"http://www.yivoencyclopedia.org" + a["href"] sr["page"] = a.text.strip() if index == 0 and sr["href"] != data['uri']: isMain = False if not isMain and index == 0: data['parent'] = sr["href"] if self.check_queue(sr['href']): yield response.follow(sr["href"]) if isMain and index != 0: data['subrecords'].append(sr) if self.check_queue(sr['href']): yield response.follow(sr["href"]) # Subconcepts, i.e., H2 headings on the same page (not really a concept, but maybe useful) data['subconcepts'] = [] for index, h2 in enumerate(soup.select("h2.entry")): sc = h2.text.strip() if index == 0 and not isMain: data['title'] = f"{sc} ({data['title']})" break # The following H2 headings are NOT stored as concepts: stops = [ "About this Article", "Suggested Reading", "YIVO Archival Resources", "Author", "Translation" ] if sc in stops: break data['subconcepts'].append(sc) # Next record in alphabet next_article = soup.select_one( '#ctl00_placeHolderMain_linkNextArticle') if next_article: data['next_article'] = next_article['href'] # Here we yield the data of this page. yield data if next_article and self.check_queue(data['next_article']): yield response.follow(data['next_article'])
def parse(self, response: scrapy.http.Response): selector_xpath = "//div[@class='mainbody']/div[@class='centent']/ul[position()>1]/li/a/@href" for i in response.xpath(selector_xpath).extract(): yield response.follow(i, callback=self.parse_page)
def parse(self, response: scrapy.http.Response): if "Служебная:Вход" in unquote(response.url): log.info("Login page, skipping") return None log.info(f"Processing {response.url}") # We get our soup. soup = BeautifulSoup(response.text, 'html.parser') # We create an empty dictionary to store all data about this page. data = {} if "AllPages" in response.url or "Все_страницы" in unquote( response.url): log.info("Processing index page") for a in soup.select("div.mw-allpages-nav a"): h = a["href"] if self.check_queue(h): log.info(f"New index page: {a['href']}") yield response.follow(h) for a in soup.select(".mw-allpages-chunk li a"): h = self.lower_case(a["href"]) if self.check_queue(h): log.info(f"New page: {a['href']}") yield response.follow(h) return None data['uri'] = self.lower_case(response.url) log.info(f"Processing {data['uri']}") if response.status == 404: log.info(f"Page not found: {response.url}") return data try: # "wgArticleId":9907 data['id'] = re.search(r'"wgArticleId":([0-9]+),', soup.head.get_text()).group(1) except Exception as e: log.info(f"Error getting ID ({data['uri']}): {e}") error.info(f"Error getting ID ({data['uri']}): {e}") # return {} # Basic stuff data['title'] = soup.select_one("h1.firstHeading").text.strip() abstract = soup.select_one("#mw-content-text p") if abstract: data['abstract'] = abstract.get_text().replace("\n", "").strip() else: error.info(f"No abstract: {response.url}") # Links data["links"] = [] for a in soup.select("#mw-content-text p a"): link = {} h = self.lower_case(a['href'].replace('&action=edit&redlink=1', '').replace('?title=', '/')) link["href"] = h link["text"] = a.text.strip() if len(link["text"] ) > 0: # Strangely, there are sometimes empty links data["links"].append(link) if self.check_queue(h): yield response.follow(h) #Category data["categories"] = [] for a in soup.select("#mw-normal-catlinks ul li a"): data["categories"].append(a.text.strip()) # Here we yield the data of this page. yield data