def parse_school(self, response): collection = {'phone_numbers': {}} collection['title'] = response.css("#content h2::text").extract_first().strip() entries = response.css(".kontaktliste li") for entry in entries: # Remove the trailing `:` from the key (:-1) key = entry.css("b::text").extract_first(default="kein Eintrag:").strip()[:-1] values = entry.css("::text").extract()[1:] # Some schools list additional phone numbers. The problem is # that they do not have the header "Telefon" or something # comparable. The header shows, whom the number belongs to # So we check if there is a phone icon and if there is we # Add this to our list of phone numbers type = entry.css("img::attr(src)").extract_first() if type == "img/telefon.gif": collection['phone_numbers'][key] = ' '.join(values) else: collection[key] = ' '.join(values).replace('zur Karte', '') collection["Leitbild"] = cleanjoin(response.css("#quickbar > div:nth-child(3) ::text").extract(), "\n") request = scrapy.Request('https://schuldatenbank.sachsen.de/index.php?id=440', meta={'cookiejar': response.meta['cookiejar']}, callback=self.parse_personal_resources, dont_filter=True) request.meta['collection'] = collection yield request
def parse_detail(self, response): # inspect_response(response, self) text = response.css("article ::text") collection = {} street, city = response.css("article > p")[0].css("::text").extract() collection['street'] = street collection['city'] = city collection['name'] = cleanjoin( response.css('article h1::text').extract(), "") collection['phone'] = get_first_or_none(text.re("Telefon: ([0-9 /]+)")) collection['fax'] = get_first_or_none(text.re("Fax: ([0-9 /]+)")) collection['web'] = response.css( "article a::attr(href)").extract_first() collection['number'] = get_first_or_none( text.re("Schulnummer: ([0-9]+)")) collection['school_type'] = get_first_or_none( text.re("Schulart: (.+)")) collection['type'] = get_first_or_none( text.re("Rechtlicher Status: (.+)")) collection['teachers'] = get_first_or_none( text.re("Hauptamtliche Lehrkräfte: ([0-9]+)")) collection['students'] = get_first_or_none( text.re("Schüler: ([0-9]+)")) collection['url'] = response.url yield collection
def parse_partners_detail(self, response): meta = response.meta stash = response.meta.get('stash') if "5130" in response.url: data = [] table = response.css("table.ssdb_02") for row in table.css("tr"): row_data = {} tds = row.css("td") row_key = tds[0].css("::text").extract_first().strip() row_data[row_key] = cleanjoin(tds[1:].css("::text").extract()) data.append(row_data) meta['collection']['partners'].append(data) else: # TODO: This is an Eltern/SV page, parse it differently pass if len(stash) > 0: next_request = meta['stash'].pop() next_request.meta['stash'] = meta['stash'] yield next_request else: request = scrapy.Request("https://schuldatenbank.sachsen.de/index.php?id=470", meta={'cookiejar': response.meta['cookiejar']}, callback=self.parse_competitions_overview, dont_filter=True) request.meta['collection'] = meta['collection'] yield request
def parse_competition_detail(self, response): meta = response.meta stash = response.meta.get('stash') data = { 'name': response.css("#content > div:nth-child(3) > b::text").extract_first(), 'results': [] } table = response.css("table.ssdb_02") headers = [text.strip() for text in table.css(" tr:first-child td::text").extract()] for tr_index, row in enumerate(table.css("tr")): if tr_index == 0: # header continue row_data = {} for td_index, td in enumerate(row.css("td")): row_data[headers[td_index]] = cleanjoin(td.css("::text").extract()) data['results'].append(row_data) meta['collection']['competitions'].append(data) if len(stash) > 0: next_request = meta['stash'].pop() next_request.meta['stash'] = meta['stash'] yield next_request else: yield meta['collection']
def parse_school(self, response): collection = {'phone_numbers': {}} collection['title'] = self.fix_data( response.css("#content h2::text").extract_first().strip()) collection['data_url'] = response.url entries = response.css(".kontaktliste li") for entry in entries: # Remove the trailing `:` from the key (:-1) key = self.fix_data( entry.css("b::text").extract_first( default="kein Eintrag:").strip()[:-1]) values = [ self.fix_data(value) for value in entry.css("::text").extract()[1:] ] # Some schools list additional phone numbers. The problem is # that they do not have the header "Telefon" or something # comparable. The header shows, whom the number belongs to # So we check if there is a phone icon and if there is we # Add this to our list of phone numbers type = entry.css("img::attr(src)").extract_first() if type == "img/telefon.gif": collection['phone_numbers'][key] = ' '.join(values) else: collection[key] = ' '.join(values).replace('zur Karte', '') collection["Leitbild"] = cleanjoin( response.css("#quickbar > div:nth-child(3) ::text").extract(), "\n") yield collection
def parse_school(self, response): collection = {'phone_numbers': {}} collection['title'] = self.fix_data(response.css("#content h2::text").extract_first().strip()) entries = response.css(".kontaktliste li") for entry in entries: # Remove the trailing `:` from the key (:-1) key = self.fix_data(entry.css("b::text").extract_first(default="kein Eintrag:").strip()[:-1]) values = [self.fix_data(value) for value in entry.css("::text").extract()[1:]] # Some schools list additional phone numbers. The problem is # that they do not have the header "Telefon" or something # comparable. The header shows, whom the number belongs to # So we check if there is a phone icon and if there is we # Add this to our list of phone numbers type = entry.css("img::attr(src)").extract_first() if type == "img/telefon.gif": collection['phone_numbers'][key] = ' '.join(values) else: collection[key] = ' '.join(values).replace('zur Karte', '') collection["Leitbild"] = cleanjoin(response.css("#quickbar > div:nth-child(3) ::text").extract(), "\n") students_url = response.xpath('//*[@id="navi"]/div[2]/ul/li[2]/ul/li[1]/a/@href').get() if not students_url: yield collection else: request = scrapy.Request(self.base_url+students_url, meta={'cookiejar': response.meta['cookiejar']}, callback=self.parse_students, dont_filter=True) request.meta['collection'] = collection yield request
def parse_detail(self, response): collection = {} for index, row in enumerate(response.css('tr')): # skip disclaimer header if index == 0: continue tds = row.css('td') # last character is ":". Strip that row_key = cleanjoin(tds[0].css('::text').extract())[:-1] row_value = cleanjoin(tds[1].css('::text').extract(), "\n") if row_key == 'Schulname': row_value = row_value.replace('\n', ' ') collection[row_key] = row_value collection['data_url'] = response.url yield collection
def parse_details(self, response): collection = {} for tr in response.css("table tr"): tds = tr.css("td ::text").extract() # sometimes there is no value for the key if len(tds) >= 2: key = tds[0][:-1] collection[key] = cleanjoin(tds[1:], "\n") if key == "Name": collection['website'] = tr.css( "td:nth-child(2) a::attr(href)").extract_first() yield collection
def parse_schoollist(self, response): for tr in response.css('table tr'): collection = {} collection['Schulform'] = tr.css( 'td:nth-child(3)::text').extract()[1].strip() collection['Schulname'] = cleanjoin( tr.css('td:nth-child(2)::text').extract(), join_on=" ") url = tr.css('td')[3].css('a::attr(href)').extract_first().strip() request = scrapy.Request(self.base_url + url, callback=self.parse_overview) request.meta['collection'] = collection response.collection = collection #inspect_response(response, self) yield request