Python cleanjoin示例，jedeschule.utils.cleanjoin Python示例

示例#1

0

显示文件

文件： sachsen.py 项目： yetzt/jedeschule-scraper

    def parse_school(self, response):
        collection = {'phone_numbers': {}}
        collection['title'] = response.css("#content h2::text").extract_first().strip()
        entries = response.css(".kontaktliste li")
        for entry in entries:
            # Remove the trailing `:` from the key (:-1)
            key = entry.css("b::text").extract_first(default="kein Eintrag:").strip()[:-1]
            values = entry.css("::text").extract()[1:]

            # Some schools list additional phone numbers. The problem is
            # that they do not have the header "Telefon" or something
            # comparable. The header shows, whom the number belongs to
            # So we check if there is a phone icon and if there is we
            # Add this to our list of phone numbers
            type = entry.css("img::attr(src)").extract_first()
            if type == "img/telefon.gif":
                collection['phone_numbers'][key] = ' '.join(values)
            else:
                collection[key] = ' '.join(values).replace('zur Karte', '')

        collection["Leitbild"] = cleanjoin(response.css("#quickbar > div:nth-child(3) ::text").extract(), "\n")
        request = scrapy.Request('https://schuldatenbank.sachsen.de/index.php?id=440',
                                 meta={'cookiejar': response.meta['cookiejar']},
                                 callback=self.parse_personal_resources,
                                 dont_filter=True)
        request.meta['collection'] = collection
        yield request

示例#2

0

显示文件

 def parse_detail(self, response):
     # inspect_response(response, self)
     text = response.css("article ::text")
     collection = {}
     street, city = response.css("article > p")[0].css("::text").extract()
     collection['street'] = street
     collection['city'] = city
     collection['name'] = cleanjoin(
         response.css('article h1::text').extract(), "")
     collection['phone'] = get_first_or_none(text.re("Telefon: ([0-9 /]+)"))
     collection['fax'] = get_first_or_none(text.re("Fax: ([0-9 /]+)"))
     collection['web'] = response.css(
         "article a::attr(href)").extract_first()
     collection['number'] = get_first_or_none(
         text.re("Schulnummer: ([0-9]+)"))
     collection['school_type'] = get_first_or_none(
         text.re("Schulart: (.+)"))
     collection['type'] = get_first_or_none(
         text.re("Rechtlicher Status: (.+)"))
     collection['teachers'] = get_first_or_none(
         text.re("Hauptamtliche Lehrkräfte: ([0-9]+)"))
     collection['students'] = get_first_or_none(
         text.re("Schüler: ([0-9]+)"))
     collection['url'] = response.url
     yield collection

示例#3

0

显示文件

文件： sachsen.py 项目： yetzt/jedeschule-scraper

    def parse_partners_detail(self, response):
        meta = response.meta
        stash = response.meta.get('stash')

        if "5130" in response.url:
            data = []
            table = response.css("table.ssdb_02")
            for row in table.css("tr"):
                row_data = {}
                tds = row.css("td")
                row_key = tds[0].css("::text").extract_first().strip()
                row_data[row_key] = cleanjoin(tds[1:].css("::text").extract())
                data.append(row_data)
            meta['collection']['partners'].append(data)
        else:
            # TODO: This is an Eltern/SV page, parse it differently
            pass

        if len(stash) > 0:
            next_request = meta['stash'].pop()
            next_request.meta['stash'] = meta['stash']
            yield next_request
        else:
            request = scrapy.Request("https://schuldatenbank.sachsen.de/index.php?id=470",
                                     meta={'cookiejar': response.meta['cookiejar']},
                                     callback=self.parse_competitions_overview,
                                     dont_filter=True)
            request.meta['collection'] = meta['collection']
            yield request

示例#4

0

显示文件

文件： sachsen.py 项目： yetzt/jedeschule-scraper

    def parse_competition_detail(self, response):
        meta = response.meta
        stash = response.meta.get('stash')

        data = {
            'name': response.css("#content > div:nth-child(3) > b::text").extract_first(),
            'results': []
        }
        table = response.css("table.ssdb_02")
        headers = [text.strip() for text in table.css(" tr:first-child td::text").extract()]
        for tr_index, row in enumerate(table.css("tr")):
            if tr_index == 0:
                # header
                continue
            row_data = {}
            for td_index, td in enumerate(row.css("td")):
                row_data[headers[td_index]] = cleanjoin(td.css("::text").extract())
            data['results'].append(row_data)
        meta['collection']['competitions'].append(data)

        if len(stash) > 0:
            next_request = meta['stash'].pop()
            next_request.meta['stash'] = meta['stash']
            yield next_request
        else:
            yield meta['collection']

示例#5

0

显示文件

文件： sachsen.py 项目： htw-kevkev/jedeschule-scraper

    def parse_school(self, response):
        collection = {'phone_numbers': {}}
        collection['title'] = self.fix_data(
            response.css("#content h2::text").extract_first().strip())
        collection['data_url'] = response.url
        entries = response.css(".kontaktliste li")
        for entry in entries:
            # Remove the trailing `:` from the key (:-1)
            key = self.fix_data(
                entry.css("b::text").extract_first(
                    default="kein Eintrag:").strip()[:-1])
            values = [
                self.fix_data(value)
                for value in entry.css("::text").extract()[1:]
            ]
            # Some schools list additional phone numbers. The problem is
            # that they do not have the header "Telefon" or something
            # comparable. The header shows, whom the number belongs to
            # So we check if there is a phone icon and if there is we
            # Add this to our list of phone numbers
            type = entry.css("img::attr(src)").extract_first()
            if type == "img/telefon.gif":
                collection['phone_numbers'][key] = ' '.join(values)
            else:
                collection[key] = ' '.join(values).replace('zur Karte', '')

        collection["Leitbild"] = cleanjoin(
            response.css("#quickbar > div:nth-child(3) ::text").extract(),
            "\n")
        yield collection

示例#6

0

显示文件

    def parse_school(self, response):
        collection = {'phone_numbers': {}}
        collection['title'] = self.fix_data(response.css("#content h2::text").extract_first().strip())
        entries = response.css(".kontaktliste li")
        for entry in entries:
            # Remove the trailing `:` from the key (:-1)
            key = self.fix_data(entry.css("b::text").extract_first(default="kein Eintrag:").strip()[:-1])
            values = [self.fix_data(value) for value in entry.css("::text").extract()[1:]]
            # Some schools list additional phone numbers. The problem is
            # that they do not have the header "Telefon" or something
            # comparable. The header shows, whom the number belongs to
            # So we check if there is a phone icon and if there is we
            # Add this to our list of phone numbers
            type = entry.css("img::attr(src)").extract_first()
            if type == "img/telefon.gif":
                collection['phone_numbers'][key] = ' '.join(values)
            else:
                collection[key] = ' '.join(values).replace('zur Karte', '')

        collection["Leitbild"] = cleanjoin(response.css("#quickbar > div:nth-child(3) ::text").extract(), "\n")
        students_url = response.xpath('//*[@id="navi"]/div[2]/ul/li[2]/ul/li[1]/a/@href').get()
        if not students_url:
            yield collection
        else:    
            request = scrapy.Request(self.base_url+students_url,
                                    meta={'cookiejar': response.meta['cookiejar']},
                                    callback=self.parse_students,
                                    dont_filter=True)
            request.meta['collection'] = collection

            yield request

示例#7

0

显示文件

文件： niedersachsen.py 项目： htw-kevkev/jedeschule-scraper

    def parse_detail(self, response):
        collection = {}
        for index, row in enumerate(response.css('tr')):
            # skip disclaimer header
            if index == 0:
                continue
            tds = row.css('td')

            # last character is ":". Strip that
            row_key = cleanjoin(tds[0].css('::text').extract())[:-1]
            row_value = cleanjoin(tds[1].css('::text').extract(), "\n")

            if row_key == 'Schulname':
                row_value = row_value.replace('\n', ' ')

            collection[row_key] = row_value
        collection['data_url'] = response.url
        yield collection

示例#8

0

显示文件

文件： bayern.py 项目： tursics/jedeschule-scraper

 def parse_details(self, response):
     collection = {}
     for tr in response.css("table tr"):
         tds = tr.css("td ::text").extract()
         # sometimes there is no value for the key
         if len(tds) >= 2:
             key = tds[0][:-1]
             collection[key] = cleanjoin(tds[1:], "\n")
             if key == "Name":
                 collection['website'] = tr.css(
                     "td:nth-child(2) a::attr(href)").extract_first()
     yield collection

示例#9

0

显示文件

文件： nrw.py 项目： yetzt/jedeschule-scraper

 def parse_schoollist(self, response):
     for tr in response.css('table tr'):
         collection = {}
         collection['Schulform'] = tr.css(
             'td:nth-child(3)::text').extract()[1].strip()
         collection['Schulname'] = cleanjoin(
             tr.css('td:nth-child(2)::text').extract(), join_on=" ")
         url = tr.css('td')[3].css('a::attr(href)').extract_first().strip()
         request = scrapy.Request(self.base_url + url,
                                  callback=self.parse_overview)
         request.meta['collection'] = collection
         response.collection = collection
         #inspect_response(response, self)
         yield request