def normalize(item: Item) -> School: name = " ".join([ item.get("Schulbezeichnung_1", ""), item.get("Schulbezeichnung_2", ""), item.get("Schulbezeichnung_3", "") ]).strip() helper = NordRheinWestfalenHelper() right, high = item.get('UTMRechtswert'), item.get('UTMHochwert') this_projection = Proj(item.get('EPSG')) target_projection = Proj('epsg:4326') lon, lat = transform(this_projection, target_projection, right, high) return School( name=name, id='NW-{}'.format(item.get('Schulnummer')), address=item.get('Strasse'), zip=item.get("PLZ"), city=item.get('Ort'), website=item.get('Homepage'), email=item.get('E-Mail'), legal_status=helper.resolve('rechtsform', item.get('Rechtsform')), school_type=helper.resolve('schulform', item.get('Schulform')), provider=helper.resolve('provider', item.get('Traegernummer')), fax=f"{item.get('Faxvorwahl')}{item.get('Fax')}", phone=f"{item.get('Telefonvorwahl')}{item.get('Telefon')}", latitude=lat, longitude=lon, )
def normalize(item: Item) -> School: v = list(item.get('phone_numbers').values()) phone_numbers = v[0] if len(v) > 0 else None address_objects = re.split('\d{5}', item.get('Postanschrift').strip()) if len(address_objects) == 0: address = '' zip = '' city = '' elif len(address_objects) == 1: address = '' zip = '' city = address_objects[0].strip() else: address = re.split('\d{5}', item.get('Postanschrift'))[0].strip() zip = re.findall('\d{5}', item.get('Postanschrift'))[0].strip() city = re.split('\d{5}', item.get('Postanschrift'))[1].strip() return School(name=item.get('title'), id='SN-{}'.format(item.get('Dienststellenschlüssel')), address=address, zip=zip, city=city, website=item.get('Homepage'), email=item.get('E-Mail'), school_type=item.get('Einrichtungsart'), legal_status=item.get('Rechtsstellung'), provider=item.get('Schulträger'), fax=item.get('Telefax'), phone=phone_numbers, director=item.get('Schulleiter') or item.get('Schulleiter/in'))
def normalize(item: Item) -> School: zip_code, *city_parts = item.get('city').split() return School(name=item.get('name'), phone=item.get('phone'), website=item.get('web'), address=item.get('street'), city=' '.join(city_parts), zip=zip_code, id='BY-{}'.format(item.get('number')))
def update_or_create(item: SchoolPipelineItem) -> School: school = session.query(School).get(item.info['id']) if school: session.query(School).filter_by(id=item.info['id']).update({ **item.info, 'raw': item.item }) else: school = School(**item.info, raw=item.item) return school
def process_item(self, item, spider): school = School.update_or_create(item) try: session.add(school) session.commit() except SQLAlchemyError as e: logging.warning('Error when putting to DB') logging.warning(e) session.rollback() return school
def normalize(item: Item) -> School: return School(name=item.get('name'), phone=item.get('telefon'), fax=item.get('fax'), website=item.get('homepage'), address=item.get('straße'), city=item.get('ort'), zip=item.get('plz'), school_type=item.get('schultyp'), id='HE-{}'.format(item.get('id')))
def normalize(self, item: Item) -> School: return School(name=item.get('name'), id='RP-{}'.format(item.get('id')), address=item.get('Adresse'), city=item.get('Ort'), website=item.get('Internet'), email=item.get('E-Mail'), school_type=item.get('Schulform'), fax=item.get('Fax'), phone=item.get('Telefon'))
def normalize(item: Item) -> School: return School(name=item.get('name'), id='SH-{}'.format(item.get('Dienststellennummer')), address=item.get('Strasse'), zip=item.get("Postleitzahl"), city=item.get("Ort"), email=item.get('E-Mail'), school_type=item.get('Schularten'), fax=item.get('Fax'), phone=item.get('Telefon'), director=item.get('Schulleitung'))
def normalize(item: Item) -> School: return School(name=item.get('Name'), id = 'SA-{}'.format(item.get('ID')), address=re.split('\d{5}', item.get('Adresse').strip())[0].strip(), zip=re.findall('\d{5}', item.get('Adresse').strip())[0], city=re.split('\d{5}', item.get('Adresse').strip())[1].strip(), # address=item.get('Adresse'), website=item.get('Homepage'), email=item.get('E-Mail'), fax=item.get('Telefax'), phone=item.get('Telefon'), )
def normalize(item: Item) -> School: return School(name=item.get('name'), id='RP-{}'.format(item.get('id')), address=item.get('Adresse'), city=re.split('\d{5}', item.get('Ort').strip())[1].strip(), zip=re.findall('\d{5}', item.get('Ort'))[0], website=item.get('Internet'), email=item.get('E-Mail'), school_type=item.get('Schulform'), fax=item.get('Fax'), phone=item.get('Telefon'))
def normalize(item: Item) -> School: return School(name=item.get('title'), id='SN-{}'.format(item.get('Dienststellenschlüssel')), address=item.get('Postanschrift'), website=item.get('Homepage'), email=item.get('E-Mail'), school_type=item.get('Einrichtungsart'), legal_status=item.get('Rechtsstellung'), provider=item.get('Schulträger'), fax=item.get('Telefax'), phone=item.get('phone_numbers'), director=item.get('Schulleiter'))
def normalize(item: Item) -> School: city_parts = item.get('Ort').split() zip, city = city_parts[0], ' '.join(city_parts[1:]) return School(name=item.get('Schule'), phone=item.get('Tel'), fax=None, email=item.get('E-Mail'), website=item.get('Homepage'), address=item.get('Straße'), zip=zip, city=city, school_type=item.get("Schul-gliederung(en)"), id='NI-{}'.format(item.get('Schulnummer')))
def normalize(item: Item) -> School: tel = item.get('telefon') return School(name=item.get('name'), phone=tel, fax=item.get('telefax'), website=item.get('homepage'), email=item.get('e-mail'), address=item.get('straße'), city=item.get('ort'), zip=item.get('plz'), school_type=item.get('schultyp'), director=item.get('schulleitung'), id='SL-{}'.format(tel.replace(" ", "-")))
def normalize(item: Item) -> School: return School(name=item.get('name'), id='BW-{}'.format(item.get('id')), address=item.get('Strasse'), zip=item.get('PLZ'), city=item.get('Ort'), website=item.get('Internet'), email=item.get('E-Mail'), fax=item.get('Fax'), phone=item.get('Telefon'), provider=item.get('Schulamt'), director=item.get('Schulleitung'), school_type='')
def normalize(self, item: Item) -> School: city_parts = item.get('Ort').split(' ', 1) zip, city = city_parts[0], city_parts[1] return School(name=item.get('name'), id='RP-{}'.format(item.get('id')), address=item.get('Adresse'), zip=zip, city=city, website=item.get('Internet'), email=item.get('E-Mail'), school_type=item.get('Schulform'), fax=item.get('Fax'), phone=item.get('Telefon'))
def normalize(self, item: Item) -> School: return School(name=item.get('name'), id='BE-{}'.format(item.get('id')), address=item.get('address'), zip=item.get('zip'), city=item.get('city'), website=item.get('web'), email=item.get('mail'), school_type=item.get('schooltype'), fax=item.get('fax'), phone=item.get('telephone'), director=item.get('headmaster'), legal_status=item.get('legal_status'))
def normalize(item: Item) -> School: zip_code, *city_parts = item.get('city').split() return School(name=item.get('name'), phone=item.get('phone'), fax=item.get('fax'), website=item.get('web'), address=item.get('street'), city=' '.join(city_parts), zip=zip_code, school_type=item.get('school_type'), legal_status=item.get('type'), id='BY-{}'.format(item.get('number')), latitude=item.get('latitude'), longitude=item.get('longitude'))
def normalize(item: Item) -> School: city_parts = item.get('Ort').split() zip, city = city_parts[0], ' '.join(city_parts[1:]) return School(name=item.get('Schulname'), id='TH-{}'.format(item.get('Schulnummer')), address=item.get('Straße'), zip=zip, city=city, website=item.get('Internet'), email=ThueringenSpider._deobfuscate_email(item.get('E-Mail')), school_type=item.get('Schulart'), provider=item.get('Schulträger'), fax=item.get('Telefax'), phone=item.get('Telefon'))
def normalize(item: Item) -> School: return School(name=item.get('title'), id='SN-{}'.format(item.get('Dienststellenschlüssel')), address=re.split('\d{5}', item.get('Postanschrift').strip())[0].strip(), zip=re.findall('\d{5}', item.get('Postanschrift').strip())[0], city=re.split('\d{5}', item.get('Postanschrift').strip())[1].strip(), website=item.get('Homepage'), email=item.get('E-Mail'), school_type=item.get('Einrichtungsart'), legal_status=item.get('Rechtsstellung'), provider=item.get('Schulträger'), fax=item.get('Telefax'), phone=list(item.get('phone_numbers').values())[0], director=item.get('Schulleiter') or item.get('Schulleiter/in'))
def test_import_new(self): # Arrange info = School(name='Test Schule', id='NDS-1') item = dict(name='Test Schule', nr=1) school_item: SchoolPipelineItem = SchoolPipelineItem(info=info, item=item) db_item = DBSchool.update_or_create(school_item) session.add(db_item) session.commit() # Act count = session.query(DBSchool).count() # Assert self.assertEqual(count, 1)
def normalize(item): zip, city = item['Stadt/Gemeinde'].split(', ') phone = item.get('Telefon').split('\n')[0] if item.get( 'Telefon') else None return School( id=item['id'], name=item.get('name'), phone=phone, director=item.get('Schulleiter/in'), website=item.get('Homepage'), fax=item.get('Telefax'), email=item.get('E-Mail'), # email, address=item.get('Straße'), zip=zip, city=city)
def normalize(item: Item) -> School: *name, street, place = item.get('Adresse') zip_code, *city_parts = place.split(" ") return School(name=' '.join(name), id='BB-{}'.format(item.get('id')), address=street, zip=zip_code, city=' '.join(city_parts), website=first_or_none(item.get('Internet')), email=first_or_none(item.get('E-Mail')), school_type=first_or_none(item.get('Schulform')), provider=first_or_none(item.get('Schulamt')), fax=first_or_none(item.get('Fax')), phone=first_or_none(item.get('Telefon')), director=first_or_none(item.get('Schulleiter/in')))
def normalize(item: Item) -> School: city_parts = item.get('adresse_ort').split() zip_code, city = city_parts[0], city_parts[1:] return School(name=item.get('schulname'), id='HH-{}'.format(item.get('schul_id')), address=item.get('adresse_strasse_hausnr'), address2='', zip=zip_code, city=' '.join(city), website=item.get('schul_homepage'), email=item.get('schul_email'), school_type=item.get('schulform'), fax=item.get('fax'), phone=item.get('schul_telefonnr'), director=item.get('name_schulleiter'))
def normalize(item: Item) -> School: dst = str(item.get('Dst-Nr.:')).replace('.0', '') plz = str(item.get('Plz')).replace('.0', '') return School(name=item.get('Schulname'), id='MV-{}'.format(dst), address=item.get('Straße, Haus-Nr.'), address2='', zip=plz, city=item.get('Ort'), website=item.get('Homepage'), email=item.get('E-Mail'), school_type=item.get('Schulart/ Org.form'), fax=item.get('Telefax'), phone=item.get('Telefon'), provider=item.get('Schul-behörde'), director=item.get('Schulleitung'))
def normalize(item: Item) -> School: name = " ".join([item.get("Schulbezeichnung_1", ""), item.get("Schulbezeichnung_2", ""), item.get("Schulbezeichnung_3", "")]).strip() helper = NordRheinWestfalenHelper() return School(name=name, id='NW-{}'.format(item.get('Schulnummer')), address=item.get('Strasse'), zip=item.get("PLZ"), city=item.get('Ort'), website=item.get('Homepage'), email=item.get('E-Mail'), legal_status=helper.resolve('rechtsform', item.get('Rechtsform')), school_type=helper.resolve('schulform', item.get('Schulform')), provider=helper.resolve('provider', item.get('Traegernummer')), fax=f"{item.get('Faxvorwahl')}{item.get('Fax')}", phone=f"{item.get('Telefonvorwahl')}{item.get('Telefon')}")
def test_import_existing(self): # This test requires the previous one to have run already so that the item # exists in the database # Arrange info = School(name='Test Schule (updated)', id='NDS-1') item = dict(name='Test Schule', nr=1) school_item: SchoolPipelineItem = SchoolPipelineItem(info=info, item=item) db_item = DBSchool.update_or_create(school_item) session.add(db_item) session.commit() # Act count = session.query(DBSchool).count() db_school = session.query(DBSchool).first() # Assert self.assertEqual(count, 1) self.assertEqual(db_school.name, "Test Schule (updated)")
def normalize(item: Item) -> School: ansprechpersonen = item['Ansprechperson'].replace( 'Schulleitung:', '').replace('Vertretung:', ',').split(',') item['Schulleitung'] = ansprechpersonen[0] item['Vertretung'] = ansprechpersonen[1] return School(name=item.get('name'), id='HB-{}'.format(item.get('id')), address=re.split( '\d{5}', item.get('Anschrift:').strip())[0].strip(), zip=re.findall('\d{5}', item.get('Anschrift:').strip())[0], city=re.split('\d{5}', item.get('Anschrift:').strip())[1].strip(), website=item.get('Internet'), email=item.get('E-Mail-Adresse').strip(), fax=BremenSpider.fix_number(item.get('Telefax')), phone=BremenSpider.fix_number(item.get('Telefon')))
def normalize(item: Item) -> School: name = " ".join( [item.get('schulname', ''), item.get('namenszuatz', '')]).strip() address = item.get('sdb_adressen', [{}])[0] ort = address.get('sdb_ort', {}) school_type = NiedersachsenSpider._get(item, 'sdb_art', {}).get('art') provider = NiedersachsenSpider._get(item, 'sdb_traeger', {}).get('name') return School(name=name, phone=item.get('telefon'), fax=item.get('fax'), email=item.get('email'), website=item.get('homepage'), address=address.get('strasse'), zip=ort.get('plz'), city=ort.get('ort'), school_type=school_type, provider=provider, legal_status=item.get("sdb_traegerschaft", {}).get('bezeichnung'), id='NI-{}'.format(item.get('schulnr')))
def process_item(self, item, spider): if spider.name == 'saarland': address = u"{} {}".format(item.get('street', ""), item.get('zip', "")) if item.get('email'): email = item['email'].replace('mailto:', '').replace('%40', '@') else: email = None school = School(name=item.get('name'), phone=item.get('telephone'), director=item.get('telephone'), website=item.get('website'), fax=item.get('fax'), email=email, address=address) elif spider.name == 'niedersachsen': address = u"{} {}".format(item.get('Straße', ""), item.get('Ort', "")) school = School(name=item.get('Schule'), phone=item.get('Tel'), email=item.get('E-Mail'), website=item.get('Homepage'), address=address, id='NDS-{}'.format(item.get('Schulnummer'))) elif spider.name == 'bayern': school = School(name=item.get('Name'), phone=item.get('Telefon'), website=item.get('website'), address=item.get('Anschrift'), id='BAY-{}'.format(item.get('Schulnummer'))) elif spider.name == 'thueringen': school = School(name=item.get('Schulname'), id='TH-{}'.format(item.get('Schulnummer')), address=u"{} {}".format(item.get('Straße'), item.get('Ort')), website=item.get('Internet'), email=item.get('E-Mail'), school_type=item.get('Schulart'), provider=item.get('Schulträger'), fax=item.get('Telefax'), phone=item.get('Telefon')) elif spider.name == 'schleswig-holstein': school = School(name=item.get('Name'), id='SH-{}'.format(item.get('Dienststellen Nr.')), address=u"{} {} {}".format(item.get('Straße'), item.get("PLZ"), item.get("Ort")), email=item.get('EMail'), school_type=item.get('Organisationsform'), legal_status=item.get('Rechtsstatus'), provider=item.get('Träger'), fax=item.get('Fax'), phone=item.get('Telefon'), director=item.get('Schulleiter(-in)')) elif spider.name == 'bremen': ansprechpersonen = item['Ansprechperson'].replace( 'Schulleitung:', '').replace('Vertretung:', ',').split(',') item['Schulleitung'] = ansprechpersonen[0] item['Vertretung'] = ansprechpersonen[1] school = School(name=item.get('name'), address=item.get('Anschrift:'), website=item.get('Internet'), email=item.get('E-Mail-Adresse'), fax=item.get('Telefax'), phone=item.get('Telefon')) elif spider.name == 'sachsen': school = School(name=item.get('title'), id='SN-{}'.format( item.get('Dienststellenschlüssel')), address=item.get('Postanschrift'), website=item.get('Homepage'), email=item.get('E-Mail'), school_type=item.get('Einrichtungsart'), legal_status=item.get('Rechtsstellung'), provider=item.get('Schulträger'), fax=item.get('Telefax'), phone=item.get('phone_numbers'), director=item.get('Schulleiter')) elif spider.name == 'sachsen-anhalt': school = School( name=item.get('Name'), address=item.get('Addresse'), website=item.get('Homepage'), email=item.get('E-Mail'), fax=item.get('Fax'), phone=item.get('Telefon'), ) elif spider.name == 'brandenburg': school = School(name=item.get('name'), id=item.get('nummer'), address=item.get('Adresse'), website=item.get('Internet'), email=item.get('E-Mail'), school_type=item.get('Schulform'), provider=item.get('Schulamt'), fax=item.get('Fax'), phone=item.get('Telefon'), director=item.get('Schulleiter/in')) else: return item raise DropItem("Missing name in %s" % item) return {'info': school, 'item': item}