def save_item(item: Item, spider): if isinstance(item, MessageItem): def message_is_unique(message_model: Message, limit=20) -> bool: subquery = session.query(Message) \ .order_by(Message.created_at.desc(), Message.id.desc()) \ .limit(limit) \ .subquery() alias = aliased(Message, subquery) return not session.query( session.query(alias).filter( alias.text == message_model.text, alias.image == message_model.image).exists()).scalar() def message_fit_the_length(message_model: Message) -> bool: if message_model.image: return len(remove_tags(message_model.text)) <= 1024 return len(remove_tags(message_model.text)) <= 4096 message = Message(text=item.get('text'), image=item.get('image'), url=item.get('url')) if message_is_unique(message) and message_fit_the_length(message): with session.begin(): session.add(message)
def normalize(item: Item) -> School: return School(name=item.get('name'), id='SH-{}'.format(item.get('Dienststellennummer')), address=item.get('Strasse'), zip=item.get("Postleitzahl"), city=item.get("Ort"), email=item.get('E-Mail'), school_type=item.get('Schularten'), fax=item.get('Fax'), phone=item.get('Telefon'), director=item.get('Schulleitung'))
def normalize(item: Item) -> School: return School(name=item.get('name'), id='RP-{}'.format(item.get('id')), address=item.get('Adresse'), city=re.split('\d{5}', item.get('Ort').strip())[1].strip(), zip=re.findall('\d{5}', item.get('Ort'))[0], website=item.get('Internet'), email=item.get('E-Mail'), school_type=item.get('Schulform'), fax=item.get('Fax'), phone=item.get('Telefon'))
def on_duplicate_sql(*args, item: scrapy.Item): if args and isinstance(item, scrapy.Item): dup_keys = list() for index, key in enumerate(args): if index == 0: update_str = ' ON DUPLICATE KEY UPDATE {}="{}"'.format(key, item.get(key)) else: update_str = '{}="{}"'.format(key, item.get(key)) dup_keys.append(update_str) return ', '.join(dup_keys) else: return ''
def normalize(item: Item) -> School: tel = item.get('telefon') return School(name=item.get('name'), phone=tel, fax=item.get('telefax'), website=item.get('homepage'), email=item.get('e-mail'), address=item.get('straße'), city=item.get('ort'), zip=item.get('plz'), school_type=item.get('schultyp'), director=item.get('schulleitung'), id='SL-{}'.format(tel.replace(" ", "-")))
def normalize(item: Item) -> School: return School(name=item.get('name'), phone=item.get('telefon'), fax=item.get('fax'), website=item.get('homepage'), address=item.get('straße'), city=item.get('ort'), zip=item.get('plz'), school_type=item.get('schultyp'), id='HE-{}'.format(item.get('id')))
def normalize(self, item: Item) -> School: return School(name=item.get('name'), id='RP-{}'.format(item.get('id')), address=item.get('Adresse'), city=item.get('Ort'), website=item.get('Internet'), email=item.get('E-Mail'), school_type=item.get('Schulform'), fax=item.get('Fax'), phone=item.get('Telefon'))
def normalize(item: Item) -> School: city_parts = item.get('Ort').split() zip, city = city_parts[0], ' '.join(city_parts[1:]) return School(name=item.get('Schulname'), id='TH-{}'.format(item.get('Schulnummer')), address=item.get('Straße'), zip=zip, city=city, website=item.get('Internet'), email=ThueringenSpider._deobfuscate_email(item.get('E-Mail')), school_type=item.get('Schulart'), provider=item.get('Schulträger'), fax=item.get('Telefax'), phone=item.get('Telefon'))
def normalize(item: Item) -> School: city_parts = item.get('adresse_ort').split() zip_code, city = city_parts[0], city_parts[1:] return School(name=item.get('schulname'), id='HH-{}'.format(item.get('schul_id')), address=item.get('adresse_strasse_hausnr'), address2='', zip=zip_code, city=' '.join(city), website=item.get('schul_homepage'), email=item.get('schul_email'), school_type=item.get('schulform'), fax=item.get('fax'), phone=item.get('schul_telefonnr'), director=item.get('name_schulleiter'))
def normalize(item: Item) -> School: zip_code, *city_parts = item.get('city').split() return School(name=item.get('name'), phone=item.get('phone'), fax=item.get('fax'), website=item.get('web'), address=item.get('street'), city=' '.join(city_parts), zip=zip_code, school_type=item.get('school_type'), legal_status=item.get('type'), id='BY-{}'.format(item.get('number')))
def normalize(item: Item) -> School: return School(name=item.get('Name'), id = 'SA-{}'.format(item.get('ID')), address=re.split('\d{5}', item.get('Adresse').strip())[0].strip(), zip=re.findall('\d{5}', item.get('Adresse').strip())[0], city=re.split('\d{5}', item.get('Adresse').strip())[1].strip(), # address=item.get('Adresse'), website=item.get('Homepage'), email=item.get('E-Mail'), fax=item.get('Telefax'), phone=item.get('Telefon'), )
def normalize(self, item: Item) -> School: city_parts = item.get('Ort').split(' ', 1) zip, city = city_parts[0], city_parts[1] return School(name=item.get('name'), id='RP-{}'.format(item.get('id')), address=item.get('Adresse'), zip=zip, city=city, website=item.get('Internet'), email=item.get('E-Mail'), school_type=item.get('Schulform'), fax=item.get('Fax'), phone=item.get('Telefon'))
def normalize(item: Item) -> School: *name, street, place = item.get('Adresse') zip_code, *city_parts = place.split(" ") return School(name=' '.join(name), id='BB-{}'.format(item.get('id')), address=street, zip=zip_code, city=' '.join(city_parts), website=first_or_none(item.get('Internet')), email=first_or_none(item.get('E-Mail')), school_type=first_or_none(item.get('Schulform')), provider=first_or_none(item.get('Schulamt')), fax=first_or_none(item.get('Fax')), phone=first_or_none(item.get('Telefon')), director=first_or_none(item.get('Schulleiter/in')))
def normalize(item: Item) -> School: name = "".join([ item.get("Schulbezeichnung_1", ""), item.get("Schulbezeichnung_2", ""), item.get("Schulbezeichnung_3", "") ]) helper = NordRheinWestfalenHelper() return School( name=name, id='NW-{}'.format(item.get('Schulnummer')), address=item.get('Strasse'), zip=item.get("PLZ"), city=item.get('Ort'), website=item.get('Homepage'), email=item.get('E-Mail'), legal_status=helper.resolve('rechtsform', item.get('Rechtsform')), school_type=helper.resolve('schulform', item.get('Schulform')), fax=f"{item.get('Faxvorwahl')}{item.get('Fax')}", phone=f"{item.get('Telefonvorwahl')}{item.get('Telefon')}")
def normalize(item: Item) -> School: city_parts = item.get('Ort').split() zip, city = city_parts[0], ' '.join(city_parts[1:]) return School(name=item.get('Schule'), phone=item.get('Tel'), fax=None, email=item.get('E-Mail'), website=item.get('Homepage'), address=item.get('Straße'), zip=zip, city=city, school_type=item.get("Schul-gliederung(en)"), id='NI-{}'.format(item.get('Schulnummer')))
def normalize(item: Item) -> School: ansprechpersonen = item['Ansprechperson'].replace( 'Schulleitung:', '').replace('Vertretung:', ',').split(',') item['Schulleitung'] = ansprechpersonen[0] item['Vertretung'] = ansprechpersonen[1] return School(name=item.get('name'), id='HB-{}'.format(item.get('id')), address=re.split( '\d{5}', item.get('Anschrift:').strip())[0].strip(), zip=re.findall('\d{5}', item.get('Anschrift:').strip())[0], city=re.split('\d{5}', item.get('Anschrift:').strip())[1].strip(), website=item.get('Internet'), email=item.get('E-Mail-Adresse').strip(), fax=BremenSpider.fix_number(item.get('Telefax')), phone=BremenSpider.fix_number(item.get('Telefon')))
def normalize(item: Item) -> School: name = " ".join( [item.get('schulname', ''), item.get('namenszuatz', '')]).strip() address = item.get('sdb_adressen', [{}])[0] ort = address.get('sdb_ort', {}) school_type = NiedersachsenSpider._get(item, 'sdb_art', {}).get('art') provider = NiedersachsenSpider._get(item, 'sdb_traeger', {}).get('name') return School(name=name, phone=item.get('telefon'), fax=item.get('fax'), email=item.get('email'), website=item.get('homepage'), address=address.get('strasse'), zip=ort.get('plz'), city=ort.get('ort'), school_type=school_type, provider=provider, legal_status=item.get("sdb_traegerschaft", {}).get('bezeichnung'), id='NI-{}'.format(item.get('schulnr')))
def process_item(self, item: Item, spider): if item['url'] in self.ids_seen: raise DropItem("Duplicate item found: %s" % item.get('url', item)) else: self.ids_seen.add(item['url']) return item
def normalize(item: Item) -> School: return School(name=item.get('title'), id='SN-{}'.format(item.get('Dienststellenschlüssel')), address=item.get('Postanschrift'), website=item.get('Homepage'), email=item.get('E-Mail'), school_type=item.get('Einrichtungsart'), legal_status=item.get('Rechtsstellung'), provider=item.get('Schulträger'), fax=item.get('Telefax'), phone=item.get('phone_numbers'), director=item.get('Schulleiter'))
def normalize(self, item: Item) -> School: return School(name=item.get('name'), id='BE-{}'.format(item.get('id')), address=item.get('address'), zip=item.get('zip'), city=item.get('city'), website=item.get('web'), email=item.get('mail'), school_type=item.get('schooltype'), fax=item.get('fax'), phone=item.get('telephone'), director=item.get('headmaster'), legal_status=item.get('legal_status'))
def insert_sql_values(self, table: str, item: scrapy.Item) -> Tuple[str, Tuple]: movie_id = item.get(Item.MOVIE_ID_NAME, 0) title = item.get(Item.TITLE_NAME, "") director = item.get(Item.DIRECTOR_NAME, "") author = item.get(Item.AUTHOR_NAME, "") actor = item.get(Item.ACTOR_NAME, "") region = item.get(Item.REGION_NAME, "") lang = item.get(Item.LANG_NAME, "") genre = item.get(Item.GENRE_NAME, "") release = item.get(Item.RELEASE_NAME, "") episode = item.get(Item.EPISODE_NAME, "") duration = item.get(Item.DURATION_NAME, "") runtime = item.get(Item.RUNTIME_NAME, "") average = item.get(Item.AVERAGE_NAME, "") votes = item.get(Item.VOTES_NAME, "") logging.info(("INSERT OR IGNORE INTO %s VALUES(" "%d, '%s', '%s', '%s', '%s', '%s', '%s', " "'%s', '%s', '%s', '%s', '%s', '%s', '%s')"), table, movie_id, title, director, author, actor, region, lang, genre, release, episode, duration, runtime, average, votes) return ( f"INSERT OR IGNORE INTO {table} VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (movie_id, title, director, author, actor, region, lang, genre, release, episode, duration, runtime, average, votes))
def normalize(item: Item) -> School: v = list(item.get('phone_numbers').values()) phone_numbers = v[0] if len(v) > 0 else None address_objects = re.split('\d{5}', item.get('Postanschrift').strip()) if len(address_objects) == 0: address = '' zip = '' city = '' elif len(address_objects) == 1: address = '' zip = '' city = address_objects[0].strip() else: address = re.split('\d{5}', item.get('Postanschrift'))[0].strip() zip = re.findall('\d{5}', item.get('Postanschrift'))[0].strip() city = re.split('\d{5}', item.get('Postanschrift'))[1].strip() return School(name=item.get('title'), id='SN-{}'.format(item.get('Dienststellenschlüssel')), address=address, zip=zip, city=city, website=item.get('Homepage'), email=item.get('E-Mail'), school_type=item.get('Einrichtungsart'), legal_status=item.get('Rechtsstellung'), provider=item.get('Schulträger'), fax=item.get('Telefax'), phone=phone_numbers, director=item.get('Schulleiter') or item.get('Schulleiter/in'))
def normalize(item: Item) -> School: return School(name=item.get('name'), id='BW-{}'.format(item.get('id')), address=item.get('Strasse'), zip=item.get('PLZ'), city=item.get('Ort'), website=item.get('Internet'), email=item.get('E-Mail'), fax=item.get('Fax'), phone=item.get('Telefon'), provider=item.get('Schulamt'), director=item.get('Schulleitung'), school_type='')
def normalize(item: Item) -> School: dst = str(item.get('Dst-Nr.:')).replace('.0', '') plz = str(item.get('Plz')).replace('.0', '') return School(name=item.get('Schulname'), id='MV-{}'.format(dst), address=item.get('Straße, Haus-Nr.'), address2='', zip=plz, city=item.get('Ort'), website=item.get('Homepage'), email=item.get('E-Mail'), school_type=item.get('Schulart/ Org.form'), fax=item.get('Telefax'), phone=item.get('Telefon'), provider=item.get('Schul-behörde'), director=item.get('Schulleitung'))
def normalize(item: Item) -> School: return School(name=item.get('title'), id='SN-{}'.format(item.get('Dienststellenschlüssel')), address=re.split('\d{5}', item.get('Postanschrift').strip())[0].strip(), zip=re.findall('\d{5}', item.get('Postanschrift').strip())[0], city=re.split('\d{5}', item.get('Postanschrift').strip())[1].strip(), website=item.get('Homepage'), email=item.get('E-Mail'), school_type=item.get('Einrichtungsart'), legal_status=item.get('Rechtsstellung'), provider=item.get('Schulträger'), fax=item.get('Telefax'), phone=list(item.get('phone_numbers').values())[0], director=item.get('Schulleiter') or item.get('Schulleiter/in'))
def normalize(item: Item) -> School: name = " ".join([ item.get("Schulbezeichnung_1", ""), item.get("Schulbezeichnung_2", ""), item.get("Schulbezeichnung_3", "") ]).strip() helper = NordRheinWestfalenHelper() right, high = item.get('UTMRechtswert'), item.get('UTMHochwert') this_projection = Proj(item.get('EPSG')) target_projection = Proj('epsg:4326') lon, lat = transform(this_projection, target_projection, right, high) return School( name=name, id='NW-{}'.format(item.get('Schulnummer')), address=item.get('Strasse'), zip=item.get("PLZ"), city=item.get('Ort'), website=item.get('Homepage'), email=item.get('E-Mail'), legal_status=helper.resolve('rechtsform', item.get('Rechtsform')), school_type=helper.resolve('schulform', item.get('Schulform')), provider=helper.resolve('provider', item.get('Traegernummer')), fax=f"{item.get('Faxvorwahl')}{item.get('Fax')}", phone=f"{item.get('Telefonvorwahl')}{item.get('Telefon')}", latitude=lat, longitude=lon, )