def save_items(table_name, items, id_lv=None): """Save items into DB. Items are saved separately, because we want to store id for futher processing.""" primary_key = get_item_primary_key_name(table_name) for item in items: is_change = not (item.get('zadna_zmena', False)) if is_change: # exclude owners from saving item, we add it after save owners = item.pop('vlastnici', []) if 'cislo_zaznamu' not in item: item['cislo_zaznamu'] = 1 if id_lv: item['id_lv'] = id_lv db = db_handler.get_dataset() try: item[primary_key] = db[table_name].insert(item) except Exception as e: pprint(e) item['vlastnici'] = owners
def prepare_batch(): settings = get_project_settings() max_items = settings['MAX_STAVEBNI_OBJEKTY_IN_BATCH'] max_processes = settings['MAX_STAVEBNI_OBJEKTY_PROCESSES'] # check current running scrapyd processes active_processes = utils.sum_active_jobs(LandRegisterCrawler.project_name, LandRegisterCrawler.spider_name) max_processes -= active_processes if max_processes < 1: return [] items_to_process = max_items * max_processes db = db_handler.get_dataset() results = db['stavebni_objekt_ref'].find(stav_scrapingu='W', _limit=items_to_process, order_by='id') batch = [] single_list = [] for r in results: if len(single_list) == max_items: batch.append(single_list.copy()) single_list = [] single_list.append(r['id']) if single_list: batch.append(single_list) return batch
def filter_lv_change(new_item): """Check for item change and filter (remove) unchanged fields.""" primary_key = get_item_primary_key_name('lv') search_key = get_item_search_key('lv', new_item) db = db_handler.get_dataset() all_items = db['lv'].find(**search_key, order_by='cislo_zaznamu') current_item = {} for item in all_items: current_item.update((k,v) for k,v in item.items() if v is not None) # item is new => no changes have been made if not current_item: return get_changed_fields(current_item, new_item) new_item[primary_key] = current_item[primary_key] if is_changed(new_item, ignore_fields=['vlastnici', primary_key]): new_item['cislo_zaznamu'] = current_item['cislo_zaznamu'] + 1 else: new_item['zadna_zmena'] = True
def process_seznam_nemovitosti(seznam_nemovitosti, id_rizeni): db = db_handler.get_dataset() for s in seznam_nemovitosti: s['id_rizeni'] = id_rizeni db['seznam_nemovitosti'].upsert(s, ['id_rizeni', 'poradove_cislo'])
def process_predmety_rizeni(predmety_rizeni, id_rizeni): db = db_handler.get_dataset() for p in predmety_rizeni: p['id_rizeni'] = id_rizeni db['predmety_rizeni'].upsert(p, ['id_rizeni', 'poradove_cislo'])
def create_scraping(name=None): """Creates new scraping process.""" db = db_handler.get_dataset() fields = dict(nazev=name, datum_zacatku=datetime.now()) scraping_id = db['log_scraping'].insert(fields) return scraping_id
def process_ucastnici_rizeni(ucastnici, id_rizeni): db = db_handler.get_dataset() for u in ucastnici: u['id_rizeni'] = id_rizeni db['ucastnici_rizeni'].upsert(u, ['id_rizeni', 'poradove_cislo'])
def is_operation_updated(operation_number, wp, state): """Checks if operation has been updated already.""" db = db_handler.get_dataset() result = db['rizeni'].find_one(cislo_pracoviste=wp, cislo_rizeni=operation_number, stav_rizeni=state) return True if result else False
def load_items(id_lv, table_name, additional_fields={}): """Loads all items (include historical changes) for single LV.""" db = db_handler.get_dataset() return db[table_name].find( id_lv=id_lv, **additional_fields, order_by='cislo_zaznamu' )
def get_ku_jobs(scraping_id, status, limit=None): """Loads list of KU jobs by status. (R - Runinng, F - Finished, E - Error, W - Waiting).""" db = db_handler.get_dataset() results = db['log_uloha'].find( id_scrapingu=scraping_id, stav=status, _limit=limit ) return [r for r in results]
def process_provedene_operace(provedene_operace, id_rizeni): db = db_handler.get_dataset() for p in provedene_operace: p['id_rizeni'] = id_rizeni datum = p.get('datum') p['datum'] = common.get_date_from_string(datum) if datum else datum db['provedene_operace'].upsert(p, ['id_rizeni', 'poradove_cislo'])
def generate_scraping_objects(ids): """Get next url.""" db = db_handler.get_dataset() table = db['stavebni_objekt_ref'].table statement = table.select(table.c.id.in_(tuple(ids))) results = db.query(statement) for obj in results: yield obj
def get_id_lv(item): """Get LV's ID, if not exists, item will be saved.""" db = db_handler.get_dataset() return db['lv'].upsert( dict(cislo_ku=item.get('cislo_ku'), cislo_lv=item.get('cislo_lv'), cislo_zaznamu=1), ['cislo_ku', 'cislo_lv'])
def get_id_rizeni(item): """Get ID of rizeni, if not exists, item will be saved.""" db = db_handler.get_dataset() return db['rizeni'].upsert( dict(cislo_pracoviste=item.get('cislo_pracoviste'), cislo_rizeni=item.get('cislo_rizeni')), ['cislo_pracoviste', 'cislo_rizeni'])
def get_scraping_id(): """Loads ID of scraping process.""" db = db_handler.get_dataset() result = db.query(""" SELECT MAX(id) as id FROM log_scraping""" ) for r in result: return r['id'] if r['id'] else None
def update_job_log(job_id, job_hash, status): """Updates job's hash and status.""" db = db_handler.get_dataset() fields = dict(id=job_id, hash_ulohy=job_hash, stav=status, datum_konce=None) # if status is R (running), set date of create if status == 'R': fields['datum_zacatku'] = datetime.now() db['log_uloha'].update(fields, ['id'])
def save_ku_csv_into_db(file): """Saves all KU items into DB.""" with open(file, mode='r', encoding='utf8') as csv_file: csv_reader = csv.DictReader(csv_file, delimiter=';') items_to_save = [] for row in csv_reader: item = prepare_ku_to_save(row) items_to_save.append(item) db = db_handler.get_dataset() db['ku'].insert_many(items_to_save)
def process_ref_rizeni(item, number, type): lv_item = item.get('lv_item') id_lv = get_id_lv(lv_item) id_rizeni = get_id_rizeni(item) if not id_rizeni: return db = db_handler.get_dataset() db['seznam_nemovitosti'].update( dict(id_lv=id_lv, id_rizeni=id_rizeni, typ=type, cislo=number), ['id_rizeni', 'typ', 'cislo'])
def get_last_processed_lv(job_id): """Loads last processed LV for job ID (job ID is connected) to KU, and with this connection we can distinguish data from different scrapings.""" db = db_handler.get_dataset() result = db.query(""" SELECT MAX(cislo_lv) as max_lv FROM log_lv WHERE id_ulohy = {}""".format(job_id) ) for r in result: return r['max_lv'] if r['max_lv'] else 0
def delete_whole_lv_item(lv, ku): """Delete all objects of LV in specified KU.""" db = db_handler.get_dataset() result = db['lv'].find_one(cislo_lv=lv, cislo_ku=ku) id_lv = result['id'] if result else None if not id_lv: return # delete object tables for t in ['pozemek', 'stavebni_objekt', 'stavba', 'jednotka', 'vlastnici']: db[t].delete(id_lv = id_lv) # delete main table db['lv'].delete(id=id_lv)
def process_rizeni(item): data = item.get('data') datum = data.get('datum_prijeti') data['datum_prijeti'] = common.get_datetime_from_string( datum) if datum else datum db = db_handler.get_dataset() id_rizeni = db['rizeni'].upsert(data, ['cislo_pracoviste', 'cislo_rizeni']) if item.get('ucastnici'): process_ucastnici_rizeni(item['ucastnici'], id_rizeni) if item.get('provedene_operace'): process_provedene_operace(item['provedene_operace'], id_rizeni) if item.get('predmety_rizeni'): process_predmety_rizeni(item['predmety_rizeni'], id_rizeni) if item.get('seznam_nemovitosti'): process_seznam_nemovitosti(item['seznam_nemovitosti'], id_rizeni)
def load_all_ku(): """Loads all KU.""" db = db_handler.get_dataset() results = db['ku'].all() today = date.today() ku_list = [] for r in results: if r['plati_od']: if r['plati_od'] > today: continue if r['plati_do']: if r['plati_do'] < today: continue ku_list.append(r['cislo_ku']) return ku_list
def init_new_scraping(): """Inits new scraping of cadaster. Prepare all ku codes and set waiting status to them.""" scraping_id = create_scraping() ku_list = load_all_ku() ku_jobs = [] for ku in ku_list: ku_jobs.append({ 'id_scrapingu': scraping_id, 'cislo_ku': ku, 'stav': 'W' }) db = db_handler.get_dataset() db['log_uloha'].insert_many(ku_jobs) return scraping_id
def check_completness(ku_code): scraping_id = get_scraping_id() db = db_handler.get_dataset() max_valid_lvs = db.query(""" SELECT MAX(log_lv.cislo_lv), log_lv.cislo_ku FROM log_lv INNER JOIN log_uloha ON log_lv.id_ulohy = log_uloha.id WHERE log_uloha.id_scrapingu = 53 GROUP BY log_lv.cislo_ku, log_lv.existuje ORDER BY log_lv.cislo_ku ASC""".format(scraping_id)) """ SELECT MAX(cislo_lv), cislo_ku, existuje FROM log_lv WHERE cislo_ku = '600016' GROUP BY cislo_ku, existuje ORDER BY cislo_ku ASC""" max_invalid_lvs = db.query(""" SELECT MAX(cislo_lv) AS max_invalid_lv, cislo_ku FROM log_lv WHERE existuje = 0""")
def update_rizeni_log(self, status): db = db_handler.get_dataset() db['log_rizeni'].update( dict(id=self.job_id, stav=status, datum_konce=datetime.now()), ['id'])
def save_buildings_refs(id_lv, refs): refs = delete_duplicates(refs) db = db_handler.get_dataset() for r in refs: r['id_lv'] = id_lv db['stavebni_objekt_ref'].insert_many(refs)
def update_scraping_object(scraping_object, status): db = db_handler.get_dataset() db['stavebni_objekt_ref'].update( dict(id=scraping_object['id'], stav_scrapingu=status), ['id'])
def update_item_status(id, status): db = db_handler.get_dataset() db['stavebni_objekt_ref'].update(dict(id=id, stav_scrapingu=status), ['id'])
def delete_all_ku(): """Deletes all ku data.""" db = db_handler.get_dataset() db['ku'].delete()
def update_job_status(job_id, status): db = db_handler.get_dataset() fields = dict(id=job_id, stav=status, datum_konce=datetime.now()) db['log_uloha'].update(fields, ['id'])