def entry_generator(self): vybory = [ entry[const.MONGO_ID] for entry in storage.MongoCollection( self.db, 'nodes_vybor').iterate_all() ] source_collection = storage.MongoCollection(self.db, 'parsed_zakon') def result_form(entry, vybor, lehota): return { const.NEO4J_BEGINNING_ID: vybor, const.NEO4J_ENDING_ID: entry[const.MONGO_ID], const.NAVRHNUTY_LEHOTA: lehota } for entry in source_collection.iterate_all(): if const.ZAKON_ROZHODNUTIE_VYBORY in entry: sprava = entry[const.ZAKON_ROZHODNUTIE_VYBORY] if sprava == '': break lehota = self.get_lehota(sprava) for vybor in vybory: flag = False if vybor in sprava: result = result_form(entry, vybor, lehota) result[ const.NAVRHNUTY_TYP] = const.NAVRHNUTY_DOPLNUJUCI flag = True if vybor in entry[const.ZAKON_ROZHODNUTIE_GESTORSKY]: result = result_form(entry, vybor, lehota) result[const.NAVRHNUTY_TYP] = const.NAVRHNUTY_GESTORSKY flag = True if flag: yield result
def entry_generator(self): col_zakon = storage.MongoCollection(self.db, 'parsed_zakon') col_tlac = storage.MongoCollection(self.db, 'parsed_hlasovanietlace') for entry in col_zakon.iterate_all(): zakon = col_tlac.get({const.MONGO_ID: entry[const.MONGO_ID]}) hlasovania = zakon.get(const.HLASOVANIETLAC_LIST, {}) zmeny = entry.get(const.ZAKON_ZMENY, {}) ids = sorted(zmeny.keys()) names = [ zmeny[i][const.ZAKON_ZMENY_PREDKLADATEL].split(',')[0] for i in ids ] hlas_text = pd.Series({ key: value[const.HLASOVANIE_NAZOV].split('Hlasovanie')[-1] for key, value in hlasovania.items() if 'druhé čítanie' in value[const.HLASOVANIE_NAZOV] }) if len(hlas_text) == 0: continue counts = [0] * len(ids) for j, name in enumerate(names): if names.count(name) > 1: counts[j] = names[:j + 1].count(name) for j, i in enumerate(ids): hlas_name = hlas_text[hlas_text.str.contains(names[j][:-1])] if counts[j] > 0: hlas_name = hlas_name[hlas_name.str.contains( f'{counts[j]}. návrh')] for id_hlas, text in hlas_name.items(): if not 'dopracovanie' in text and not 'preložiť' in text: yield { const.NEO4J_BEGINNING_ID: int(id_hlas), const.NEO4J_ENDING_ID: int(i) }
def entry_generator(self): source_collection = storage.MongoCollection(self.db, 'parsed_hlasovanie') last_entry = source_collection.get( {}, projection=[const.HLASOVANIE_INDIVIDUALNE], sort=[(const.MONGO_ID, -1)]) aktivni_ids = [ int(i) for i in last_entry[const.HLASOVANIE_INDIVIDUALNE].keys() ] source_collection = storage.MongoCollection(self.db, 'parsed_poslanec') for entry in source_collection.iterate_all(): if entry[const.MONGO_ID] in aktivni_ids: klub = const.KLUB_NEZARADENI typ = const.CLEN_CLEN for org in entry[const.POSLANEC_CLENSTVO]: if org in const.KLUB_DICT: klub = const.KLUB_DICT[org] typ = const.CLEN_TYP_DICT[entry[ const.POSLANEC_CLENSTVO][org]] break result = { const.NEO4J_BEGINNING_ID: int(entry[const.MONGO_ID]), const.NEO4J_ENDING_ID: klub, const.CLEN_TYP: typ } yield result
def entry_generator(self): source_collection = storage.MongoCollection(self.db, 'parsed_zakon') col_navrh = storage.MongoCollection( self.db, 'edges_poslanec_zakon_navrhol') # TODO: fix collection naming col_klub = storage.MongoCollection(self.db, 'edges_poslanec_klub_bol_clenom') col_spektrum = storage.MongoCollection(self.db, 'edges_klub_spektrum_clen') for entry in source_collection.iterate_all(): if const.ZAKON_NAVRHOVATEL not in entry: continue navrhovatel = entry[const.ZAKON_NAVRHOVATEL] zakon_id = entry[const.MONGO_ID] if const.NAVRHOL_VLADA.lower() in navrhovatel.lower(): yield { const.NEO4J_BEGINNING_ID: const.SPEKTRUM_KOALICIA, const.NEO4J_ENDING_ID: zakon_id, const.NAVRHOL_NAVRHOVATEL: navrhovatel } elif const.NAVRHOL_POSLANCI.lower() in navrhovatel.lower(): poslanci = [ navrh[const.NEO4J_BEGINNING_ID] for navrh in col_navrh.get_all( {const.NEO4J_ENDING_ID: entry[const.MONGO_ID]}) ] if not poslanci: continue kluby = [ col_klub.get({const.NEO4J_BEGINNING_ID: poslanec_id})[const.NEO4J_ENDING_ID] for poslanec_id in poslanci ] spektrum = [ col_spektrum.get({const.NEO4J_BEGINNING_ID: klub})[const.NEO4J_ENDING_ID] for klub in kluby ] result = { const.NEO4J_ENDING_ID: zakon_id, const.NAVRHOL_NAVRHOVATEL: const.NAVRHOL_POSLANCI } if spektrum.count(const.SPEKTRUM_KOALICIA) > spektrum.count( const.SPEKTRUM_OPOZICIA): result[const.NEO4J_BEGINNING_ID] = const.SPEKTRUM_KOALICIA else: result[const.NEO4J_BEGINNING_ID] = const.SPEKTRUM_OPOZICIA yield result
def entry_generator(self): source_collection = storage.MongoCollection(self.db, 'parsed_hlasovanie') for entry in source_collection.iterate_all(): del entry[const.MONGO_TIMESTAMP] del entry[const.HLASOVANIE_INDIVIDUALNE] yield entry
def entry_generator(self): source_collection = storage.MongoCollection(self.db, 'parsed_zmena') for entry in source_collection.iterate_all(): entry.pop(const.ZMENA_PODPISANI, None) entry.pop(const.ZMENA_DALSI, None) entry.pop(const.ZMENA_PREDKLADATEL) yield entry
def create_id_generator(self): collection = storage.MongoCollection(self.db, 'parsed_zakon') return [ int(zmena_id) for zmeny in collection.get_all_attribute(const.ZAKON_ZMENY) for zmena_id in zmeny ]
def __init__(self, db, conf, base_url): super().__init__() name = str(self.__class__).split("'")[1].split('.')[-1].lower() self.db = db self.conf = conf self.collection = storage.MongoCollection(db, 'raw_' + name) self.base_url = base_url
def entry_generator(self): source_collection = storage.MongoCollection(self.db, 'parsed_zakon') for entry in source_collection.iterate_all(): if const.ZAKON_GESTORSKY in entry: yield { const.NEO4J_BEGINNING_ID: entry[const.ZAKON_GESTORSKY], const.NEO4J_ENDING_ID: entry[const.MONGO_ID] }
def entry_generator(self): source_collection = storage.MongoCollection(self.db, 'parsed_zakon') for entry in source_collection.iterate_all(): for zmena_id in entry.get(const.ZAKON_ZMENY, {}): yield { const.NEO4J_BEGINNING_ID: int(zmena_id), const.NEO4J_ENDING_ID: entry[const.MONGO_ID] }
def get_poslanec_id(db, name): # priezvisko, meno col_names = storage.MongoCollection(db, 'nodes_poslanec') priezvisko, meno = [s.strip() for s in name.split(',')] poslanec = col_names.get({ const.POSLANEC_PRIEZVISKO: priezvisko, const.POSLANEC_MENO: meno }) return poslanec['id']
def __init__(self, db, conf): self.db = db self.conf = conf self.name = str(self.__class__).split("'")[1] self.target_name = utils.camel2snake(self.name.split(".")[-1]) self.target_collection = storage.MongoCollection(self.db, self.target_name) self.log = logging.getLogger(self.name) self.batch_process = True
def entry_generator(self): source_collection = storage.MongoCollection( self.db, 'parsed_legislativnainiciativa') for entry in source_collection.iterate_all(): for zakon_id in entry.get(const.PREDLOZILZAKON_LIST, {}): yield { const.NEO4J_BEGINNING_ID: entry[const.MONGO_ID], const.NEO4J_ENDING_ID: int(zakon_id) }
def entry_generator(self): source_collection = storage.MongoCollection(self.db, 'parsed_poslanec') orgs = set() for entry in source_collection.iterate_all(): for org in entry[const.POSLANEC_CLENSTVO]: if const.POSLANEC_DELEGACIA.lower() in org.lower(): orgs.add(org) for org in orgs: yield {const.MONGO_ID: org}
def entry_generator(self): source_collection = storage.MongoCollection(self.db, 'parsed_hlasovanietlace') for entry in source_collection.iterate_all(): for hlasovanie_id in entry.get(const.HLASOVANIETLAC_LIST, {}): yield { const.NEO4J_BEGINNING_ID: int(hlasovanie_id), const.NEO4J_ENDING_ID: entry[const.MONGO_ID] }
def entry_generator(self): source_collection = storage.MongoCollection(self.db, 'parsed_rozprava') for entry in source_collection.iterate_all(): for vystupenie in entry[const.ROZPRAVA_VYSTUPENIA]: if const.ROZPRAVA_TLAC in vystupenie: yield { const.NEO4J_BEGINNING_ID: vystupenie[const.MONGO_ID], const.NEO4J_ENDING_ID: vystupenie[const.ROZPRAVA_TLAC] }
def get_collection(obj, conf, stage, db): conf_collections = conf[const.CONF_MONGO][const.CONF_MONGO_DATABASE][ const.CONF_MONGO_COLLECTION] prefix = conf_collections[stage] if isinstance(obj, str): suffix = obj else: suffix = str(obj.__class__).split("'")[1].split(".")[-1].lower() name = "_".join([prefix, suffix]) return storage.MongoCollection(db, name)
def entry_generator(self): source_collection = storage.MongoCollection(self.db, 'parsed_zmena') for entry in source_collection.iterate_all(): for poslanec in entry.get(const.ZMENA_PODPISANI, []): yield { const.NEO4J_BEGINNING_ID: utils.get_poslanec_id(self.db, poslanec), const.NEO4J_ENDING_ID: entry[const.MONGO_ID] }
def entry_generator(self): source_collection = storage.MongoCollection(self.db, 'parsed_poslanec') for entry in source_collection.iterate_all(): for org, typ in entry[const.POSLANEC_CLENSTVO].items(): if const.POSLANEC_DELEGACIA.lower() in org.lower(): result = { const.NEO4J_BEGINNING_ID: entry[const.MONGO_ID], const.NEO4J_ENDING_ID: org, const.CLEN_TYP: const.CLEN_TYP_DICT[typ] } yield result
def store_raw_html(self, entry_id): ids_collection = storage.MongoCollection(self.db, 'edges_poslanec_rozprava_vystupil') stored_ids = [ entry[const.NEO4J_ENDING_ID] for entry in ids_collection.get_all({const.NEO4J_BEGINNING_ID: entry_id}, projections={const.NEO4J_ENDING_ID}) ] entry_pages = [ entry['page'] for entry in self.collection.get_all({'id': entry_id}, projections={'page'}) ] if entry_pages: min_page = min(entry_pages) url = self.base_url.format(entry_id) br = robobrowser.RoboBrowser(parser='html.parser', history=False) br.open(url) last_page = False page = 1 while True: if not br.parsed.select('#_sectionLayoutContainer_ctl01__resultGrid'): break rozpravy_ids = [ int(span.find('a')['href'].split('=')[-1]) for span in br.parsed('span', attrs={'class': 'daily_info_speech_header_right'}) if span.find('a') ] if rozpravy_ids[0] in stored_ids: break if rozpravy_ids[-1] in stored_ids: last_page = True if entry_pages: store_page = min_page - page else: store_page = page data = { 'url': url, 'html': str(br.parsed), 'page': store_page, 'id': entry_id } self.collection.update(data, ['url', 'page']) sleep(self.conf['scrape']['delay']) if last_page: break page += 1 form = br.get_form(id='_f') form.add_field(rbfields.Input(f'<input name="__EVENTARGUMENT" value="Page${page}" />')) form.add_field(rbfields.Input('<input name="__EVENTTARGET" value="_sectionLayoutContainer$ctl01$_resultGrid" />')) form.fields.pop('_sectionLayoutContainer$ctl01$_searchButton') br.submit_form(form)
def entry_generator(self): source_collection = storage.MongoCollection(self.db, 'parsed_zakon') fields = [ const.MONGO_ID, const.MONGO_TIMESTAMP, const.ZAKON_STAV, const.ZAKON_VYSLEDOK, const.ZAKON_DRUH, const.ZAKON_NAZOV, const.MONGO_URL, const.MONGO_UNIQUE_ID, const.ZAKON_DATUM_DORUCENIA ] for entry in source_collection.iterate_all(): yield { field: entry[field] if field in entry else const.NEO4J_NULLVALUE for field in fields }
def entry_generator(self): source_collection = storage.MongoCollection(self.db, 'parsed_hlasovanie') last_entry = source_collection.get( {}, projection=[const.HLASOVANIE_INDIVIDUALNE], sort=[(const.MONGO_ID, -1)]) hlasy = last_entry[const.HLASOVANIE_INDIVIDUALNE].values() kluby = [value[const.HLASOVANIE_KLUB] for value in hlasy] values, counts = np.unique(kluby, return_counts=True) for val, count in zip(values, counts): val = utils.parse_klub(val) entry = {const.MONGO_ID: val, const.KLUB_POCET: int(count)} yield entry
def entry_generator(self): source_collection = storage.MongoCollection(self.db, 'parsed_rozprava') for entry in source_collection.iterate_all(): for vystupenie in entry[const.ROZPRAVA_VYSTUPENIA]: klub = vystupenie[const.ROZPRAVA_POSLANEC_KLUB] klub = const.KLUB_DICT.get('Klub ' + klub, const.NEO4J_NULLVALUE) yield { const.NEO4J_BEGINNING_ID: entry[const.MONGO_ID], const.NEO4J_ENDING_ID: vystupenie[const.MONGO_ID], const.ROZPRAVA_POSLANEC_KLUB: klub, const.ROZPRAVA_POSLANEC_TYP: vystupenie[const.ROZPRAVA_POSLANEC_TYP] }
def entry_generator(self): source_collection = storage.MongoCollection(self.db, 'parsed_hlasovanie') for entry in source_collection.iterate_all(): for poslanec_id, poslanec in entry[ const.HLASOVANIE_INDIVIDUALNE].items(): hlas = { const.NEO4J_BEGINNING_ID: int(poslanec_id), const.NEO4J_ENDING_ID: entry[const.MONGO_ID], const.HLASOVAL_HLAS: const.HLASOVAL_HLAS_DICT[poslanec[const.HLASOVANIE_HLAS]], const.HLASOVAL_KLUB: utils.parse_klub(poslanec[const.HLASOVANIE_KLUB]) } yield hlas
def entry_generator(self): source_collection = storage.MongoCollection(self.db, 'parsed_rozprava') pop_fields = [ const.ROZPRAVA_TLAC, const.ROZPRAVA_POSLANEC_ID, const.ROZPRAVA_POSLANEC_PRIEZVISKO, const.ROZPRAVA_POSLANEC_MENO, const.ROZPRAVA_POSLANEC_KLUB, const.ROZPRAVA_POSLANEC_TYP ] include_fields = [const.MONGO_TIMESTAMP, const.MONGO_URL] for entry in source_collection.iterate_all(): for vystupenie in entry[const.ROZPRAVA_VYSTUPENIA]: for field in pop_fields: vystupenie.pop(field, None) for field in include_fields: vystupenie[field] = entry[field] vystupenie[ const.ROZPRAVA_DLZKA] = self.compute_dlzka_vystupenia( vystupenie[const.ROZPRAVA_CAS_ZACIATOK], vystupenie[const.ROZPRAVA_CAS_KONIEC]) yield vystupenie
def entry_generator(self): source_collection = storage.MongoCollection(self.db, 'parsed_zmena') for entry in source_collection.iterate_all(): yield { const.NEO4J_BEGINNING_ID: utils.get_poslanec_id(self.db, entry[const.ZMENA_PREDKLADATEL]), const.NEO4J_ENDING_ID: entry[const.MONGO_ID], const.NAVRHOL_NAVRHOVATEL: const.NAVRHOL_HLAVNY } for poslanec in entry.get(const.ZMENA_DALSI, []): yield { const.NEO4J_BEGINNING_ID: utils.get_poslanec_id(self.db, poslanec), const.NEO4J_ENDING_ID: entry[const.MONGO_ID], const.NAVRHOL_NAVRHOVATEL: const.NAVRHOL_DALSI }
def entry_generator(self): source_collection = storage.MongoCollection(self.db, 'parsed_hlasovanie') poslanci = {} for entry in source_collection.iterate_all(): for poslanec_id in entry[const.HLASOVANIE_INDIVIDUALNE]: if poslanec_id in poslanci: if poslanci[poslanec_id][const.CLEN_NAPOSLEDY] > entry[ const.HLASOVANIE_CAS]: continue values = { const.NEO4J_BEGINNING_ID: int(poslanec_id), const.NEO4J_ENDING_ID: utils.parse_klub(entry[const.HLASOVANIE_INDIVIDUALNE] [poslanec_id][const.HLASOVANIE_KLUB]), const.CLEN_NAPOSLEDY: entry[const.HLASOVANIE_CAS] } poslanci[poslanec_id] = values for entry in poslanci.values(): yield entry
def create_id_generator(self): collection = storage.MongoCollection(self.db, 'parsed_poslanec') return collection.get_all_attribute('id')
def __init__(self, db, conf): super().__init__() name = str(self.__class__).split("'")[1].split('.')[-1].lower() self.source_collection = storage.MongoCollection(db, 'raw_' + name) self.target_collection = storage.MongoCollection(db, 'parsed_' + name) self.unique_ids = ['id']
def entry_generator(self): source_collection = storage.MongoCollection(self.db, 'parsed_poslanec') for entry in source_collection.iterate_all(): del entry[const.POSLANEC_CLENSTVO] yield entry