def load_debatten(engine, indexer, sitzung): WebTV_Speech = sl.get_table(engine, 'webtv_speech') zitate = list(sl.find(engine, WebTV_Speech, wp=str(sitzung.wahlperiode), session=str(sitzung.nummer))) debatten = dict([(z['item_id'], z) for z in zitate]) speeches = list(sl.find(engine, sl.get_table(engine, 'speech'), wahlperiode=int(sitzung.wahlperiode), sitzung=int(sitzung.nummer))) for i, data in debatten.items(): log.info("Loading -> Debatte: %s..." % data.get('item_label')) debatte = Debatte.query.filter_by( sitzung=sitzung, nummer=data.get('item_id') ).first() if debatte is None: debatte = Debatte() debatte.sitzung = sitzung debatte.nummer = data.get('item_id') debatte.tops = data.get('item_key') debatte.titel = data.get('item_label') debatte.text = data.get('item_description') db.session.add(debatte) db.session.flush() indexer.add(debatte) dzitate = filter(lambda z: z['item_id'] == data['item_id'], zitate) reden = load_reden(engine, indexer, debatte, dzitate) load_zitate(engine, indexer, debatte, dzitate, speeches, reden) db.session.commit() indexer.add_many(reden.values())
def load_debatten(engine, sitzung): WebTV_Speech = sl.get_table(engine, 'webtv_speech') zitate = list(sl.find(engine, WebTV_Speech, wp=str(sitzung.wahlperiode), session=str(sitzung.nummer))) debatten = dict([(z['item_id'], z) for z in zitate]) speeches = list(sl.find(engine, sl.get_table(engine, 'speech'), wahlperiode=int(sitzung.wahlperiode), sitzung=int(sitzung.nummer))) for i, data in debatten.items(): log.info("Loading -> Debatte: %s..." % data.get('item_label')) debatte = Debatte.query.filter_by( sitzung=sitzung, nummer=data.get('item_id') ).first() if debatte is None: debatte = Debatte() debatte.sitzung = sitzung debatte.nummer = data.get('item_id') debatte.tops = data.get('item_key') debatte.titel = data.get('item_label') debatte.text = data.get('item_description') db.session.add(debatte) db.session.flush() dzitate = filter(lambda z: z['item_id'] == data['item_id'], zitate) load_zitate(engine, debatte, dzitate, speeches) db.session.commit()
def load_ablauf(engine, indexer, data): ablauf = Ablauf.query.filter_by(source_url=data.get('source_url')).first() if ablauf is None: ablauf = Ablauf() ablauf.key = data.get('key') ablauf.source_url = data.get('source_url') ablauf.wahlperiode = data.get('wahlperiode') ablauf.typ = data.get('typ') ablauf.klasse = data.get('class') ablauf.titel = data.get('titel') if not len(ablauf.titel): log.error("No titel!") return ablauf.initiative = data.get('initiative') ablauf.stand = data.get('stand') ablauf.signatur = data.get('signatur') ablauf.gesta_id = data.get('gesta_id') ablauf.eu_dok_nr = data.get('eu_dok_nr') ablauf.eur_lex_url = data.get('eur_lex_url') ablauf.eur_lex_pdf = data.get('eur_lex_pdf') ablauf.consilium_url = data.get('consilium_url') ablauf.abstrakt = data.get('abstrakt') ablauf.zustimmungsbeduerftig = data.get('zustimmungsbeduerftig') ablauf.sachgebiet = data.get('sachgebiet') ablauf.abgeschlossen = True if str(data.get('abgeschlossen')) \ == 'True' else False db.session.add(ablauf) db.session.flush() worte = [] _Schlagwort = sl.get_table(engine, 'schlagwort') for sw in sl.find(engine, _Schlagwort, source_url=ablauf.source_url): wort = Schlagwort() wort.name = sw['wort'] db.session.add(wort) worte.append(wort) ablauf.schlagworte = worte _Referenz = sl.get_table(engine, 'referenz') for ddata in sl.find(engine, _Referenz, source_url=ablauf.source_url): dokument = load_dokument(engine, indexer, ddata) referenz = Referenz.query.filter_by( dokument=dokument, seiten=ddata.get('seiten'), ).filter(Referenz.ablaeufe.any(id=ablauf.id)).first() if referenz is None: referenz = Referenz() referenz.ablaeufe.append(ablauf) referenz.dokument = dokument referenz.seiten = ddata.get('seiten') referenz.text = ddata.get('text') _Position = sl.get_table(engine, 'position') for position in sl.find(engine, _Position, source_url=ablauf.source_url): load_position(engine, indexer, ablauf, position) db.session.commit() indexer.add(ablauf)
def speechmatcher(wp, session): engine = etl_engine() speech_table = sl.get_table(engine, 'speech') speeches = sl.find(engine, speech_table, order_by='sequence', wahlperiode=wp, sitzung=session, matched=True) webtv_table = sl.get_table(engine, 'webtv') agenda = sl.find(engine, webtv_table, wp=wp, session=session) agenda = list(agenda) return render_template('backend/speechmatcher.html', speeches=speeches, agenda=agenda, wp=wp, session=session)
def load_abstimmung(engine, source_url): table = sl.get_table(engine, 'abstimmung') stimmen = list(sl.find(engine, table, source_url=source_url, matched=True)) if not len(stimmen): log.error("No reconciled votes, signals deeper trouble?") return thema = stimmen[0].get('subject') abst = Abstimmung.query.filter_by(thema=thema).first() if abst is None: abst = Abstimmung() abst.thema = thema abst.datum = to_date(stimmen[0].get('date')) db.session.add(abst) db.session.flush() for stimme_ in stimmen: person = Person.query.filter_by( fingerprint=stimme_.get('fingerprint')).first() if person is None: continue stimme = Stimme.query.filter_by(abstimmung=abst).filter_by( person=person).first() if stimme is not None: continue stimme = Stimme() stimme.entscheidung = stimme_['vote'] stimme.person = person stimme.abstimmung = abst db.session.add(stimme) db.session.commit()
def extend_positions(engine): log.info("Amending positions ...") Position = sl.get_table(engine, 'position') for i, data in enumerate(sl.find(engine, Position)): if i % 1000 == 0: sys.stdout.write('.') sys.stdout.flush() dt, rest = data['fundstelle'].split("-", 1) data['date'] = datetime.strptime(dt.strip(), "%d.%m.%Y").isoformat() if ',' in data['urheber']: typ, quelle = data['urheber'].split(',', 1) data['quelle'] = re.sub("^.*Urheber.*:", "", quelle).strip() data['typ'] = typ.strip() else: data['typ'] = data['urheber'] br = 'Bundesregierung, ' if data['urheber'].startswith(br): data['urheber'] = data['urheber'][len(br):] data['fundstelle_doc'] = None if data['fundstelle_url'] and \ 'btp' in data['fundstelle_url']: data['fundstelle_doc'] = data['fundstelle_url']\ .rsplit('#',1)[0] hash = sha1(data['fundstelle'].encode('utf-8') \ + data['urheber'].encode('utf-8') + \ data['ablauf_id'].encode('utf-8')).hexdigest() data['hash'] = hash[:10] sl.upsert(engine, Position, data, unique=UNIQUE)
def combine(force=False, filter=None): stats = OpenSpendingStats() engine = db_connect() source_table = sl.get_table(engine, 'source') for row in sl.find(engine, source_table, **(filter or {})): combine_resource(engine, source_table, row, force, stats) log.info('Combine summary: \n%s' % stats.report())
def generate_person_long_names(engine): log.info("Generating person fingerprints and slugs...") from offenesparlament.transform.namematch import match_speaker nkp = nk_persons() Person = sl.get_table(engine, 'person') for person in sl.find(engine, Person): long_name = make_long_name(person) try: long_name = match_speaker(long_name) except NKNoMatch: pass log.info(" -> %s" % long_name.strip()) slug = url_slug(long_name) sl.upsert(engine, Person, { 'fingerprint': long_name, 'slug': slug, 'id': person['id']}, unique=['id']) tries = 0 while True: try: nkp.ensure_value(long_name, data=person) except ValueError, E: log.warn('Exception: %s' % str(E)) tries = tries + 1 if tries > 5: raise else: break
def load_rollen(engine, person, data): _RolleSource = sl.get_table(engine, 'rolle') for rdata in sl.find(engine, _RolleSource, fingerprint=data['fingerprint']): rolle = Rolle.query.filter_by(person=person, funktion=rdata.get('funktion'), ressort=rdata.get('ressort'), fraktion=rdata.get('fraktion'), land=rdata.get('land')).first() if rolle is None: rolle = Rolle() rolle.person = person rolle.mdb_id = rdata.get('mdb_id') rolle.status = rdata.get('status') rolle.funktion = rdata.get('funktion') rolle.fraktion = rdata.get('fraktion') rolle.gewaehlt = rdata.get('gewaehlt') rolle.ressort = rdata.get('ressort') rolle.land = rdata.get('land') rolle.austritt = to_date(rdata.get('austritt')) if rdata.get('mdb_id'): rolle.wahlkreis = load_wahlkreis(engine, rolle, data) db.session.add(rolle)
def get_agenda(engine, wp, session): return list( sl.find(engine, sl.get_table(engine, 'webtv'), wp=wp, session=session, order_by='speech_id'))
def merge_speeches(engine): # desired result: (position_id, debatte_id) referenzen = referenzen_index(engine) items = item_index(engine) log.info("Finding best matches.... ") matches = {} for (ablauf_id, rwp, rsession), rdrs in referenzen.items(): for (iwp, isession, item_id), idrs in items.items(): if iwp != rwp or rsession != isession: continue ints = len(idrs.intersection(rdrs)) if ints == 0: continue k = (ablauf_id, rwp, rsession) if k in matches and matches[k][1] > ints: continue matches[k] = (item_id, ints) log.info("Saving position associations....") pos_tbl = sl.get_table(engine, 'position') for (ablauf_id, wp, session), (item_id, n) in matches.items(): for pos in sl.find(engine, pos_tbl, ablauf_id="%s/%s" % (wp, ablauf_id)): if not pos['fundstelle_url']: continue if 'btp/%s/%s%03d.pdf' % (wp, wp, int(session)) in pos['fundstelle_url']: d = {'ablauf_id': pos['ablauf_id'], 'hash': pos['hash'], 'debatte_wp': wp, 'debatte_session': session, 'debatte_item_id': item_id} sl.upsert(engine, pos_tbl, d, unique=['ablauf_id', 'hash'])
def load_rollen(engine, person, data): _RolleSource = sl.get_table(engine, "rolle") for rdata in sl.find(engine, _RolleSource, fingerprint=data["fingerprint"]): rolle = Rolle.query.filter_by( person=person, funktion=rdata.get("funktion"), ressort=rdata.get("ressort"), fraktion=rdata.get("fraktion"), land=rdata.get("land"), ).first() if rolle is None: rolle = Rolle() rolle.person = person rolle.mdb_id = rdata.get("mdb_id") rolle.status = rdata.get("status") rolle.funktion = rdata.get("funktion") rolle.fraktion = rdata.get("fraktion") rolle.gewaehlt = rdata.get("gewaehlt") rolle.ressort = rdata.get("ressort") rolle.land = rdata.get("land") rolle.austritt = to_date(rdata.get("austritt")) if rdata.get("mdb_id"): rolle.wahlkreis = load_wahlkreis(engine, rolle, data) db.session.add(rolle)
def load_abstimmungen(engine): _Abstimmung = sl.get_table(engine, 'abstimmung') i = 0 for row in sl.distinct(engine, _Abstimmung, 'subject', 'date'): thema = row.get('subject') abst = Abstimmung.query.filter_by(thema=thema).first() if abst is None: abst = Abstimmung() abst.thema = thema abst.datum = date(row.get('date')) db.session.add(abst) for stimme_ in sl.find(engine, _Abstimmung, subject=thema, matched=True): if i % 1000 == 0: sys.stdout.write(".") sys.stdout.flush() i += 1 person = Person.query.filter_by( fingerprint=stimme_.get('fingerprint')).first() if person is None: continue stimme = Stimme.query.filter_by( abstimmung=abst).filter_by( person=person).first() if stimme is not None: continue stimme = Stimme() stimme.entscheidung = stimme_['vote'] stimme.person = person stimme.abstimmung = abst db.session.add(stimme) db.session.commit()
def extend_speeches(engine, wahlperiode=17): log.info("Amending speeches with DRS ...") drs_match = re.compile(DRS_MATCH % (wahlperiode, wahlperiode)) Speech = sl.get_table(engine, 'speech') SpeechDocument = sl.get_table(engine, 'speech_document') for i, data in enumerate(sl.find(engine, Speech)): if data.get('type') != 'chair': continue if i % 1000 == 0: sys.stdout.write('.') sys.stdout.flush() m = drs_match.search(data.get('text')) if m is None: continue for i, grp in enumerate(m.groups()): if grp and '/' in grp: wp, nummer = grp.split('/', 1) sl.upsert( engine, SpeechDocument, { 'group': i, 'sequence': data['sequence'], 'sitzung': data['sitzung'], 'wahlperiode': wahlperiode, 'dok_nummer': nummer }, unique=['sequence', 'sitzung', 'wahlperiode', 'group'])
def load_rollen(engine, person, data): _RolleSource = sl.get_table(engine, 'rolle') mdb_rolle = None for rdata in sl.find(engine, _RolleSource, fingerprint=data['fingerprint']): rolle = Rolle.query.filter_by( person=person, funktion=rdata.get('funktion'), ressort=rdata.get('ressort'), fraktion=rdata.get('fraktion'), land=rdata.get('land')).first() if rolle is None: rolle = Rolle() rolle.person = person rolle.mdb_id = rdata.get('mdb_id') rolle.status = rdata.get('status') rolle.funktion = rdata.get('funktion') rolle.fraktion = rdata.get('fraktion') rolle.gewaehlt = rdata.get('gewaehlt') rolle.ressort = rdata.get('ressort') rolle.land = rdata.get('land') rolle.austritt = date(rdata.get('austritt')) if rdata.get('mdb_id'): rolle.wahlkreis = load_wahlkreis(engine, rolle, data) mdb_rolle = rolle db.session.add(rolle) return mdb_rolle
def validate_sheet(engine, row, sheet_id, data_row_filter, stats_spending): spending_table = sl.get_table(engine, 'spending') data = list( sl.find(engine, spending_table, resource_id=row['resource_id'], sheet_id=sheet_id)) connection = engine.connect() trans = connection.begin() issue_noted_for_this_resource = False # record first failure only error_message = None try: records = 0 for row_ in data: if data_row_filter and data_row_filter != row_['row_id']: continue result = {'id': row_['id'], 'valid': True} result['signature'] = generate_signature(row_) if row_['DateFormatted'] is None: stats_spending['date'].add_spending('Date invalid', row_) result['valid'] = False if not issue_noted_for_this_resource: issue( engine, row['resource_id'], row['retrieve_hash'], STAGE, 'Date invalid (blank, inconsistent or unrecognised format)', { 'row_id': row_.get('row_id'), 'row_number': row_.get('row_number'), 'Date': row_.get('Date') }) error_message = 'Date invalid' issue_noted_for_this_resource = True else: stats_spending['date'].add_spending('Date ok', row_) if row_['AmountFormatted'] is None: stats_spending['amount'].add_spending('Amount invalid', row_) result['valid'] = False if not issue_noted_for_this_resource: issue( engine, row['resource_id'], row['retrieve_hash'], STAGE, 'Amount invalid', { 'row_id': row_.get('row_id'), 'row_number': row_.get('row_number'), 'Amount': row_.get('Amount') }) error_message = 'Amount invalid' issue_noted_for_this_resource = True else: stats_spending['amount'].add_spending('Amount ok', row_) if result['valid']: records += 1 sl.update(connection, spending_table, {'id': result['id']}, result) trans.commit() return records > 0, error_message finally: connection.close()
def load_abstimmung(engine, source_url): table = sl.get_table(engine, 'abstimmung') stimmen = list(sl.find(engine, table, source_url=source_url, matched=True)) if not len(stimmen): log.error("No reconciled votes, signals deeper trouble?") return thema = stimmen[0].get('subject') abst = Abstimmung.query.filter_by(thema=thema).first() if abst is None: abst = Abstimmung() abst.thema = thema abst.datum = to_date(stimmen[0].get('date')) db.session.add(abst) db.session.flush() for stimme_ in stimmen: person = Person.query.filter_by( fingerprint=stimme_.get('fingerprint')).first() if person is None: continue stimme = Stimme.query.filter_by( abstimmung=abst).filter_by( person=person).first() if stimme is not None: continue stimme = Stimme() stimme.entscheidung = stimme_['vote'] stimme.person = person stimme.abstimmung = abst db.session.add(stimme) db.session.commit()
def cleanup_sheet(engine, row, sheet_id): spending_table = sl.get_table(engine, 'spending') data = list( sl.find(engine, spending_table, resource_id=row['resource_id'], sheet_id=sheet_id)) connection = engine.connect() trans = connection.begin() date_formats = cleanup_dates.detect_formats(data) try: if None in date_formats.values(): log.warn("Couldn't detect date formats: %r", date_formats) issue(engine, row['resource_id'], row['retrieve_hash'], "Couldn't detect date formats", repr(date_formats)) return False sl.delete(connection, spending_table, resource_id=row['resource_id'], sheet_id=sheet_id) for row in data: row = cleanup_dates.apply(row, date_formats) row = cleanup_numbers.apply(row) row = cleanup_gov.apply(row) #row = cleanup_supplier.apply(row, engine) del row['id'] sl.add_row(connection, spending_table, row) trans.commit() return True finally: connection.close()
def load(engine, grano): for rep in sl.find(engine, sl.get_table(engine, 'representative')): del rep['id'] rep_ent = canonical_actor(grano, engine, rep['originalName']) if 'id' in rep_ent: rep_ent = grano.getEntity(rep_ent['id'], deep=True) #if not SETTINGS.FULL and rep_ent['etlId'] == rep['etlId']: # continue rep_ent.update(rep) rep_ent['actsAsRepresentative'] = True rep_ent['staffMembers'] = int(float(rep['members'])) rep_ent['incoming'] = rep_ent.get('incoming', []) rep_ent['outgoing'] = rep_ent.get('outgoing', []) rep_ent['contactCountry'] = rep_ent['contactCountryNorm'] rep_ent = load_clients(grano, engine, rep_ent) rep_ent = load_organisations(grano, engine, rep_ent) rep_ent = load_networking(grano, engine, rep_ent) rep_ent = load_persons(grano, engine, rep_ent) rep_ent = load_interests(grano, engine, rep_ent) rep_ent = load_action_fields(grano, engine, rep_ent) rep_ent = get_financial_data(engine, rep_ent) # TODO: other financial sources #from pprint import pprint #pprint(rep_ent) grano.updateEntity(rep_ent)
def extract_some(force=False, filter=None): # kwargs = resource_id=x, package_name=y, publisher_title=z stats = OpenSpendingStats() engine = db_connect() source_table = sl.get_table(engine, 'source') for row in sl.find(engine, source_table, **(filter or {})): extract_resource(engine, source_table, row, force, stats) log.info('Extract summary: \n%s' % stats.report())
def retrieve_some(force=False, filter=None): stats = OpenSpendingStats() engine = db_connect() source_table = sl.get_table(engine, 'source') for row in sl.find(engine, source_table, **(filter or {})): retrieve(row, engine, source_table, force, stats) print 'Retrieve summary:' print stats.report()
def load_ablaeufe(engine): _Ablauf = sl.get_table(engine, 'ablauf') for i, data in enumerate(sl.find(engine, _Ablauf, wahlperiode=str(17))): log.info("Loading Ablauf: %s..." % data['titel']) load_ablauf(engine, data) if i % 500 == 0: db.session.commit() db.session.commit()
def cleanup(force=False, resource_filter=None, data_row_filter=None): stats = OpenSpendingStats() stats_spending = defaultdict(OpenSpendingStats) engine = db_connect() source_table = sl.get_table(engine, 'source') for row in sl.find(engine, source_table, **(filter or {})): cleanup_resource(engine, source_table, row, force, data_row_filter, stats, stats_spending) log.info('Cleanup summary: \n%s' % stats.report()) for key in stats_spending: log.info('Cleanup %s: \n%s' % (key, stats_spending[key].report()))
def load_ap(ap, engine): orgs = list(sl.find(engine, sl.get_table(engine, 'representative'), identificationCode=ap['orgIdentificationCode'])) if len(orgs): org = max(orgs, key=lambda o: o['lastUpdateDate']) childBase = {'representativeEtlId': org['etlId'], 'representativeUpdateDate': org['lastUpdateDate']} load_person(ap, 'accredited', childBase, engine) else: print ap
def get_transcript(engine, wp, session): speeches = [] for speech in sl.find( engine, sl.get_table(engine, "speech"), order_by="sequence", wahlperiode=wp, sitzung=session, matched=True ): if speech["type"] == "poi": continue seg = (speech["sequence"], speech["fingerprint"]) speeches.append(seg) return speeches
def retrieve_some(force=False, **filters): engine = db_connect() source_table = sl.get_table(engine, 'source') result_counts = defaultdict(int) for row in sl.find(engine, source_table, **filters): result = retrieve(row, engine, source_table, force) result_counts['total'] += 1 result_counts[result] += 1 log.info('Total %i URLs', result_counts.pop('total')) for result, count in result_counts.items(): log.info(' %i %s', count, result)
def build_index(publisher_name=None): '''Searches CKAN for spending resources and writes their metadata to the database.''' engine, table = connect() client = ckan_client() log.info('CKAN: %s', client.base_location) tags = ['+tags:"%s"' % t for t in TAGS] q = " OR ".join(tags) publisher_dict_filter = {} if publisher_name: publisher_solr_filter = 'publisher:"%s"' % publisher_name q = '(%s) AND (%s)' % (q, publisher_solr_filter) publisher_dict_filter = {'publisher_name': publisher_name} log.info('SOLR Search q: %r', q) existing_packages = set([ res['package_name'] for res in sl.distinct( engine, table, 'package_name', **publisher_dict_filter) ]) log.info('Existing datasets: %i', len(existing_packages)) processed_packages = set() log.info('Doing package search for: "%s"', q) res = client.package_search(q, search_options={'limit': 2000}) log.info('Search returned %i dataset results', res['count']) stats = OpenSpendingStats() stats_resources = OpenSpendingStats() for package_name in res['results']: processed_packages.add(package_name) num_resources = fetch_package(client, package_name, engine, table, stats_resources) if num_resources == 0: stats.add('Dataset has no resources', package_name) else: stats.add('Dataset has resources', package_name) # Removed rows about deleted packages obsolete_packages = existing_packages - processed_packages log.info('Obsolete datasets: %s from %s', len(obsolete_packages), len(existing_packages)) for package_name in obsolete_packages: sl.delete(engine, table, package_name=package_name) sl.delete(engine, 'issue', package_name=package_name) stats.add('Removed obsolete dataset', package_name) # Removed stray rows without package_name stray_rows = list(sl.find(engine, table, package_name=None)) if stray_rows: log.info('Stray rows without package_name: %i', len(stray_rows)) sl.delete(engine, table, package_name=None) sl.delete(engine, 'issue', package_name=None) for row in stray_rows: stats.add('Stray row removed', row['resource_id']) print 'Datasets build_index summary:' print stats.report() print 'Resources build_index summary:' print stats_resources.report()
def validate(force=False, filter=None, data_row_filter=None): stats = OpenSpendingStats() stats_spending = {'date': OpenSpendingStats(), 'amount': OpenSpendingStats()} engine = db_connect() source_table = sl.get_table(engine, 'source') for row in sl.find(engine, source_table, **(filter or {})): validate_resource(engine, source_table, row, force, data_row_filter, stats, stats_spending) log.info('Validate summary: \n%s' % stats.report()) for stat_type in stats_spending: log.info('Validate %s: \n%s' % (stat_type, stats_spending[stat_type].report()))
def build_index(publisher_name=None): '''Searches CKAN for spending resources and writes their metadata to the database.''' engine, table = connect() client = ckan_client() log.info('CKAN: %s', client.base_location) tags = ['+tags:"%s"' % t for t in TAGS] q = " OR ".join(tags) publisher_dict_filter = {} if publisher_name: publisher_solr_filter = 'publisher:"%s"' % publisher_name q = '(%s) AND (%s)' % (q, publisher_solr_filter) publisher_dict_filter = {'publisher_name': publisher_name} log.info('SOLR Search q: %r', q) existing_packages = set( [res['package_name'] for res in sl.distinct(engine, table, 'package_name', **publisher_dict_filter)]) log.info('Existing datasets: %i', len(existing_packages)) processed_packages = set() log.info('Doing package search for: "%s"', q) res = client.package_search(q, search_options={'limit': 2000}) log.info('Search returned %i dataset results', res['count']) stats = OpenSpendingStats() stats_resources = OpenSpendingStats() for package_name in res['results']: processed_packages.add(package_name) num_resources = fetch_package(client, package_name, engine, table, stats_resources) if num_resources == 0: stats.add('Dataset has no resources', package_name) else: stats.add('Dataset has resources', package_name) # Removed rows about deleted packages obsolete_packages = existing_packages - processed_packages log.info('Obsolete datasets: %s from %s', len(obsolete_packages), len(existing_packages)) for package_name in obsolete_packages: sl.delete(engine, table, package_name=package_name) sl.delete(engine, 'issue', package_name=package_name) stats.add('Removed obsolete dataset', package_name) # Removed stray rows without package_name stray_rows = list(sl.find(engine, table, package_name=None)) if stray_rows: log.info('Stray rows without package_name: %i', len(stray_rows)) sl.delete(engine, table, package_name=None) sl.delete(engine, 'issue', package_name=None) for row in stray_rows: stats.add('Stray row removed', row['resource_id']) print 'Datasets build_index summary:' print stats.report() print 'Resources build_index summary:' print stats_resources.report()
def validate_sheet(engine, row, sheet_id, data_row_filter, stats_spending): spending_table = sl.get_table(engine, 'spending') data = list(sl.find(engine, spending_table, resource_id=row['resource_id'], sheet_id=sheet_id)) connection = engine.connect() trans = connection.begin() issue_noted_for_this_resource = False # record first failure only error_message = None try: records = 0 for row_ in data: if data_row_filter and data_row_filter != row_['row_id']: continue result = {'id': row_['id'], 'valid': True} result['signature'] = generate_signature(row_) if row_['DateFormatted'] is None: stats_spending['date'].add_spending('Date invalid', row_) result['valid'] = False if not issue_noted_for_this_resource: issue(engine, row['resource_id'], row['retrieve_hash'], STAGE, 'Date invalid (blank, inconsistent or unrecognised format)', {'row_id': row_.get('row_id'), 'row_number': row_.get('row_number'), 'Date': row_.get('Date')}) error_message = 'Date invalid' issue_noted_for_this_resource = True else: stats_spending['date'].add_spending('Date ok', row_) if row_['AmountFormatted'] is None: stats_spending['amount'].add_spending('Amount invalid', row_) result['valid'] = False if not issue_noted_for_this_resource: issue(engine, row['resource_id'], row['retrieve_hash'], STAGE, 'Amount invalid', {'row_id': row_.get('row_id'), 'row_number': row_.get('row_number'), 'Amount': row_.get('Amount')}) error_message = 'Amount invalid' issue_noted_for_this_resource = True else: stats_spending['amount'].add_spending('Amount ok', row_) if result['valid']: records += 1 sl.update(connection, spending_table, {'id': result['id']}, result) trans.commit() return records > 0, error_message finally: connection.close()
def get_transcript(engine, wp, session): speeches = [] for speech in sl.find(engine, sl.get_table(engine, 'speech'), order_by='sequence', wahlperiode=wp, sitzung=session, matched=True): if speech['type'] == 'poi': continue seg = (speech['sequence'], speech['fingerprint']) speeches.append(seg) return speeches
def resolve_stimmen(engine, source_url): table = sl.get_table(engine, 'abstimmung') for data in sl.find(engine, table, source_url=source_url): try: fp = resolve_person(data['person']) except BadReference: fp = None log.info("No match for: %s", data['person']) sl.upsert(engine, table, {'person': data.get('person'), 'matched': fp is not None, 'fingerprint': fp}, unique=['person'])
def articles(engine): a_table = sl.get_table(engine, 'article') for data in sl.find(engine, a_table): up = {'number': data['number']} slug_parts = data['canonical_url'].split('/')[3:] if len(slug_parts) > 3: print slug_parts if len(slug_parts) == 3: up['ressort'], up['subressort'], _ = slug_parts elif len(slug_parts) == 2: up['ressort'], _ = slug_parts up['date'] = parse_date(data['date_text']) sl.upsert(engine, a_table, up, ['number'])
def merge_speech(engine, wp, session): log.info("Merging media + transcript: %s/%s" % (wp, session)) WebTV = sl.get_table(engine, 'webtv') WebTV_Speeches = sl.get_table(engine, 'webtv_speech') changes, recordings = [], [] for recd in sl.find(engine, WebTV, wp=wp, session=session, order_by='speech_id'): recordings.append(recd) if not len(changes) or changes[-1] != recd['fingerprint']: changes.append(recd) #speakers = [] changes_index = 0 def emit(speech): data = changes[changes_index].copy() del data['id'] data['sequence'] = speech['sequence'] sl.upsert(engine, WebTV_Speeches, data, unique=['wp', 'session', 'sequence']) Speech = sl.get_table(engine, 'speech') for speech in sl.find(engine, Speech, order_by='sequence', wahlperiode=wp, sitzung=session, matched=True): if speech['type'] == 'poi': emit(speech) continue if speech['type'] == 'chair': match_chair(speech, changes[changes_index]) transition = changes[changes_index] if len(changes) > changes_index + 1: transition = changes[changes_index + 1] if speech['fingerprint'] == transition['fingerprint']: changes_index += 1 recd = changes[changes_index] #print [speech['fingerprint'], recd['fingerprint'], recd['item_label']] emit(speech)
def load_networking(grano, engine, rep): for org in sl.find(engine, sl.get_table(engine, 'network_entity'), representativeEtlId=rep['etlId']): ent = canonical_actor(grano, engine, org['etlFingerPrint']) ent = ensure_actor(grano, ent) rel = find_relation(rep['outgoing'], 'target', ent, {'type': ASSOCIATED['name']}) rel['type'] = ASSOCIATED['name'] rel['source'] = rep.get('id') rel['target'] = ent rep['outgoing'] = replace_relation(rep['outgoing'], 'target', rel) return rep
def get_financial_data(engine, rep): fds = list(sl.find(engine, sl.get_table(engine, 'financialData'), representativeEtlId=rep['etlId'])) fd = max(fds, key=lambda f: f.get('endDate')) for key, value in fd.items(): if key in [u'totalBudget', u'turnoverMin', u'costAbsolute', u'publicFinancingNational', u'otherSourcesDonation', u'eurSourcesProcurement', u'costMax', u'eurSourcesGrants', u'otherSourcesContributions', u'publicFinancingTotal', u'turnoverAbsolute', u'turnoverMax', u'costMin', u'directRepCostsMin', u'directRepCostsMax', u'publicFinancingInfranational', u'otherSourcesTotal']: if value is not None: value = int(float(value)) key = 'fd' + key[0].upper() + key[1:] rep[key] = value return rep
def resolve_stimmen(engine, source_url): table = sl.get_table(engine, 'abstimmung') for data in sl.find(engine, table, source_url=source_url): try: fp = resolve_person(data['person']) except BadReference: fp = None log.info("No match for: %s", data['person']) sl.upsert(engine, table, { 'person': data.get('person'), 'matched': fp is not None, 'fingerprint': fp }, unique=['person'])
def validate(force=False, filter=None, data_row_filter=None): stats = OpenSpendingStats() stats_spending = { 'date': OpenSpendingStats(), 'amount': OpenSpendingStats() } engine = db_connect() source_table = sl.get_table(engine, 'source') for row in sl.find(engine, source_table, **(filter or {})): validate_resource(engine, source_table, row, force, data_row_filter, stats, stats_spending) log.info('Validate summary: \n%s' % stats.report()) for stat_type in stats_spending: log.info('Validate %s: \n%s' % (stat_type, stats_spending[stat_type].report()))
def load_organisations(grano, engine, rep): for org in sl.find(engine, sl.get_table(engine, 'organisation'), representativeEtlId=rep['etlId']): ent = canonical_actor(grano, engine, org['name']) ent['orgMembers'] = int(float(org['numberOfMembers'] or 0)) ent['actsAsOrganisation'] = True ent = ensure_actor(grano, ent) rel = find_relation(rep['outgoing'], 'target', ent, {'type': MEMBERSHIP['name']}) rel['type'] = MEMBERSHIP['name'] rel['source'] = rep.get('id') rel['target'] = ent rep['outgoing'] = replace_relation(rep['outgoing'], 'target', rel) return rep
def extend_beschluesse(engine): log.info("Re-connecting beschluesse ...") abstimmungen = cache_abstimmungen(engine) # pprint(abstimmungen) Beschluss = sl.get_table(engine, "beschluss") for data in sl.find(engine, Beschluss): date = data["fundstelle"].split(" ")[0] data["date"] = datetime.strptime(date, "%d.%m.%Y").isoformat() if not data["dokument_text"]: continue if data["date"] in abstimmungen: abst = abstimmungen[data["date"]] doks = set(data["dokument_text"].split(", ")) for subject, adoks in abst.items(): if len(doks & adoks): print "MATCH", data["date"], doks, adoks
def extend_beschluesse(engine): log.info("Re-connecting beschluesse ...") abstimmungen = cache_abstimmungen(engine) #pprint(abstimmungen) Beschluss = sl.get_table(engine, 'beschluss') for data in sl.find(engine, Beschluss): date = data['fundstelle'].split(' ')[0] data['date'] = datetime.strptime(date, '%d.%m.%Y').isoformat() if not data['dokument_text']: continue if data['date'] in abstimmungen: abst = abstimmungen[data['date']] doks = set(data['dokument_text'].split(', ')) for subject, adoks in abst.items(): if len(doks & adoks): print "MATCH", data['date'], doks, adoks
def extend_beschluesse(engine, master): log.info("Re-connecting beschluesse ...") abstimmungen = cache_abstimmungen(engine) #pprint(abstimmungen) Beschluss = sl.get_table(engine, 'beschluss') for data in sl.find(engine, Beschluss): date = data['fundstelle'].split(' ')[0] data['date'] = datetime.strptime(date, '%d.%m.%Y').isoformat() if not data['dokument_text']: continue if data['date'] in abstimmungen: abst = abstimmungen[data['date']] doks = set(data['dokument_text'].split(', ')) for subject, adoks in abst.items(): if len(doks & adoks): print "MATCH", data['date'], doks, adoks
def validate_sheet(engine, row, sheet_id): spending_table = sl.get_table(engine, 'spending') data = list( sl.find(engine, spending_table, resource_id=row['resource_id'], sheet_id=sheet_id)) connection = engine.connect() trans = connection.begin() issue_noted_for_this_resource = False # record first failure only try: records = 0 for row_ in data: result = {'id': row_['id'], 'valid': True} result['signature'] = generate_signature(row_) if row_['DateFormatted'] is None: result['valid'] = False if not issue_noted_for_this_resource: issue( engine, row['resource_id'], row['retrieve_hash'], 'Date invalid (or possible the date format is inconsistent)', { 'row_id': row_.get('row_id'), 'Date': row_.get('Date') }) issue_noted_for_this_resource = True if row_['AmountFormatted'] is None: result['valid'] = False if not issue_noted_for_this_resource: issue(engine, row['resource_id'], row['retrieve_hash'], 'Amount invalid', { 'row_id': row_.get('row_id'), 'Amount': row_.get('Amount') }) issue_noted_for_this_resource = True if result['valid']: records += 1 sl.update(connection, spending_table, {'id': result['id']}, result) trans.commit() return records > 0 finally: connection.close()
def generate_all(): engine = db_connect() spending = sl.get_table(engine, 'spending') sources = sources_metadata(engine) signatures = set() for row in sl.find(engine, spending, valid=True): if row['signature'] in signatures: continue signatures.add(row['signature']) if not row['resource_id'] in sources: continue row.update(sources[row['resource_id']]) row.pop('valid', True) row.pop('row_id', True) row.pop('resource_id', True) row.pop('resource_hash', True) row['RecordETLID'] = row.pop('id', None) row['RecordSignature'] = row.pop('signature', None) row['SourceSheetID'] = row.pop('sheet_id', None) yield row
def get_alignment(engine, wp, session): agenda_speeches = get_agenda(engine, wp, session) transcript_speeches = get_transcript(engine, wp, session) try: cuts = list( sl.find(engine, sl.get_table(engine, 'alignments'), wp=str(wp), session=str(session), order_by='sequence')) except KeyError: cuts = [] alignment = [] tr_offset = 0 ag_offset = 0 for cut in cuts: tr_speeches = transcript_seek(transcript_speeches, cut, tr_offset) tr_current = len(tr_speeches) + 1 tr_offset = tr_offset + tr_current ag_speeches = agenda_seek(agenda_speeches, cut, ag_offset) ag_offset = ag_offset + len(ag_speeches) - 1 section = align_section(tr_speeches, ag_speeches) alignment.extend(section) data = { 'item_id': cut.get('item_id'), 'speech_id': cut.get('speech_id'), 'sequence': cut.get('sequence'), 'agenda_fp': ag_speeches[-1].get('fingerprint'), 'transcript_fp': transcript_speeches[tr_current][1] } alignment.append(data) section = align_section(transcript_speeches[tr_offset:], agenda_speeches[ag_offset:]) alignment.extend(section) return score_alignment(alignment), alignment
def load_gremium_mitglieder(engine, person): _GremiumMitglieder = sl.get_table(engine, 'gremium_mitglieder') for gmdata in sl.find(engine, _GremiumMitglieder, person_source_url=person.source_url): gremium = Gremium.query.filter_by(key=gmdata['gremium_key']).first() if gremium is None: gremium = lazyload_gremium(engine, gmdata['gremium_key']) if gremium is None: log.error("Gremium not found: %s" % gmdata['gremium_key']) role = gmdata['role'] if role == 'obleute': gremium.obleute.append(person) elif role == 'vorsitz': gremium.vorsitz = person elif role == 'stellv_vorsitz': gremium.stellv_vorsitz = person elif role == 'mitglied': gremium.mitglieder.append(person) elif role == 'stellv_mitglied': gremium.stellvertreter.append(person)
def merge_speech(engine, wp, session): log.info("Merging media + transcript: %s/%s" % (wp, session)) score, alignment = get_alignment(engine, wp, session) log.info("Matching score: %s", score) agenda = get_agenda(engine, wp, session) agenda = dict([(a['item_id'], a) for a in agenda]) alignment = dict([(a['sequence'], a) for a in alignment]) item = None table = sl.get_table(engine, 'webtv_speech') for speech in sl.find(engine, sl.get_table(engine, 'speech'), order_by='sequence', wahlperiode=wp, sitzung=session, matched=True): sequence = speech['sequence'] item = alignment.get(sequence, item) data = agenda.get(item['item_id']).copy() del data['id'] data['sequence'] = sequence sl.upsert(engine, table, data, unique=['wp', 'session', 'sequence'])
def cleanup_sheet(engine, row, sheet_id, data_row_filter, stats_spending): spending_table = sl.get_table(engine, 'spending') data = list( sl.find(engine, spending_table, resource_id=row['resource_id'], sheet_id=sheet_id)) if not data: log.info('Sheet has no rows') return False, None connection = engine.connect() trans = connection.begin() date_formats = cleanup_dates.detect_formats(data) try: for date_format in date_formats.values(): if isinstance(date_format, basestring): issue(engine, row['resource_id'], row['retrieve_hash'], STAGE, "Couldn't detect date formats because: %s" % date_format, repr(date_formats)) return True, date_format if not data_row_filter: sl.delete(connection, spending_table, resource_id=row['resource_id'], sheet_id=sheet_id) for row in data: if data_row_filter and data_row_filter != row['row_id']: continue row = cleanup_dates.apply(row, date_formats, stats_spending) row = cleanup_numbers.apply(row, stats_spending) row = cleanup_gov.apply(row, stats_spending) #row = cleanup_supplier.apply(row, engine) del row['id'] sl.add_row(connection, spending_table, row) trans.commit() return True, None finally: connection.close()
def merge_speeches(engine): # desired result: (position_id, debatte_id) referenzen = referenzen_index(engine) items = item_index(engine) log.info("Finding best matches.... ") matches = {} for (ablauf_id, rwp, rsession), rdrs in referenzen.items(): for (iwp, isession, item_id), idrs in items.items(): if iwp != rwp or rsession != isession: continue ints = len(idrs.intersection(rdrs)) if ints == 0: continue k = (ablauf_id, rwp, rsession) if k in matches and matches[k][1] > ints: continue matches[k] = (item_id, ints) log.info("Saving position associations....") pos_tbl = sl.get_table(engine, 'position') for (ablauf_id, wp, session), (item_id, n) in matches.items(): for pos in sl.find(engine, pos_tbl, ablauf_id="%s/%s" % (wp, ablauf_id)): if not pos['fundstelle_url']: continue if 'btp/%s/%s%03d.pdf' % (wp, wp, int(session)) in pos['fundstelle_url']: d = { 'ablauf_id': pos['ablauf_id'], 'hash': pos['hash'], 'debatte_wp': wp, 'debatte_session': session, 'debatte_item_id': item_id } sl.upsert(engine, pos_tbl, d, unique=['ablauf_id', 'hash'])
def combine(force=False, filter=None): engine = db_connect() source_table = sl.get_table(engine, 'source') for row in sl.find(engine, source_table, **(filter or {})): combine_resource(engine, source_table, row, force)
def combine_resource_id(resource_id, force=False): engine = db_connect() source_table = sl.get_table(engine, 'source') for row in sl.find(engine, source_table, resource_id=resource_id): combine_resource(engine, source_table, row, force)
def cleanup_all(force=False): engine = db_connect() source_table = sl.get_table(engine, 'source') for row in sl.find(engine, source_table): cleanup_resource(engine, source_table, row, force)