def supplier_names_from_table(engine, resource_id, table_id): global tables_count, rows_count tables_count = tables_count + 1 print "# checking table %d" % tables_count table_suffix = "%s_table%s" % (resource_id, table_id) table = sl.get_table(engine, "spending_%s" % table_suffix) supplier_table = sl.get_table(engine, "suppliers") for row in sl.all(engine, table): rows_count = rows_count + 1 if not row.has_key("SupplierName"): # One of the junk tables that contain no real data, usually excel noise continue supplier = row["SupplierName"] if supplier is None or supplier == "": continue if suppliers_visited.has_key(supplier): continue suppliers_visited[supplier] = True if sl.find_one(engine, supplier_table, original=supplier) is not None: continue yield supplier
def load_debatten(engine, indexer, sitzung): WebTV_Speech = sl.get_table(engine, 'webtv_speech') zitate = list(sl.find(engine, WebTV_Speech, wp=str(sitzung.wahlperiode), session=str(sitzung.nummer))) debatten = dict([(z['item_id'], z) for z in zitate]) speeches = list(sl.find(engine, sl.get_table(engine, 'speech'), wahlperiode=int(sitzung.wahlperiode), sitzung=int(sitzung.nummer))) for i, data in debatten.items(): log.info("Loading -> Debatte: %s..." % data.get('item_label')) debatte = Debatte.query.filter_by( sitzung=sitzung, nummer=data.get('item_id') ).first() if debatte is None: debatte = Debatte() debatte.sitzung = sitzung debatte.nummer = data.get('item_id') debatte.tops = data.get('item_key') debatte.titel = data.get('item_label') debatte.text = data.get('item_description') db.session.add(debatte) db.session.flush() indexer.add(debatte) dzitate = filter(lambda z: z['item_id'] == data['item_id'], zitate) reden = load_reden(engine, indexer, debatte, dzitate) load_zitate(engine, indexer, debatte, dzitate, speeches, reden) db.session.commit() indexer.add_many(reden.values())
def scrape_activity(engine, url, elem): urheber = elem.findtext("URHEBER") fundstelle = elem.findtext("FUNDSTELLE") Position = sl.get_table(engine, 'position') p = {'source_url': url, 'urheber': urheber, 'fundstelle': fundstelle} pos_keys = p.copy() p['zuordnung'] = elem.findtext("ZUORDNUNG") p['abstrakt'] = elem.findtext("VP_ABSTRAKT") p['fundstelle_url'] = elem.findtext("FUNDSTELLE_LINK") Zuweisung = sl.get_table(engine, 'zuweisung') for zelem in elem.findall("ZUWEISUNG"): z = pos_keys.copy() z['text'] = zelem.findtext("AUSSCHUSS_KLARTEXT") z['federfuehrung'] = zelem.find("FEDERFUEHRUNG") is not None z['gremium_key'] = DIP_GREMIUM_TO_KEY.get(z['text']) sl.add_row(engine, Zuweisung, z) Beschluss = sl.get_table(engine, 'beschluss') for belem in elem.findall("BESCHLUSS"): b = pos_keys.copy() b['seite'] = belem.findtext("BESCHLUSSSEITE") b['dokument_text'] = belem.findtext("BEZUGSDOKUMENT") b['tenor'] = belem.findtext("BESCHLUSSTENOR") b['grundlage'] = belem.findtext("GRUNDLAGE") sl.add_row(engine, Beschluss, b) Referenz = sl.get_table(engine, 'referenz') try: dokument = dokument_by_url(p['fundstelle_url']) or \ dokument_by_name(p['fundstelle']) dokument.update(pos_keys) sl.add_row(engine, Referenz, dokument) except Exception, e: log.exception(e)
def load_debatten(engine, sitzung): WebTV_Speech = sl.get_table(engine, 'webtv_speech') zitate = list(sl.find(engine, WebTV_Speech, wp=str(sitzung.wahlperiode), session=str(sitzung.nummer))) debatten = dict([(z['item_id'], z) for z in zitate]) speeches = list(sl.find(engine, sl.get_table(engine, 'speech'), wahlperiode=int(sitzung.wahlperiode), sitzung=int(sitzung.nummer))) for i, data in debatten.items(): log.info("Loading -> Debatte: %s..." % data.get('item_label')) debatte = Debatte.query.filter_by( sitzung=sitzung, nummer=data.get('item_id') ).first() if debatte is None: debatte = Debatte() debatte.sitzung = sitzung debatte.nummer = data.get('item_id') debatte.tops = data.get('item_key') debatte.titel = data.get('item_label') debatte.text = data.get('item_description') db.session.add(debatte) db.session.flush() dzitate = filter(lambda z: z['item_id'] == data['item_id'], zitate) load_zitate(engine, debatte, dzitate, speeches) db.session.commit()
def load_ablauf(engine, indexer, data): ablauf = Ablauf.query.filter_by(source_url=data.get('source_url')).first() if ablauf is None: ablauf = Ablauf() ablauf.key = data.get('key') ablauf.source_url = data.get('source_url') ablauf.wahlperiode = data.get('wahlperiode') ablauf.typ = data.get('typ') ablauf.klasse = data.get('class') ablauf.titel = data.get('titel') if not len(ablauf.titel): log.error("No titel!") return ablauf.initiative = data.get('initiative') ablauf.stand = data.get('stand') ablauf.signatur = data.get('signatur') ablauf.gesta_id = data.get('gesta_id') ablauf.eu_dok_nr = data.get('eu_dok_nr') ablauf.eur_lex_url = data.get('eur_lex_url') ablauf.eur_lex_pdf = data.get('eur_lex_pdf') ablauf.consilium_url = data.get('consilium_url') ablauf.abstrakt = data.get('abstrakt') ablauf.zustimmungsbeduerftig = data.get('zustimmungsbeduerftig') ablauf.sachgebiet = data.get('sachgebiet') ablauf.abgeschlossen = True if str(data.get('abgeschlossen')) \ == 'True' else False db.session.add(ablauf) db.session.flush() worte = [] _Schlagwort = sl.get_table(engine, 'schlagwort') for sw in sl.find(engine, _Schlagwort, source_url=ablauf.source_url): wort = Schlagwort() wort.name = sw['wort'] db.session.add(wort) worte.append(wort) ablauf.schlagworte = worte _Referenz = sl.get_table(engine, 'referenz') for ddata in sl.find(engine, _Referenz, source_url=ablauf.source_url): dokument = load_dokument(engine, indexer, ddata) referenz = Referenz.query.filter_by( dokument=dokument, seiten=ddata.get('seiten'), ).filter(Referenz.ablaeufe.any(id=ablauf.id)).first() if referenz is None: referenz = Referenz() referenz.ablaeufe.append(ablauf) referenz.dokument = dokument referenz.seiten = ddata.get('seiten') referenz.text = ddata.get('text') _Position = sl.get_table(engine, 'position') for position in sl.find(engine, _Position, source_url=ablauf.source_url): load_position(engine, indexer, ablauf, position) db.session.commit() indexer.add(ablauf)
def load_finances(financialData, childBase, engine): etlId = '%s//%s' % (financialData['startDate'].isoformat(), financialData['endDate'].isoformat()) financial_sources = \ [(s, 'other') for s in financialData.pop("otherCustomized")] + \ [(s, 'public') for s in financialData.pop("publicCustomized")] for financial_source, type_ in financial_sources: financial_source['type'] = type_ financial_source['financialDataEtlId'] = etlId financial_source.update(childBase) sl.upsert(engine, sl.get_table(engine, 'financialDataCustomSource'), financial_source, ['representativeEtlId', 'financialDataEtlId', 'type', 'name']) for turnover in financialData.pop("turnoverBreakdown"): turnover['financialDataEtlId'] = etlId turnover['etlFingerPrint'] = turnover['name'].strip() turnover.update(childBase) sl.upsert(engine, sl.get_table(engine, 'financialDataTurnover'), turnover, ['representativeEtlId', 'financialDataEtlId', 'etlFingerPrint']) financialData['etlId'] = etlId financialData.update(childBase) sl.upsert(engine, sl.get_table(engine, 'financialData'), financialData, ['representativeEtlId', 'etlId'])
def update_network_entities(engine, file_name): log.info("Updating network entities reference sheet: %s", file_name) network_entities = set() table = sl.get_table(engine, 'network_entity') if os.path.exists(file_name): fh = open(file_name, 'rb') reader = csv.DictReader(fh) for d in reader: e = [(k, v.decode('utf-8')) for (k, v) in d.items()] e = dict(e) network_entities.add((e['representativeEtlId'], e['etlFingerPrint'])) sl.upsert(engine, table, e, ['representativeEtlId', 'etlFingerPrint']) fh.close() reps = set([ne[0] for ne in network_entities]) rep_table = sl.get_table(engine, 'representative') for rep in reps: sl.update(engine, rep_table, {'etlId': rep}, {'network_extracted': True}) for row in sl.all(engine, table): network_entities.add((row['representativeEtlId'], row['etlFingerPrint'])) fh = open(file_name, 'wb') writer = None table = sl.get_table(engine, 'network_entity') for ic, fp in network_entities: row = { 'representativeEtlId': ic, 'etlFingerPrint': fp } if writer is None: writer = csv.DictWriter(fh, row.keys()) writer.writerow(dict(zip(row.keys(), row.keys()))) r = [(k, unicode(v).encode('utf-8')) for k, v in row.items()] writer.writerow(dict(r)) fh.close()
def extend_speeches(engine, wahlperiode=17): log.info("Amending speeches with DRS ...") drs_match = re.compile(DRS_MATCH % (wahlperiode, wahlperiode)) Speech = sl.get_table(engine, 'speech') SpeechDocument = sl.get_table(engine, 'speech_document') for i, data in enumerate(sl.find(engine, Speech)): if data.get('type') != 'chair': continue if i % 1000 == 0: sys.stdout.write('.') sys.stdout.flush() m = drs_match.search(data.get('text')) if m is None: continue for i, grp in enumerate(m.groups()): if grp and '/' in grp: wp, nummer = grp.split('/', 1) sl.upsert( engine, SpeechDocument, { 'group': i, 'sequence': data['sequence'], 'sitzung': data['sitzung'], 'wahlperiode': wahlperiode, 'dok_nummer': nummer }, unique=['sequence', 'sitzung', 'wahlperiode', 'group'])
def make_fingerprint(engine, person): try: long_name = make_long_name(person) try: long_name = resolve_person(long_name) log.info(" -> %s" % long_name.strip()) except: log.error("Resolve did not work") pass Person = sl.get_table(engine, 'person') sl.upsert(engine, Person, { 'fingerprint': long_name, 'slug': url_slug(long_name), 'mdb_id': person['mdb_id'] }, unique=['mdb_id']) Rolle = sl.get_table(engine, 'rolle') sl.upsert(engine, Rolle, { 'mdb_id': person['mdb_id'], 'fingerprint': long_name }, unique=['mdb_id']) person['fingerprint'] = long_name except BadReference: log.error("Bad Reference %s", person) pass
def condense(engine, resource_id, table_id, force): table_suffix = '%s_table%s' % (resource_id, table_id) if not engine.has_table('raw_%s' % table_suffix): return condensed_table = sl.get_table(engine, 'condensed') # Skip over tables we have already extracted if not force and sl.find_one(engine, condensed_table, resource_id=resource_id, table_id=table_id) is not None: return connection = engine.connect() trans = connection.begin() start = time.time() try: raw_table = sl.get_table(connection, 'raw_%s' % table_suffix) sl.drop_table(connection, 'spending_%s' % table_suffix) spending_table = sl.get_table(connection, 'spending_%s' % table_suffix) columns_table = sl.get_table(connection, 'column_sets') normalise_map = normalised_columns_map(raw_table) normalised_headers = ','.join(sorted(normalise_map.values())) mapping_row = sl.find_one(connection, columns_table, normalised=normalised_headers) if mapping_row is None or not mapping_row.get('valid'): # This table is unmapped, cannot be condensed return column_mapping = json.loads(mapping_row['column_map']) # Build the final mapping from input column to output column mapping = {} for k,n in normalise_map.iteritems(): if n in column_mapping and column_mapping[n] is not None and len(column_mapping[n]) > 0: mapping[k] = column_mapping[n] for row in sl.all(connection, raw_table): spending_row = {} for key, value in row.items(): if key not in mapping: continue if not value or not len(value.strip()): continue if mapping[key] in spending_row: continue spending_row[mapping[key]] = value.strip() #print spending_row sl.add_row(connection, spending_table, spending_row) sl.upsert(connection, condensed_table, {'resource_id': resource_id, 'table_id': table_id, 'condense_time': time.time() - start, }, ['resource_id', 'table_id']) trans.commit() finally: connection.close()
def speechmatcher(wp, session): engine = etl_engine() speech_table = sl.get_table(engine, 'speech') speeches = sl.find(engine, speech_table, order_by='sequence', wahlperiode=wp, sitzung=session, matched=True) webtv_table = sl.get_table(engine, 'webtv') agenda = sl.find(engine, webtv_table, wp=wp, session=session) agenda = list(agenda) return render_template('backend/speechmatcher.html', speeches=speeches, agenda=agenda, wp=wp, session=session)
def create_entities(engine): log.info("De-normalizing global entities collection...") table = sl.get_table(engine, 'entity') for tbl in ['representative', 'person', 'financialDataTurnover', 'organisation', 'network_entity']: for row in sl.all(engine, sl.get_table(engine, tbl)): entity = {'etlFingerPrint': row.get('etlFingerPrint')} entity['legalStatus'] = row.get('legalStatus', '') entity['countryCode'] = row.get('contactCountryCode', '') entity['etlTable'] = tbl sl.upsert(engine, table, entity, ['etlFingerPrint', 'etlTable'])
def add_to_gremium(node, url, role, engine): key = node.get('id') table = sl.get_table(engine, 'gremium') g = sl.find_one(engine, table, key=key) if g is None: g = {'key': key, 'type': 'sonstiges'} g['name'] = node.findtext('gremiumName') g['url'] = node.findtext('gremiumURL') sl.upsert(engine, table, g, unique=['key']) table = sl.get_table(engine, 'gremium_mitglieder') sl.upsert(engine, table, { 'gremium_key': g['key'], 'person_source_url': url, 'role': role }, unique=['person_source_url', 'gremium_key', 'role'])
def retrieve(row, engine, force): ret_table = sl.get_table(engine, 'retrieval_log') #print row.get('package_name'), row['url'].encode('utf-8') try: import os if not force and os.path.exists(source_path(row)): return url = fix_url(row['url']) print "Fetching %s" % url res = urllib2.urlopen(url) fh = open(source_path(row), 'wb') fh.write(res.read()) sl.add_row(engine, ret_table, { 'resource_id': row['resource_id'], 'status': '200', 'message': "", 'content-type': res.headers.get('content-type', ''), 'timestamp': datetime.now() }) except Exception, ioe: print traceback.format_exc() status = 0 if hasattr(ioe, 'code'): status = ioe.code sl.add_row(engine, ret_table, { 'resource_id': row['resource_id'], 'status': status, 'message': unicode(ioe), 'timestamp': datetime.now() }) assert False, unicode(ioe).encode('utf-8')
def add_to_gremium(node, url, role, engine): key = node.get("id") table = sl.get_table(engine, "gremium") g = sl.find_one(engine, table, key=key) if g is None: g = {"key": key, "type": "sonstiges"} g["name"] = node.findtext("gremiumName") g["url"] = node.findtext("gremiumURL") sl.upsert(engine, table, g, unique=["key"]) table = sl.get_table(engine, "gremium_mitglieder") sl.upsert( engine, table, {"gremium_key": g["key"], "person_source_url": url, "role": role}, unique=["person_source_url", "gremium_key", "role"], )
def load_rollen(engine, person, data): _RolleSource = sl.get_table(engine, 'rolle') for rdata in sl.find(engine, _RolleSource, fingerprint=data['fingerprint']): rolle = Rolle.query.filter_by(person=person, funktion=rdata.get('funktion'), ressort=rdata.get('ressort'), fraktion=rdata.get('fraktion'), land=rdata.get('land')).first() if rolle is None: rolle = Rolle() rolle.person = person rolle.mdb_id = rdata.get('mdb_id') rolle.status = rdata.get('status') rolle.funktion = rdata.get('funktion') rolle.fraktion = rdata.get('fraktion') rolle.gewaehlt = rdata.get('gewaehlt') rolle.ressort = rdata.get('ressort') rolle.land = rdata.get('land') rolle.austritt = to_date(rdata.get('austritt')) if rdata.get('mdb_id'): rolle.wahlkreis = load_wahlkreis(engine, rolle, data) db.session.add(rolle)
def clear_issues(engine, resource_id, stage): import sqlaload as sl # this import is slow, so it is done inside this func table = sl.get_table(engine, 'issue') sl.delete(engine, table, resource_id=resource_id, stage=stage, )
def combine(force=False, filter=None): stats = OpenSpendingStats() engine = db_connect() source_table = sl.get_table(engine, 'source') for row in sl.find(engine, source_table, **(filter or {})): combine_resource(engine, source_table, row, force, stats) log.info('Combine summary: \n%s' % stats.report())
def combine_resource_core(engine, row, stats): '''Given a resource (source row) it opens the table with its contents and puts its rows into the combined table, using the column mappings. Returns whether it succeeds or not. ''' error = None for sheet_id in range(0, row['sheets']): table = sl.get_table(engine, 'raw_%s_sheet%s' % ( row['resource_id'], sheet_id)) if not engine.has_table(table.name): error = 'Sheet table does not exist' log.warn('Sheet table does not exist: %s', table) continue columns = [c.name for c in table.columns] mapping = column_mapping(engine, row, columns) if mapping is None: error = 'Column mappings not complete' log.warn('Column mappings not complete: %s', columns) continue log.debug('Column mapping: %r', mapping) if not combine_sheet(engine, row, sheet_id, table, mapping): error = 'Could not combine sheet' if error: stats.add_source(error, row) else: stats.add_source('Combined ok', row) return (not error)
def combine_sheet(engine, resource, sheet_id, table, mapping): begin = time.time() base = { 'resource_id': resource['resource_id'], 'resource_hash': resource['extract_hash'], 'sheet_id': sheet_id, } spending_table = sl.get_table(engine, 'spending') connection = engine.connect() trans = connection.begin() try: rows = 0 sl.delete(connection, spending_table, resource_id=resource['resource_id'], sheet_id=sheet_id) for row in sl.all(connection, table): data = dict(base) for col, value in row.items(): if col == 'id': data['row_id'] = value continue mapped = mapping.get(col) if mapped is not None: data[mapped] = value sl.add_row(connection, spending_table, data) rows += 1 trans.commit() log.info("Loaded %s rows in %s ms", rows, int((time.time()-begin)*1000)) return rows > 0 finally: connection.close()
def load_documents(engine): refs = sl.get_table(engine, 'referenz') for ref in sl.distinct(engine, refs, 'link'): link = ref.get('link') if link is None: continue load_document(link)
def load_abstimmung(engine, source_url): table = sl.get_table(engine, 'abstimmung') stimmen = list(sl.find(engine, table, source_url=source_url, matched=True)) if not len(stimmen): log.error("No reconciled votes, signals deeper trouble?") return thema = stimmen[0].get('subject') abst = Abstimmung.query.filter_by(thema=thema).first() if abst is None: abst = Abstimmung() abst.thema = thema abst.datum = to_date(stimmen[0].get('date')) db.session.add(abst) db.session.flush() for stimme_ in stimmen: person = Person.query.filter_by( fingerprint=stimme_.get('fingerprint')).first() if person is None: continue stimme = Stimme.query.filter_by( abstimmung=abst).filter_by( person=person).first() if stimme is not None: continue stimme = Stimme() stimme.entscheidung = stimme_['vote'] stimme.person = person stimme.abstimmung = abst db.session.add(stimme) db.session.commit()
def load_abstimmung(engine, source_url): table = sl.get_table(engine, 'abstimmung') stimmen = list(sl.find(engine, table, source_url=source_url, matched=True)) if not len(stimmen): log.error("No reconciled votes, signals deeper trouble?") return thema = stimmen[0].get('subject') abst = Abstimmung.query.filter_by(thema=thema).first() if abst is None: abst = Abstimmung() abst.thema = thema abst.datum = to_date(stimmen[0].get('date')) db.session.add(abst) db.session.flush() for stimme_ in stimmen: person = Person.query.filter_by( fingerprint=stimme_.get('fingerprint')).first() if person is None: continue stimme = Stimme.query.filter_by(abstimmung=abst).filter_by( person=person).first() if stimme is not None: continue stimme = Stimme() stimme.entscheidung = stimme_['vote'] stimme.person = person stimme.abstimmung = abst db.session.add(stimme) db.session.commit()
def scrape_transcript(engine, url, force=False): wp, session = url_metadata(url) table = sl.get_table(engine, 'speech') sio = find_local(url) sample = {'source_etag': 'local'} if sio is None: sample = sl.find_one(engine, table, source_url=url, matched=True) response, sio = fetch_stream(url) sample = check_tags(sample or {}, response, force) base_data = {'source_url': url, 'sitzung': session, 'wahlperiode': wp, 'matched': False, 'loaded': False, 'source_etag': sample['source_etag']} log.info("Loading transcript: %s/%s, from %s" , wp, session, url) seq = 0 parser = SpeechParser(sio) for contrib in parser: if not len(contrib['text'].strip()): continue contrib.update(base_data) contrib['sequence'] = seq sl.upsert(engine, table, contrib, unique=['source_url', 'sequence']) seq += 1 if not parser.missing_recon: sl.upsert(engine, table, { 'matched': True, 'source_url': url, }, unique=['source_url']) else: raise InvalidReference() return base_data
def generate_person_long_names(engine): log.info("Generating person fingerprints and slugs...") from offenesparlament.transform.namematch import match_speaker nkp = nk_persons() Person = sl.get_table(engine, 'person') for person in sl.find(engine, Person): long_name = make_long_name(person) try: long_name = match_speaker(long_name) except NKNoMatch: pass log.info(" -> %s" % long_name.strip()) slug = url_slug(long_name) sl.upsert(engine, Person, { 'fingerprint': long_name, 'slug': slug, 'id': person['id']}, unique=['id']) tries = 0 while True: try: nkp.ensure_value(long_name, data=person) except ValueError, E: log.warn('Exception: %s' % str(E)) tries = tries + 1 if tries > 5: raise else: break
def combine_resource_core(engine, row, stats): '''Given a resource (source row) it opens the table with its contents and puts its rows into the combined table, using the column mappings. Returns whether it succeeds or not. ''' error = None for sheet_id in range(0, row['sheets']): table = sl.get_table(engine, 'raw_%s_sheet%s' % (row['resource_id'], sheet_id)) if not engine.has_table(table.name): error = 'Sheet table does not exist' log.warn('Sheet table does not exist: %s', table) continue columns = [c.name for c in table.columns] mapping = column_mapping(engine, row, columns) if mapping is None: error = 'Column mappings not complete' log.warn('Column mappings not complete: %s', columns) continue log.debug('Column mapping: %r', mapping) if not combine_sheet(engine, row, sheet_id, table, mapping): error = 'Could not combine sheet' if error: stats.add_source(error, row) else: stats.add_source('Combined ok', row) return (not error)
def combine_sheet(engine, resource, sheet_id, table, mapping): begin = time.time() base = { 'resource_id': resource['resource_id'], 'resource_hash': resource['extract_hash'], 'sheet_id': sheet_id, } spending_table = sl.get_table(engine, 'spending') connection = engine.connect() trans = connection.begin() try: rows = 0 sl.delete(connection, spending_table, resource_id=resource['resource_id'], sheet_id=sheet_id) for row in sl.all(connection, table): data = dict(base) for col, value in row.items(): if col == 'id': data['row_id'] = value continue mapped = mapping.get(col) if mapped is not None: data[mapped] = value sl.add_row(connection, spending_table, data) rows += 1 trans.commit() log.info("Loaded %s rows in %s ms", rows, int((time.time() - begin) * 1000)) return rows > 0 finally: connection.close()
def load(engine, grano): for rep in sl.find(engine, sl.get_table(engine, 'representative')): del rep['id'] rep_ent = canonical_actor(grano, engine, rep['originalName']) if 'id' in rep_ent: rep_ent = grano.getEntity(rep_ent['id'], deep=True) #if not SETTINGS.FULL and rep_ent['etlId'] == rep['etlId']: # continue rep_ent.update(rep) rep_ent['actsAsRepresentative'] = True rep_ent['staffMembers'] = int(float(rep['members'])) rep_ent['incoming'] = rep_ent.get('incoming', []) rep_ent['outgoing'] = rep_ent.get('outgoing', []) rep_ent['contactCountry'] = rep_ent['contactCountryNorm'] rep_ent = load_clients(grano, engine, rep_ent) rep_ent = load_organisations(grano, engine, rep_ent) rep_ent = load_networking(grano, engine, rep_ent) rep_ent = load_persons(grano, engine, rep_ent) rep_ent = load_interests(grano, engine, rep_ent) rep_ent = load_action_fields(grano, engine, rep_ent) rep_ent = get_financial_data(engine, rep_ent) # TODO: other financial sources #from pprint import pprint #pprint(rep_ent) grano.updateEntity(rep_ent)
def load_abstimmungen(engine): _Abstimmung = sl.get_table(engine, 'abstimmung') i = 0 for row in sl.distinct(engine, _Abstimmung, 'subject', 'date'): thema = row.get('subject') abst = Abstimmung.query.filter_by(thema=thema).first() if abst is None: abst = Abstimmung() abst.thema = thema abst.datum = date(row.get('date')) db.session.add(abst) for stimme_ in sl.find(engine, _Abstimmung, subject=thema, matched=True): if i % 1000 == 0: sys.stdout.write(".") sys.stdout.flush() i += 1 person = Person.query.filter_by( fingerprint=stimme_.get('fingerprint')).first() if person is None: continue stimme = Stimme.query.filter_by( abstimmung=abst).filter_by( person=person).first() if stimme is not None: continue stimme = Stimme() stimme.entscheidung = stimme_['vote'] stimme.person = person stimme.abstimmung = abst db.session.add(stimme) db.session.commit()
def extend_positions(engine): log.info("Amending positions ...") Position = sl.get_table(engine, 'position') for i, data in enumerate(sl.find(engine, Position)): if i % 1000 == 0: sys.stdout.write('.') sys.stdout.flush() dt, rest = data['fundstelle'].split("-", 1) data['date'] = datetime.strptime(dt.strip(), "%d.%m.%Y").isoformat() if ',' in data['urheber']: typ, quelle = data['urheber'].split(',', 1) data['quelle'] = re.sub("^.*Urheber.*:", "", quelle).strip() data['typ'] = typ.strip() else: data['typ'] = data['urheber'] br = 'Bundesregierung, ' if data['urheber'].startswith(br): data['urheber'] = data['urheber'][len(br):] data['fundstelle_doc'] = None if data['fundstelle_url'] and \ 'btp' in data['fundstelle_url']: data['fundstelle_doc'] = data['fundstelle_url']\ .rsplit('#',1)[0] hash = sha1(data['fundstelle'].encode('utf-8') \ + data['urheber'].encode('utf-8') + \ data['ablauf_id'].encode('utf-8')).hexdigest() data['hash'] = hash[:10] sl.upsert(engine, Position, data, unique=UNIQUE)
def load_rollen(engine, person, data): _RolleSource = sl.get_table(engine, 'rolle') mdb_rolle = None for rdata in sl.find(engine, _RolleSource, fingerprint=data['fingerprint']): rolle = Rolle.query.filter_by( person=person, funktion=rdata.get('funktion'), ressort=rdata.get('ressort'), fraktion=rdata.get('fraktion'), land=rdata.get('land')).first() if rolle is None: rolle = Rolle() rolle.person = person rolle.mdb_id = rdata.get('mdb_id') rolle.status = rdata.get('status') rolle.funktion = rdata.get('funktion') rolle.fraktion = rdata.get('fraktion') rolle.gewaehlt = rdata.get('gewaehlt') rolle.ressort = rdata.get('ressort') rolle.land = rdata.get('land') rolle.austritt = date(rdata.get('austritt')) if rdata.get('mdb_id'): rolle.wahlkreis = load_wahlkreis(engine, rolle, data) mdb_rolle = rolle db.session.add(rolle) return mdb_rolle
def validate_sheet(engine, row, sheet_id, data_row_filter, stats_spending): spending_table = sl.get_table(engine, 'spending') data = list( sl.find(engine, spending_table, resource_id=row['resource_id'], sheet_id=sheet_id)) connection = engine.connect() trans = connection.begin() issue_noted_for_this_resource = False # record first failure only error_message = None try: records = 0 for row_ in data: if data_row_filter and data_row_filter != row_['row_id']: continue result = {'id': row_['id'], 'valid': True} result['signature'] = generate_signature(row_) if row_['DateFormatted'] is None: stats_spending['date'].add_spending('Date invalid', row_) result['valid'] = False if not issue_noted_for_this_resource: issue( engine, row['resource_id'], row['retrieve_hash'], STAGE, 'Date invalid (blank, inconsistent or unrecognised format)', { 'row_id': row_.get('row_id'), 'row_number': row_.get('row_number'), 'Date': row_.get('Date') }) error_message = 'Date invalid' issue_noted_for_this_resource = True else: stats_spending['date'].add_spending('Date ok', row_) if row_['AmountFormatted'] is None: stats_spending['amount'].add_spending('Amount invalid', row_) result['valid'] = False if not issue_noted_for_this_resource: issue( engine, row['resource_id'], row['retrieve_hash'], STAGE, 'Amount invalid', { 'row_id': row_.get('row_id'), 'row_number': row_.get('row_number'), 'Amount': row_.get('Amount') }) error_message = 'Amount invalid' issue_noted_for_this_resource = True else: stats_spending['amount'].add_spending('Amount ok', row_) if result['valid']: records += 1 sl.update(connection, spending_table, {'id': result['id']}, result) trans.commit() return records > 0, error_message finally: connection.close()
def load_rollen(engine, person, data): _RolleSource = sl.get_table(engine, "rolle") for rdata in sl.find(engine, _RolleSource, fingerprint=data["fingerprint"]): rolle = Rolle.query.filter_by( person=person, funktion=rdata.get("funktion"), ressort=rdata.get("ressort"), fraktion=rdata.get("fraktion"), land=rdata.get("land"), ).first() if rolle is None: rolle = Rolle() rolle.person = person rolle.mdb_id = rdata.get("mdb_id") rolle.status = rdata.get("status") rolle.funktion = rdata.get("funktion") rolle.fraktion = rdata.get("fraktion") rolle.gewaehlt = rdata.get("gewaehlt") rolle.ressort = rdata.get("ressort") rolle.land = rdata.get("land") rolle.austritt = to_date(rdata.get("austritt")) if rdata.get("mdb_id"): rolle.wahlkreis = load_wahlkreis(engine, rolle, data) db.session.add(rolle)
def load_transcript(engine, wp, session, incremental=True): url = URL % (wp, session) Speech = sl.get_table(engine, 'speech') if incremental and sl.find_one(engine, Speech, source_url=url, matched=True): return True if '404 Seite nicht gefunden' in fetch(url): return False sio = fetch_stream(url) if sio is None: return False log.info("Loading transcript: %s/%s" % (wp, session)) seq = 0 parser = SpeechParser(engine, sio) for contrib in parser: if not len(contrib['text'].strip()): continue contrib['sitzung'] = session contrib['sequence'] = seq contrib['wahlperiode'] = wp contrib['source_url'] = url contrib['matched'] = True sl.upsert(engine, Speech, contrib, unique=['sequence', 'sitzung', 'wahlperiode']) seq += 1 if parser.missing_recon: sl.upsert(engine, Speech, { 'matched': False, 'sitzung': session, 'wahlperiode': wp }, unique=['sitzung', 'wahlperiode']) return True
def cleanup_sheet(engine, row, sheet_id): spending_table = sl.get_table(engine, 'spending') data = list( sl.find(engine, spending_table, resource_id=row['resource_id'], sheet_id=sheet_id)) connection = engine.connect() trans = connection.begin() date_formats = cleanup_dates.detect_formats(data) try: if None in date_formats.values(): log.warn("Couldn't detect date formats: %r", date_formats) issue(engine, row['resource_id'], row['retrieve_hash'], "Couldn't detect date formats", repr(date_formats)) return False sl.delete(connection, spending_table, resource_id=row['resource_id'], sheet_id=sheet_id) for row in data: row = cleanup_dates.apply(row, date_formats) row = cleanup_numbers.apply(row) row = cleanup_gov.apply(row) #row = cleanup_supplier.apply(row, engine) del row['id'] sl.add_row(connection, spending_table, row) trans.commit() return True finally: connection.close()
def merge_speeches(engine): # desired result: (position_id, debatte_id) referenzen = referenzen_index(engine) items = item_index(engine) log.info("Finding best matches.... ") matches = {} for (ablauf_id, rwp, rsession), rdrs in referenzen.items(): for (iwp, isession, item_id), idrs in items.items(): if iwp != rwp or rsession != isession: continue ints = len(idrs.intersection(rdrs)) if ints == 0: continue k = (ablauf_id, rwp, rsession) if k in matches and matches[k][1] > ints: continue matches[k] = (item_id, ints) log.info("Saving position associations....") pos_tbl = sl.get_table(engine, 'position') for (ablauf_id, wp, session), (item_id, n) in matches.items(): for pos in sl.find(engine, pos_tbl, ablauf_id="%s/%s" % (wp, ablauf_id)): if not pos['fundstelle_url']: continue if 'btp/%s/%s%03d.pdf' % (wp, wp, int(session)) in pos['fundstelle_url']: d = {'ablauf_id': pos['ablauf_id'], 'hash': pos['hash'], 'debatte_wp': wp, 'debatte_session': session, 'debatte_item_id': item_id} sl.upsert(engine, pos_tbl, d, unique=['ablauf_id', 'hash'])
def get_agenda(engine, wp, session): return list( sl.find(engine, sl.get_table(engine, 'webtv'), wp=wp, session=session, order_by='speech_id'))
def load_news(engine): log.info("Loading news into production DB...") _NewsSource = sl.get_table(engine, 'news') for data in sl.all(engine, _NewsSource): news = NewsItem.query.filter_by( source_url=data.get('source_url')).first() if news is None: news = NewsItem() news.key = data.get('key') news.source_url = data.get('source_url') news.title = data.get('title') news.text = data.get('text') news.date = date(data.get('date')) news.image_url = data.get('image_url') news.image_copyright = data.get('image_copyright') news.image_changed_at = date(data.get('image_changed_at')) if data.get('gremium'): gremium = Gremium.query.filter_by( key=data.get('gremium')).first() if gremium is None: log.error("Gremium %s not found!" % data.get('gremium')) else: news.gremium = gremium db.session.add(news) db.session.commit()
def scrape_transcript(engine, url, force=False): wp, session = url_metadata(url) table = sl.get_table(engine, 'speech') sample = sl.find_one(engine, table, source_url=url, matched=True) response, sio = fetch_stream(url) sample = check_tags(sample or {}, response, force) base_data = {'source_url': url, 'sitzung': session, 'wahlperiode': wp, 'matched': False, 'loaded': False, 'source_etag': sample['source_etag']} log.info("Loading transcript: %s/%s, from %s" , wp, session, url) seq = 0 parser = SpeechParser(sio) for contrib in parser: if not len(contrib['text'].strip()): continue contrib.update(base_data) contrib['sequence'] = seq sl.upsert(engine, table, contrib, unique=['source_url', 'sequence']) seq += 1 if not parser.missing_recon: sl.upsert(engine, table, { 'matched': True, 'source_url': url, }, unique=['source_url']) else: raise InvalidReference() return base_data
def scrape_activity(ablauf, elem, engine): urheber = elem.findtext("URHEBER") fundstelle = elem.findtext("FUNDSTELLE") Position = sl.get_table(engine, 'position') p = sl.find_one(engine, Position, urheber=urheber, fundstelle=fundstelle, ablauf_id=ablauf['ablauf_id']) if p is not None: return p = {'ablauf_id': ablauf['ablauf_id'], 'urheber': urheber, 'fundstelle': fundstelle} pos_keys = p.copy() p['zuordnung'] = elem.findtext("ZUORDNUNG") p['abstrakt'] = elem.findtext("VP_ABSTRAKT") p['fundstelle_url'] = elem.findtext("FUNDSTELLE_LINK") Zuweisung = sl.get_table(engine, 'zuweisung') for zelem in elem.findall("ZUWEISUNG"): z = pos_keys.copy() z['text'] = zelem.findtext("AUSSCHUSS_KLARTEXT") z['federfuehrung'] = zelem.find("FEDERFUEHRUNG") is not None z['gremium_key'] = DIP_GREMIUM_TO_KEY.get(z['text']) sl.upsert(engine, Zuweisung, z, unique=[]) Beschluss = sl.get_table(engine, 'beschluss') for belem in elem.findall("BESCHLUSS"): b = pos_keys.copy() b['seite'] = belem.findtext("BESCHLUSSSEITE") b['dokument_text'] = belem.findtext("BEZUGSDOKUMENT") b['tenor'] = belem.findtext("BESCHLUSSTENOR") b['grundlage'] = belem.findtext("GRUNDLAGE") sl.upsert(engine, Beschluss, b, unique=[]) Referenz = sl.get_table(engine, 'referenz') try: dokument = dokument_by_url(p['fundstelle_url']) or \ dokument_by_name(p['fundstelle']) dokument.update(pos_keys) dokument['ablauf_key'] = ablauf['key'] dokument['wahlperiode'] = ablauf['wahlperiode'] sl.upsert(engine, Referenz, dokument, unique=[ 'link', 'wahlperiode', 'ablauf_key', 'seiten' ]) except Exception, e: log.exception(e)
def extract_some(force=False, filter=None): # kwargs = resource_id=x, package_name=y, publisher_title=z stats = OpenSpendingStats() engine = db_connect() source_table = sl.get_table(engine, 'source') for row in sl.find(engine, source_table, **(filter or {})): extract_resource(engine, source_table, row, force, stats) log.info('Extract summary: \n%s' % stats.report())
def index(): engine = etl_engine() webtv_table = sl.get_table(engine, 'webtv') sessions = sl.distinct(engine, webtv_table, 'wp', 'session', 'session_name') sessions = sorted(sessions, reverse=True) return render_template('backend/index.html', sessions=sessions)
def retrieve_some(force=False, filter=None): stats = OpenSpendingStats() engine = db_connect() source_table = sl.get_table(engine, 'source') for row in sl.find(engine, source_table, **(filter or {})): retrieve(row, engine, source_table, force, stats) print 'Retrieve summary:' print stats.report()
def mark_done(engine, url): table = sl.get_table(engine, 'speech') sl.upsert(engine, table, { 'loaded': True, 'source_url': url, }, unique=['source_url'])
def clear_issues(engine, resource_id, stage): import sqlaload as sl # this import is slow, so it is done inside this func table = sl.get_table(engine, 'issue') sl.delete( engine, table, resource_id=resource_id, stage=stage, )
def scrape_abstimmung(engine, url, force=False): abstimmung = sl.get_table(engine, 'abstimmung') sample = sl.find_one(engine, abstimmung, source_url=url) response = fetch(url) sample = check_tags(sample or {}, response, force) base_data = {'source_url': url, 'source_etag': sample['source_etag']} handle_xml(engine, base_data, response.content) return base_data
def match_beitraege(engine, url): table = sl.get_table(engine, 'beitrag') for beitrag in sl.distinct(engine, table, *KEYS, source_url=url): match = match_beitrag(engine, beitrag, url) beitrag['fingerprint'] = match beitrag['matched'] = match is not None if match: ensure_rolle(beitrag, match, engine) sl.upsert(engine, table, beitrag, unique=KEYS)
def speechmatcher_alignment_post(wp, session): engine = etl_engine() table = sl.get_table(engine, 'alignments') data = dict(request.form.items()) data['sequence'] = int(data['sequence']) data['wp'] = wp data['session'] = session sl.upsert(engine, table, data, ['wp', 'session', 'sequence']) return speechmatcher_alignment_get(wp, session)
def ensure_rolle(beitrag, fp, engine): rolle = { 'fingerprint': fp, 'ressort': beitrag.get('ressort'), 'fraktion': beitrag.get('fraktion'), 'funktion': beitrag.get('funktion') } Rolle = sl.get_table(engine, 'rolle') sl.upsert(engine, Rolle, rolle, unique=['fingerprint', 'funktion'])
def retrieve_some(force=False, **filters): engine = db_connect() source_table = sl.get_table(engine, 'source') result_counts = defaultdict(int) for row in sl.find(engine, source_table, **filters): result = retrieve(row, engine, source_table, force) result_counts['total'] += 1 result_counts[result] += 1 log.info('Total %i URLs', result_counts.pop('total')) for result, count in result_counts.items(): log.info(' %i %s', count, result)
def cleanup(force=False, resource_filter=None, data_row_filter=None): stats = OpenSpendingStats() stats_spending = defaultdict(OpenSpendingStats) engine = db_connect() source_table = sl.get_table(engine, 'source') for row in sl.find(engine, source_table, **(filter or {})): cleanup_resource(engine, source_table, row, force, data_row_filter, stats, stats_spending) log.info('Cleanup summary: \n%s' % stats.report()) for key in stats_spending: log.info('Cleanup %s: \n%s' % (key, stats_spending[key].report()))
def clean_ablauf(engine, data): try: table = sl.get_table(engine, 'ablauf') data['class'] = resolve_type(data.get('typ')) data['stage'] = resolve_stage(data.get('stand')) d = {'class': data['class'], 'stage': data['stand'], 'source_url': data['source_url']} sl.upsert(engine, table, d, unique=['source_url']) except BadReference: pass
def merge_speech(engine, wp, session): log.info("Merging media + transcript: %s/%s" % (wp, session)) score, alignment = get_alignment(engine, wp, session) log.info("Matching score: %s", score) agenda = get_agenda(engine, wp, session) agenda = dict([(a['item_id'], a) for a in agenda]) alignment = dict([(a['sequence'], a) for a in alignment]) item = None table = sl.get_table(engine, 'webtv_speech') for speech in sl.find(engine, sl.get_table(engine, 'speech'), order_by='sequence', wahlperiode=wp, sitzung=session, matched=True): sequence = speech['sequence'] item = alignment.get(sequence, item) data = agenda.get(item['item_id']).copy() del data['id'] data['sequence'] = sequence sl.upsert(engine, table, data, unique=['wp', 'session', 'sequence'])
def match_beitrag(engine, beitrag, url): beitrag_print = make_long_name(beitrag) log.info("Matching: %s", beitrag_print.encode('ascii', 'replace')) try: value = resolve_person(beitrag_print) if sl.find_one(engine, sl.get_table(engine, 'person'), fingerprint=value) is None: make_person(engine, beitrag, value, url) return value except BadReference: log.info("Beitrag person is unknown: %s", beitrag_print.encode('ascii', 'replace'))