def load_abstimmungen(engine): _Abstimmung = sl.get_table(engine, 'abstimmung') i = 0 for row in sl.distinct(engine, _Abstimmung, 'subject', 'date'): thema = row.get('subject') abst = Abstimmung.query.filter_by(thema=thema).first() if abst is None: abst = Abstimmung() abst.thema = thema abst.datum = date(row.get('date')) db.session.add(abst) for stimme_ in sl.find(engine, _Abstimmung, subject=thema, matched=True): if i % 1000 == 0: sys.stdout.write(".") sys.stdout.flush() i += 1 person = Person.query.filter_by( fingerprint=stimme_.get('fingerprint')).first() if person is None: continue stimme = Stimme.query.filter_by( abstimmung=abst).filter_by( person=person).first() if stimme is not None: continue stimme = Stimme() stimme.entscheidung = stimme_['vote'] stimme.person = person stimme.abstimmung = abst db.session.add(stimme) db.session.commit()
def load_documents(engine): refs = sl.get_table(engine, 'referenz') for ref in sl.distinct(engine, refs, 'link'): link = ref.get('link') if link is None: continue load_document(link)
def index(): engine = etl_engine() webtv_table = sl.get_table(engine, 'webtv') sessions = sl.distinct(engine, webtv_table, 'wp', 'session', 'session_name') sessions = sorted(sessions, reverse=True) return render_template('backend/index.html', sessions=sessions)
def merge(): engine = util.make_engine() table = sl.get_table(engine, 'fts') for row in sl.distinct(engine, table, 'beneficiary', 'country_code'): canonical, uri, score = lookup(row.get('beneficiary'), row.get('country_code'), engine) row['beneficiary_canonical'] = canonical row['beneficiary_uri'] = uri row['beneficiary_score'] = score sl.upsert(engine, table, row, ['beneficiary', 'country'])
def match_beitraege(engine, url): table = sl.get_table(engine, 'beitrag') for beitrag in sl.distinct(engine, table, *KEYS, source_url=url): match = match_beitrag(engine, beitrag, url) beitrag['fingerprint'] = match beitrag['matched'] = match is not None if match: ensure_rolle(beitrag, match, engine) sl.upsert(engine, table, beitrag, unique=KEYS)
def extend_ablaeufe(engine, master): log.info("Amending ablaeufe ...") Ablauf = sl.get_table(engine, 'ablauf') typen = [(t.get('typ'), t.get('class')) for t in master['ablauf_typ']] typen = dict(typen) for data in sl.distinct(engine, Ablauf, 'typ'): klass = typen.get(data.get('typ')) sl.upsert(engine, Ablauf, {'typ': data.get('typ'), 'class': klass}, unique=['typ'])
def merge(): read_countries() engine = util.make_engine() table = sl.get_table(engine, 'fts') for row in sl.distinct(engine, table, 'country'): country = row.get('country') data = match(country) row['country_code'] = data.get('iso_3166-1_2') row['country_common'] = data.get('common') sl.upsert(engine, table, row, ['country'])
def build_index(publisher_name=None): '''Searches CKAN for spending resources and writes their metadata to the database.''' engine, table = connect() client = ckan_client() log.info('CKAN: %s', client.base_location) tags = ['+tags:"%s"' % t for t in TAGS] q = " OR ".join(tags) publisher_dict_filter = {} if publisher_name: publisher_solr_filter = 'publisher:"%s"' % publisher_name q = '(%s) AND (%s)' % (q, publisher_solr_filter) publisher_dict_filter = {'publisher_name': publisher_name} log.info('SOLR Search q: %r', q) existing_packages = set([ res['package_name'] for res in sl.distinct( engine, table, 'package_name', **publisher_dict_filter) ]) log.info('Existing datasets: %i', len(existing_packages)) processed_packages = set() log.info('Doing package search for: "%s"', q) res = client.package_search(q, search_options={'limit': 2000}) log.info('Search returned %i dataset results', res['count']) stats = OpenSpendingStats() stats_resources = OpenSpendingStats() for package_name in res['results']: processed_packages.add(package_name) num_resources = fetch_package(client, package_name, engine, table, stats_resources) if num_resources == 0: stats.add('Dataset has no resources', package_name) else: stats.add('Dataset has resources', package_name) # Removed rows about deleted packages obsolete_packages = existing_packages - processed_packages log.info('Obsolete datasets: %s from %s', len(obsolete_packages), len(existing_packages)) for package_name in obsolete_packages: sl.delete(engine, table, package_name=package_name) sl.delete(engine, 'issue', package_name=package_name) stats.add('Removed obsolete dataset', package_name) # Removed stray rows without package_name stray_rows = list(sl.find(engine, table, package_name=None)) if stray_rows: log.info('Stray rows without package_name: %i', len(stray_rows)) sl.delete(engine, table, package_name=None) sl.delete(engine, 'issue', package_name=None) for row in stray_rows: stats.add('Stray row removed', row['resource_id']) print 'Datasets build_index summary:' print stats.report() print 'Resources build_index summary:' print stats_resources.report()
def build_index(publisher_name=None): '''Searches CKAN for spending resources and writes their metadata to the database.''' engine, table = connect() client = ckan_client() log.info('CKAN: %s', client.base_location) tags = ['+tags:"%s"' % t for t in TAGS] q = " OR ".join(tags) publisher_dict_filter = {} if publisher_name: publisher_solr_filter = 'publisher:"%s"' % publisher_name q = '(%s) AND (%s)' % (q, publisher_solr_filter) publisher_dict_filter = {'publisher_name': publisher_name} log.info('SOLR Search q: %r', q) existing_packages = set( [res['package_name'] for res in sl.distinct(engine, table, 'package_name', **publisher_dict_filter)]) log.info('Existing datasets: %i', len(existing_packages)) processed_packages = set() log.info('Doing package search for: "%s"', q) res = client.package_search(q, search_options={'limit': 2000}) log.info('Search returned %i dataset results', res['count']) stats = OpenSpendingStats() stats_resources = OpenSpendingStats() for package_name in res['results']: processed_packages.add(package_name) num_resources = fetch_package(client, package_name, engine, table, stats_resources) if num_resources == 0: stats.add('Dataset has no resources', package_name) else: stats.add('Dataset has resources', package_name) # Removed rows about deleted packages obsolete_packages = existing_packages - processed_packages log.info('Obsolete datasets: %s from %s', len(obsolete_packages), len(existing_packages)) for package_name in obsolete_packages: sl.delete(engine, table, package_name=package_name) sl.delete(engine, 'issue', package_name=package_name) stats.add('Removed obsolete dataset', package_name) # Removed stray rows without package_name stray_rows = list(sl.find(engine, table, package_name=None)) if stray_rows: log.info('Stray rows without package_name: %i', len(stray_rows)) sl.delete(engine, table, package_name=None) sl.delete(engine, 'issue', package_name=None) for row in stray_rows: stats.add('Stray row removed', row['resource_id']) print 'Datasets build_index summary:' print stats.report() print 'Resources build_index summary:' print stats_resources.report()
def extend_abstimmungen(engine): log.info("Amending votes ...") Abstimmung = sl.get_table(engine, 'abstimmung') for data in sl.distinct(engine, Abstimmung, 'person'): try: fp = match_speaker(data['person']) except NKInvalid, inv: log.exception(ve) continue except NKNoMatch, nm: fp = None log.info("No match for: %s", data['person'])
def integrate_recon(engine, table, qfunc, src_col, dst_name_col, dst_uri_col, min_score=None, limit=200, memory_name=None): if memory_name is None: memory_name = "recon_%s_%s" % (table.name, src_col) memory = SQLALoadMemory(engine, table=memory_name) for row in sl.distinct(engine, table, src_col): res = interactive(qfunc, row[src_col], min_score=min_score, memory=memory, limit=limit) if res is not None: #print row.get(src_col), " -> ", res.name.encode('utf-8'), res.score sl.upsert(engine, table, {src_col: row[src_col], dst_name_col: res.name, dst_uri_col: res.uri}, [src_col])
def match_beitraege(engine): Beitrag = sl.get_table(engine, 'beitrag') for i, beitrag in enumerate(sl.distinct(engine, Beitrag, 'vorname', 'nachname', 'funktion', 'land', 'fraktion', 'ressort', 'ort')): if i % 1000 == 0: sys.stdout.write('.') sys.stdout.flush() match = match_beitrag(engine, beitrag) ensure_rolle(beitrag, match, engine) beitrag['fingerprint'] = match beitrag['matched'] = match is not None sl.upsert(engine, Beitrag, beitrag, unique=['vorname', 'nachname', 'funktion', 'land', 'fraktion', 'ressort', 'ort'])
def match_speakers_webtv(engine): WebTV = sl.get_table(engine, 'webtv') for i, speech in enumerate(sl.distinct(engine, WebTV, 'speaker')): if speech['speaker'] is None: continue speaker = speaker_name_transform(speech['speaker']) matched = True try: fp = match_speaker(speaker) except NKInvalid, inv: fp = None except NKNoMatch, nm: fp = None matched = False
def item_index(engine): webtv_tbl = sl.get_table(engine, 'webtv') q = "SELECT s.text FROM speech s LEFT JOIN webtv_speech ws ON ws.wp::int = s.wahlperiode AND " \ + "ws.session::int = s.sitzung AND ws.sequence = s.sequence " \ + "WHERE ws.wp = '%(wp)s' AND ws.session = '%(session)s' AND ws.item_id = '%(item_id)s' AND s.type = 'chair' " items = {} log.info("Building index of drs mentions in speeches...") for item in sl.distinct(engine, webtv_tbl, 'wp', 'session', 'item_id'): _drs = set() for text in list(sl.query(engine, q % item)): _drs = _drs.union(drucksachen(text['text'], wahlperiode=item['wp'])) if len(_drs): items[(item['wp'], item['session'], item['item_id'])] = _drs #pprint(items) return items
def merge(): geocoder = shapegeocode.geocoder( 'nuts2-shapefile/data/NUTS_RG_10M_2006.shp', filter=lambda r: r['STAT_LEVL_'] == 3) regions = load_region_hierarchy() engine = util.make_engine() table = sl.get_table(engine, 'fts') for row in sl.distinct(engine, table, *KEYS): loc = geocode(row) if loc is None: continue row.update(loc) reg = find_region(geocoder, regions, row) row.update(reg) log.info("Geocoded: %s/%s - %s", row['lat'], row['lon'], row.get('nuts3_label')) sl.upsert(engine, table, row, KEYS)
def merge(codes): engine = util.make_engine() table = sl.get_table(engine, 'fts') for level in LEVELS: src_col = 'budget_code' if level == 'item' else level for data in sl.distinct(engine, table, src_col): value = data[src_col] if level == 'item' and len(value) < 11: continue if value not in codes: print value continue code_data = codes.get(value) data['%s_name' % level] = value data['%s_label' % level] = code_data['label'] data['%s_description' % level] = code_data['description'] data['%s_legal_basis' % level] = code_data['legal_basis'] sl.upsert(engine, table, data, [src_col])
def speakers_webtv(engine, wp, session): table = sl.get_table(engine, 'webtv') for speech in sl.distinct(engine, table, 'speaker', wp=wp, session=session): if speech['speaker'] is None: continue speaker = speaker_name_transform(speech['speaker']) matched = True try: fp = resolve_person(speaker) except InvalidReference: fp = None except BadReference: fp = None matched = False sl.upsert(engine, table, {'fingerprint': fp, 'matched': matched, 'speaker': speech['speaker']}, unique=['speaker'])
def update_reference(engine, data, table_name, col): table = sl.get_table(engine, table_name) for row in sl.distinct(engine, table, col): print row matched = False for ref in data: country = ref['country'].decode('utf-8') if ref['euname'] == row[col] or \ country.upper() == row[col].upper(): if not len(ref['euname']): ref['euname'] = row[col] matched = True sl.update_row(engine, table, { col: row[col], col + 'Norm': country, col + 'Code': ref['iso2']}, [col]) if not matched: print row return data
def cache_abstimmungen(engine): Abstimmung = sl.get_table(engine, 'abstimmung') data = defaultdict(dict) for e in sl.distinct(engine, Abstimmung, 'subject', 'date'): data[e['date']][e['subject']] = set(drucksachen(e['subject'])) return dict(data.items())
def cache_abstimmungen(engine): Abstimmung = sl.get_table(engine, "abstimmung") data = defaultdict(dict) for e in sl.distinct(engine, Abstimmung, "subject", "date"): data[e["date"]][e["subject"]] = set(drucksachen(e["subject"])) return dict(data.items())
def load_sitzungen(engine): WebTV = sl.get_table(engine, 'webtv_speech') for session in sl.distinct(engine, WebTV, 'wp', 'session', 'session_name', 'session_date'): load_sitzung(engine, session)
def merge_speeches(engine): Speech = sl.get_table(engine, 'speech') for combo in sl.distinct(engine, Speech, 'wahlperiode', 'sitzung'): merge_speech(engine, str(combo['wahlperiode']), str(combo['sitzung']))