Exemplo n.º 1
0
def load_abstimmungen(engine):
    _Abstimmung = sl.get_table(engine, 'abstimmung')
    i = 0
    for row in sl.distinct(engine, _Abstimmung, 'subject', 'date'):
        thema = row.get('subject')
        abst = Abstimmung.query.filter_by(thema=thema).first()
        if abst is None:
            abst = Abstimmung()
            abst.thema = thema
            abst.datum = date(row.get('date'))
        db.session.add(abst)
        for stimme_ in sl.find(engine, _Abstimmung, subject=thema,
            matched=True):
            if i % 1000 == 0:
                sys.stdout.write(".")
                sys.stdout.flush()
            i += 1
            person = Person.query.filter_by(
                fingerprint=stimme_.get('fingerprint')).first()
            if person is None:
                continue
            stimme = Stimme.query.filter_by(
                abstimmung=abst).filter_by(
                person=person).first()
            if stimme is not None:
                continue
            stimme = Stimme()
            stimme.entscheidung = stimme_['vote']
            stimme.person = person
            stimme.abstimmung = abst
            db.session.add(stimme)
        db.session.commit()
Exemplo n.º 2
0
def load_documents(engine):
    refs = sl.get_table(engine, 'referenz')
    for ref in sl.distinct(engine, refs, 'link'):
        link = ref.get('link')
        if link is None:
            continue
        load_document(link)
Exemplo n.º 3
0
def load_documents(engine):
    refs = sl.get_table(engine, 'referenz')
    for ref in sl.distinct(engine, refs, 'link'):
        link = ref.get('link')
        if link is None:
            continue
        load_document(link)
Exemplo n.º 4
0
def index():
    engine = etl_engine()
    webtv_table = sl.get_table(engine, 'webtv')
    sessions = sl.distinct(engine, webtv_table,
        'wp', 'session', 'session_name')
    sessions = sorted(sessions, reverse=True)
    return render_template('backend/index.html',
        sessions=sessions)
Exemplo n.º 5
0
def merge():
    engine = util.make_engine()
    table = sl.get_table(engine, 'fts')
    for row in sl.distinct(engine, table, 'beneficiary', 'country_code'):
        canonical, uri, score = lookup(row.get('beneficiary'), row.get('country_code'), engine)
        row['beneficiary_canonical'] = canonical
        row['beneficiary_uri'] = uri
        row['beneficiary_score'] = score
        sl.upsert(engine, table, row, ['beneficiary', 'country'])
Exemplo n.º 6
0
def match_beitraege(engine, url):
    table = sl.get_table(engine, 'beitrag')
    for beitrag in sl.distinct(engine, table, *KEYS, source_url=url):
        match = match_beitrag(engine, beitrag, url)
        beitrag['fingerprint'] = match
        beitrag['matched'] = match is not None
        if match:
            ensure_rolle(beitrag, match, engine)
        sl.upsert(engine, table, beitrag, unique=KEYS)
Exemplo n.º 7
0
def extend_ablaeufe(engine, master):
    log.info("Amending ablaeufe ...")
    Ablauf = sl.get_table(engine, 'ablauf')
    typen = [(t.get('typ'), t.get('class')) for t in master['ablauf_typ']]
    typen = dict(typen)
    for data in sl.distinct(engine, Ablauf, 'typ'):
        klass = typen.get(data.get('typ'))
        sl.upsert(engine, Ablauf, {'typ': data.get('typ'),
                         'class': klass}, 
                         unique=['typ'])
Exemplo n.º 8
0
def merge():
    read_countries()
    engine = util.make_engine()
    table = sl.get_table(engine, 'fts')
    for row in sl.distinct(engine, table, 'country'):
        country = row.get('country')
        data = match(country)
        row['country_code'] = data.get('iso_3166-1_2')
        row['country_common'] = data.get('common')
        sl.upsert(engine, table, row, ['country'])
Exemplo n.º 9
0
def build_index(publisher_name=None):
    '''Searches CKAN for spending resources and writes their metadata to
    the database.'''
    engine, table = connect()
    client = ckan_client()
    log.info('CKAN: %s', client.base_location)
    tags = ['+tags:"%s"' % t for t in TAGS]
    q = " OR ".join(tags)
    publisher_dict_filter = {}
    if publisher_name:
        publisher_solr_filter = 'publisher:"%s"' % publisher_name
        q = '(%s) AND (%s)' % (q, publisher_solr_filter)
        publisher_dict_filter = {'publisher_name': publisher_name}
    log.info('SOLR Search q: %r', q)

    existing_packages = set([
        res['package_name'] for res in sl.distinct(
            engine, table, 'package_name', **publisher_dict_filter)
    ])
    log.info('Existing datasets: %i', len(existing_packages))
    processed_packages = set()
    log.info('Doing package search for: "%s"', q)
    res = client.package_search(q, search_options={'limit': 2000})
    log.info('Search returned %i dataset results', res['count'])
    stats = OpenSpendingStats()
    stats_resources = OpenSpendingStats()
    for package_name in res['results']:
        processed_packages.add(package_name)
        num_resources = fetch_package(client, package_name, engine, table,
                                      stats_resources)
        if num_resources == 0:
            stats.add('Dataset has no resources', package_name)
        else:
            stats.add('Dataset has resources', package_name)
    # Removed rows about deleted packages
    obsolete_packages = existing_packages - processed_packages
    log.info('Obsolete datasets: %s from %s', len(obsolete_packages),
             len(existing_packages))
    for package_name in obsolete_packages:
        sl.delete(engine, table, package_name=package_name)
        sl.delete(engine, 'issue', package_name=package_name)
        stats.add('Removed obsolete dataset', package_name)
    # Removed stray rows without package_name
    stray_rows = list(sl.find(engine, table, package_name=None))
    if stray_rows:
        log.info('Stray rows without package_name: %i', len(stray_rows))
        sl.delete(engine, table, package_name=None)
        sl.delete(engine, 'issue', package_name=None)
        for row in stray_rows:
            stats.add('Stray row removed', row['resource_id'])
    print 'Datasets build_index summary:'
    print stats.report()
    print 'Resources build_index summary:'
    print stats_resources.report()
Exemplo n.º 10
0
def build_index(publisher_name=None):
    '''Searches CKAN for spending resources and writes their metadata to
    the database.'''
    engine, table = connect()
    client = ckan_client()
    log.info('CKAN: %s', client.base_location)
    tags = ['+tags:"%s"' % t for t in TAGS]
    q = " OR ".join(tags)
    publisher_dict_filter = {}
    if publisher_name:
        publisher_solr_filter = 'publisher:"%s"' % publisher_name
        q = '(%s) AND (%s)' % (q, publisher_solr_filter)
        publisher_dict_filter = {'publisher_name': publisher_name}
    log.info('SOLR Search q: %r', q)

    existing_packages = set(
            [res['package_name']
             for res in sl.distinct(engine, table, 'package_name', **publisher_dict_filter)])
    log.info('Existing datasets: %i', len(existing_packages))
    processed_packages = set()
    log.info('Doing package search for: "%s"', q)
    res = client.package_search(q,
            search_options={'limit': 2000})
    log.info('Search returned %i dataset results', res['count'])
    stats = OpenSpendingStats()
    stats_resources = OpenSpendingStats()
    for package_name in res['results']:
        processed_packages.add(package_name)
        num_resources = fetch_package(client, package_name, engine, table, stats_resources)
        if num_resources == 0:
            stats.add('Dataset has no resources', package_name)
        else:
            stats.add('Dataset has resources', package_name)
    # Removed rows about deleted packages
    obsolete_packages = existing_packages - processed_packages
    log.info('Obsolete datasets: %s from %s',
             len(obsolete_packages), len(existing_packages))
    for package_name in obsolete_packages:
        sl.delete(engine, table, package_name=package_name)
        sl.delete(engine, 'issue', package_name=package_name)
        stats.add('Removed obsolete dataset', package_name)
    # Removed stray rows without package_name
    stray_rows = list(sl.find(engine, table, package_name=None))
    if stray_rows:
        log.info('Stray rows without package_name: %i',
                 len(stray_rows))
        sl.delete(engine, table, package_name=None)
        sl.delete(engine, 'issue', package_name=None)
        for row in stray_rows:
            stats.add('Stray row removed', row['resource_id'])
    print 'Datasets build_index summary:'
    print stats.report()
    print 'Resources build_index summary:'
    print stats_resources.report()
Exemplo n.º 11
0
def extend_abstimmungen(engine):
    log.info("Amending votes ...")
    Abstimmung = sl.get_table(engine, 'abstimmung')
    for data in sl.distinct(engine, Abstimmung, 'person'):
        try:
            fp = match_speaker(data['person'])
        except NKInvalid, inv:
            log.exception(ve)
            continue
        except NKNoMatch, nm:
            fp = None
            log.info("No match for: %s", data['person'])
Exemplo n.º 12
0
def integrate_recon(engine, table, qfunc, src_col, dst_name_col, dst_uri_col,
        min_score=None, limit=200, memory_name=None):
    if memory_name is None:
        memory_name = "recon_%s_%s" % (table.name, src_col)
    memory = SQLALoadMemory(engine, table=memory_name)
    for row in sl.distinct(engine, table, src_col):
        res = interactive(qfunc, row[src_col], min_score=min_score,
                memory=memory, limit=limit)
        if res is not None:
            #print row.get(src_col), " -> ", res.name.encode('utf-8'), res.score
            sl.upsert(engine, table, {src_col: row[src_col], dst_name_col: res.name, 
                      dst_uri_col: res.uri}, [src_col])
Exemplo n.º 13
0
def match_beitraege(engine):
    Beitrag = sl.get_table(engine, 'beitrag')
    for i, beitrag in enumerate(sl.distinct(engine, Beitrag, 'vorname',
        'nachname', 'funktion', 'land', 'fraktion', 'ressort', 'ort')):
        if i % 1000 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()
        match = match_beitrag(engine, beitrag)
        ensure_rolle(beitrag, match, engine)
        beitrag['fingerprint'] = match
        beitrag['matched'] = match is not None
        sl.upsert(engine, Beitrag, beitrag, unique=['vorname', 'nachname',
            'funktion', 'land', 'fraktion', 'ressort', 'ort'])
Exemplo n.º 14
0
def match_speakers_webtv(engine):
    WebTV = sl.get_table(engine, 'webtv')
    for i, speech in enumerate(sl.distinct(engine, WebTV, 'speaker')):
        if speech['speaker'] is None:
            continue
        speaker = speaker_name_transform(speech['speaker'])
        matched = True
        try:
            fp = match_speaker(speaker)
        except NKInvalid, inv:
            fp = None
        except NKNoMatch, nm:
            fp = None
            matched = False
Exemplo n.º 15
0
def item_index(engine):
    webtv_tbl = sl.get_table(engine, 'webtv')
    q = "SELECT s.text FROM speech s LEFT JOIN webtv_speech ws ON ws.wp::int = s.wahlperiode AND " \
        + "ws.session::int = s.sitzung AND ws.sequence = s.sequence " \
        + "WHERE ws.wp = '%(wp)s' AND ws.session = '%(session)s' AND ws.item_id = '%(item_id)s' AND s.type = 'chair' "
    items = {}
    log.info("Building index of drs mentions in speeches...")
    for item in sl.distinct(engine, webtv_tbl, 'wp', 'session', 'item_id'):
        _drs = set()
        for text in list(sl.query(engine, q % item)):
            _drs = _drs.union(drucksachen(text['text'], wahlperiode=item['wp']))
        if len(_drs):
            items[(item['wp'], item['session'], item['item_id'])] = _drs
    #pprint(items)
    return items
Exemplo n.º 16
0
def item_index(engine):
    webtv_tbl = sl.get_table(engine, 'webtv')
    q = "SELECT s.text FROM speech s LEFT JOIN webtv_speech ws ON ws.wp::int = s.wahlperiode AND " \
        + "ws.session::int = s.sitzung AND ws.sequence = s.sequence " \
        + "WHERE ws.wp = '%(wp)s' AND ws.session = '%(session)s' AND ws.item_id = '%(item_id)s' AND s.type = 'chair' "
    items = {}
    log.info("Building index of drs mentions in speeches...")
    for item in sl.distinct(engine, webtv_tbl, 'wp', 'session', 'item_id'):
        _drs = set()
        for text in list(sl.query(engine, q % item)):
            _drs = _drs.union(drucksachen(text['text'],
                                          wahlperiode=item['wp']))
        if len(_drs):
            items[(item['wp'], item['session'], item['item_id'])] = _drs
    #pprint(items)
    return items
Exemplo n.º 17
0
def merge():
    geocoder = shapegeocode.geocoder(
        'nuts2-shapefile/data/NUTS_RG_10M_2006.shp',
        filter=lambda r: r['STAT_LEVL_'] == 3)
    regions = load_region_hierarchy()
    engine = util.make_engine()
    table = sl.get_table(engine, 'fts')
    for row in sl.distinct(engine, table, *KEYS):
        loc = geocode(row)
        if loc is None:
            continue
        row.update(loc)
        reg = find_region(geocoder, regions, row)
        row.update(reg)
        log.info("Geocoded: %s/%s - %s",
            row['lat'], row['lon'], row.get('nuts3_label'))
        sl.upsert(engine, table, row, KEYS)
Exemplo n.º 18
0
def merge(codes):
    engine = util.make_engine()
    table = sl.get_table(engine, 'fts')
    for level in LEVELS:
        src_col = 'budget_code' if level == 'item' else level
        for data in sl.distinct(engine, table, src_col):
            value = data[src_col]
            if level == 'item' and len(value) < 11:
                continue
            if value not in codes:
                print value
                continue
            code_data = codes.get(value)
            data['%s_name' % level] = value
            data['%s_label' % level] = code_data['label']
            data['%s_description' % level] = code_data['description']
            data['%s_legal_basis' % level] = code_data['legal_basis']
            sl.upsert(engine, table, data, [src_col])
Exemplo n.º 19
0
def speakers_webtv(engine, wp, session):
    table = sl.get_table(engine, 'webtv')
    for speech in sl.distinct(engine, table, 'speaker',
            wp=wp, session=session):
        if speech['speaker'] is None:
            continue
        speaker = speaker_name_transform(speech['speaker'])
        matched = True
        try:
            fp = resolve_person(speaker)
        except InvalidReference:
            fp = None
        except BadReference:
            fp = None
            matched = False
        sl.upsert(engine, table, {'fingerprint': fp,
                                  'matched': matched,
                                  'speaker': speech['speaker']},
                    unique=['speaker'])
Exemplo n.º 20
0
def update_reference(engine, data, table_name, col):
    table = sl.get_table(engine, table_name)
    for row in sl.distinct(engine, table, col):
        print row
        matched = False
        for ref in data:
            country = ref['country'].decode('utf-8')
            if ref['euname'] == row[col] or \
                country.upper() == row[col].upper():
                if not len(ref['euname']):
                    ref['euname'] = row[col]
                matched = True
                sl.update_row(engine, table, {
                        col: row[col],
                        col + 'Norm': country,
                        col + 'Code': ref['iso2']},
                        [col])
        if not matched:
            print row
    return data
Exemplo n.º 21
0
def cache_abstimmungen(engine):
    Abstimmung = sl.get_table(engine, 'abstimmung')
    data = defaultdict(dict)
    for e in sl.distinct(engine, Abstimmung, 'subject', 'date'):
        data[e['date']][e['subject']] = set(drucksachen(e['subject']))
    return dict(data.items())
Exemplo n.º 22
0
def cache_abstimmungen(engine):
    Abstimmung = sl.get_table(engine, "abstimmung")
    data = defaultdict(dict)
    for e in sl.distinct(engine, Abstimmung, "subject", "date"):
        data[e["date"]][e["subject"]] = set(drucksachen(e["subject"]))
    return dict(data.items())
Exemplo n.º 23
0
def load_sitzungen(engine):
    WebTV = sl.get_table(engine, 'webtv_speech')
    for session in sl.distinct(engine, WebTV,
        'wp', 'session', 'session_name', 'session_date'):
        load_sitzung(engine, session)
Exemplo n.º 24
0
def cache_abstimmungen(engine):
    Abstimmung = sl.get_table(engine, 'abstimmung')
    data = defaultdict(dict)
    for e in sl.distinct(engine, Abstimmung, 'subject', 'date'):
        data[e['date']][e['subject']] = set(drucksachen(e['subject']))
    return dict(data.items())
Exemplo n.º 25
0
def merge_speeches(engine):
    Speech = sl.get_table(engine, 'speech')
    for combo in sl.distinct(engine, Speech, 'wahlperiode', 'sitzung'):
        merge_speech(engine, str(combo['wahlperiode']),
                     str(combo['sitzung']))