示例#1
0
def supplier_names_from_table(engine, resource_id, table_id):
    global tables_count, rows_count
    tables_count = tables_count + 1
    print "# checking table %d" % tables_count
    table_suffix = "%s_table%s" % (resource_id, table_id)

    table = sl.get_table(engine, "spending_%s" % table_suffix)
    supplier_table = sl.get_table(engine, "suppliers")

    for row in sl.all(engine, table):
        rows_count = rows_count + 1
        if not row.has_key("SupplierName"):
            # One of the junk tables that contain no real data, usually excel noise
            continue
        supplier = row["SupplierName"]

        if supplier is None or supplier == "":
            continue

        if suppliers_visited.has_key(supplier):
            continue

        suppliers_visited[supplier] = True

        if sl.find_one(engine, supplier_table, original=supplier) is not None:
            continue

        yield supplier
示例#2
0
def load_debatten(engine, indexer, sitzung):
    WebTV_Speech = sl.get_table(engine, 'webtv_speech')
    zitate = list(sl.find(engine, WebTV_Speech, wp=str(sitzung.wahlperiode),
        session=str(sitzung.nummer)))
    debatten = dict([(z['item_id'], z) for z in zitate])
    speeches = list(sl.find(engine, sl.get_table(engine, 'speech'),
        wahlperiode=int(sitzung.wahlperiode), sitzung=int(sitzung.nummer)))
    for i, data in debatten.items():
        log.info("Loading  -> Debatte: %s..." % data.get('item_label'))
        debatte = Debatte.query.filter_by(
                sitzung=sitzung,
                nummer=data.get('item_id')
                ).first()
        if debatte is None:
            debatte = Debatte()
        debatte.sitzung = sitzung
        debatte.nummer = data.get('item_id')
        debatte.tops = data.get('item_key')
        debatte.titel = data.get('item_label')
        debatte.text = data.get('item_description')

        db.session.add(debatte)
        db.session.flush()
        indexer.add(debatte)
        
        dzitate = filter(lambda z: z['item_id'] == data['item_id'], zitate)
        reden = load_reden(engine, indexer, debatte, dzitate)
        load_zitate(engine, indexer, debatte, dzitate, speeches, reden)
        db.session.commit()
        indexer.add_many(reden.values())
示例#3
0
def scrape_activity(engine, url, elem):
    urheber = elem.findtext("URHEBER")
    fundstelle = elem.findtext("FUNDSTELLE")
    Position = sl.get_table(engine, 'position')
    p = {'source_url': url, 'urheber': urheber, 'fundstelle': fundstelle}
    pos_keys = p.copy()
    p['zuordnung'] = elem.findtext("ZUORDNUNG")
    p['abstrakt'] = elem.findtext("VP_ABSTRAKT")
    p['fundstelle_url'] = elem.findtext("FUNDSTELLE_LINK")

    Zuweisung = sl.get_table(engine, 'zuweisung')
    for zelem in elem.findall("ZUWEISUNG"):
        z = pos_keys.copy()
        z['text'] = zelem.findtext("AUSSCHUSS_KLARTEXT")
        z['federfuehrung'] = zelem.find("FEDERFUEHRUNG") is not None
        z['gremium_key'] = DIP_GREMIUM_TO_KEY.get(z['text'])
        sl.add_row(engine, Zuweisung, z)

    Beschluss = sl.get_table(engine, 'beschluss')
    for belem in elem.findall("BESCHLUSS"):
        b = pos_keys.copy()
        b['seite'] = belem.findtext("BESCHLUSSSEITE")
        b['dokument_text'] = belem.findtext("BEZUGSDOKUMENT")
        b['tenor'] = belem.findtext("BESCHLUSSTENOR")
        b['grundlage'] = belem.findtext("GRUNDLAGE")
        sl.add_row(engine, Beschluss, b)

    Referenz = sl.get_table(engine, 'referenz')
    try:
        dokument = dokument_by_url(p['fundstelle_url']) or \
            dokument_by_name(p['fundstelle'])
        dokument.update(pos_keys)
        sl.add_row(engine, Referenz, dokument)
    except Exception, e:
        log.exception(e)
示例#4
0
def load_debatten(engine, sitzung):
    WebTV_Speech = sl.get_table(engine, 'webtv_speech')
    zitate = list(sl.find(engine, WebTV_Speech, wp=str(sitzung.wahlperiode),
        session=str(sitzung.nummer)))
    debatten = dict([(z['item_id'], z) for z in zitate])
    speeches = list(sl.find(engine, sl.get_table(engine, 'speech'),
        wahlperiode=int(sitzung.wahlperiode), sitzung=int(sitzung.nummer)))
    for i, data in debatten.items():
        log.info("Loading  -> Debatte: %s..." % data.get('item_label'))
        debatte = Debatte.query.filter_by(
                sitzung=sitzung,
                nummer=data.get('item_id')
                ).first()
        if debatte is None:
            debatte = Debatte()
        debatte.sitzung = sitzung
        debatte.nummer = data.get('item_id')
        debatte.tops = data.get('item_key')
        debatte.titel = data.get('item_label')
        debatte.text = data.get('item_description')

        db.session.add(debatte)
        db.session.flush()

        dzitate = filter(lambda z: z['item_id'] == data['item_id'], zitate)
        load_zitate(engine, debatte, dzitate, speeches)
        db.session.commit()
示例#5
0
def load_ablauf(engine, indexer, data):
    ablauf = Ablauf.query.filter_by(source_url=data.get('source_url')).first()
    if ablauf is None:
        ablauf = Ablauf()

    ablauf.key = data.get('key')
    ablauf.source_url = data.get('source_url')
    ablauf.wahlperiode = data.get('wahlperiode')
    ablauf.typ = data.get('typ')
    ablauf.klasse = data.get('class')
    ablauf.titel = data.get('titel')
    if not len(ablauf.titel):
        log.error("No titel!")
        return

    ablauf.initiative = data.get('initiative')
    ablauf.stand = data.get('stand')
    ablauf.signatur = data.get('signatur')
    ablauf.gesta_id = data.get('gesta_id')
    ablauf.eu_dok_nr = data.get('eu_dok_nr')
    ablauf.eur_lex_url = data.get('eur_lex_url')
    ablauf.eur_lex_pdf = data.get('eur_lex_pdf')
    ablauf.consilium_url = data.get('consilium_url')
    ablauf.abstrakt = data.get('abstrakt')
    ablauf.zustimmungsbeduerftig = data.get('zustimmungsbeduerftig')
    ablauf.sachgebiet = data.get('sachgebiet')
    ablauf.abgeschlossen = True if str(data.get('abgeschlossen')) \
            == 'True' else False
    db.session.add(ablauf)
    db.session.flush()

    worte = []
    _Schlagwort = sl.get_table(engine, 'schlagwort')
    for sw in sl.find(engine, _Schlagwort, source_url=ablauf.source_url):
        wort = Schlagwort()
        wort.name = sw['wort']
        db.session.add(wort)
        worte.append(wort)
    ablauf.schlagworte = worte

    _Referenz = sl.get_table(engine, 'referenz')
    for ddata in sl.find(engine, _Referenz, source_url=ablauf.source_url):
        dokument = load_dokument(engine, indexer, ddata)
        referenz = Referenz.query.filter_by(
            dokument=dokument,
            seiten=ddata.get('seiten'),
        ).filter(Referenz.ablaeufe.any(id=ablauf.id)).first()
        if referenz is None:
            referenz = Referenz()
            referenz.ablaeufe.append(ablauf)
            referenz.dokument = dokument
        referenz.seiten = ddata.get('seiten')
        referenz.text = ddata.get('text')

    _Position = sl.get_table(engine, 'position')
    for position in sl.find(engine, _Position, source_url=ablauf.source_url):
        load_position(engine, indexer, ablauf, position)

    db.session.commit()
    indexer.add(ablauf)
示例#6
0
def load_ablauf(engine, indexer, data):
    ablauf = Ablauf.query.filter_by(source_url=data.get('source_url')).first()
    if ablauf is None:
        ablauf = Ablauf()

    ablauf.key = data.get('key')
    ablauf.source_url = data.get('source_url')
    ablauf.wahlperiode = data.get('wahlperiode')
    ablauf.typ = data.get('typ')
    ablauf.klasse = data.get('class')
    ablauf.titel = data.get('titel')
    if not len(ablauf.titel):
        log.error("No titel!")
        return

    ablauf.initiative = data.get('initiative')
    ablauf.stand = data.get('stand')
    ablauf.signatur = data.get('signatur')
    ablauf.gesta_id = data.get('gesta_id')
    ablauf.eu_dok_nr = data.get('eu_dok_nr')
    ablauf.eur_lex_url = data.get('eur_lex_url')
    ablauf.eur_lex_pdf = data.get('eur_lex_pdf')
    ablauf.consilium_url = data.get('consilium_url')
    ablauf.abstrakt = data.get('abstrakt')
    ablauf.zustimmungsbeduerftig = data.get('zustimmungsbeduerftig')
    ablauf.sachgebiet = data.get('sachgebiet')
    ablauf.abgeschlossen = True if str(data.get('abgeschlossen')) \
            == 'True' else False
    db.session.add(ablauf)
    db.session.flush()

    worte = []
    _Schlagwort = sl.get_table(engine, 'schlagwort')
    for sw in sl.find(engine, _Schlagwort, source_url=ablauf.source_url):
        wort = Schlagwort()
        wort.name = sw['wort']
        db.session.add(wort)
        worte.append(wort)
    ablauf.schlagworte = worte

    _Referenz = sl.get_table(engine, 'referenz')
    for ddata in sl.find(engine, _Referenz, source_url=ablauf.source_url):
        dokument = load_dokument(engine, indexer, ddata)
        referenz = Referenz.query.filter_by(
                dokument=dokument,
                seiten=ddata.get('seiten'),
                ).filter(Referenz.ablaeufe.any(id=ablauf.id)).first()
        if referenz is None:
            referenz = Referenz()
            referenz.ablaeufe.append(ablauf)
            referenz.dokument = dokument
        referenz.seiten = ddata.get('seiten')
        referenz.text = ddata.get('text')

    _Position = sl.get_table(engine, 'position')
    for position in sl.find(engine, _Position, source_url=ablauf.source_url):
        load_position(engine, indexer, ablauf, position)

    db.session.commit()
    indexer.add(ablauf)
示例#7
0
def load_finances(financialData, childBase, engine):
    etlId = '%s//%s' % (financialData['startDate'].isoformat(),
                        financialData['endDate'].isoformat())

    financial_sources = \
        [(s, 'other') for s in financialData.pop("otherCustomized")] + \
        [(s, 'public') for s in financialData.pop("publicCustomized")]
    for financial_source, type_ in financial_sources:
        financial_source['type'] = type_
        financial_source['financialDataEtlId'] = etlId
        financial_source.update(childBase)
        sl.upsert(engine, sl.get_table(engine, 'financialDataCustomSource'),
                  financial_source, ['representativeEtlId',
                      'financialDataEtlId', 'type', 'name'])

    for turnover in financialData.pop("turnoverBreakdown"):
        turnover['financialDataEtlId'] = etlId
        turnover['etlFingerPrint'] = turnover['name'].strip()
        turnover.update(childBase)
        sl.upsert(engine, sl.get_table(engine, 'financialDataTurnover'),
                  turnover, ['representativeEtlId', 'financialDataEtlId',
                             'etlFingerPrint'])

    financialData['etlId'] = etlId
    financialData.update(childBase)
    sl.upsert(engine, sl.get_table(engine, 'financialData'),
              financialData, ['representativeEtlId', 'etlId'])
def update_network_entities(engine, file_name):
    log.info("Updating network entities reference sheet: %s", file_name)
    network_entities = set()
    table = sl.get_table(engine, 'network_entity')
    if os.path.exists(file_name):
        fh = open(file_name, 'rb')
        reader = csv.DictReader(fh)
        for d in reader:
            e = [(k, v.decode('utf-8')) for (k, v) in d.items()]
            e = dict(e)
            network_entities.add((e['representativeEtlId'], e['etlFingerPrint']))
            sl.upsert(engine, table, e, ['representativeEtlId', 'etlFingerPrint'])
        fh.close()
        reps = set([ne[0] for ne in network_entities])
        rep_table = sl.get_table(engine, 'representative')
        for rep in reps:
            sl.update(engine, rep_table, {'etlId': rep}, {'network_extracted': True})

    for row in sl.all(engine, table):
        network_entities.add((row['representativeEtlId'], row['etlFingerPrint']))

    fh = open(file_name, 'wb')
    writer = None
    table = sl.get_table(engine, 'network_entity')
    for ic, fp in network_entities:
        row = {
            'representativeEtlId': ic,
            'etlFingerPrint': fp
        }
        if writer is None:
            writer = csv.DictWriter(fh, row.keys())
            writer.writerow(dict(zip(row.keys(), row.keys())))
        r = [(k, unicode(v).encode('utf-8')) for k, v in row.items()]
        writer.writerow(dict(r))
    fh.close()
示例#9
0
def extend_speeches(engine, wahlperiode=17):
    log.info("Amending speeches with DRS ...")
    drs_match = re.compile(DRS_MATCH % (wahlperiode, wahlperiode))
    Speech = sl.get_table(engine, 'speech')
    SpeechDocument = sl.get_table(engine, 'speech_document')
    for i, data in enumerate(sl.find(engine, Speech)):
        if data.get('type') != 'chair':
            continue
        if i % 1000 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()
        m = drs_match.search(data.get('text'))
        if m is None:
            continue
        for i, grp in enumerate(m.groups()):
            if grp and '/' in grp:
                wp, nummer = grp.split('/', 1)
                sl.upsert(
                    engine,
                    SpeechDocument, {
                        'group': i,
                        'sequence': data['sequence'],
                        'sitzung': data['sitzung'],
                        'wahlperiode': wahlperiode,
                        'dok_nummer': nummer
                    },
                    unique=['sequence', 'sitzung', 'wahlperiode', 'group'])
示例#10
0
def make_fingerprint(engine, person):
    try:
        long_name = make_long_name(person)
        try:
            long_name = resolve_person(long_name)
            log.info(" -> %s" % long_name.strip())
        except:
            log.error("Resolve did not work")
            pass

        Person = sl.get_table(engine, 'person')
        sl.upsert(engine, Person, {
            'fingerprint': long_name,
            'slug': url_slug(long_name),
            'mdb_id': person['mdb_id']
            }, unique=['mdb_id'])
        Rolle = sl.get_table(engine, 'rolle')
        sl.upsert(engine, Rolle, {
            'mdb_id': person['mdb_id'],
            'fingerprint': long_name
            }, unique=['mdb_id'])
        person['fingerprint'] = long_name
    except BadReference:
        log.error("Bad Reference %s", person)
        pass
示例#11
0
def condense(engine, resource_id, table_id, force):
    table_suffix = '%s_table%s' % (resource_id, table_id)

    if not engine.has_table('raw_%s' % table_suffix):
        return

    condensed_table = sl.get_table(engine, 'condensed')

    # Skip over tables we have already extracted
    if not force and sl.find_one(engine, condensed_table, resource_id=resource_id, table_id=table_id) is not None:
        return

    connection = engine.connect()
    trans = connection.begin()

    start = time.time()

    try:
        raw_table = sl.get_table(connection, 'raw_%s' % table_suffix)
        sl.drop_table(connection, 'spending_%s' % table_suffix)
        spending_table = sl.get_table(connection, 'spending_%s' % table_suffix)
        columns_table = sl.get_table(connection, 'column_sets')

        normalise_map = normalised_columns_map(raw_table)
        normalised_headers = ','.join(sorted(normalise_map.values()))
        mapping_row = sl.find_one(connection, columns_table, normalised=normalised_headers)

        if mapping_row is None or not mapping_row.get('valid'):
            # This table is unmapped, cannot be condensed
            return

        column_mapping = json.loads(mapping_row['column_map'])

        # Build the final mapping from input column to output column
        mapping = {}
        for k,n in normalise_map.iteritems():
            if n in column_mapping and column_mapping[n] is not None and len(column_mapping[n]) > 0:
                mapping[k] = column_mapping[n]
        
        for row in sl.all(connection, raw_table):
            spending_row = {}
            for key, value in row.items():
                if key not in mapping:
                    continue
                if not value or not len(value.strip()):
                    continue
                if mapping[key] in spending_row:
                    continue
                spending_row[mapping[key]] = value.strip()
            #print spending_row
            sl.add_row(connection, spending_table, spending_row)
        sl.upsert(connection, condensed_table, {'resource_id': resource_id,
                                                'table_id': table_id,
                                                'condense_time': time.time() - start,
                                                }, ['resource_id', 'table_id'])
        trans.commit()
    finally:
        connection.close()
示例#12
0
def speechmatcher(wp, session):
    engine = etl_engine()
    speech_table = sl.get_table(engine, 'speech')
    speeches = sl.find(engine, speech_table, order_by='sequence', 
        wahlperiode=wp, sitzung=session, matched=True)
    webtv_table = sl.get_table(engine, 'webtv')
    agenda = sl.find(engine, webtv_table, wp=wp, session=session)
    agenda = list(agenda)
    return render_template('backend/speechmatcher.html',
            speeches=speeches, agenda=agenda, wp=wp, session=session)
示例#13
0
def create_entities(engine):
    log.info("De-normalizing global entities collection...")
    table = sl.get_table(engine, 'entity')
    for tbl in ['representative', 'person', 'financialDataTurnover',
        'organisation', 'network_entity']:
        for row in sl.all(engine, sl.get_table(engine, tbl)):
            entity = {'etlFingerPrint': row.get('etlFingerPrint')}
            entity['legalStatus'] = row.get('legalStatus', '')
            entity['countryCode'] = row.get('contactCountryCode', '')
            entity['etlTable'] = tbl
            sl.upsert(engine, table, entity, ['etlFingerPrint', 'etlTable'])
示例#14
0
def add_to_gremium(node, url, role, engine):
    key = node.get('id')
    table = sl.get_table(engine, 'gremium')
    g = sl.find_one(engine, table, key=key)
    if g is None:
        g = {'key': key, 'type': 'sonstiges'}
        g['name'] = node.findtext('gremiumName')
        g['url'] = node.findtext('gremiumURL')
        sl.upsert(engine, table, g, unique=['key'])
    table = sl.get_table(engine, 'gremium_mitglieder')
    sl.upsert(engine, table, {
        'gremium_key': g['key'],
        'person_source_url': url,
        'role': role
        }, unique=['person_source_url', 'gremium_key', 'role'])
示例#15
0
def retrieve(row, engine, force):
    ret_table = sl.get_table(engine, 'retrieval_log')
    #print row.get('package_name'), row['url'].encode('utf-8')
    try:
        import os
        if not force and os.path.exists(source_path(row)):
            return
        url = fix_url(row['url'])
        print "Fetching %s" % url
        res = urllib2.urlopen(url)

        fh = open(source_path(row), 'wb')
        fh.write(res.read())

        sl.add_row(engine, ret_table, {
            'resource_id': row['resource_id'],
            'status': '200',
            'message': "",
            'content-type': res.headers.get('content-type', ''),
            'timestamp': datetime.now()
            })
    except Exception, ioe:
        print traceback.format_exc()
        status = 0
        if hasattr(ioe, 'code'):
            status = ioe.code
        sl.add_row(engine, ret_table, {
            'resource_id': row['resource_id'],
            'status': status,
            'message': unicode(ioe),
            'timestamp': datetime.now()
            })
        assert False, unicode(ioe).encode('utf-8')
示例#16
0
def add_to_gremium(node, url, role, engine):
    key = node.get("id")
    table = sl.get_table(engine, "gremium")
    g = sl.find_one(engine, table, key=key)
    if g is None:
        g = {"key": key, "type": "sonstiges"}
        g["name"] = node.findtext("gremiumName")
        g["url"] = node.findtext("gremiumURL")
        sl.upsert(engine, table, g, unique=["key"])
    table = sl.get_table(engine, "gremium_mitglieder")
    sl.upsert(
        engine,
        table,
        {"gremium_key": g["key"], "person_source_url": url, "role": role},
        unique=["person_source_url", "gremium_key", "role"],
    )
示例#17
0
def load_rollen(engine, person, data):
    _RolleSource = sl.get_table(engine, 'rolle')
    for rdata in sl.find(engine, _RolleSource,
                         fingerprint=data['fingerprint']):
        rolle = Rolle.query.filter_by(person=person,
                                      funktion=rdata.get('funktion'),
                                      ressort=rdata.get('ressort'),
                                      fraktion=rdata.get('fraktion'),
                                      land=rdata.get('land')).first()
        if rolle is None:
            rolle = Rolle()

        rolle.person = person
        rolle.mdb_id = rdata.get('mdb_id')
        rolle.status = rdata.get('status')
        rolle.funktion = rdata.get('funktion')
        rolle.fraktion = rdata.get('fraktion')
        rolle.gewaehlt = rdata.get('gewaehlt')
        rolle.ressort = rdata.get('ressort')
        rolle.land = rdata.get('land')
        rolle.austritt = to_date(rdata.get('austritt'))

        if rdata.get('mdb_id'):
            rolle.wahlkreis = load_wahlkreis(engine, rolle, data)
        db.session.add(rolle)
示例#18
0
def clear_issues(engine, resource_id, stage):
    import sqlaload as sl # this import is slow, so it is done inside this func
    table = sl.get_table(engine, 'issue')
    sl.delete(engine, table,
              resource_id=resource_id,
              stage=stage,
    )
示例#19
0
def combine(force=False, filter=None):
    stats = OpenSpendingStats()
    engine = db_connect()
    source_table = sl.get_table(engine, 'source')
    for row in sl.find(engine, source_table, **(filter or {})):
        combine_resource(engine, source_table, row, force, stats)
    log.info('Combine summary: \n%s' % stats.report())
示例#20
0
def combine_resource_core(engine, row, stats):
    '''Given a resource (source row) it opens the table with its contents
    and puts its rows into the combined table, using the column mappings.

    Returns whether it succeeds or not.
    '''
    error = None
    for sheet_id in range(0, row['sheets']):
        table = sl.get_table(engine, 'raw_%s_sheet%s' % (
            row['resource_id'], sheet_id))
        if not engine.has_table(table.name):
            error = 'Sheet table does not exist'
            log.warn('Sheet table does not exist: %s', table)
            continue
        columns = [c.name for c in table.columns]
        mapping = column_mapping(engine, row, columns)
        if mapping is None:
            error = 'Column mappings not complete'
            log.warn('Column mappings not complete: %s', columns)
            continue
        log.debug('Column mapping: %r', mapping)
        if not combine_sheet(engine, row, sheet_id, table, mapping):
            error = 'Could not combine sheet'
    if error:
        stats.add_source(error, row)
    else:
        stats.add_source('Combined ok', row)
    return (not error)
示例#21
0
def combine_sheet(engine, resource, sheet_id, table, mapping):
    begin = time.time()
    base = {
            'resource_id': resource['resource_id'],
            'resource_hash': resource['extract_hash'],
            'sheet_id': sheet_id,
        }
    spending_table = sl.get_table(engine, 'spending')
    connection = engine.connect()
    trans = connection.begin()
    try:
        rows = 0
        sl.delete(connection, spending_table,
                resource_id=resource['resource_id'],
                sheet_id=sheet_id)
        for row in sl.all(connection, table):
            data = dict(base)
            for col, value in row.items():
                if col == 'id':
                    data['row_id'] = value
                    continue
                mapped = mapping.get(col)
                if mapped is not None:
                    data[mapped] = value
            sl.add_row(connection, spending_table, data)
            rows += 1
        trans.commit()
        log.info("Loaded %s rows in %s ms", rows,
                int((time.time()-begin)*1000))
        return rows > 0
    finally:
        connection.close()
示例#22
0
def load_documents(engine):
    refs = sl.get_table(engine, 'referenz')
    for ref in sl.distinct(engine, refs, 'link'):
        link = ref.get('link')
        if link is None:
            continue
        load_document(link)
示例#23
0
def load_abstimmung(engine, source_url):
    table = sl.get_table(engine, 'abstimmung')
    stimmen = list(sl.find(engine, table, source_url=source_url,
        matched=True))
    if not len(stimmen):
        log.error("No reconciled votes, signals deeper trouble?")
        return
    thema = stimmen[0].get('subject')
    abst = Abstimmung.query.filter_by(thema=thema).first()
    if abst is None:
        abst = Abstimmung()
        abst.thema = thema
        abst.datum = to_date(stimmen[0].get('date'))
    db.session.add(abst)
    db.session.flush()
    for stimme_ in stimmen:
        person = Person.query.filter_by(
            fingerprint=stimme_.get('fingerprint')).first()
        if person is None:
            continue
        stimme = Stimme.query.filter_by(
            abstimmung=abst).filter_by(
            person=person).first()
        if stimme is not None:
            continue
        stimme = Stimme()
        stimme.entscheidung = stimme_['vote']
        stimme.person = person
        stimme.abstimmung = abst
        db.session.add(stimme)
    db.session.commit()
示例#24
0
def load_abstimmung(engine, source_url):
    table = sl.get_table(engine, 'abstimmung')
    stimmen = list(sl.find(engine, table, source_url=source_url, matched=True))
    if not len(stimmen):
        log.error("No reconciled votes, signals deeper trouble?")
        return
    thema = stimmen[0].get('subject')
    abst = Abstimmung.query.filter_by(thema=thema).first()
    if abst is None:
        abst = Abstimmung()
        abst.thema = thema
        abst.datum = to_date(stimmen[0].get('date'))
    db.session.add(abst)
    db.session.flush()
    for stimme_ in stimmen:
        person = Person.query.filter_by(
            fingerprint=stimme_.get('fingerprint')).first()
        if person is None:
            continue
        stimme = Stimme.query.filter_by(abstimmung=abst).filter_by(
            person=person).first()
        if stimme is not None:
            continue
        stimme = Stimme()
        stimme.entscheidung = stimme_['vote']
        stimme.person = person
        stimme.abstimmung = abst
        db.session.add(stimme)
    db.session.commit()
示例#25
0
def scrape_transcript(engine, url, force=False):
    wp, session = url_metadata(url)
    table = sl.get_table(engine, 'speech')
    sio = find_local(url)
    sample = {'source_etag': 'local'}
    if sio is None:
        sample = sl.find_one(engine, table, source_url=url, matched=True)
        response, sio = fetch_stream(url)
        sample = check_tags(sample or {}, response, force)
    base_data = {'source_url': url,
                 'sitzung': session,
                 'wahlperiode': wp,
                 'matched': False,
                 'loaded': False,
                 'source_etag': sample['source_etag']}
    log.info("Loading transcript: %s/%s, from %s" , wp, session, url)
    seq = 0
    parser = SpeechParser(sio)
    for contrib in parser:
        if not len(contrib['text'].strip()):
            continue
        contrib.update(base_data)
        contrib['sequence'] = seq
        sl.upsert(engine, table, contrib, 
                  unique=['source_url', 'sequence'])
        seq += 1
    if not parser.missing_recon:
        sl.upsert(engine, table, {
                    'matched': True,
                    'source_url': url,
            }, unique=['source_url'])
    else:
        raise InvalidReference()
    return base_data
示例#26
0
def generate_person_long_names(engine):
    log.info("Generating person fingerprints and slugs...")
    from offenesparlament.transform.namematch import match_speaker
    nkp = nk_persons()
    Person = sl.get_table(engine, 'person')
    for person in sl.find(engine, Person):
        long_name = make_long_name(person)
        try:
            long_name = match_speaker(long_name)
        except NKNoMatch:
            pass
        log.info(" -> %s" % long_name.strip())
        slug = url_slug(long_name)
        sl.upsert(engine, Person, {
                         'fingerprint': long_name,
                         'slug': slug,
                         'id': person['id']},
                         unique=['id'])
        tries = 0
        while True:
            try:
                nkp.ensure_value(long_name, data=person)
            except ValueError, E:
                log.warn('Exception: %s' % str(E))
                tries = tries + 1
                if tries > 5:
                    raise
            else:
                break
示例#27
0
def load_documents(engine):
    refs = sl.get_table(engine, 'referenz')
    for ref in sl.distinct(engine, refs, 'link'):
        link = ref.get('link')
        if link is None:
            continue
        load_document(link)
示例#28
0
def combine_resource_core(engine, row, stats):
    '''Given a resource (source row) it opens the table with its contents
    and puts its rows into the combined table, using the column mappings.

    Returns whether it succeeds or not.
    '''
    error = None
    for sheet_id in range(0, row['sheets']):
        table = sl.get_table(engine,
                             'raw_%s_sheet%s' % (row['resource_id'], sheet_id))
        if not engine.has_table(table.name):
            error = 'Sheet table does not exist'
            log.warn('Sheet table does not exist: %s', table)
            continue
        columns = [c.name for c in table.columns]
        mapping = column_mapping(engine, row, columns)
        if mapping is None:
            error = 'Column mappings not complete'
            log.warn('Column mappings not complete: %s', columns)
            continue
        log.debug('Column mapping: %r', mapping)
        if not combine_sheet(engine, row, sheet_id, table, mapping):
            error = 'Could not combine sheet'
    if error:
        stats.add_source(error, row)
    else:
        stats.add_source('Combined ok', row)
    return (not error)
示例#29
0
def combine_sheet(engine, resource, sheet_id, table, mapping):
    begin = time.time()
    base = {
        'resource_id': resource['resource_id'],
        'resource_hash': resource['extract_hash'],
        'sheet_id': sheet_id,
    }
    spending_table = sl.get_table(engine, 'spending')
    connection = engine.connect()
    trans = connection.begin()
    try:
        rows = 0
        sl.delete(connection,
                  spending_table,
                  resource_id=resource['resource_id'],
                  sheet_id=sheet_id)
        for row in sl.all(connection, table):
            data = dict(base)
            for col, value in row.items():
                if col == 'id':
                    data['row_id'] = value
                    continue
                mapped = mapping.get(col)
                if mapped is not None:
                    data[mapped] = value
            sl.add_row(connection, spending_table, data)
            rows += 1
        trans.commit()
        log.info("Loaded %s rows in %s ms", rows,
                 int((time.time() - begin) * 1000))
        return rows > 0
    finally:
        connection.close()
示例#30
0
def load(engine, grano):
    for rep in sl.find(engine, sl.get_table(engine, 'representative')):
        del rep['id']
        rep_ent = canonical_actor(grano, engine, rep['originalName'])
        if 'id' in rep_ent:
            rep_ent = grano.getEntity(rep_ent['id'], deep=True)
        #if not SETTINGS.FULL and rep_ent['etlId'] == rep['etlId']:
        #    continue
        rep_ent.update(rep)
        rep_ent['actsAsRepresentative'] = True
        rep_ent['staffMembers'] = int(float(rep['members']))
        rep_ent['incoming'] = rep_ent.get('incoming', [])
        rep_ent['outgoing'] = rep_ent.get('outgoing', [])
        rep_ent['contactCountry'] = rep_ent['contactCountryNorm']
        rep_ent = load_clients(grano, engine, rep_ent)
        rep_ent = load_organisations(grano, engine, rep_ent)
        rep_ent = load_networking(grano, engine, rep_ent)
        rep_ent = load_persons(grano, engine, rep_ent)
        rep_ent = load_interests(grano, engine, rep_ent)
        rep_ent = load_action_fields(grano, engine, rep_ent)
        rep_ent = get_financial_data(engine, rep_ent)
        # TODO: other financial sources
        #from pprint import pprint
        #pprint(rep_ent)
        grano.updateEntity(rep_ent)
示例#31
0
def load_abstimmungen(engine):
    _Abstimmung = sl.get_table(engine, 'abstimmung')
    i = 0
    for row in sl.distinct(engine, _Abstimmung, 'subject', 'date'):
        thema = row.get('subject')
        abst = Abstimmung.query.filter_by(thema=thema).first()
        if abst is None:
            abst = Abstimmung()
            abst.thema = thema
            abst.datum = date(row.get('date'))
        db.session.add(abst)
        for stimme_ in sl.find(engine, _Abstimmung, subject=thema,
            matched=True):
            if i % 1000 == 0:
                sys.stdout.write(".")
                sys.stdout.flush()
            i += 1
            person = Person.query.filter_by(
                fingerprint=stimme_.get('fingerprint')).first()
            if person is None:
                continue
            stimme = Stimme.query.filter_by(
                abstimmung=abst).filter_by(
                person=person).first()
            if stimme is not None:
                continue
            stimme = Stimme()
            stimme.entscheidung = stimme_['vote']
            stimme.person = person
            stimme.abstimmung = abst
            db.session.add(stimme)
        db.session.commit()
示例#32
0
def extend_positions(engine):
    log.info("Amending positions ...")
    Position = sl.get_table(engine, 'position')
    for i, data in enumerate(sl.find(engine, Position)):
        if i % 1000 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()
        dt, rest = data['fundstelle'].split("-", 1)
        data['date'] = datetime.strptime(dt.strip(), "%d.%m.%Y").isoformat()
        if ',' in data['urheber']:
            typ, quelle = data['urheber'].split(',', 1)
            data['quelle'] = re.sub("^.*Urheber.*:", "", quelle).strip()
            data['typ'] = typ.strip()
        else:
            data['typ'] = data['urheber']

        br = 'Bundesregierung, '
        if data['urheber'].startswith(br):
            data['urheber'] = data['urheber'][len(br):]

        data['fundstelle_doc'] = None
        if data['fundstelle_url'] and \
                'btp' in data['fundstelle_url']:
            data['fundstelle_doc'] = data['fundstelle_url']\
                    .rsplit('#',1)[0]

        hash = sha1(data['fundstelle'].encode('utf-8') \
                + data['urheber'].encode('utf-8') + \
                data['ablauf_id'].encode('utf-8')).hexdigest()
        data['hash'] = hash[:10]
        sl.upsert(engine, Position, data, unique=UNIQUE)
示例#33
0
def load_rollen(engine, person, data):
    _RolleSource = sl.get_table(engine, 'rolle')
    mdb_rolle = None
    for rdata in sl.find(engine, _RolleSource, fingerprint=data['fingerprint']):
        rolle = Rolle.query.filter_by(
                person=person,
                funktion=rdata.get('funktion'),
                ressort=rdata.get('ressort'),
                fraktion=rdata.get('fraktion'),
                land=rdata.get('land')).first()
        if rolle is None:
            rolle = Rolle()

        rolle.person = person
        rolle.mdb_id = rdata.get('mdb_id')
        rolle.status = rdata.get('status')
        rolle.funktion = rdata.get('funktion')
        rolle.fraktion = rdata.get('fraktion')
        rolle.gewaehlt = rdata.get('gewaehlt')
        rolle.ressort = rdata.get('ressort')
        rolle.land = rdata.get('land')
        rolle.austritt = date(rdata.get('austritt'))

        if rdata.get('mdb_id'):
            rolle.wahlkreis = load_wahlkreis(engine, rolle, data)
            mdb_rolle = rolle
        db.session.add(rolle)
    return mdb_rolle
示例#34
0
def validate_sheet(engine, row, sheet_id, data_row_filter, stats_spending):
    spending_table = sl.get_table(engine, 'spending')
    data = list(
        sl.find(engine,
                spending_table,
                resource_id=row['resource_id'],
                sheet_id=sheet_id))
    connection = engine.connect()
    trans = connection.begin()
    issue_noted_for_this_resource = False  # record first failure only
    error_message = None
    try:
        records = 0
        for row_ in data:
            if data_row_filter and data_row_filter != row_['row_id']:
                continue
            result = {'id': row_['id'], 'valid': True}
            result['signature'] = generate_signature(row_)

            if row_['DateFormatted'] is None:
                stats_spending['date'].add_spending('Date invalid', row_)
                result['valid'] = False
                if not issue_noted_for_this_resource:
                    issue(
                        engine, row['resource_id'], row['retrieve_hash'],
                        STAGE,
                        'Date invalid (blank, inconsistent or unrecognised format)',
                        {
                            'row_id': row_.get('row_id'),
                            'row_number': row_.get('row_number'),
                            'Date': row_.get('Date')
                        })
                    error_message = 'Date invalid'
                    issue_noted_for_this_resource = True
            else:
                stats_spending['date'].add_spending('Date ok', row_)

            if row_['AmountFormatted'] is None:
                stats_spending['amount'].add_spending('Amount invalid', row_)
                result['valid'] = False
                if not issue_noted_for_this_resource:
                    issue(
                        engine, row['resource_id'], row['retrieve_hash'],
                        STAGE, 'Amount invalid', {
                            'row_id': row_.get('row_id'),
                            'row_number': row_.get('row_number'),
                            'Amount': row_.get('Amount')
                        })
                    error_message = 'Amount invalid'
                    issue_noted_for_this_resource = True
            else:
                stats_spending['amount'].add_spending('Amount ok', row_)

            if result['valid']:
                records += 1
            sl.update(connection, spending_table, {'id': result['id']}, result)
        trans.commit()
        return records > 0, error_message
    finally:
        connection.close()
示例#35
0
def combine(force=False, filter=None):
    stats = OpenSpendingStats()
    engine = db_connect()
    source_table = sl.get_table(engine, 'source')
    for row in sl.find(engine, source_table, **(filter or {})):
        combine_resource(engine, source_table, row, force, stats)
    log.info('Combine summary: \n%s' % stats.report())
示例#36
0
def load_rollen(engine, person, data):
    _RolleSource = sl.get_table(engine, "rolle")
    for rdata in sl.find(engine, _RolleSource, fingerprint=data["fingerprint"]):
        rolle = Rolle.query.filter_by(
            person=person,
            funktion=rdata.get("funktion"),
            ressort=rdata.get("ressort"),
            fraktion=rdata.get("fraktion"),
            land=rdata.get("land"),
        ).first()
        if rolle is None:
            rolle = Rolle()

        rolle.person = person
        rolle.mdb_id = rdata.get("mdb_id")
        rolle.status = rdata.get("status")
        rolle.funktion = rdata.get("funktion")
        rolle.fraktion = rdata.get("fraktion")
        rolle.gewaehlt = rdata.get("gewaehlt")
        rolle.ressort = rdata.get("ressort")
        rolle.land = rdata.get("land")
        rolle.austritt = to_date(rdata.get("austritt"))

        if rdata.get("mdb_id"):
            rolle.wahlkreis = load_wahlkreis(engine, rolle, data)
        db.session.add(rolle)
示例#37
0
def load_transcript(engine, wp, session, incremental=True):
    url = URL % (wp, session)
    Speech = sl.get_table(engine, 'speech')
    if incremental and sl.find_one(engine, Speech,
        source_url=url, matched=True):
        return True
    if '404 Seite nicht gefunden' in fetch(url):
        return False
    sio = fetch_stream(url)
    if sio is None:
        return False
    log.info("Loading transcript: %s/%s" % (wp, session))
    seq = 0
    parser = SpeechParser(engine, sio)
    for contrib in parser:
        if not len(contrib['text'].strip()):
            continue
        contrib['sitzung'] = session
        contrib['sequence'] = seq
        contrib['wahlperiode'] = wp
        contrib['source_url'] = url
        contrib['matched'] = True
        sl.upsert(engine, Speech, contrib, 
                  unique=['sequence', 'sitzung', 'wahlperiode'])
        seq += 1
    if parser.missing_recon:
        sl.upsert(engine, Speech, {
                    'matched': False,
                    'sitzung': session,
                    'wahlperiode': wp
            }, unique=['sitzung', 'wahlperiode'])

    return True
示例#38
0
def cleanup_sheet(engine, row, sheet_id):
    spending_table = sl.get_table(engine, 'spending')
    data = list(
        sl.find(engine,
                spending_table,
                resource_id=row['resource_id'],
                sheet_id=sheet_id))
    connection = engine.connect()
    trans = connection.begin()
    date_formats = cleanup_dates.detect_formats(data)
    try:
        if None in date_formats.values():
            log.warn("Couldn't detect date formats: %r", date_formats)
            issue(engine, row['resource_id'], row['retrieve_hash'],
                  "Couldn't detect date formats", repr(date_formats))
            return False

        sl.delete(connection,
                  spending_table,
                  resource_id=row['resource_id'],
                  sheet_id=sheet_id)
        for row in data:
            row = cleanup_dates.apply(row, date_formats)
            row = cleanup_numbers.apply(row)
            row = cleanup_gov.apply(row)
            #row = cleanup_supplier.apply(row, engine)
            del row['id']
            sl.add_row(connection, spending_table, row)
        trans.commit()
        return True
    finally:
        connection.close()
示例#39
0
def merge_speeches(engine):
    # desired result: (position_id, debatte_id)
    referenzen = referenzen_index(engine)
    items = item_index(engine)
    
    log.info("Finding best matches.... ")
    matches = {}
    for (ablauf_id, rwp, rsession), rdrs in referenzen.items():
        for (iwp, isession, item_id), idrs in items.items():
            if iwp != rwp or rsession != isession:
                continue
            ints = len(idrs.intersection(rdrs))
            if ints == 0:
                continue
            k = (ablauf_id, rwp, rsession)
            if k in matches and matches[k][1] > ints:
                continue
            matches[k] = (item_id, ints)

    log.info("Saving position associations....")
    pos_tbl = sl.get_table(engine, 'position')
    for (ablauf_id, wp, session), (item_id, n) in matches.items():
        for pos in sl.find(engine, pos_tbl, ablauf_id="%s/%s" % (wp, ablauf_id)):
            if not pos['fundstelle_url']:
                continue
            if 'btp/%s/%s%03d.pdf' % (wp, wp, int(session)) in pos['fundstelle_url']:
                d = {'ablauf_id': pos['ablauf_id'], 
                     'hash': pos['hash'],
                     'debatte_wp': wp,
                     'debatte_session': session,
                     'debatte_item_id': item_id}
                sl.upsert(engine, pos_tbl, d, unique=['ablauf_id', 'hash'])
示例#40
0
def get_agenda(engine, wp, session):
    return list(
        sl.find(engine,
                sl.get_table(engine, 'webtv'),
                wp=wp,
                session=session,
                order_by='speech_id'))
示例#41
0
def load_news(engine):
    log.info("Loading news into production DB...")
    _NewsSource = sl.get_table(engine, 'news')
    for data in sl.all(engine, _NewsSource):
        news = NewsItem.query.filter_by(
                source_url=data.get('source_url')).first()
        if news is None:
            news = NewsItem()
        news.key = data.get('key')
        news.source_url = data.get('source_url')
        news.title = data.get('title')
        news.text = data.get('text')
        news.date = date(data.get('date'))
        news.image_url = data.get('image_url')
        news.image_copyright = data.get('image_copyright')
        news.image_changed_at = date(data.get('image_changed_at'))

        if data.get('gremium'):
            gremium = Gremium.query.filter_by(
                    key=data.get('gremium')).first()
            if gremium is None:
                log.error("Gremium %s not found!" % data.get('gremium'))
            else:
                news.gremium = gremium
        db.session.add(news)
    db.session.commit()
示例#42
0
def scrape_transcript(engine, url, force=False):
    wp, session = url_metadata(url)
    table = sl.get_table(engine, 'speech')
    sample = sl.find_one(engine, table, source_url=url, matched=True)
    response, sio = fetch_stream(url)
    sample = check_tags(sample or {}, response, force)
    base_data = {'source_url': url, 
                 'sitzung': session,
                 'wahlperiode': wp,
                 'matched': False,
                 'loaded': False,
                 'source_etag': sample['source_etag']}
    log.info("Loading transcript: %s/%s, from %s" , wp, session, url)
    seq = 0
    parser = SpeechParser(sio)
    for contrib in parser:
        if not len(contrib['text'].strip()):
            continue
        contrib.update(base_data)
        contrib['sequence'] = seq
        sl.upsert(engine, table, contrib, 
                  unique=['source_url', 'sequence'])
        seq += 1
    if not parser.missing_recon:
        sl.upsert(engine, table, {
                    'matched': True,
                    'source_url': url,
            }, unique=['source_url'])
    else:
        raise InvalidReference()
    return base_data
示例#43
0
def scrape_activity(ablauf, elem, engine):
    urheber = elem.findtext("URHEBER")
    fundstelle = elem.findtext("FUNDSTELLE")
    Position = sl.get_table(engine, 'position')
    p = sl.find_one(engine, Position, 
                    urheber=urheber, 
                    fundstelle=fundstelle, 
                    ablauf_id=ablauf['ablauf_id'])
    if p is not None:
        return 
    p = {'ablauf_id': ablauf['ablauf_id'], 
         'urheber': urheber,
         'fundstelle': fundstelle}
    pos_keys = p.copy()
    p['zuordnung'] = elem.findtext("ZUORDNUNG")
    p['abstrakt'] = elem.findtext("VP_ABSTRAKT")
    p['fundstelle_url'] = elem.findtext("FUNDSTELLE_LINK")
    
    Zuweisung = sl.get_table(engine, 'zuweisung')
    for zelem in elem.findall("ZUWEISUNG"):
        z = pos_keys.copy()
        z['text'] = zelem.findtext("AUSSCHUSS_KLARTEXT")
        z['federfuehrung'] = zelem.find("FEDERFUEHRUNG") is not None
        z['gremium_key'] = DIP_GREMIUM_TO_KEY.get(z['text'])
        sl.upsert(engine, Zuweisung, z, unique=[])
        
    Beschluss = sl.get_table(engine, 'beschluss')
    for belem in elem.findall("BESCHLUSS"):
        b = pos_keys.copy()
        b['seite'] = belem.findtext("BESCHLUSSSEITE")
        b['dokument_text'] = belem.findtext("BEZUGSDOKUMENT")
        b['tenor'] = belem.findtext("BESCHLUSSTENOR")
        b['grundlage'] = belem.findtext("GRUNDLAGE")
        sl.upsert(engine, Beschluss, b, unique=[])

    Referenz = sl.get_table(engine, 'referenz')
    try:
        dokument = dokument_by_url(p['fundstelle_url']) or \
            dokument_by_name(p['fundstelle'])
        dokument.update(pos_keys)
        dokument['ablauf_key'] = ablauf['key']
        dokument['wahlperiode'] = ablauf['wahlperiode']
        sl.upsert(engine, Referenz, dokument, unique=[
                'link', 'wahlperiode', 'ablauf_key', 'seiten'
                ])
    except Exception, e:
        log.exception(e)
示例#44
0
def extract_some(force=False, filter=None):
    # kwargs = resource_id=x, package_name=y, publisher_title=z
    stats = OpenSpendingStats()
    engine = db_connect()
    source_table = sl.get_table(engine, 'source')
    for row in sl.find(engine, source_table, **(filter or {})):
        extract_resource(engine, source_table, row, force, stats)
    log.info('Extract summary: \n%s' % stats.report())
示例#45
0
def index():
    engine = etl_engine()
    webtv_table = sl.get_table(engine, 'webtv')
    sessions = sl.distinct(engine, webtv_table,
        'wp', 'session', 'session_name')
    sessions = sorted(sessions, reverse=True)
    return render_template('backend/index.html',
        sessions=sessions)
示例#46
0
def retrieve_some(force=False, filter=None):
    stats = OpenSpendingStats()
    engine = db_connect()
    source_table = sl.get_table(engine, 'source')
    for row in sl.find(engine, source_table, **(filter or {})):
        retrieve(row, engine, source_table, force, stats)
    print 'Retrieve summary:'
    print stats.report()
示例#47
0
def extract_some(force=False, filter=None):
    # kwargs = resource_id=x, package_name=y, publisher_title=z
    stats = OpenSpendingStats()
    engine = db_connect()
    source_table = sl.get_table(engine, 'source')
    for row in sl.find(engine, source_table, **(filter or {})):
        extract_resource(engine, source_table, row, force, stats)
    log.info('Extract summary: \n%s' % stats.report())
示例#48
0
def add_to_gremium(node, url, role, engine):
    key = node.get('id')
    table = sl.get_table(engine, 'gremium')
    g = sl.find_one(engine, table, key=key)
    if g is None:
        g = {'key': key, 'type': 'sonstiges'}
        g['name'] = node.findtext('gremiumName')
        g['url'] = node.findtext('gremiumURL')
        sl.upsert(engine, table, g, unique=['key'])
    table = sl.get_table(engine, 'gremium_mitglieder')
    sl.upsert(engine,
              table, {
                  'gremium_key': g['key'],
                  'person_source_url': url,
                  'role': role
              },
              unique=['person_source_url', 'gremium_key', 'role'])
示例#49
0
def mark_done(engine, url):
    table = sl.get_table(engine, 'speech')
    sl.upsert(engine,
              table, {
                  'loaded': True,
                  'source_url': url,
              },
              unique=['source_url'])
示例#50
0
def clear_issues(engine, resource_id, stage):
    import sqlaload as sl  # this import is slow, so it is done inside this func
    table = sl.get_table(engine, 'issue')
    sl.delete(
        engine,
        table,
        resource_id=resource_id,
        stage=stage,
    )
示例#51
0
def scrape_abstimmung(engine, url, force=False):
    abstimmung = sl.get_table(engine, 'abstimmung')
    sample = sl.find_one(engine, abstimmung, source_url=url)
    response = fetch(url)
    sample = check_tags(sample or {}, response, force)

    base_data = {'source_url': url, 'source_etag': sample['source_etag']}
    handle_xml(engine, base_data, response.content)
    return base_data
示例#52
0
def match_beitraege(engine, url):
    table = sl.get_table(engine, 'beitrag')
    for beitrag in sl.distinct(engine, table, *KEYS, source_url=url):
        match = match_beitrag(engine, beitrag, url)
        beitrag['fingerprint'] = match
        beitrag['matched'] = match is not None
        if match:
            ensure_rolle(beitrag, match, engine)
        sl.upsert(engine, table, beitrag, unique=KEYS)
示例#53
0
def speechmatcher_alignment_post(wp, session):
    engine = etl_engine()
    table = sl.get_table(engine, 'alignments')
    data = dict(request.form.items())
    data['sequence'] = int(data['sequence'])
    data['wp'] = wp
    data['session'] = session
    sl.upsert(engine, table, data, ['wp', 'session', 'sequence'])
    return speechmatcher_alignment_get(wp, session)
示例#54
0
def ensure_rolle(beitrag, fp, engine):
    rolle = {
        'fingerprint': fp,
        'ressort': beitrag.get('ressort'),
        'fraktion': beitrag.get('fraktion'),
        'funktion': beitrag.get('funktion')
        }
    Rolle = sl.get_table(engine, 'rolle')
    sl.upsert(engine, Rolle, rolle,
            unique=['fingerprint', 'funktion'])
def retrieve_some(force=False, **filters):
    engine = db_connect()
    source_table = sl.get_table(engine, 'source')
    result_counts = defaultdict(int)
    for row in sl.find(engine, source_table, **filters):
        result = retrieve(row, engine, source_table, force)
        result_counts['total'] += 1
        result_counts[result] += 1
    log.info('Total %i URLs', result_counts.pop('total'))
    for result, count in result_counts.items():
        log.info('  %i %s', count, result)
示例#56
0
def cleanup(force=False, resource_filter=None, data_row_filter=None):
    stats = OpenSpendingStats()
    stats_spending = defaultdict(OpenSpendingStats)
    engine = db_connect()
    source_table = sl.get_table(engine, 'source')
    for row in sl.find(engine, source_table, **(filter or {})):
        cleanup_resource(engine, source_table, row, force, data_row_filter,
                         stats, stats_spending)
    log.info('Cleanup summary: \n%s' % stats.report())
    for key in stats_spending:
        log.info('Cleanup %s: \n%s' % (key, stats_spending[key].report()))
def clean_ablauf(engine, data):
    try:
        table = sl.get_table(engine, 'ablauf')
        data['class'] = resolve_type(data.get('typ'))
        data['stage'] = resolve_stage(data.get('stand'))
        d = {'class': data['class'], 
             'stage': data['stand'],
             'source_url': data['source_url']}
        sl.upsert(engine, table, d, unique=['source_url'])
    except BadReference:
        pass
示例#58
0
def merge_speech(engine, wp, session):
    log.info("Merging media + transcript: %s/%s" % (wp, session))
    score, alignment = get_alignment(engine, wp, session)
    log.info("Matching score: %s", score)
    agenda = get_agenda(engine, wp, session)
    agenda = dict([(a['item_id'], a) for a in agenda])
    alignment = dict([(a['sequence'], a) for a in alignment])
    item = None
    table = sl.get_table(engine, 'webtv_speech')
    for speech in sl.find(engine,
                          sl.get_table(engine, 'speech'),
                          order_by='sequence',
                          wahlperiode=wp,
                          sitzung=session,
                          matched=True):
        sequence = speech['sequence']
        item = alignment.get(sequence, item)
        data = agenda.get(item['item_id']).copy()
        del data['id']
        data['sequence'] = sequence
        sl.upsert(engine, table, data, unique=['wp', 'session', 'sequence'])
示例#59
0
def match_beitrag(engine, beitrag, url):
    beitrag_print = make_long_name(beitrag)
    log.info("Matching: %s", beitrag_print.encode('ascii', 'replace'))
    try:
        value = resolve_person(beitrag_print)
        if sl.find_one(engine, sl.get_table(engine, 'person'),
                fingerprint=value) is None:
            make_person(engine, beitrag, value, url)
        return value
    except BadReference:
        log.info("Beitrag person is unknown: %s",
                beitrag_print.encode('ascii', 'replace'))