Пример #1
0
def load_news(engine):
    log.info("Loading news into production DB...")
    _NewsSource = sl.get_table(engine, 'news')
    for data in sl.all(engine, _NewsSource):
        news = NewsItem.query.filter_by(
                source_url=data.get('source_url')).first()
        if news is None:
            news = NewsItem()
        news.key = data.get('key')
        news.source_url = data.get('source_url')
        news.title = data.get('title')
        news.text = data.get('text')
        news.date = date(data.get('date'))
        news.image_url = data.get('image_url')
        news.image_copyright = data.get('image_copyright')
        news.image_changed_at = date(data.get('image_changed_at'))

        if data.get('gremium'):
            gremium = Gremium.query.filter_by(
                    key=data.get('gremium')).first()
            if gremium is None:
                log.error("Gremium %s not found!" % data.get('gremium'))
            else:
                news.gremium = gremium
        db.session.add(news)
    db.session.commit()
Пример #2
0
def combine_sheet(engine, resource, sheet_id, table, mapping):
    begin = time.time()
    base = {
        'resource_id': resource['resource_id'],
        'resource_hash': resource['extract_hash'],
        'sheet_id': sheet_id,
    }
    spending_table = sl.get_table(engine, 'spending')
    connection = engine.connect()
    trans = connection.begin()
    try:
        rows = 0
        sl.delete(connection,
                  spending_table,
                  resource_id=resource['resource_id'],
                  sheet_id=sheet_id)
        for row in sl.all(connection, table):
            data = dict(base)
            for col, value in row.items():
                if col == 'id':
                    data['row_id'] = value
                    continue
                mapped = mapping.get(col)
                if mapped is not None:
                    data[mapped] = value
            sl.add_row(connection, spending_table, data)
            rows += 1
        trans.commit()
        log.info("Loaded %s rows in %s ms", rows,
                 int((time.time() - begin) * 1000))
        return rows > 0
    finally:
        connection.close()
def update_network_entities(engine, file_name):
    log.info("Updating network entities reference sheet: %s", file_name)
    network_entities = set()
    table = sl.get_table(engine, 'network_entity')
    if os.path.exists(file_name):
        fh = open(file_name, 'rb')
        reader = csv.DictReader(fh)
        for d in reader:
            e = [(k, v.decode('utf-8')) for (k, v) in d.items()]
            e = dict(e)
            network_entities.add((e['representativeEtlId'], e['etlFingerPrint']))
            sl.upsert(engine, table, e, ['representativeEtlId', 'etlFingerPrint'])
        fh.close()
        reps = set([ne[0] for ne in network_entities])
        rep_table = sl.get_table(engine, 'representative')
        for rep in reps:
            sl.update(engine, rep_table, {'etlId': rep}, {'network_extracted': True})

    for row in sl.all(engine, table):
        network_entities.add((row['representativeEtlId'], row['etlFingerPrint']))

    fh = open(file_name, 'wb')
    writer = None
    table = sl.get_table(engine, 'network_entity')
    for ic, fp in network_entities:
        row = {
            'representativeEtlId': ic,
            'etlFingerPrint': fp
        }
        if writer is None:
            writer = csv.DictWriter(fh, row.keys())
            writer.writerow(dict(zip(row.keys(), row.keys())))
        r = [(k, unicode(v).encode('utf-8')) for k, v in row.items()]
        writer.writerow(dict(r))
    fh.close()
Пример #4
0
def combine_sheet(engine, resource, sheet_id, table, mapping):
    begin = time.time()
    base = {
            'resource_id': resource['resource_id'],
            'resource_hash': resource['extract_hash'],
            'sheet_id': sheet_id,
        }
    spending_table = sl.get_table(engine, 'spending')
    connection = engine.connect()
    trans = connection.begin()
    try:
        rows = 0
        sl.delete(connection, spending_table,
                resource_id=resource['resource_id'],
                sheet_id=sheet_id)
        for row in sl.all(connection, table):
            data = dict(base)
            for col, value in row.items():
                if col == 'id':
                    data['row_id'] = value
                    continue
                mapped = mapping.get(col)
                if mapped is not None:
                    data[mapped] = value
            sl.add_row(connection, spending_table, data)
            rows += 1
        trans.commit()
        log.info("Loaded %s rows in %s ms", rows,
                int((time.time()-begin)*1000))
        return rows > 0
    finally:
        connection.close()
Пример #5
0
def supplier_names_from_table(engine, resource_id, table_id):
    global tables_count, rows_count
    tables_count = tables_count + 1
    print "# checking table %d" % tables_count
    table_suffix = "%s_table%s" % (resource_id, table_id)

    table = sl.get_table(engine, "spending_%s" % table_suffix)
    supplier_table = sl.get_table(engine, "suppliers")

    for row in sl.all(engine, table):
        rows_count = rows_count + 1
        if not row.has_key("SupplierName"):
            # One of the junk tables that contain no real data, usually excel noise
            continue
        supplier = row["SupplierName"]

        if supplier is None or supplier == "":
            continue

        if suppliers_visited.has_key(supplier):
            continue

        suppliers_visited[supplier] = True

        if sl.find_one(engine, supplier_table, original=supplier) is not None:
            continue

        yield supplier
Пример #6
0
def test_format_all():
    engine,table = connect()
    for row in sl.all(engine, table):
        format_ = partial(format, engine, row['resource_id'], row['table_id'], False)
        format_.description = \
            'format: %(resource_id)s/%(table_id)s' % row
        yield format_
Пример #7
0
def test_condense_all():
    engine = db_connect()
    table = sl.get_table(engine, 'extracted')
    for row in sl.all(engine, table):
        for table_id in xrange(0, row['max_table_id'] + 1):
            condense_ = partial(condense, engine, row['resource_id'], table_id, False)
            condense_.description = describe(row['resource_id'], table_id)
            yield condense_
Пример #8
0
def process(engine, table, key_column, color_column, subkey_column, subcolor_column):
    children = defaultdict(set)
    key_colors = {}
    for row in sl.all(engine, table):
        key = row.get(key_column)
        key_colors[key] = row.get(color_column)
        children[key].add(row.get(subkey_column))
    sub_colors = dict()
    for key in children.keys():
        gen = _color_range(key_colors.get(key), len(children[key]))
        for sub, col in zip(children[key], gen):
            sub_colors[(key, sub)] = col
    for row in sl.all(engine, table):
         ks = (row.get(key_column), row.get(subkey_column))
         row[subcolor_column] = sub_colors[ks]
         #pprint(row)
         sl.upsert(engine, table, row, ['id'])
Пример #9
0
def process_rows(handlefunc, engine=None):
    if engine is None:
        engine = make_engine()
    table = sl.get_table(engine, 'fts')
    for row in sl.all(engine, table):
        out = handlefunc(row)
        sl.upsert(engine, table, out, ['id'])
    return table
Пример #10
0
def condense(engine, resource_id, table_id, force):
    table_suffix = '%s_table%s' % (resource_id, table_id)

    if not engine.has_table('raw_%s' % table_suffix):
        return

    condensed_table = sl.get_table(engine, 'condensed')

    # Skip over tables we have already extracted
    if not force and sl.find_one(engine, condensed_table, resource_id=resource_id, table_id=table_id) is not None:
        return

    connection = engine.connect()
    trans = connection.begin()

    start = time.time()

    try:
        raw_table = sl.get_table(connection, 'raw_%s' % table_suffix)
        sl.drop_table(connection, 'spending_%s' % table_suffix)
        spending_table = sl.get_table(connection, 'spending_%s' % table_suffix)
        columns_table = sl.get_table(connection, 'column_sets')

        normalise_map = normalised_columns_map(raw_table)
        normalised_headers = ','.join(sorted(normalise_map.values()))
        mapping_row = sl.find_one(connection, columns_table, normalised=normalised_headers)

        if mapping_row is None or not mapping_row.get('valid'):
            # This table is unmapped, cannot be condensed
            return

        column_mapping = json.loads(mapping_row['column_map'])

        # Build the final mapping from input column to output column
        mapping = {}
        for k,n in normalise_map.iteritems():
            if n in column_mapping and column_mapping[n] is not None and len(column_mapping[n]) > 0:
                mapping[k] = column_mapping[n]
        
        for row in sl.all(connection, raw_table):
            spending_row = {}
            for key, value in row.items():
                if key not in mapping:
                    continue
                if not value or not len(value.strip()):
                    continue
                if mapping[key] in spending_row:
                    continue
                spending_row[mapping[key]] = value.strip()
            #print spending_row
            sl.add_row(connection, spending_table, spending_row)
        sl.upsert(connection, condensed_table, {'resource_id': resource_id,
                                                'table_id': table_id,
                                                'condense_time': time.time() - start,
                                                }, ['resource_id', 'table_id'])
        trans.commit()
    finally:
        connection.close()
Пример #11
0
def merge_votes(engine):
    _referenzen = defaultdict(set)
    referenz_tbl = sl.get_table(engine, "referenz")
    for ref in sl.all(engine, referenz_tbl):
        if ref["typ"] == "drs":
            # print ref
            _referenzen[ref["nummer"]].add(ref["ablauf_key"])
    mref = dict([(c, d) for c, d in _referenzen.items() if len(d) > 1])
    print len(_referenzen), len(mref)
Пример #12
0
def do_format(engine, resource_id, table_id):
    table_suffix = '%s_table%s' % (resource_id, table_id)

    table = sl.get_table(engine, 'spending_%s' % table_suffix)

    date_field_values = defaultdict(list)
    for row in sl.all(engine, table):
        for date_field in DATE_FIELDS:
            if date_field in row and row[date_field]:
                date_field_values[date_field].append(row[date_field])

    date_field_formats = {}
    for date_field, values in date_field_values.items():
        date_field_formats[date_field] = detect_date_format(values)

    for row in sl.all(engine, table):

        for numeric_field in NUMERIC_FIELDS:
            try:
                val = row.get(numeric_field)
                if val is None:
                    continue
                val = "".join([v for v in val if v in "-.0123456789"])
                row[numeric_field + 'AsFloat'] = float(val)
            except Exception as e:
                print e

        for date_field, format_ in date_field_formats.items():
            if format_ is None:
                continue
            try:
                if row[date_field] is None:
                    continue
                if format_ == 'excel':
                    # Deciphers excel dates that have been mangled into integers by formatting errors
                    parsed = datetime(*xldate_as_tuple(float(row[date_field].strip()), 0))
                else:
                    parsed = datetime.strptime(row[date_field].strip(), format_)
                row[date_field + 'AsDate'] = parsed.strftime("%Y-%m-%d")
            except Exception as e:
                print e

        sl.upsert(engine, table, row, ['id'])
Пример #13
0
def create_entities(engine):
    log.info("De-normalizing global entities collection...")
    table = sl.get_table(engine, 'entity')
    for tbl in ['representative', 'person', 'financialDataTurnover',
        'organisation', 'network_entity']:
        for row in sl.all(engine, sl.get_table(engine, tbl)):
            entity = {'etlFingerPrint': row.get('etlFingerPrint')}
            entity['legalStatus'] = row.get('legalStatus', '')
            entity['countryCode'] = row.get('contactCountryCode', '')
            entity['etlTable'] = tbl
            sl.upsert(engine, table, entity, ['etlFingerPrint', 'etlTable'])
Пример #14
0
def load_persons(engine):
    log.info("Loading persons into production DB...")
    db.session.commit()
    db.session.execute(obleute.delete())
    db.session.execute(mitglieder.delete())
    db.session.execute(stellvertreter.delete())

    _PersonSource = sl.get_table(engine, 'person')
    for data in sl.all(engine, _PersonSource):
        person = Person.query.filter_by(
                fingerprint=data.get('fingerprint')).first()
        if person is None:
            person = Person()

        person.slug = data.get('slug')
        person.fingerprint = data.get('fingerprint')
        log.info("      -> %s" % person.fingerprint)
        person.source_url = data.get('source_url')
        person.mdb_id = data.get('mdb_id')
        person.vorname = data.get('vorname')
        person.nachname = data.get('nachname')
        person.adelstitel = data.get('adelstitel')
        person.titel = data.get('titel')
        person.ort = data.get('ort')
        person.geburtsdatum = data.get('geburtsdatum')
        person.religion = data.get('religion')
        person.hochschule = data.get('hochschule')
        person.beruf = data.get('beruf')
        person.berufsfeld = data.get('berufsfeld')
        person.geschlecht = data.get('geschlecht')
        person.familienstand = data.get('familienstand')
        person.kinder = data.get('kinder')
        person.partei = data.get('partei')
        person.land = data.get('land')
        person.bio_url = data.get('bio_url')
        person.bio = data.get('bio')
        person.wissenswertes = data.get('wissenswertes')
        person.homepage_url = data.get('homepage_url')
        person.telefon = data.get('telefon')
        person.homepage_url = data.get('homepage_url')
        person.angaben = data.get('angaben')
        person.foto_url = data.get('foto_url')
        person.foto_copyright = data.get('foto_copyright')
        person.reden_plenum_url = data.get('reden_plenum_url')
        person.reden_plenum_rss_url = data.get('reden_plenum_rss_url')
        person.twitter_url = data.get('twitter_url')
        person.facebook_url = data.get('facebook_url')
        person.awatch_url = data.get('awatch_url')
        db.session.add(person)
        db.session.flush()
        mdb_rolle = load_rollen(engine, person, data)
        load_gremium_mitglieder(engine, person, mdb_rolle)

        db.session.commit()
Пример #15
0
def sources_metadata(engine):
    sources = {}
    log.info("Building sources index...")
    source_table = sl.get_table(engine, 'source')
    for source in sl.all(engine, source_table):
        data = {
            'SourceDatasetName': source.get('package_name'),
            'SourceDatasetID': source.get('package_id'),
            'SourceDatasetTitle': source.get('package_title'),
            'SourcePublisherName': source.get('publisher_name'),
            'SourcePublisherTitle': source.get('publisher_title'),
            'SourceID': source.get('resource_id'),
            'SourceURL': source.get('url'),
            'SourceFormat': source.get('format'),
            }
        sources[source['resource_id']] = data
    return sources
Пример #16
0
def sources_metadata(engine):
    sources = {}
    log.info("Building sources index...")
    source_table = sl.get_table(engine, 'source')
    for source in sl.all(engine, source_table):
        data = {
            'SourceDatasetName': source.get('package_name'),
            'SourceDatasetID': source.get('package_id'),
            'SourceDatasetTitle': source.get('package_title'),
            'SourcePublisherName': source.get('publisher_name'),
            'SourcePublisherTitle': source.get('publisher_title'),
            'SourceID': source.get('resource_id'),
            'SourceURL': source.get('url'),
            'SourceFormat': source.get('format'),
        }
        sources[source['resource_id']] = data
    return sources
Пример #17
0
def referenzen_index(engine):
    _referenzen = defaultdict(set)
    referenz_tbl = sl.get_table(engine, 'referenz')
    log.info("Building index of ablauf document references per session...")
    for ref in sl.all(engine, referenz_tbl):
        _referenzen[ref['ablauf_key']].add((ref['typ'], ref['nummer']))
    referenzen = defaultdict(set)
    for ablauf, refs in _referenzen.items():
        for typ, num in refs:
            if not '/' in num:
                continue
            if typ == 'plpr':
                wp, session= num.split('/')
                drs = [n for t, n in refs if t != 'plpr']
                if len(drs):
                    referenzen[(ablauf,wp,session)] = drs
    #pprint(dict(referenzen.items()))
    return referenzen
Пример #18
0
def load_gremien(engine):
    log.info("Loading gremien into production DB...")
    _GremiumSource = sl.get_table(engine, 'gremium')
    for data in sl.all(engine, _GremiumSource):
        gremium = Gremium.query.filter_by(key=data.get('key')).first()
        if gremium is None:
            gremium = Gremium()
        gremium.key = data.get('key')
        gremium.source_url = data.get('source_url')
        gremium.name = data.get('name')
        gremium.typ = data.get('type')
        gremium.url = data.get('url')
        gremium.aufgabe = data.get('aufgabe')
        gremium.rss_url = data.get('rss_url')
        gremium.image_url = data.get('image_url')
        gremium.image_copyright = data.get('image_copyright')
        db.session.add(gremium)
    db.session.commit()
Пример #19
0
def referenzen_index(engine):
    _referenzen = defaultdict(set)
    referenz_tbl = sl.get_table(engine, 'referenz')
    log.info("Building index of ablauf document references per session...")
    for ref in sl.all(engine, referenz_tbl):
        _referenzen[ref['ablauf_key']].add((ref['typ'], ref['nummer']))
    referenzen = defaultdict(set)
    for ablauf, refs in _referenzen.items():
        for typ, num in refs:
            if not '/' in num:
                continue
            if typ == 'plpr':
                wp, session = num.split('/')
                drs = [n for t, n in refs if t != 'plpr']
                if len(drs):
                    referenzen[(ablauf, wp, session)] = drs
    #pprint(dict(referenzen.items()))
    return referenzen
Пример #20
0
def referenzen_index(engine):
    _referenzen = defaultdict(set)
    referenz_tbl = sl.get_table(engine, "referenz")
    log.info("Building index of ablauf document references per session...")
    for ref in sl.all(engine, referenz_tbl):
        _referenzen[ref["ablauf_key"]].add((ref["typ"], ref["nummer"]))
    referenzen = defaultdict(set)
    for ablauf, refs in _referenzen.items():
        for typ, num in refs:
            if not "/" in num:
                continue
            if typ == "plpr":
                wp, session = num.split("/")
                drs = [n for t, n in refs if t != "plpr"]
                if len(drs):
                    referenzen[(ablauf, wp, session)] = drs
    # pprint(dict(referenzen.items()))
    return referenzen
Пример #21
0
def update_entities(engine, file_name):
    log.info("Updating entities reference sheet: %s", file_name)
    data = {}
    if os.path.exists(file_name):
        fh = open(file_name, 'rb')
        reader = csv.DictReader(fh)
        data = {}
        for d in reader:
            e = [(k, v.decode('utf-8')) for (k, v) in d.items()]
            e = dict(e)
            data[e['etlFingerPrint']] = e
        fh.close()
        print len(data)

    fh = open(file_name, 'wb')
    writer = None
    table = sl.get_table(engine, 'entity')
    for row in sl.all(engine, table):
        fp = row['etlFingerPrint']
        if fp is None:
            continue
        if not row.get('canonicalName'):
            row['canonicalName'] = row['etlFingerPrint']
        row['canonicalName'] = cleanCanonical(row['canonicalName'])
        entity = data.get(fp)
        if entity and entity.get('canonicalName') and \
            fp != entity.get('canonicalName'):
            #print entity.get('canonicalName').encode('utf-8')
            row['canonicalName'] = entity.get('canonicalName')
            out = row.copy()
            del out['id']
            sl.upsert(engine, table, out, ['etlFingerPrint'])
        cn = row['canonicalName']
        row['normalizedForm'] = normalize_text(cn)
        row['reverseForm'] = reverse_normalize(cn)
        if writer is None:
            writer = csv.DictWriter(fh, row.keys())
            writer.writerow(dict(zip(row.keys(), row.keys())))
        r = [(k, unicode(v).encode('utf-8')) for k, v in row.items()]
        writer.writerow(dict(r))
    fh.close()
Пример #22
0
import sqlaload as sl
import csv
import sys
import logging
from common import *

if __name__ == "__main__":
    logging.basicConfig()
    logging.getLogger(__name__).setLevel(logging.DEBUG)
    logging.getLogger("sqlalchemy.engine").setLevel(logging.WARN)

    supplier_map = {}

    engine = db_connect()
    supplier_table = sl.get_table(engine, "suppliers")
    for row in sl.all(engine, supplier_table):
        supplier_map[row["original"]] = row

    writer = csv.DictWriter(
        open(sys.argv[1], "wb"),
        [
            "SupplierName",
            "DepartmentFamily",
            "Entity",
            "ExpenseType",
            "ExpenseArea",
            "VATNumber",
            "AmountAsFloat",
            "DateAsDate",
            "SupplierGuess",
            "SupplierGuessURI",
Пример #23
0
def process(engine, table):
    gpl = load_csv_keys('gruppierungsplan.csv', 'ID')
    fpl = load_csv_keys('funktionenplan.csv', 'ID')
    for row in sl.all(engine, table):
        sl.upsert(engine, table, for_row(row, fpl, gpl), ['id'])
Пример #24
0
def dump_table(engine, table):
    file_name = '%s-%s.csv' % (table.name,
            datetime.utcnow().strftime("%Y-%m-%d"))
    fh = open(file_name, 'wb')
    sl.dump_csv(sl.all(engine, table), fh)
Пример #25
0
def test_retrieve_all():
    engine,src_table = connect()
    for row in sl.all(engine, src_table):
        f = partial(retrieve, row, engine, False)
        f.description = describe(row)
        yield f,
Пример #26
0
def cluster(engine):
    for row in sl.all(engine, sl.get_table(engine, 'entity')):
        print normalize_text(row['etlFingerPrint']).encode('utf-8')
Пример #27
0
def supplier_tables(engine, table):
    for row in sl.all(engine, table):
        yield supplier_names_from_table(engine, row["resource_id"], row["table_id"])