示例#1
0
def combine_sheet(engine, resource, sheet_id, table, mapping):
    begin = time.time()
    base = {
        'resource_id': resource['resource_id'],
        'resource_hash': resource['extract_hash'],
        'sheet_id': sheet_id,
    }
    spending_table = sl.get_table(engine, 'spending')
    connection = engine.connect()
    trans = connection.begin()
    try:
        rows = 0
        sl.delete(connection,
                  spending_table,
                  resource_id=resource['resource_id'],
                  sheet_id=sheet_id)
        for row in sl.all(connection, table):
            data = dict(base)
            for col, value in row.items():
                if col == 'id':
                    data['row_id'] = value
                    continue
                mapped = mapping.get(col)
                if mapped is not None:
                    data[mapped] = value
            sl.add_row(connection, spending_table, data)
            rows += 1
        trans.commit()
        log.info("Loaded %s rows in %s ms", rows,
                 int((time.time() - begin) * 1000))
        return rows > 0
    finally:
        connection.close()
示例#2
0
def combine_sheet(engine, resource, sheet_id, table, mapping):
    begin = time.time()
    base = {
            'resource_id': resource['resource_id'],
            'resource_hash': resource['extract_hash'],
            'sheet_id': sheet_id,
        }
    spending_table = sl.get_table(engine, 'spending')
    connection = engine.connect()
    trans = connection.begin()
    try:
        rows = 0
        sl.delete(connection, spending_table,
                resource_id=resource['resource_id'],
                sheet_id=sheet_id)
        for row in sl.all(connection, table):
            data = dict(base)
            for col, value in row.items():
                if col == 'id':
                    data['row_id'] = value
                    continue
                mapped = mapping.get(col)
                if mapped is not None:
                    data[mapped] = value
            sl.add_row(connection, spending_table, data)
            rows += 1
        trans.commit()
        log.info("Loaded %s rows in %s ms", rows,
                int((time.time()-begin)*1000))
        return rows > 0
    finally:
        connection.close()
示例#3
0
def cleanup_sheet(engine, row, sheet_id):
    spending_table = sl.get_table(engine, 'spending')
    data = list(
        sl.find(engine,
                spending_table,
                resource_id=row['resource_id'],
                sheet_id=sheet_id))
    connection = engine.connect()
    trans = connection.begin()
    date_formats = cleanup_dates.detect_formats(data)
    try:
        if None in date_formats.values():
            log.warn("Couldn't detect date formats: %r", date_formats)
            issue(engine, row['resource_id'], row['retrieve_hash'],
                  "Couldn't detect date formats", repr(date_formats))
            return False

        sl.delete(connection,
                  spending_table,
                  resource_id=row['resource_id'],
                  sheet_id=sheet_id)
        for row in data:
            row = cleanup_dates.apply(row, date_formats)
            row = cleanup_numbers.apply(row)
            row = cleanup_gov.apply(row)
            #row = cleanup_supplier.apply(row, engine)
            del row['id']
            sl.add_row(connection, spending_table, row)
        trans.commit()
        return True
    finally:
        connection.close()
示例#4
0
def clear_issues(engine, resource_id, stage):
    import sqlaload as sl # this import is slow, so it is done inside this func
    table = sl.get_table(engine, 'issue')
    sl.delete(engine, table,
              resource_id=resource_id,
              stage=stage,
    )
示例#5
0
def clear_issues(engine, resource_id, stage):
    import sqlaload as sl  # this import is slow, so it is done inside this func
    table = sl.get_table(engine, 'issue')
    sl.delete(
        engine,
        table,
        resource_id=resource_id,
        stage=stage,
    )
示例#6
0
def build_index(publisher_name=None):
    '''Searches CKAN for spending resources and writes their metadata to
    the database.'''
    engine, table = connect()
    client = ckan_client()
    log.info('CKAN: %s', client.base_location)
    tags = ['+tags:"%s"' % t for t in TAGS]
    q = " OR ".join(tags)
    publisher_dict_filter = {}
    if publisher_name:
        publisher_solr_filter = 'publisher:"%s"' % publisher_name
        q = '(%s) AND (%s)' % (q, publisher_solr_filter)
        publisher_dict_filter = {'publisher_name': publisher_name}
    log.info('SOLR Search q: %r', q)

    existing_packages = set(
            [res['package_name']
             for res in sl.distinct(engine, table, 'package_name', **publisher_dict_filter)])
    log.info('Existing datasets: %i', len(existing_packages))
    processed_packages = set()
    log.info('Doing package search for: "%s"', q)
    res = client.package_search(q,
            search_options={'limit': 2000})
    log.info('Search returned %i dataset results', res['count'])
    stats = OpenSpendingStats()
    stats_resources = OpenSpendingStats()
    for package_name in res['results']:
        processed_packages.add(package_name)
        num_resources = fetch_package(client, package_name, engine, table, stats_resources)
        if num_resources == 0:
            stats.add('Dataset has no resources', package_name)
        else:
            stats.add('Dataset has resources', package_name)
    # Removed rows about deleted packages
    obsolete_packages = existing_packages - processed_packages
    log.info('Obsolete datasets: %s from %s',
             len(obsolete_packages), len(existing_packages))
    for package_name in obsolete_packages:
        sl.delete(engine, table, package_name=package_name)
        sl.delete(engine, 'issue', package_name=package_name)
        stats.add('Removed obsolete dataset', package_name)
    # Removed stray rows without package_name
    stray_rows = list(sl.find(engine, table, package_name=None))
    if stray_rows:
        log.info('Stray rows without package_name: %i',
                 len(stray_rows))
        sl.delete(engine, table, package_name=None)
        sl.delete(engine, 'issue', package_name=None)
        for row in stray_rows:
            stats.add('Stray row removed', row['resource_id'])
    print 'Datasets build_index summary:'
    print stats.report()
    print 'Resources build_index summary:'
    print stats_resources.report()
示例#7
0
def build_index(publisher_name=None):
    '''Searches CKAN for spending resources and writes their metadata to
    the database.'''
    engine, table = connect()
    client = ckan_client()
    log.info('CKAN: %s', client.base_location)
    tags = ['+tags:"%s"' % t for t in TAGS]
    q = " OR ".join(tags)
    publisher_dict_filter = {}
    if publisher_name:
        publisher_solr_filter = 'publisher:"%s"' % publisher_name
        q = '(%s) AND (%s)' % (q, publisher_solr_filter)
        publisher_dict_filter = {'publisher_name': publisher_name}
    log.info('SOLR Search q: %r', q)

    existing_packages = set([
        res['package_name'] for res in sl.distinct(
            engine, table, 'package_name', **publisher_dict_filter)
    ])
    log.info('Existing datasets: %i', len(existing_packages))
    processed_packages = set()
    log.info('Doing package search for: "%s"', q)
    res = client.package_search(q, search_options={'limit': 2000})
    log.info('Search returned %i dataset results', res['count'])
    stats = OpenSpendingStats()
    stats_resources = OpenSpendingStats()
    for package_name in res['results']:
        processed_packages.add(package_name)
        num_resources = fetch_package(client, package_name, engine, table,
                                      stats_resources)
        if num_resources == 0:
            stats.add('Dataset has no resources', package_name)
        else:
            stats.add('Dataset has resources', package_name)
    # Removed rows about deleted packages
    obsolete_packages = existing_packages - processed_packages
    log.info('Obsolete datasets: %s from %s', len(obsolete_packages),
             len(existing_packages))
    for package_name in obsolete_packages:
        sl.delete(engine, table, package_name=package_name)
        sl.delete(engine, 'issue', package_name=package_name)
        stats.add('Removed obsolete dataset', package_name)
    # Removed stray rows without package_name
    stray_rows = list(sl.find(engine, table, package_name=None))
    if stray_rows:
        log.info('Stray rows without package_name: %i', len(stray_rows))
        sl.delete(engine, table, package_name=None)
        sl.delete(engine, 'issue', package_name=None)
        for row in stray_rows:
            stats.add('Stray row removed', row['resource_id'])
    print 'Datasets build_index summary:'
    print stats.report()
    print 'Resources build_index summary:'
    print stats_resources.report()
示例#8
0
def cleanup_sheet(engine, row, sheet_id, data_row_filter, stats_spending):
    spending_table = sl.get_table(engine, 'spending')
    data = list(
        sl.find(engine,
                spending_table,
                resource_id=row['resource_id'],
                sheet_id=sheet_id))
    if not data:
        log.info('Sheet has no rows')
        return False, None
    connection = engine.connect()
    trans = connection.begin()
    date_formats = cleanup_dates.detect_formats(data)
    try:
        for date_format in date_formats.values():
            if isinstance(date_format, basestring):
                issue(engine, row['resource_id'], row['retrieve_hash'], STAGE,
                      "Couldn't detect date formats because: %s" % date_format,
                      repr(date_formats))
                return True, date_format

        if not data_row_filter:
            sl.delete(connection,
                      spending_table,
                      resource_id=row['resource_id'],
                      sheet_id=sheet_id)
        for row in data:
            if data_row_filter and data_row_filter != row['row_id']:
                continue
            row = cleanup_dates.apply(row, date_formats, stats_spending)
            row = cleanup_numbers.apply(row, stats_spending)
            row = cleanup_gov.apply(row, stats_spending)
            #row = cleanup_supplier.apply(row, engine)
            del row['id']
            sl.add_row(connection, spending_table, row)
        trans.commit()
        return True, None
    finally:
        connection.close()
示例#9
0
def cleanup_sheet(engine, row, sheet_id):
    spending_table = sl.get_table(engine, "spending")
    data = list(sl.find(engine, spending_table, resource_id=row["resource_id"], sheet_id=sheet_id))
    connection = engine.connect()
    trans = connection.begin()
    date_formats = cleanup_dates.detect_formats(data)
    try:
        if None in date_formats.values():
            log.warn("Couldn't detect date formats: %r", date_formats)
            issue(engine, row["resource_id"], row["retrieve_hash"], "Couldn't detect date formats", repr(date_formats))
            return False

        sl.delete(connection, spending_table, resource_id=row["resource_id"], sheet_id=sheet_id)
        for row in data:
            row = cleanup_dates.apply(row, date_formats)
            row = cleanup_numbers.apply(row)
            row = cleanup_gov.apply(row)
            # row = cleanup_supplier.apply(row, engine)
            del row["id"]
            sl.add_row(connection, spending_table, row)
        trans.commit()
        return True
    finally:
        connection.close()
示例#10
0
def cleanup_sheet(engine, row, sheet_id, data_row_filter, stats_spending):
    spending_table = sl.get_table(engine, 'spending')
    data = list(sl.find(engine, spending_table,
            resource_id=row['resource_id'],
            sheet_id=sheet_id))
    if not data:
        log.info('Sheet has no rows')
        return False, None
    connection = engine.connect()
    trans = connection.begin()
    date_formats = cleanup_dates.detect_formats(data)
    try:
        for date_format in date_formats.values():
            if isinstance(date_format, basestring):
                issue(engine, row['resource_id'], row['retrieve_hash'], STAGE,
                        "Couldn't detect date formats because: %s" % date_format,
                        repr(date_formats))
                return True, date_format

        if not data_row_filter:
            sl.delete(connection, spending_table,
                      resource_id=row['resource_id'],
                      sheet_id=sheet_id)
        for row in data:
            if data_row_filter and data_row_filter != row['row_id']:
                continue
            row = cleanup_dates.apply(row, date_formats, stats_spending)
            row = cleanup_numbers.apply(row, stats_spending)
            row = cleanup_gov.apply(row, stats_spending)
            #row = cleanup_supplier.apply(row, engine)
            del row['id']
            sl.add_row(connection, spending_table, row)
        trans.commit()
        return True, None
    finally:
        connection.close()
示例#11
0
def scrape_ablauf(engine, url, force=False):
    Ablauf = sl.get_table(engine, 'ablauf')

    key = int(url.rsplit('/', 1)[-1].split('.')[0])
    a = sl.find_one(engine, Ablauf, source_url=url)
    if a is not None and a['abgeschlossen'] and not force:
        raise Unmodified()
    response = fetch(url)
    a = check_tags(a or {}, response, force)
    a.update({'key': key, 'source_url': url})
    doc = inline_xml_from_page(response.content, url)
    if doc is None:
        raise NoContentException()

    a['wahlperiode'] = int(doc.findtext("WAHLPERIODE"))
    a['typ'] = doc.findtext("VORGANGSTYP")
    a['titel'] = doc.findtext("TITEL")

    if not a['titel'] or not len(a['titel'].strip()):
        raise NoContentException()

    if '\n' in a['titel']:
        t, k = a['titel'].rsplit('\n', 1)
        k = k.strip()
        if k.startswith('KOM') or k.startswith('SEK'):
            a['titel'] = t

    a['initiative'] = doc.findtext("INITIATIVE")
    a['stand'] = doc.findtext("AKTUELLER_STAND")
    a['signatur'] = doc.findtext("SIGNATUR")
    a['gesta_id'] = doc.findtext("GESTA_ORDNUNGSNUMMER")
    a['eu_dok_nr'] = doc.findtext("EU_DOK_NR")
    a['abstrakt'] = doc.findtext("ABSTRAKT")
    a['sachgebiet'] = doc.findtext("SACHGEBIET")
    a['zustimmungsbeduerftig'] = doc.findtext("ZUSTIMMUNGSBEDUERFTIGKEIT")
    #a.schlagworte = []
    Schlagwort = sl.get_table(engine, 'schlagwort')
    for sw in doc.findall("SCHLAGWORT"):
        wort = {'wort': sw.text, 'source_url': url}
        sl.upsert(engine, Schlagwort, wort, unique=wort.keys())
    log.info("Ablauf %s: %s", url, a['titel'].encode('ascii', 'replace'))
    a['titel'] = a['titel'].strip().lstrip('.').strip()
    a = expand_dok_nr(a)
    a['abgeschlossen'] = DIP_ABLAUF_STATES_FINISHED.get(a['stand'], False)

    if a['wahlperiode'] != max(app.config.get('WAHLPERIODEN')):
        a['abgeschlossen'] = True

    if 'Originaltext der Frage(n):' in a['abstrakt']:
        _, a['abstrakt'] = a['abstrakt'].split('Originaltext der Frage(n):', 1)

    sl.delete(engine, sl.get_table(engine, 'position'), source_url=url)
    sl.delete(engine, sl.get_table(engine, 'beitrag'), source_url=url)
    sl.delete(engine, sl.get_table(engine, 'zuweisung'), source_url=url)
    sl.delete(engine, sl.get_table(engine, 'beschluss'), source_url=url)
    sl.delete(engine, sl.get_table(engine, 'referenz'), source_url=url)

    for elem in doc.findall(".//VORGANGSPOSITION"):
        scrape_activity(engine, url, elem)

    Referenz = sl.get_table(engine, 'referenz')
    for elem in doc.findall("WICHTIGE_DRUCKSACHE"):
        link = elem.findtext("DRS_LINK")
        hash = None
        if link is not None and '#' in link:
            link, hash = link.rsplit('#', 1)
        dokument = dokument_by_id(elem.findtext("DRS_HERAUSGEBER"),
                                  'drs',
                                  elem.findtext("DRS_NUMMER"),
                                  link=link)
        dokument['text'] = elem.findtext("DRS_TYP")
        dokument['seiten'] = hash
        dokument['source_url'] = url
        sl.upsert(engine,
                  Referenz,
                  dokument,
                  unique=['link', 'source_url', 'seiten'])

    for elem in doc.findall("PLENUM"):
        link = elem.findtext("PLPR_LINK")
        if link is not None and '#' in link:
            link, hash = link.rsplit('#', 1)
        dokument = dokument_by_id(elem.findtext("PLPR_HERAUSGEBER"),
                                  'plpr',
                                  elem.findtext("PLPR_NUMMER"),
                                  link=link)
        dokument['text'] = elem.findtext("PLPR_KLARTEXT")
        dokument['seiten'] = elem.findtext("PLPR_SEITEN")
        dokument['source_url'] = url
        sl.upsert(engine,
                  Referenz,
                  dokument,
                  unique=['link', 'source_url', 'seiten'])

    sl.upsert(engine, Ablauf, a, unique=['source_url'])
    return a
示例#12
0
        processed_resource_ids.add(res['id'])
        if row and row['url'] != data['url']:
            # url has changed, so force retrieval next time
            data['retrieve_status'] = False
            stats_resources.add_source('URL changed', data)
        elif row:
            stats_resources.add_source('URL unchanged', data)
        else:
            stats_resources.add_source('New resource', data)
        sl.upsert(engine, table, data, ['resource_id'])

    # Remove references to any deleted resources for this dataset
    obsolete_rows = [row for row in existing_rows
                     if row['resource_id'] not in processed_resource_ids]
    for row in obsolete_rows:
        sl.delete(engine, table, resource_id=row['resource_id'])
        sl.delete(engine, 'issue', resource_id=row['resource_id'])
        stats_resources.add_source('Deleted obsolete row', row)
    return len(resources)

def connect():
    engine = db_connect()
    src_table = sl.get_table(engine, 'source')
    return engine, src_table

def build_index(publisher_name=None):
    '''Searches CKAN for spending resources and writes their metadata to
    the database.'''
    engine, table = connect()
    client = ckan_client()
    log.info('CKAN: %s', client.base_location)
示例#13
0
def scrape_ablauf(engine, url, force=False):
    Ablauf = sl.get_table(engine, 'ablauf')

    key = int(url.rsplit('/', 1)[-1].split('.')[0])
    a = sl.find_one(engine, Ablauf, source_url=url)
    if a is not None and a['abgeschlossen'] and not force:
        raise Unmodified()
    response = fetch(url)
    a = check_tags(a or {}, response, force)
    a.update({'key': key, 
              'source_url': url})
    doc = inline_xml_from_page(response.content, url)
    if doc is None: 
        raise NoContentException()
    
    a['wahlperiode'] = int(doc.findtext("WAHLPERIODE"))
    a['typ'] = doc.findtext("VORGANGSTYP")
    a['titel'] = doc.findtext("TITEL")

    if not a['titel'] or not len(a['titel'].strip()):
        raise NoContentException()

    if '\n' in a['titel']:
        t, k = a['titel'].rsplit('\n', 1)
        k = k.strip()
        if k.startswith('KOM') or k.startswith('SEK'):
            a['titel'] = t

    a['initiative'] = doc.findtext("INITIATIVE")
    a['stand'] = doc.findtext("AKTUELLER_STAND")
    a['signatur'] = doc.findtext("SIGNATUR")
    a['gesta_id'] = doc.findtext("GESTA_ORDNUNGSNUMMER")
    a['eu_dok_nr'] = doc.findtext("EU_DOK_NR")
    a['abstrakt'] = doc.findtext("ABSTRAKT")
    a['sachgebiet'] = doc.findtext("SACHGEBIET")
    a['zustimmungsbeduerftig'] = doc.findtext("ZUSTIMMUNGSBEDUERFTIGKEIT")
    #a.schlagworte = []
    Schlagwort = sl.get_table(engine, 'schlagwort')
    for sw in doc.findall("SCHLAGWORT"):
        wort = {'wort': sw.text, 'source_url': url}
        sl.upsert(engine, Schlagwort, wort, unique=wort.keys())
    log.info("Ablauf %s: %s", url, a['titel'].encode('ascii', 'replace'))
    a['titel'] = a['titel'].strip().lstrip('.').strip()
    a = expand_dok_nr(a)
    a['abgeschlossen'] = DIP_ABLAUF_STATES_FINISHED.get(a['stand'], False)
    if 'Originaltext der Frage(n):' in a['abstrakt']:
        _, a['abstrakt'] = a['abstrakt'].split('Originaltext der Frage(n):', 1)


    sl.delete(engine, sl.get_table(engine, 'position'), source_url=url)
    sl.delete(engine, sl.get_table(engine, 'beitrag'), source_url=url)
    sl.delete(engine, sl.get_table(engine, 'zuweisung'), source_url=url)
    sl.delete(engine, sl.get_table(engine, 'beschluss'), source_url=url)
    sl.delete(engine, sl.get_table(engine, 'referenz'), source_url=url)

    for elem in doc.findall(".//VORGANGSPOSITION"):
        scrape_activity(engine, url, elem)

    Referenz = sl.get_table(engine, 'referenz')
    for elem in doc.findall("WICHTIGE_DRUCKSACHE"):
        link = elem.findtext("DRS_LINK")
        hash = None
        if link is not None and '#' in link:
            link, hash = link.rsplit('#', 1)
        dokument = dokument_by_id(elem.findtext("DRS_HERAUSGEBER"), 
                'drs', elem.findtext("DRS_NUMMER"), link=link)
        dokument['text'] = elem.findtext("DRS_TYP")
        dokument['seiten'] = hash
        dokument['source_url'] = url
        sl.upsert(engine, Referenz, dokument, unique=[
            'link', 'source_url', 'seiten'
            ])

    for elem in doc.findall("PLENUM"):
        link = elem.findtext("PLPR_LINK")
        if link is not None and '#' in link:
            link, hash = link.rsplit('#', 1)
        dokument = dokument_by_id(elem.findtext("PLPR_HERAUSGEBER"), 
                'plpr', elem.findtext("PLPR_NUMMER"), link=link)
        dokument['text'] = elem.findtext("PLPR_KLARTEXT")
        dokument['seiten'] = elem.findtext("PLPR_SEITEN")
        dokument['source_url'] = url
        sl.upsert(engine, Referenz, dokument, unique=[
            'link', 'source_url', 'seiten'
            ])

    sl.upsert(engine, Ablauf, a, unique=['source_url'])
    return a
示例#14
0
            # url has changed, so force retrieval next time
            data['retrieve_status'] = False
            stats_resources.add_source('URL changed', data)
        elif row:
            stats_resources.add_source('URL unchanged', data)
        else:
            stats_resources.add_source('New resource', data)
        sl.upsert(engine, table, data, ['resource_id'])

    # Remove references to any deleted resources for this dataset
    obsolete_rows = [
        row for row in existing_rows
        if row['resource_id'] not in processed_resource_ids
    ]
    for row in obsolete_rows:
        sl.delete(engine, table, resource_id=row['resource_id'])
        sl.delete(engine, 'issue', resource_id=row['resource_id'])
        stats_resources.add_source('Deleted obsolete row', row)
    return len(resources)


def connect():
    engine = db_connect()
    src_table = sl.get_table(engine, 'source')
    return engine, src_table


def build_index(publisher_name=None):
    '''Searches CKAN for spending resources and writes their metadata to
    the database.'''
    engine, table = connect()