예제 #1
0
def combine_sheet(engine, resource, sheet_id, table, mapping):
    begin = time.time()
    base = {
            'resource_id': resource['resource_id'],
            'resource_hash': resource['extract_hash'],
            'sheet_id': sheet_id,
        }
    spending_table = sl.get_table(engine, 'spending')
    connection = engine.connect()
    trans = connection.begin()
    try:
        rows = 0
        sl.delete(connection, spending_table,
                resource_id=resource['resource_id'],
                sheet_id=sheet_id)
        for row in sl.all(connection, table):
            data = dict(base)
            for col, value in row.items():
                if col == 'id':
                    data['row_id'] = value
                    continue
                mapped = mapping.get(col)
                if mapped is not None:
                    data[mapped] = value
            sl.add_row(connection, spending_table, data)
            rows += 1
        trans.commit()
        log.info("Loaded %s rows in %s ms", rows,
                int((time.time()-begin)*1000))
        return rows > 0
    finally:
        connection.close()
예제 #2
0
def cleanup_sheet(engine, row, sheet_id):
    spending_table = sl.get_table(engine, 'spending')
    data = list(
        sl.find(engine,
                spending_table,
                resource_id=row['resource_id'],
                sheet_id=sheet_id))
    connection = engine.connect()
    trans = connection.begin()
    date_formats = cleanup_dates.detect_formats(data)
    try:
        if None in date_formats.values():
            log.warn("Couldn't detect date formats: %r", date_formats)
            issue(engine, row['resource_id'], row['retrieve_hash'],
                  "Couldn't detect date formats", repr(date_formats))
            return False

        sl.delete(connection,
                  spending_table,
                  resource_id=row['resource_id'],
                  sheet_id=sheet_id)
        for row in data:
            row = cleanup_dates.apply(row, date_formats)
            row = cleanup_numbers.apply(row)
            row = cleanup_gov.apply(row)
            #row = cleanup_supplier.apply(row, engine)
            del row['id']
            sl.add_row(connection, spending_table, row)
        trans.commit()
        return True
    finally:
        connection.close()
예제 #3
0
def retrieve(row, engine, force):
    ret_table = sl.get_table(engine, 'retrieval_log')
    #print row.get('package_name'), row['url'].encode('utf-8')
    try:
        import os
        if not force and os.path.exists(source_path(row)):
            return
        url = fix_url(row['url'])
        print "Fetching %s" % url
        res = urllib2.urlopen(url)

        fh = open(source_path(row), 'wb')
        fh.write(res.read())

        sl.add_row(engine, ret_table, {
            'resource_id': row['resource_id'],
            'status': '200',
            'message': "",
            'content-type': res.headers.get('content-type', ''),
            'timestamp': datetime.now()
            })
    except Exception, ioe:
        print traceback.format_exc()
        status = 0
        if hasattr(ioe, 'code'):
            status = ioe.code
        sl.add_row(engine, ret_table, {
            'resource_id': row['resource_id'],
            'status': status,
            'message': unicode(ioe),
            'timestamp': datetime.now()
            })
        assert False, unicode(ioe).encode('utf-8')
예제 #4
0
def combine_sheet(engine, resource, sheet_id, table, mapping):
    begin = time.time()
    base = {
        'resource_id': resource['resource_id'],
        'resource_hash': resource['extract_hash'],
        'sheet_id': sheet_id,
    }
    spending_table = sl.get_table(engine, 'spending')
    connection = engine.connect()
    trans = connection.begin()
    try:
        rows = 0
        sl.delete(connection,
                  spending_table,
                  resource_id=resource['resource_id'],
                  sheet_id=sheet_id)
        for row in sl.all(connection, table):
            data = dict(base)
            for col, value in row.items():
                if col == 'id':
                    data['row_id'] = value
                    continue
                mapped = mapping.get(col)
                if mapped is not None:
                    data[mapped] = value
            sl.add_row(connection, spending_table, data)
            rows += 1
        trans.commit()
        log.info("Loaded %s rows in %s ms", rows,
                 int((time.time() - begin) * 1000))
        return rows > 0
    finally:
        connection.close()
예제 #5
0
def condense(engine, resource_id, table_id, force):
    table_suffix = '%s_table%s' % (resource_id, table_id)

    if not engine.has_table('raw_%s' % table_suffix):
        return

    condensed_table = sl.get_table(engine, 'condensed')

    # Skip over tables we have already extracted
    if not force and sl.find_one(engine, condensed_table, resource_id=resource_id, table_id=table_id) is not None:
        return

    connection = engine.connect()
    trans = connection.begin()

    start = time.time()

    try:
        raw_table = sl.get_table(connection, 'raw_%s' % table_suffix)
        sl.drop_table(connection, 'spending_%s' % table_suffix)
        spending_table = sl.get_table(connection, 'spending_%s' % table_suffix)
        columns_table = sl.get_table(connection, 'column_sets')

        normalise_map = normalised_columns_map(raw_table)
        normalised_headers = ','.join(sorted(normalise_map.values()))
        mapping_row = sl.find_one(connection, columns_table, normalised=normalised_headers)

        if mapping_row is None or not mapping_row.get('valid'):
            # This table is unmapped, cannot be condensed
            return

        column_mapping = json.loads(mapping_row['column_map'])

        # Build the final mapping from input column to output column
        mapping = {}
        for k,n in normalise_map.iteritems():
            if n in column_mapping and column_mapping[n] is not None and len(column_mapping[n]) > 0:
                mapping[k] = column_mapping[n]
        
        for row in sl.all(connection, raw_table):
            spending_row = {}
            for key, value in row.items():
                if key not in mapping:
                    continue
                if not value or not len(value.strip()):
                    continue
                if mapping[key] in spending_row:
                    continue
                spending_row[mapping[key]] = value.strip()
            #print spending_row
            sl.add_row(connection, spending_table, spending_row)
        sl.upsert(connection, condensed_table, {'resource_id': resource_id,
                                                'table_id': table_id,
                                                'condense_time': time.time() - start,
                                                }, ['resource_id', 'table_id'])
        trans.commit()
    finally:
        connection.close()
예제 #6
0
def issue(engine, resource_id, resource_hash, stage, message, data={}):
    table = sl.get_table(engine, 'issue')
    log = logging.getLogger('issue')
    log.debug("R[%s]: %s", resource_id, message)
    sl.add_row(
        engine, table, {
            'resource_id': resource_id,
            'resource_hash': resource_hash,
            'timestamp': datetime.datetime.utcnow(),
            'stage': stage,
            'message': message,
            'data': json.dumps(data)
        })
예제 #7
0
파일: common.py 프로젝트: pudo/dpkg-uk25k
def issue(engine, resource_id, resource_hash, stage, message,
          data={}):
    table = sl.get_table(engine, 'issue')
    log = logging.getLogger('issue')
    log.debug("R[%s]: %s", resource_id, message)
    sl.add_row(engine, table, {
        'resource_id': resource_id,
        'resource_hash': resource_hash,
        'timestamp': datetime.datetime.utcnow(),
        'stage': stage,
        'message': message,
        'data': json.dumps(data)
        })
예제 #8
0
def issue(engine, resource_id, resource_hash, stage, message,
          data={}):
    import sqlaload as sl # this import is slow, so it is done inside this func
    table = sl.get_table(engine, 'issue')
    log = logging.getLogger('issue')
    log.debug("R[%s]: %s", resource_id, message)
    sl.add_row(engine, table, {
        'resource_id': resource_id,
        'resource_hash': resource_hash,
        'timestamp': datetime.datetime.utcnow(),
        'stage': stage,
        'message': message,
        'data': json.dumps(data)
        })
예제 #9
0
def issue(engine, resource_id, resource_hash, stage, message, data={}):
    import sqlaload as sl  # this import is slow, so it is done inside this func
    table = sl.get_table(engine, 'issue')
    log = logging.getLogger('issue')
    log.debug("R[%s]: %s", resource_id, message)
    sl.add_row(
        engine, table, {
            'resource_id': resource_id,
            'resource_hash': resource_hash,
            'timestamp': datetime.datetime.utcnow(),
            'stage': stage,
            'message': message,
            'data': json.dumps(data)
        })
예제 #10
0
def scrape_activity(engine, url, elem):
    urheber = elem.findtext("URHEBER")
    fundstelle = elem.findtext("FUNDSTELLE")
    Position = sl.get_table(engine, 'position')
    p = {'source_url': url, 'urheber': urheber, 'fundstelle': fundstelle}
    pos_keys = p.copy()
    p['zuordnung'] = elem.findtext("ZUORDNUNG")
    p['abstrakt'] = elem.findtext("VP_ABSTRAKT")
    p['fundstelle_url'] = elem.findtext("FUNDSTELLE_LINK")

    Zuweisung = sl.get_table(engine, 'zuweisung')
    for zelem in elem.findall("ZUWEISUNG"):
        z = pos_keys.copy()
        z['text'] = zelem.findtext("AUSSCHUSS_KLARTEXT")
        z['federfuehrung'] = zelem.find("FEDERFUEHRUNG") is not None
        z['gremium_key'] = DIP_GREMIUM_TO_KEY.get(z['text'])
        sl.add_row(engine, Zuweisung, z)

    Beschluss = sl.get_table(engine, 'beschluss')
    for belem in elem.findall("BESCHLUSS"):
        b = pos_keys.copy()
        b['seite'] = belem.findtext("BESCHLUSSSEITE")
        b['dokument_text'] = belem.findtext("BEZUGSDOKUMENT")
        b['tenor'] = belem.findtext("BESCHLUSSTENOR")
        b['grundlage'] = belem.findtext("GRUNDLAGE")
        sl.add_row(engine, Beschluss, b)

    Referenz = sl.get_table(engine, 'referenz')
    try:
        dokument = dokument_by_url(p['fundstelle_url']) or \
            dokument_by_name(p['fundstelle'])
        dokument.update(pos_keys)
        sl.add_row(engine, Referenz, dokument)
    except Exception, e:
        log.exception(e)
예제 #11
0
def cleanup_sheet(engine, row, sheet_id, data_row_filter, stats_spending):
    spending_table = sl.get_table(engine, 'spending')
    data = list(
        sl.find(engine,
                spending_table,
                resource_id=row['resource_id'],
                sheet_id=sheet_id))
    if not data:
        log.info('Sheet has no rows')
        return False, None
    connection = engine.connect()
    trans = connection.begin()
    date_formats = cleanup_dates.detect_formats(data)
    try:
        for date_format in date_formats.values():
            if isinstance(date_format, basestring):
                issue(engine, row['resource_id'], row['retrieve_hash'], STAGE,
                      "Couldn't detect date formats because: %s" % date_format,
                      repr(date_formats))
                return True, date_format

        if not data_row_filter:
            sl.delete(connection,
                      spending_table,
                      resource_id=row['resource_id'],
                      sheet_id=sheet_id)
        for row in data:
            if data_row_filter and data_row_filter != row['row_id']:
                continue
            row = cleanup_dates.apply(row, date_formats, stats_spending)
            row = cleanup_numbers.apply(row, stats_spending)
            row = cleanup_gov.apply(row, stats_spending)
            #row = cleanup_supplier.apply(row, engine)
            del row['id']
            sl.add_row(connection, spending_table, row)
        trans.commit()
        return True, None
    finally:
        connection.close()
def emit(data):
    if not 'amount_n_total' in data and not 'amount_n_pay' in data:
        return
    for colyear, d in [('amount_nm2_', 2), ('amount_nm1_', 1), ('amount_n_', 0)]:
        rec = {'year': year - d}
        for k, v in data.items():
            if k.startswith(colyear):
                k = k.replace(colyear, 'amount_')
                rec[k] = v
            elif not k.startswith('amount_'):
                rec[k] = v
        for figure_type, fields in FIGURE_FIELDS.items():
            r = rec.copy()
            r.update({
                'amount': rec.get(fields[0]),
                'amount_reserve_total': rec.get(fields[1]),
                'amount_reserve_figure': rec.get(fields[2]),
                'figure_type': figure_type
                })
            for fields in FIGURE_FIELDS.values():
                for f in fields:
                    r.pop(f, None)
            sl.add_row(engine, table, r)
예제 #13
0
def cleanup_sheet(engine, row, sheet_id, data_row_filter, stats_spending):
    spending_table = sl.get_table(engine, 'spending')
    data = list(sl.find(engine, spending_table,
            resource_id=row['resource_id'],
            sheet_id=sheet_id))
    if not data:
        log.info('Sheet has no rows')
        return False, None
    connection = engine.connect()
    trans = connection.begin()
    date_formats = cleanup_dates.detect_formats(data)
    try:
        for date_format in date_formats.values():
            if isinstance(date_format, basestring):
                issue(engine, row['resource_id'], row['retrieve_hash'], STAGE,
                        "Couldn't detect date formats because: %s" % date_format,
                        repr(date_formats))
                return True, date_format

        if not data_row_filter:
            sl.delete(connection, spending_table,
                      resource_id=row['resource_id'],
                      sheet_id=sheet_id)
        for row in data:
            if data_row_filter and data_row_filter != row['row_id']:
                continue
            row = cleanup_dates.apply(row, date_formats, stats_spending)
            row = cleanup_numbers.apply(row, stats_spending)
            row = cleanup_gov.apply(row, stats_spending)
            #row = cleanup_supplier.apply(row, engine)
            del row['id']
            sl.add_row(connection, spending_table, row)
        trans.commit()
        return True, None
    finally:
        connection.close()
예제 #14
0
def cleanup_sheet(engine, row, sheet_id):
    spending_table = sl.get_table(engine, "spending")
    data = list(sl.find(engine, spending_table, resource_id=row["resource_id"], sheet_id=sheet_id))
    connection = engine.connect()
    trans = connection.begin()
    date_formats = cleanup_dates.detect_formats(data)
    try:
        if None in date_formats.values():
            log.warn("Couldn't detect date formats: %r", date_formats)
            issue(engine, row["resource_id"], row["retrieve_hash"], "Couldn't detect date formats", repr(date_formats))
            return False

        sl.delete(connection, spending_table, resource_id=row["resource_id"], sheet_id=sheet_id)
        for row in data:
            row = cleanup_dates.apply(row, date_formats)
            row = cleanup_numbers.apply(row)
            row = cleanup_gov.apply(row)
            # row = cleanup_supplier.apply(row, engine)
            del row["id"]
            sl.add_row(connection, spending_table, row)
        trans.commit()
        return True
    finally:
        connection.close()
예제 #15
0
def scrape_activity(engine, url, elem):
    urheber = elem.findtext("URHEBER")
    fundstelle = elem.findtext("FUNDSTELLE")
    Position = sl.get_table(engine, 'position')
    p = {'source_url': url, 
         'urheber': urheber,
         'fundstelle': fundstelle}
    pos_keys = p.copy()
    p['zuordnung'] = elem.findtext("ZUORDNUNG")
    p['abstrakt'] = elem.findtext("VP_ABSTRAKT")
    p['fundstelle_url'] = elem.findtext("FUNDSTELLE_LINK")
    
    Zuweisung = sl.get_table(engine, 'zuweisung')
    for zelem in elem.findall("ZUWEISUNG"):
        z = pos_keys.copy()
        z['text'] = zelem.findtext("AUSSCHUSS_KLARTEXT")
        z['federfuehrung'] = zelem.find("FEDERFUEHRUNG") is not None
        z['gremium_key'] = DIP_GREMIUM_TO_KEY.get(z['text'])
        sl.add_row(engine, Zuweisung, z)

    Beschluss = sl.get_table(engine, 'beschluss')
    for belem in elem.findall("BESCHLUSS"):
        b = pos_keys.copy()
        b['seite'] = belem.findtext("BESCHLUSSSEITE")
        b['dokument_text'] = belem.findtext("BEZUGSDOKUMENT")
        b['tenor'] = belem.findtext("BESCHLUSSTENOR")
        b['grundlage'] = belem.findtext("GRUNDLAGE")
        sl.add_row(engine, Beschluss, b)

    Referenz = sl.get_table(engine, 'referenz')
    try:
        dokument = dokument_by_url(p['fundstelle_url']) or \
            dokument_by_name(p['fundstelle'])
        dokument.update(pos_keys)
        sl.add_row(engine, Referenz, dokument)
    except Exception, e:
        log.exception(e)
예제 #16
0
def extract_resource_core(engine, row, stats):
    connection = engine.connect()
    fh = open(source_path(row), 'rb')
    source_data = fh.read()

    if not len(source_data):
        issue(engine, row['resource_id'], row['retrieve_hash'], STAGE,
              "Empty file")
        stats.add_source('Empty file', row)
        return False, 0
    if html_re.search(source_data[0:1024]) is not None:
        issue(engine, row['resource_id'], row['retrieve_hash'], STAGE,
              "HTML file detected, not a transaction report")
        stats.add_source('HTML file', row)
        return False, 0
    if source_data.startswith('%PDF'):
        issue(engine, row['resource_id'], row['retrieve_hash'], STAGE,
              "PDF file detected, not a transaction report")
        stats.add_source('PDF file', row)
        return False, 0

    trans = connection.begin()
    start = time.time()
    try:
        if source_data.startswith(COMPDOC_SIGNATURE):
            fh.seek(0)
            table_set = XLSTableSet(fh)
        elif source_data.startswith('PK'):
            table_set = XLSXTableSet(filename=source_path(row))
        else:
            #fh.seek(0)
            from StringIO import StringIO
            sio = StringIO(source_data)

            encoding = None
            detected = chardet.detect(source_data[:200])
            log.debug('Encoding detected as: %s', detected.get('encoding'))
            if detected.get(
                    'encoding') == 'ISO-8859-2' and '\xa3' in source_data:
                # Detected as Latin2 but probably isn't - that is for Eastern
                # European languages.  Probably because the presence of a GBP
                # pound sign has foxed chardet. It is pretty certain that it is
                # a single-byte ASCII-variant, and my money is on Windows-1252
                encoding = 'windows-1252'
                log.debug(
                    'Probably not ISO-8859-2 because it has GBP symbol, so assuming it is Windows-1252'
                )

            table_set = CSVTableSet(sio, encoding=encoding)

        sheets = 0
        for sheet_id, row_set in enumerate(table_set.tables):
            offset, headers = headers_guess(row_set.sample)
            headers = map(convert_, headers)
            log.debug("Headers: %r", headers)
            if len(headers) <= 1:
                continue
            sheets += 1

            row_set.register_processor(headers_processor(headers))
            row_set.register_processor(offset_processor(offset + 1))

            values = defaultdict(lambda: defaultdict(int))

            raw_table_name = 'raw_%s_sheet%s' % (row['resource_id'], sheet_id)
            sl.drop_table(connection, raw_table_name)
            raw_table = sl.get_table(connection, raw_table_name)

            # with one header row, offset=0 and we want row_number=1 so that
            # the first data row is row_number=2, matching the row number as
            # seen in Excel
            row_number = offset + 1
            for row_ in row_set:
                cells = dict([(keyify(c.column), convert_(c.value)) for c in row_ if \
                    len(c.column.strip())])
                row_number += 1
                if is_row_blank(cells):
                    continue
                for cell, value in cells.items():
                    values[cell][value] += 1
                cells['row_number'] = row_number
                sl.add_row(connection, raw_table, cells)

        trans.commit()
        log.debug(stats.add_source('Extracted ok', row))
        return sheets > 0, sheets
    except Exception, ex:
        log.exception(ex)
        issue(engine, row['resource_id'], row['retrieve_hash'], STAGE,
              unicode(ex))
        stats.add_source('Exception: %s' % ex.__class__.__name__, row)
        return False, 0
예제 #17
0
def extract_resource_core(engine, row, stats):
    connection = engine.connect()
    fh = open(source_path(row), 'rb')
    source_data = fh.read()

    if not len(source_data):
        issue(engine, row['resource_id'], row['retrieve_hash'], STAGE,
              "Empty file")
        stats.add_source('Empty file', row)
        return False, 0
    if html_re.search(source_data[0:1024]) is not None:
        issue(engine, row['resource_id'], row['retrieve_hash'], STAGE,
              "HTML file detected, not a transaction report")
        stats.add_source('HTML file', row)
        return False, 0
    if source_data.startswith('%PDF'):
        issue(engine, row['resource_id'], row['retrieve_hash'], STAGE,
              "PDF file detected, not a transaction report")
        stats.add_source('PDF file', row)
        return False, 0

    trans = connection.begin()
    start = time.time()
    try:
        if source_data.startswith(COMPDOC_SIGNATURE):
            fh.seek(0)
            table_set = XLSTableSet(fh)
        elif source_data.startswith('PK'):
            table_set = XLSXTableSet(filename=source_path(row))
        else:
            #fh.seek(0)
            from StringIO import StringIO
            sio = StringIO(source_data)

            encoding = None
            detected = chardet.detect(source_data[:200])
            log.debug('Encoding detected as: %s', detected.get('encoding'))
            if detected.get('encoding') == 'ISO-8859-2' and '\xa3' in source_data:
                # Detected as Latin2 but probably isn't - that is for Eastern
                # European languages.  Probably because the presence of a GBP
                # pound sign has foxed chardet. It is pretty certain that it is
                # a single-byte ASCII-variant, and my money is on Windows-1252
                encoding = 'windows-1252'
                log.debug('Probably not ISO-8859-2 because it has GBP symbol, so assuming it is Windows-1252')

            table_set = CSVTableSet(sio, encoding=encoding)

        sheets = 0
        for sheet_id, row_set in enumerate(table_set.tables):
            offset, headers = headers_guess(row_set.sample)
            headers = map(convert_, headers)
            log.debug("Headers: %r", headers)
            if len(headers) <= 1:
                continue
            sheets += 1

            row_set.register_processor(headers_processor(headers))
            row_set.register_processor(offset_processor(offset + 1))

            values = defaultdict(lambda: defaultdict(int))

            raw_table_name = 'raw_%s_sheet%s' % (row['resource_id'], sheet_id)
            sl.drop_table(connection, raw_table_name)
            raw_table = sl.get_table(connection, raw_table_name)

            # with one header row, offset=0 and we want row_number=1 so that
            # the first data row is row_number=2, matching the row number as
            # seen in Excel
            row_number = offset + 1
            for row_ in row_set:
                cells = dict([(keyify(c.column), convert_(c.value)) for c in row_ if \
                    len(c.column.strip())])
                row_number += 1
                if is_row_blank(cells):
                    continue
                for cell, value in cells.items():
                    values[cell][value] += 1
                cells['row_number'] = row_number
                sl.add_row(connection, raw_table, cells)

        trans.commit()
        log.debug(stats.add_source('Extracted ok', row))
        return sheets>0, sheets
    except Exception, ex:
        log.exception(ex)
        issue(engine, row['resource_id'], row['retrieve_hash'], STAGE,
              unicode(ex))
        stats.add_source('Exception: %s' % ex.__class__.__name__, row)
        return False, 0
예제 #18
0
def extract_table(engine, table, row, resource_id, force):
    # For now, interpret lack of data as not-failure at this stage, on
    # the basis that it was already reported as failure at the
    # retrieve stage and will just clutter up this list.
    if not os.path.exists(source_path(row)):
        return
    # assert os.path.exists(source_path(row)), "No source file exists."

    connection = engine.connect()
    extracted_table = sl.get_table(connection, "extracted")

    # Skip over tables we have already extracted
    if not force and sl.find_one(engine, extracted_table, resource_id=resource_id) is not None:
        return

    fh = open(source_path(row), "rb")
    source_data = fh.read()

    assert len(source_data) > 0, "Empty file"
    assert html_re.search(source_data[0:1024]) is None, "Looks like HTML"
    assert not source_data.startswith("%PDF"), "Looks like PDF"

    trans = connection.begin()
    start = time.time()
    try:
        if source_data.startswith(COMPDOC_SIGNATURE):
            fh.seek(0)
            table_set = XLSTableSet.from_fileobj(fh)
        elif source_data.startswith("PK"):
            table_set = XLSXTableSet(source_path(row))
        else:
            cd = chardet.detect(source_data)
            fh.close()
            fh = codecs.open(source_path(row), "r", cd["encoding"])

            table_set = CSVTableSet.from_fileobj(fh)

        for table_id, row_set in enumerate(table_set.tables):
            # types = type_guess(row_set.sample)
            # row_set.register_processor(types_processor(types))
            offset, headers = headers_guess(row_set.sample)
            headers = map(convert_, headers)
            assert (
                len(headers) > 1 or len(table_set.tables) > 1
            ), "Only one column was detected; assuming this is not valid data."
            # print headers

            # We might have multiple table sets where one is blank or ranty text or something. Skip those.
            if len(headers) <= 1:
                continue

            row_set.register_processor(headers_processor(headers))
            row_set.register_processor(offset_processor(offset + 1))

            values = defaultdict(lambda: defaultdict(int))

            raw_table_name = "raw_%s_table%s" % (resource_id, table_id)
            sl.drop_table(connection, raw_table_name)
            raw_table = sl.get_table(connection, raw_table_name)

            for row_ in row_set:
                cells = dict([(keyify(c.column), convert_(c.value)) for c in row_ if len(c.column.strip())])
                for cell, value in cells.items():
                    values[cell][value] += 1
                sl.add_row(connection, raw_table, cells)

        sl.upsert(
            connection,
            extracted_table,
            {"resource_id": resource_id, "max_table_id": table_id, "extraction_time": time.time() - start},
            ["resource_id"],
        )

        trans.commit()
    # except Exception:
    #    traceback.print_exc()
    #    #log.exception(ex)
    #    assert False, traceback.format_exc()
    finally:
        connection.close()
        fh.close()
예제 #19
0
        b['seite'] = belem.findtext("BESCHLUSSSEITE")
        b['dokument_text'] = belem.findtext("BEZUGSDOKUMENT")
        b['tenor'] = belem.findtext("BESCHLUSSTENOR")
        b['grundlage'] = belem.findtext("GRUNDLAGE")
        sl.add_row(engine, Beschluss, b)

    Referenz = sl.get_table(engine, 'referenz')
    try:
        dokument = dokument_by_url(p['fundstelle_url']) or \
            dokument_by_name(p['fundstelle'])
        dokument.update(pos_keys)
        sl.add_row(engine, Referenz, dokument)
    except Exception, e:
        log.exception(e)

    sl.add_row(engine, Position, p)
    Person = sl.get_table(engine, 'person')
    Beitrag = sl.get_table(engine, 'beitrag')
    for belem in elem.findall("PERSOENLICHER_URHEBER"):
        b = pos_keys.copy()
        b['vorname'] = belem.findtext("VORNAME")
        b['nachname'] = belem.findtext("NACHNAME")
        b['funktion'] = belem.findtext("FUNKTION")
        b['ort'] = belem.findtext('WAHLKREISZUSATZ')
        p = sl.find_one(engine,
                        Person,
                        vorname=b['vorname'],
                        nachname=b['nachname'],
                        ort=b['ort'])
        if p is not None:
            b['person_source_url'] = p['source_url']
예제 #20
0
        b['seite'] = belem.findtext("BESCHLUSSSEITE")
        b['dokument_text'] = belem.findtext("BEZUGSDOKUMENT")
        b['tenor'] = belem.findtext("BESCHLUSSTENOR")
        b['grundlage'] = belem.findtext("GRUNDLAGE")
        sl.add_row(engine, Beschluss, b)

    Referenz = sl.get_table(engine, 'referenz')
    try:
        dokument = dokument_by_url(p['fundstelle_url']) or \
            dokument_by_name(p['fundstelle'])
        dokument.update(pos_keys)
        sl.add_row(engine, Referenz, dokument)
    except Exception, e:
        log.exception(e)

    sl.add_row(engine, Position, p)
    Person = sl.get_table(engine, 'person')
    Beitrag = sl.get_table(engine, 'beitrag')
    for belem in elem.findall("PERSOENLICHER_URHEBER"):
        b = pos_keys.copy()
        b['vorname'] = belem.findtext("VORNAME")
        b['nachname'] = belem.findtext("NACHNAME")
        b['funktion'] = belem.findtext("FUNKTION")
        b['ort'] = belem.findtext('WAHLKREISZUSATZ')
        p = sl.find_one(engine, Person, 
                vorname=b['vorname'],
                nachname=b['nachname'],
                ort=b['ort'])
        if p is not None:
            b['person_source_url'] = p['source_url']
        b['ressort'] = belem.findtext("RESSORT")
예제 #21
0
def extract_resource_core(engine, row):
    connection = engine.connect()
    fh = open(source_path(row), 'rb')
    source_data = fh.read()

    if not len(source_data):
        issue(engine, row['resource_id'], row['retrieve_hash'],
              "Empty file")
        return False, 0
    if html_re.search(source_data[0:1024]) is not None:
        issue(engine, row['resource_id'], row['retrieve_hash'],
              "HTML file detected, not a transaction report")
        return False, 0
    if source_data.startswith('%PDF'):
        issue(engine, row['resource_id'], row['retrieve_hash'],
              "PDF file detected, not a transaction report")
        return False, 0

    trans = connection.begin()
    start = time.time()
    try:
        if source_data.startswith(COMPDOC_SIGNATURE):
            fh.seek(0)
            table_set = XLSTableSet(fh)
        elif source_data.startswith('PK'):
            table_set = XLSXTableSet(source_path(row))
        else:
            #fh.seek(0)
            from StringIO import StringIO
            sio = StringIO(source_data)
            #cd = chardet.detect(source_data)
            #fh.close()
            #fh = codecs.open(source_path(row), 'r', cd['encoding'] or 'utf-8')
            table_set = CSVTableSet(sio)

        sheets = 0
        for sheet_id, row_set in enumerate(table_set.tables):
            offset, headers = headers_guess(row_set.sample)
            headers = map(convert_, headers)
            log.debug("Headers: %r", headers)
            if len(headers) <= 1:
                continue
            sheets += 1

            row_set.register_processor(headers_processor(headers))
            row_set.register_processor(offset_processor(offset + 1))

            values = defaultdict(lambda: defaultdict(int))

            raw_table_name = 'raw_%s_sheet%s' % (row['resource_id'], sheet_id)
            sl.drop_table(connection, raw_table_name)
            raw_table = sl.get_table(connection, raw_table_name)

            for row_ in row_set:
                cells = dict([(keyify(c.column), convert_(c.value)) for c in row_ if \
                    len(c.column.strip())])
                for cell, value in cells.items():
                    values[cell][value] += 1
                sl.add_row(connection, raw_table, cells)

        trans.commit()
        return sheets>0, sheets
    except Exception, ex:
        log.exception(ex)
        issue(engine, row['resource_id'], row['retrieve_hash'],
              unicode(ex))
        return False, 0