def combine_sheet(engine, resource, sheet_id, table, mapping): begin = time.time() base = { 'resource_id': resource['resource_id'], 'resource_hash': resource['extract_hash'], 'sheet_id': sheet_id, } spending_table = sl.get_table(engine, 'spending') connection = engine.connect() trans = connection.begin() try: rows = 0 sl.delete(connection, spending_table, resource_id=resource['resource_id'], sheet_id=sheet_id) for row in sl.all(connection, table): data = dict(base) for col, value in row.items(): if col == 'id': data['row_id'] = value continue mapped = mapping.get(col) if mapped is not None: data[mapped] = value sl.add_row(connection, spending_table, data) rows += 1 trans.commit() log.info("Loaded %s rows in %s ms", rows, int((time.time()-begin)*1000)) return rows > 0 finally: connection.close()
def cleanup_sheet(engine, row, sheet_id): spending_table = sl.get_table(engine, 'spending') data = list( sl.find(engine, spending_table, resource_id=row['resource_id'], sheet_id=sheet_id)) connection = engine.connect() trans = connection.begin() date_formats = cleanup_dates.detect_formats(data) try: if None in date_formats.values(): log.warn("Couldn't detect date formats: %r", date_formats) issue(engine, row['resource_id'], row['retrieve_hash'], "Couldn't detect date formats", repr(date_formats)) return False sl.delete(connection, spending_table, resource_id=row['resource_id'], sheet_id=sheet_id) for row in data: row = cleanup_dates.apply(row, date_formats) row = cleanup_numbers.apply(row) row = cleanup_gov.apply(row) #row = cleanup_supplier.apply(row, engine) del row['id'] sl.add_row(connection, spending_table, row) trans.commit() return True finally: connection.close()
def retrieve(row, engine, force): ret_table = sl.get_table(engine, 'retrieval_log') #print row.get('package_name'), row['url'].encode('utf-8') try: import os if not force and os.path.exists(source_path(row)): return url = fix_url(row['url']) print "Fetching %s" % url res = urllib2.urlopen(url) fh = open(source_path(row), 'wb') fh.write(res.read()) sl.add_row(engine, ret_table, { 'resource_id': row['resource_id'], 'status': '200', 'message': "", 'content-type': res.headers.get('content-type', ''), 'timestamp': datetime.now() }) except Exception, ioe: print traceback.format_exc() status = 0 if hasattr(ioe, 'code'): status = ioe.code sl.add_row(engine, ret_table, { 'resource_id': row['resource_id'], 'status': status, 'message': unicode(ioe), 'timestamp': datetime.now() }) assert False, unicode(ioe).encode('utf-8')
def combine_sheet(engine, resource, sheet_id, table, mapping): begin = time.time() base = { 'resource_id': resource['resource_id'], 'resource_hash': resource['extract_hash'], 'sheet_id': sheet_id, } spending_table = sl.get_table(engine, 'spending') connection = engine.connect() trans = connection.begin() try: rows = 0 sl.delete(connection, spending_table, resource_id=resource['resource_id'], sheet_id=sheet_id) for row in sl.all(connection, table): data = dict(base) for col, value in row.items(): if col == 'id': data['row_id'] = value continue mapped = mapping.get(col) if mapped is not None: data[mapped] = value sl.add_row(connection, spending_table, data) rows += 1 trans.commit() log.info("Loaded %s rows in %s ms", rows, int((time.time() - begin) * 1000)) return rows > 0 finally: connection.close()
def condense(engine, resource_id, table_id, force): table_suffix = '%s_table%s' % (resource_id, table_id) if not engine.has_table('raw_%s' % table_suffix): return condensed_table = sl.get_table(engine, 'condensed') # Skip over tables we have already extracted if not force and sl.find_one(engine, condensed_table, resource_id=resource_id, table_id=table_id) is not None: return connection = engine.connect() trans = connection.begin() start = time.time() try: raw_table = sl.get_table(connection, 'raw_%s' % table_suffix) sl.drop_table(connection, 'spending_%s' % table_suffix) spending_table = sl.get_table(connection, 'spending_%s' % table_suffix) columns_table = sl.get_table(connection, 'column_sets') normalise_map = normalised_columns_map(raw_table) normalised_headers = ','.join(sorted(normalise_map.values())) mapping_row = sl.find_one(connection, columns_table, normalised=normalised_headers) if mapping_row is None or not mapping_row.get('valid'): # This table is unmapped, cannot be condensed return column_mapping = json.loads(mapping_row['column_map']) # Build the final mapping from input column to output column mapping = {} for k,n in normalise_map.iteritems(): if n in column_mapping and column_mapping[n] is not None and len(column_mapping[n]) > 0: mapping[k] = column_mapping[n] for row in sl.all(connection, raw_table): spending_row = {} for key, value in row.items(): if key not in mapping: continue if not value or not len(value.strip()): continue if mapping[key] in spending_row: continue spending_row[mapping[key]] = value.strip() #print spending_row sl.add_row(connection, spending_table, spending_row) sl.upsert(connection, condensed_table, {'resource_id': resource_id, 'table_id': table_id, 'condense_time': time.time() - start, }, ['resource_id', 'table_id']) trans.commit() finally: connection.close()
def issue(engine, resource_id, resource_hash, stage, message, data={}): table = sl.get_table(engine, 'issue') log = logging.getLogger('issue') log.debug("R[%s]: %s", resource_id, message) sl.add_row( engine, table, { 'resource_id': resource_id, 'resource_hash': resource_hash, 'timestamp': datetime.datetime.utcnow(), 'stage': stage, 'message': message, 'data': json.dumps(data) })
def issue(engine, resource_id, resource_hash, stage, message, data={}): table = sl.get_table(engine, 'issue') log = logging.getLogger('issue') log.debug("R[%s]: %s", resource_id, message) sl.add_row(engine, table, { 'resource_id': resource_id, 'resource_hash': resource_hash, 'timestamp': datetime.datetime.utcnow(), 'stage': stage, 'message': message, 'data': json.dumps(data) })
def issue(engine, resource_id, resource_hash, stage, message, data={}): import sqlaload as sl # this import is slow, so it is done inside this func table = sl.get_table(engine, 'issue') log = logging.getLogger('issue') log.debug("R[%s]: %s", resource_id, message) sl.add_row(engine, table, { 'resource_id': resource_id, 'resource_hash': resource_hash, 'timestamp': datetime.datetime.utcnow(), 'stage': stage, 'message': message, 'data': json.dumps(data) })
def issue(engine, resource_id, resource_hash, stage, message, data={}): import sqlaload as sl # this import is slow, so it is done inside this func table = sl.get_table(engine, 'issue') log = logging.getLogger('issue') log.debug("R[%s]: %s", resource_id, message) sl.add_row( engine, table, { 'resource_id': resource_id, 'resource_hash': resource_hash, 'timestamp': datetime.datetime.utcnow(), 'stage': stage, 'message': message, 'data': json.dumps(data) })
def scrape_activity(engine, url, elem): urheber = elem.findtext("URHEBER") fundstelle = elem.findtext("FUNDSTELLE") Position = sl.get_table(engine, 'position') p = {'source_url': url, 'urheber': urheber, 'fundstelle': fundstelle} pos_keys = p.copy() p['zuordnung'] = elem.findtext("ZUORDNUNG") p['abstrakt'] = elem.findtext("VP_ABSTRAKT") p['fundstelle_url'] = elem.findtext("FUNDSTELLE_LINK") Zuweisung = sl.get_table(engine, 'zuweisung') for zelem in elem.findall("ZUWEISUNG"): z = pos_keys.copy() z['text'] = zelem.findtext("AUSSCHUSS_KLARTEXT") z['federfuehrung'] = zelem.find("FEDERFUEHRUNG") is not None z['gremium_key'] = DIP_GREMIUM_TO_KEY.get(z['text']) sl.add_row(engine, Zuweisung, z) Beschluss = sl.get_table(engine, 'beschluss') for belem in elem.findall("BESCHLUSS"): b = pos_keys.copy() b['seite'] = belem.findtext("BESCHLUSSSEITE") b['dokument_text'] = belem.findtext("BEZUGSDOKUMENT") b['tenor'] = belem.findtext("BESCHLUSSTENOR") b['grundlage'] = belem.findtext("GRUNDLAGE") sl.add_row(engine, Beschluss, b) Referenz = sl.get_table(engine, 'referenz') try: dokument = dokument_by_url(p['fundstelle_url']) or \ dokument_by_name(p['fundstelle']) dokument.update(pos_keys) sl.add_row(engine, Referenz, dokument) except Exception, e: log.exception(e)
def cleanup_sheet(engine, row, sheet_id, data_row_filter, stats_spending): spending_table = sl.get_table(engine, 'spending') data = list( sl.find(engine, spending_table, resource_id=row['resource_id'], sheet_id=sheet_id)) if not data: log.info('Sheet has no rows') return False, None connection = engine.connect() trans = connection.begin() date_formats = cleanup_dates.detect_formats(data) try: for date_format in date_formats.values(): if isinstance(date_format, basestring): issue(engine, row['resource_id'], row['retrieve_hash'], STAGE, "Couldn't detect date formats because: %s" % date_format, repr(date_formats)) return True, date_format if not data_row_filter: sl.delete(connection, spending_table, resource_id=row['resource_id'], sheet_id=sheet_id) for row in data: if data_row_filter and data_row_filter != row['row_id']: continue row = cleanup_dates.apply(row, date_formats, stats_spending) row = cleanup_numbers.apply(row, stats_spending) row = cleanup_gov.apply(row, stats_spending) #row = cleanup_supplier.apply(row, engine) del row['id'] sl.add_row(connection, spending_table, row) trans.commit() return True, None finally: connection.close()
def emit(data): if not 'amount_n_total' in data and not 'amount_n_pay' in data: return for colyear, d in [('amount_nm2_', 2), ('amount_nm1_', 1), ('amount_n_', 0)]: rec = {'year': year - d} for k, v in data.items(): if k.startswith(colyear): k = k.replace(colyear, 'amount_') rec[k] = v elif not k.startswith('amount_'): rec[k] = v for figure_type, fields in FIGURE_FIELDS.items(): r = rec.copy() r.update({ 'amount': rec.get(fields[0]), 'amount_reserve_total': rec.get(fields[1]), 'amount_reserve_figure': rec.get(fields[2]), 'figure_type': figure_type }) for fields in FIGURE_FIELDS.values(): for f in fields: r.pop(f, None) sl.add_row(engine, table, r)
def cleanup_sheet(engine, row, sheet_id, data_row_filter, stats_spending): spending_table = sl.get_table(engine, 'spending') data = list(sl.find(engine, spending_table, resource_id=row['resource_id'], sheet_id=sheet_id)) if not data: log.info('Sheet has no rows') return False, None connection = engine.connect() trans = connection.begin() date_formats = cleanup_dates.detect_formats(data) try: for date_format in date_formats.values(): if isinstance(date_format, basestring): issue(engine, row['resource_id'], row['retrieve_hash'], STAGE, "Couldn't detect date formats because: %s" % date_format, repr(date_formats)) return True, date_format if not data_row_filter: sl.delete(connection, spending_table, resource_id=row['resource_id'], sheet_id=sheet_id) for row in data: if data_row_filter and data_row_filter != row['row_id']: continue row = cleanup_dates.apply(row, date_formats, stats_spending) row = cleanup_numbers.apply(row, stats_spending) row = cleanup_gov.apply(row, stats_spending) #row = cleanup_supplier.apply(row, engine) del row['id'] sl.add_row(connection, spending_table, row) trans.commit() return True, None finally: connection.close()
def cleanup_sheet(engine, row, sheet_id): spending_table = sl.get_table(engine, "spending") data = list(sl.find(engine, spending_table, resource_id=row["resource_id"], sheet_id=sheet_id)) connection = engine.connect() trans = connection.begin() date_formats = cleanup_dates.detect_formats(data) try: if None in date_formats.values(): log.warn("Couldn't detect date formats: %r", date_formats) issue(engine, row["resource_id"], row["retrieve_hash"], "Couldn't detect date formats", repr(date_formats)) return False sl.delete(connection, spending_table, resource_id=row["resource_id"], sheet_id=sheet_id) for row in data: row = cleanup_dates.apply(row, date_formats) row = cleanup_numbers.apply(row) row = cleanup_gov.apply(row) # row = cleanup_supplier.apply(row, engine) del row["id"] sl.add_row(connection, spending_table, row) trans.commit() return True finally: connection.close()
def extract_resource_core(engine, row, stats): connection = engine.connect() fh = open(source_path(row), 'rb') source_data = fh.read() if not len(source_data): issue(engine, row['resource_id'], row['retrieve_hash'], STAGE, "Empty file") stats.add_source('Empty file', row) return False, 0 if html_re.search(source_data[0:1024]) is not None: issue(engine, row['resource_id'], row['retrieve_hash'], STAGE, "HTML file detected, not a transaction report") stats.add_source('HTML file', row) return False, 0 if source_data.startswith('%PDF'): issue(engine, row['resource_id'], row['retrieve_hash'], STAGE, "PDF file detected, not a transaction report") stats.add_source('PDF file', row) return False, 0 trans = connection.begin() start = time.time() try: if source_data.startswith(COMPDOC_SIGNATURE): fh.seek(0) table_set = XLSTableSet(fh) elif source_data.startswith('PK'): table_set = XLSXTableSet(filename=source_path(row)) else: #fh.seek(0) from StringIO import StringIO sio = StringIO(source_data) encoding = None detected = chardet.detect(source_data[:200]) log.debug('Encoding detected as: %s', detected.get('encoding')) if detected.get( 'encoding') == 'ISO-8859-2' and '\xa3' in source_data: # Detected as Latin2 but probably isn't - that is for Eastern # European languages. Probably because the presence of a GBP # pound sign has foxed chardet. It is pretty certain that it is # a single-byte ASCII-variant, and my money is on Windows-1252 encoding = 'windows-1252' log.debug( 'Probably not ISO-8859-2 because it has GBP symbol, so assuming it is Windows-1252' ) table_set = CSVTableSet(sio, encoding=encoding) sheets = 0 for sheet_id, row_set in enumerate(table_set.tables): offset, headers = headers_guess(row_set.sample) headers = map(convert_, headers) log.debug("Headers: %r", headers) if len(headers) <= 1: continue sheets += 1 row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) values = defaultdict(lambda: defaultdict(int)) raw_table_name = 'raw_%s_sheet%s' % (row['resource_id'], sheet_id) sl.drop_table(connection, raw_table_name) raw_table = sl.get_table(connection, raw_table_name) # with one header row, offset=0 and we want row_number=1 so that # the first data row is row_number=2, matching the row number as # seen in Excel row_number = offset + 1 for row_ in row_set: cells = dict([(keyify(c.column), convert_(c.value)) for c in row_ if \ len(c.column.strip())]) row_number += 1 if is_row_blank(cells): continue for cell, value in cells.items(): values[cell][value] += 1 cells['row_number'] = row_number sl.add_row(connection, raw_table, cells) trans.commit() log.debug(stats.add_source('Extracted ok', row)) return sheets > 0, sheets except Exception, ex: log.exception(ex) issue(engine, row['resource_id'], row['retrieve_hash'], STAGE, unicode(ex)) stats.add_source('Exception: %s' % ex.__class__.__name__, row) return False, 0
def extract_resource_core(engine, row, stats): connection = engine.connect() fh = open(source_path(row), 'rb') source_data = fh.read() if not len(source_data): issue(engine, row['resource_id'], row['retrieve_hash'], STAGE, "Empty file") stats.add_source('Empty file', row) return False, 0 if html_re.search(source_data[0:1024]) is not None: issue(engine, row['resource_id'], row['retrieve_hash'], STAGE, "HTML file detected, not a transaction report") stats.add_source('HTML file', row) return False, 0 if source_data.startswith('%PDF'): issue(engine, row['resource_id'], row['retrieve_hash'], STAGE, "PDF file detected, not a transaction report") stats.add_source('PDF file', row) return False, 0 trans = connection.begin() start = time.time() try: if source_data.startswith(COMPDOC_SIGNATURE): fh.seek(0) table_set = XLSTableSet(fh) elif source_data.startswith('PK'): table_set = XLSXTableSet(filename=source_path(row)) else: #fh.seek(0) from StringIO import StringIO sio = StringIO(source_data) encoding = None detected = chardet.detect(source_data[:200]) log.debug('Encoding detected as: %s', detected.get('encoding')) if detected.get('encoding') == 'ISO-8859-2' and '\xa3' in source_data: # Detected as Latin2 but probably isn't - that is for Eastern # European languages. Probably because the presence of a GBP # pound sign has foxed chardet. It is pretty certain that it is # a single-byte ASCII-variant, and my money is on Windows-1252 encoding = 'windows-1252' log.debug('Probably not ISO-8859-2 because it has GBP symbol, so assuming it is Windows-1252') table_set = CSVTableSet(sio, encoding=encoding) sheets = 0 for sheet_id, row_set in enumerate(table_set.tables): offset, headers = headers_guess(row_set.sample) headers = map(convert_, headers) log.debug("Headers: %r", headers) if len(headers) <= 1: continue sheets += 1 row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) values = defaultdict(lambda: defaultdict(int)) raw_table_name = 'raw_%s_sheet%s' % (row['resource_id'], sheet_id) sl.drop_table(connection, raw_table_name) raw_table = sl.get_table(connection, raw_table_name) # with one header row, offset=0 and we want row_number=1 so that # the first data row is row_number=2, matching the row number as # seen in Excel row_number = offset + 1 for row_ in row_set: cells = dict([(keyify(c.column), convert_(c.value)) for c in row_ if \ len(c.column.strip())]) row_number += 1 if is_row_blank(cells): continue for cell, value in cells.items(): values[cell][value] += 1 cells['row_number'] = row_number sl.add_row(connection, raw_table, cells) trans.commit() log.debug(stats.add_source('Extracted ok', row)) return sheets>0, sheets except Exception, ex: log.exception(ex) issue(engine, row['resource_id'], row['retrieve_hash'], STAGE, unicode(ex)) stats.add_source('Exception: %s' % ex.__class__.__name__, row) return False, 0
def extract_table(engine, table, row, resource_id, force): # For now, interpret lack of data as not-failure at this stage, on # the basis that it was already reported as failure at the # retrieve stage and will just clutter up this list. if not os.path.exists(source_path(row)): return # assert os.path.exists(source_path(row)), "No source file exists." connection = engine.connect() extracted_table = sl.get_table(connection, "extracted") # Skip over tables we have already extracted if not force and sl.find_one(engine, extracted_table, resource_id=resource_id) is not None: return fh = open(source_path(row), "rb") source_data = fh.read() assert len(source_data) > 0, "Empty file" assert html_re.search(source_data[0:1024]) is None, "Looks like HTML" assert not source_data.startswith("%PDF"), "Looks like PDF" trans = connection.begin() start = time.time() try: if source_data.startswith(COMPDOC_SIGNATURE): fh.seek(0) table_set = XLSTableSet.from_fileobj(fh) elif source_data.startswith("PK"): table_set = XLSXTableSet(source_path(row)) else: cd = chardet.detect(source_data) fh.close() fh = codecs.open(source_path(row), "r", cd["encoding"]) table_set = CSVTableSet.from_fileobj(fh) for table_id, row_set in enumerate(table_set.tables): # types = type_guess(row_set.sample) # row_set.register_processor(types_processor(types)) offset, headers = headers_guess(row_set.sample) headers = map(convert_, headers) assert ( len(headers) > 1 or len(table_set.tables) > 1 ), "Only one column was detected; assuming this is not valid data." # print headers # We might have multiple table sets where one is blank or ranty text or something. Skip those. if len(headers) <= 1: continue row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) values = defaultdict(lambda: defaultdict(int)) raw_table_name = "raw_%s_table%s" % (resource_id, table_id) sl.drop_table(connection, raw_table_name) raw_table = sl.get_table(connection, raw_table_name) for row_ in row_set: cells = dict([(keyify(c.column), convert_(c.value)) for c in row_ if len(c.column.strip())]) for cell, value in cells.items(): values[cell][value] += 1 sl.add_row(connection, raw_table, cells) sl.upsert( connection, extracted_table, {"resource_id": resource_id, "max_table_id": table_id, "extraction_time": time.time() - start}, ["resource_id"], ) trans.commit() # except Exception: # traceback.print_exc() # #log.exception(ex) # assert False, traceback.format_exc() finally: connection.close() fh.close()
b['seite'] = belem.findtext("BESCHLUSSSEITE") b['dokument_text'] = belem.findtext("BEZUGSDOKUMENT") b['tenor'] = belem.findtext("BESCHLUSSTENOR") b['grundlage'] = belem.findtext("GRUNDLAGE") sl.add_row(engine, Beschluss, b) Referenz = sl.get_table(engine, 'referenz') try: dokument = dokument_by_url(p['fundstelle_url']) or \ dokument_by_name(p['fundstelle']) dokument.update(pos_keys) sl.add_row(engine, Referenz, dokument) except Exception, e: log.exception(e) sl.add_row(engine, Position, p) Person = sl.get_table(engine, 'person') Beitrag = sl.get_table(engine, 'beitrag') for belem in elem.findall("PERSOENLICHER_URHEBER"): b = pos_keys.copy() b['vorname'] = belem.findtext("VORNAME") b['nachname'] = belem.findtext("NACHNAME") b['funktion'] = belem.findtext("FUNKTION") b['ort'] = belem.findtext('WAHLKREISZUSATZ') p = sl.find_one(engine, Person, vorname=b['vorname'], nachname=b['nachname'], ort=b['ort']) if p is not None: b['person_source_url'] = p['source_url']
b['seite'] = belem.findtext("BESCHLUSSSEITE") b['dokument_text'] = belem.findtext("BEZUGSDOKUMENT") b['tenor'] = belem.findtext("BESCHLUSSTENOR") b['grundlage'] = belem.findtext("GRUNDLAGE") sl.add_row(engine, Beschluss, b) Referenz = sl.get_table(engine, 'referenz') try: dokument = dokument_by_url(p['fundstelle_url']) or \ dokument_by_name(p['fundstelle']) dokument.update(pos_keys) sl.add_row(engine, Referenz, dokument) except Exception, e: log.exception(e) sl.add_row(engine, Position, p) Person = sl.get_table(engine, 'person') Beitrag = sl.get_table(engine, 'beitrag') for belem in elem.findall("PERSOENLICHER_URHEBER"): b = pos_keys.copy() b['vorname'] = belem.findtext("VORNAME") b['nachname'] = belem.findtext("NACHNAME") b['funktion'] = belem.findtext("FUNKTION") b['ort'] = belem.findtext('WAHLKREISZUSATZ') p = sl.find_one(engine, Person, vorname=b['vorname'], nachname=b['nachname'], ort=b['ort']) if p is not None: b['person_source_url'] = p['source_url'] b['ressort'] = belem.findtext("RESSORT")
def extract_resource_core(engine, row): connection = engine.connect() fh = open(source_path(row), 'rb') source_data = fh.read() if not len(source_data): issue(engine, row['resource_id'], row['retrieve_hash'], "Empty file") return False, 0 if html_re.search(source_data[0:1024]) is not None: issue(engine, row['resource_id'], row['retrieve_hash'], "HTML file detected, not a transaction report") return False, 0 if source_data.startswith('%PDF'): issue(engine, row['resource_id'], row['retrieve_hash'], "PDF file detected, not a transaction report") return False, 0 trans = connection.begin() start = time.time() try: if source_data.startswith(COMPDOC_SIGNATURE): fh.seek(0) table_set = XLSTableSet(fh) elif source_data.startswith('PK'): table_set = XLSXTableSet(source_path(row)) else: #fh.seek(0) from StringIO import StringIO sio = StringIO(source_data) #cd = chardet.detect(source_data) #fh.close() #fh = codecs.open(source_path(row), 'r', cd['encoding'] or 'utf-8') table_set = CSVTableSet(sio) sheets = 0 for sheet_id, row_set in enumerate(table_set.tables): offset, headers = headers_guess(row_set.sample) headers = map(convert_, headers) log.debug("Headers: %r", headers) if len(headers) <= 1: continue sheets += 1 row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) values = defaultdict(lambda: defaultdict(int)) raw_table_name = 'raw_%s_sheet%s' % (row['resource_id'], sheet_id) sl.drop_table(connection, raw_table_name) raw_table = sl.get_table(connection, raw_table_name) for row_ in row_set: cells = dict([(keyify(c.column), convert_(c.value)) for c in row_ if \ len(c.column.strip())]) for cell, value in cells.items(): values[cell][value] += 1 sl.add_row(connection, raw_table, cells) trans.commit() return sheets>0, sheets except Exception, ex: log.exception(ex) issue(engine, row['resource_id'], row['retrieve_hash'], unicode(ex)) return False, 0