existing_info = dict((f['id'], f['info']) for f in existing.get('fields', []) if 'info' in f) # Some headers might have been converted from strings to floats and such. headers = [unicode(header) for header in headers] row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) types = messytables.type_guess(row_set.sample, types=TYPES, strict=True) # override with types user requested if existing_info: types = [{ 'text': messytables.StringType(), 'numeric': messytables.DecimalType(), 'timestamp': messytables.DateUtilType(), }.get(existing_info.get(h, {}).get('type_override'), t) for t, h in zip(types, headers)] row_set.register_processor(messytables.types_processor(types)) headers = [header.strip() for header in headers if header.strip()] headers_set = set(headers) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): column_name = cell.column.strip() if column_name not in headers_set:
def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): '''Loads an Excel file (or other tabular data recognized by messytables) into Datastore and creates indexes. Largely copied from datapusher - see below. Is slower than load_csv. ''' # use messytables to determine the header row logger.info('Determining column names and types') ct = mimetype format = os.path.splitext(table_filepath)[1] # filename extension with open(table_filepath, 'rb') as tmp: # # Copied from datapusher/jobs.py:push_to_datastore # try: table_set = messytables.any_tableset(tmp, mimetype=ct, extension=ct) except messytables.ReadError as e: # try again with format tmp.seek(0) try: table_set = messytables.any_tableset(tmp, mimetype=format, extension=format) except Exception as e: raise LoaderError(e) if not table_set.tables: raise LoaderError('Could not parse file as tabular data') row_set = table_set.tables.pop() offset, headers = messytables.headers_guess(row_set.sample) existing = datastore_resource_exists(resource_id) existing_info = None if existing: existing_info = dict((f['id'], f['info']) for f in existing.get('fields', []) if 'info' in f) # Some headers might have been converted from strings to floats and such. headers = encode_headers(headers) row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) TYPES, TYPE_MAPPING = get_types() types = messytables.type_guess(row_set.sample, types=TYPES, strict=True) # override with types user requested if existing_info: types = [{ 'text': messytables.StringType(), 'numeric': messytables.DecimalType(), 'timestamp': messytables.DateUtilType(), }.get(existing_info.get(h, {}).get('type_override'), t) for t, h in zip(types, headers)] row_set.register_processor(messytables.types_processor(types)) headers = [ header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip() ] headers_set = set(headers) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): column_name = cell.column.strip() if column_name not in headers_set: continue data_row[column_name] = cell.value yield data_row result = row_iterator() ''' Delete existing datstore resource before proceeding. Otherwise 'datastore_create' will append to the existing datastore. And if the fields have significantly changed, it may also fail. ''' if existing: logger.info('Deleting "{res_id}" from datastore.'.format( res_id=resource_id)) delete_datastore_resource(resource_id) headers_dicts = [ dict(id=field[0], type=TYPE_MAPPING[str(field[1])]) for field in zip(headers, types) ] # Maintain data dictionaries from matching column names if existing_info: for h in headers_dicts: if h['id'] in existing_info: h['info'] = existing_info[h['id']] # create columns with types user requested type_override = existing_info[h['id']].get('type_override') if type_override in _TYPE_MAPPING.values(): h['type'] = type_override logger.info('Determined headers and types: {headers}'.format( headers=headers_dicts)) ### Commented - this is only for tests # if dry_run: # return headers_dicts, result logger.info('Copying to database...') count = 0 for i, records in enumerate(chunky(result, 250)): count += len(records) logger.info('Saving chunk {number}'.format(number=i)) send_resource_to_datastore(resource_id, headers_dicts, records) logger.info('...copying done') if count: logger.info( 'Successfully pushed {n} entries to "{res_id}".'.format( n=count, res_id=resource_id)) else: # no datastore table is created raise LoaderError('No entries found - nothing to load')
def push_to_datastore(task_id, input, dry_run=False): '''Download and parse a resource push its data into CKAN's DataStore. An asynchronous job that gets a resource from CKAN, downloads the resource's data file and, if the data file has changed since last time, parses the data and posts it into CKAN's DataStore. :param dry_run: Fetch and parse the data file but don't actually post the data to the DataStore, instead return the data headers and rows that would have been posted. :type dry_run: boolean ''' handler = util.StoringHandler(task_id, input) logger = logging.getLogger(task_id) logger.addHandler(handler) logger.setLevel(logging.DEBUG) validate_input(input) data = input['metadata'] ckan_url = data['ckan_url'] resource_id = data['resource_id'] api_key = input.get('api_key') try: resource = get_resource(resource_id, ckan_url, api_key) except util.JobError as e: # try again in 5 seconds just incase CKAN is slow at adding resource time.sleep(5) resource = get_resource(resource_id, ckan_url, api_key) # check if the resource url_type is a datastore if resource.get('url_type') == 'datastore': logger.info('Dump files are managed with the Datastore API') return # check scheme url = resource.get('url') scheme = urlsplit(url).scheme if scheme not in ('http', 'https', 'ftp'): raise util.JobError( 'Only http, https, and ftp resources may be fetched.' ) # fetch the resource data logger.info('Fetching from: {0}'.format(url)) headers = {} if resource.get('url_type') == 'upload': # If this is an uploaded file to CKAN, authenticate the request, # otherwise we won't get file from private resources headers['Authorization'] = api_key try: response = requests.get( url, headers=headers, timeout=DOWNLOAD_TIMEOUT, verify=SSL_VERIFY, stream=True, # just gets the headers for now ) response.raise_for_status() cl = response.headers.get('content-length') try: if cl and int(cl) > MAX_CONTENT_LENGTH: raise util.JobError( 'Resource too large to download: {cl} > max ({max_cl}).' .format(cl=cl, max_cl=MAX_CONTENT_LENGTH)) except ValueError: pass tmp = tempfile.TemporaryFile() length = 0 m = hashlib.md5() for chunk in response.iter_content(CHUNK_SIZE): length += len(chunk) if length > MAX_CONTENT_LENGTH: raise util.JobError( 'Resource too large to process: {cl} > max ({max_cl}).' .format(cl=length, max_cl=MAX_CONTENT_LENGTH)) tmp.write(chunk) m.update(chunk) ct = response.headers.get('content-type', '').split(';', 1)[0] except requests.HTTPError as e: raise HTTPError( "DataPusher received a bad HTTP response when trying to download " "the data file", status_code=e.response.status_code, request_url=url, response=e.response.content) except requests.RequestException as e: raise HTTPError( message=str(e), status_code=None, request_url=url, response=None) file_hash = m.hexdigest() tmp.seek(0) if (resource.get('hash') == file_hash and not data.get('ignore_hash')): logger.info("The file hash hasn't changed: {hash}.".format( hash=file_hash)) return resource['hash'] = file_hash try: table_set = messytables.any_tableset(tmp, mimetype=ct, extension=ct) except messytables.ReadError as e: # try again with format tmp.seek(0) try: format = resource.get('format') table_set = messytables.any_tableset(tmp, mimetype=format, extension=format) except: raise util.JobError(e) get_row_set = web.app.config.get('GET_ROW_SET', lambda table_set: table_set.tables.pop()) row_set = get_row_set(table_set) offset, headers = messytables.headers_guess(row_set.sample) existing = datastore_resource_exists(resource_id, api_key, ckan_url) existing_info = None if existing: existing_info = dict((f['id'], f['info']) for f in existing.get('fields', []) if 'info' in f) # Some headers might have been converted from strings to floats and such. headers = [str(header) for header in headers] row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) types = messytables.type_guess(row_set.sample, types=TYPES, strict=True) # override with types user requested if existing_info: types = [{ 'text': messytables.StringType(), 'numeric': messytables.DecimalType(), 'timestamp': messytables.DateUtilType(), }.get(existing_info.get(h, {}).get('type_override'), t) for t, h in zip(types, headers)] row_set.register_processor(messytables.types_processor(types)) headers = [header.strip() for header in headers if header.strip()] headers_set = set(headers) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): column_name = cell.column.strip() if column_name not in headers_set: continue if isinstance(cell.value, str): try: data_row[column_name] = cell.value.encode('latin-1').decode('utf-8') except (UnicodeDecodeError, UnicodeEncodeError): data_row[column_name] = cell.value else: data_row[column_name] = cell.value yield data_row result = row_iterator() ''' Delete existing datstore resource before proceeding. Otherwise 'datastore_create' will append to the existing datastore. And if the fields have significantly changed, it may also fail. ''' if existing: logger.info('Deleting "{res_id}" from datastore.'.format( res_id=resource_id)) delete_datastore_resource(resource_id, api_key, ckan_url) headers_dicts = [dict(id=field[0], type=TYPE_MAPPING[str(field[1])]) for field in zip(headers, types)] # Maintain data dictionaries from matching column names if existing_info: for h in headers_dicts: if h['id'] in existing_info: h['info'] = existing_info[h['id']] # create columns with types user requested type_override = existing_info[h['id']].get('type_override') if type_override in list(_TYPE_MAPPING.values()): h['type'] = type_override logger.info('Determined headers and types: {headers}'.format( headers=headers_dicts)) if dry_run: return headers_dicts, result count = 0 for i, chunk in enumerate(chunky(result, 250)): records, is_it_the_last_chunk = chunk count += len(records) logger.info('Saving chunk {number} {is_last}'.format( number=i, is_last='(last)' if is_it_the_last_chunk else '')) send_resource_to_datastore(resource, headers_dicts, records, is_it_the_last_chunk, api_key, ckan_url) logger.info('Successfully pushed {n} entries to "{res_id}".'.format( n=count, res_id=resource_id)) if data.get('set_url_type', False): update_resource(resource, api_key, ckan_url)
def _load_table_xlsx(table_filepath, resource_id, encoding, logger): from .excel import headers_guess, type_guess logger.info('Determining column names and types') wb = load_workbook(filename=table_filepath) ws = wb.worksheets[0] rows = list(ws.iter_rows()) cols = ws.iter_cols() offset, headers = headers_guess(rows, tolerance=1) existing = datastore_resource_exists(resource_id) existing_info = None if existing: existing_info = dict((f['id'], f['info']) for f in existing.get('fields', []) if 'info' in f) types = type_guess(cols, strict=True, header_offset=offset) # override with types user requested if existing_info: types = [{ 'text': messytables.StringType(), 'numeric': messytables.DecimalType(), 'timestamp': messytables.DateUtilType(), }.get(existing_info.get(h, {}).get('type_override'), t) for t, h in zip(types, headers)] headers = [ header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip() ] def data_row_iterator(): for index, row in enumerate(rows): data_row = {} if index <= offset: continue for hi, header in enumerate(headers): data_row[header] = row[hi].value # for ci, cell in enumerate(row): # column_name = headers[ci] # data_row[column_name] = cell.value yield data_row data_rows = data_row_iterator() if existing: logger.info( 'Deleting "{res_id}" from datastore.'.format(res_id=resource_id)) delete_datastore_resource(resource_id) headers_dicts = [ dict(id=field[0], type=_TYPE_MAPPING[str(field[1])]) for field in zip(headers, types) ] # Maintain data dictionaries from matching column names if existing_info: for h in headers_dicts: if h['id'] in existing_info: h['info'] = existing_info[h['id']] # create columns with types user requested type_override = existing_info[h['id']].get('type_override') if type_override in list(_TYPE_MAPPING.values()): h['type'] = type_override logger.info('Determined headers and types: {headers}'.format( headers=headers_dicts)) logger.info('Copying to database...') count = 0 for i, records in enumerate(chunky(data_rows, 250)): count += len(records) logger.info('Saving chunk {number}'.format(number=i)) send_resource_to_datastore(resource_id, headers_dicts, records) logger.info('...copying done') if count: logger.info('Successfully pushed {n} entries to "{res_id}".'.format( n=count, res_id=resource_id)) else: # no datastore table is created raise LoaderError('No entries found - nothing to load')