示例#1
0
    if existing:
        existing_info = dict((f['id'], f['info'])
                             for f in existing.get('fields', [])
                             if 'info' in f)

    # Some headers might have been converted from strings to floats and such.
    headers = [unicode(header) for header in headers]

    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    types = messytables.type_guess(row_set.sample, types=TYPES, strict=True)

    # override with types user requested
    if existing_info:
        types = [{
            'text': messytables.StringType(),
            'numeric': messytables.DecimalType(),
            'timestamp': messytables.DateUtilType(),
        }.get(existing_info.get(h, {}).get('type_override'), t)
                 for t, h in zip(types, headers)]

    row_set.register_processor(messytables.types_processor(types))

    headers = [header.strip() for header in headers if header.strip()]
    headers_set = set(headers)

    def row_iterator():
        for row in row_set:
            data_row = {}
            for index, cell in enumerate(row):
                column_name = cell.column.strip()
示例#2
0
def push_to_datastore(task_id, input, dry_run=False):
    '''Download and parse a resource push its data into CKAN's DataStore.

    An asynchronous job that gets a resource from CKAN, downloads the
    resource's data file and, if the data file has changed since last time,
    parses the data and posts it into CKAN's DataStore.

    :param dry_run: Fetch and parse the data file but don't actually post the
        data to the DataStore, instead return the data headers and rows that
        would have been posted.
    :type dry_run: boolean

    '''
    handler = util.StoringHandler(task_id, input)
    logger = logging.getLogger(task_id)
    logger.addHandler(handler)
    logger.setLevel(logging.DEBUG)

    validate_input(input)

    data = input['metadata']

    ckan_url = data['ckan_url']
    resource_id = data['resource_id']
    api_key = input.get('api_key')

    try:
        resource = get_resource(resource_id, ckan_url, api_key)
    except util.JobError as e:
        # try again in 5 seconds just incase CKAN is slow at adding resource
        time.sleep(5)
        resource = get_resource(resource_id, ckan_url, api_key)

    # check if the resource url_type is a datastore
    if resource.get('url_type') == 'datastore':
        logger.info('Dump files are managed with the Datastore API')
        return

    # check scheme
    url = resource.get('url')
    scheme = urlsplit(url).scheme
    if scheme not in ('http', 'https', 'ftp'):
        raise util.JobError(
            'Only http, https, and ftp resources may be fetched.'
        )

    # fetch the resource data
    logger.info('Fetching from: {0}'.format(url))
    headers = {}
    if resource.get('url_type') == 'upload':
        # If this is an uploaded file to CKAN, authenticate the request,
        # otherwise we won't get file from private resources
        headers['Authorization'] = api_key
    try:
        response = requests.get(
            url,
            headers=headers,
            timeout=DOWNLOAD_TIMEOUT,
            verify=SSL_VERIFY,
            stream=True,  # just gets the headers for now
        )
        response.raise_for_status()

        cl = response.headers.get('content-length')
        try:
            if cl and int(cl) > MAX_CONTENT_LENGTH:
                raise util.JobError(
                    'Resource too large to download: {cl} > max ({max_cl}).'
                    .format(cl=cl, max_cl=MAX_CONTENT_LENGTH))
        except ValueError:
            pass

        tmp = tempfile.TemporaryFile()
        length = 0
        m = hashlib.md5()
        for chunk in response.iter_content(CHUNK_SIZE):
            length += len(chunk)
            if length > MAX_CONTENT_LENGTH:
                raise util.JobError(
                    'Resource too large to process: {cl} > max ({max_cl}).'
                    .format(cl=length, max_cl=MAX_CONTENT_LENGTH))
            tmp.write(chunk)
            m.update(chunk)

        ct = response.headers.get('content-type', '').split(';', 1)[0]

    except requests.HTTPError as e:
        raise HTTPError(
            "DataPusher received a bad HTTP response when trying to download "
            "the data file", status_code=e.response.status_code,
            request_url=url, response=e.response.content)
    except requests.RequestException as e:
        raise HTTPError(
            message=str(e), status_code=None,
            request_url=url, response=None)

    file_hash = m.hexdigest()
    tmp.seek(0)

    if (resource.get('hash') == file_hash
            and not data.get('ignore_hash')):
        logger.info("The file hash hasn't changed: {hash}.".format(
            hash=file_hash))
        return

    resource['hash'] = file_hash

    try:
        table_set = messytables.any_tableset(tmp, mimetype=ct, extension=ct)
    except messytables.ReadError as e:
        # try again with format
        tmp.seek(0)
        try:
            format = resource.get('format')
            table_set = messytables.any_tableset(tmp, mimetype=format, extension=format)
        except:
            raise util.JobError(e)

    get_row_set = web.app.config.get('GET_ROW_SET',
                                     lambda table_set: table_set.tables.pop())
    row_set = get_row_set(table_set)
    offset, headers = messytables.headers_guess(row_set.sample)

    existing = datastore_resource_exists(resource_id, api_key, ckan_url)
    existing_info = None
    if existing:
        existing_info = dict((f['id'], f['info'])
            for f in existing.get('fields', []) if 'info' in f)

    # Some headers might have been converted from strings to floats and such.
    headers = [str(header) for header in headers]

    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    types = messytables.type_guess(row_set.sample, types=TYPES, strict=True)

    # override with types user requested
    if existing_info:
        types = [{
            'text': messytables.StringType(),
            'numeric': messytables.DecimalType(),
            'timestamp': messytables.DateUtilType(),
            }.get(existing_info.get(h, {}).get('type_override'), t)
            for t, h in zip(types, headers)]

    row_set.register_processor(messytables.types_processor(types))

    headers = [header.strip() for header in headers if header.strip()]
    headers_set = set(headers)

    def row_iterator():
        for row in row_set:
            data_row = {}
            for index, cell in enumerate(row):
                column_name = cell.column.strip()
                if column_name not in headers_set:
                    continue
                if isinstance(cell.value, str):
                    try:
                        data_row[column_name] = cell.value.encode('latin-1').decode('utf-8')
                    except (UnicodeDecodeError, UnicodeEncodeError):
                        data_row[column_name] = cell.value
                else:
                    data_row[column_name] = cell.value
            yield data_row
    result = row_iterator()

    '''
    Delete existing datstore resource before proceeding. Otherwise
    'datastore_create' will append to the existing datastore. And if
    the fields have significantly changed, it may also fail.
    '''
    if existing:
        logger.info('Deleting "{res_id}" from datastore.'.format(
            res_id=resource_id))
        delete_datastore_resource(resource_id, api_key, ckan_url)

    headers_dicts = [dict(id=field[0], type=TYPE_MAPPING[str(field[1])])
                     for field in zip(headers, types)]

    # Maintain data dictionaries from matching column names
    if existing_info:
        for h in headers_dicts:
            if h['id'] in existing_info:
                h['info'] = existing_info[h['id']]
                # create columns with types user requested
                type_override = existing_info[h['id']].get('type_override')
                if type_override in list(_TYPE_MAPPING.values()):
                    h['type'] = type_override

    logger.info('Determined headers and types: {headers}'.format(
        headers=headers_dicts))

    if dry_run:
        return headers_dicts, result

    count = 0
    for i, chunk in enumerate(chunky(result, 250)):
        records, is_it_the_last_chunk = chunk
        count += len(records)
        logger.info('Saving chunk {number} {is_last}'.format(
            number=i, is_last='(last)' if is_it_the_last_chunk else ''))
        send_resource_to_datastore(resource, headers_dicts, records,
                                   is_it_the_last_chunk, api_key, ckan_url)

    logger.info('Successfully pushed {n} entries to "{res_id}".'.format(
        n=count, res_id=resource_id))

    if data.get('set_url_type', False):
        update_resource(resource, api_key, ckan_url)
示例#3
0
 def datetime_convert(row_set, row):
     for cell in row:
         if isinstance(cell.value, datetime.datetime):
             cell.value = cell.value.isoformat()
             cell.type = messytables.StringType()
     return row
示例#4
0
def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
    '''Loads an Excel file (or other tabular data recognized by messytables)
    into Datastore and creates indexes.

    Largely copied from datapusher - see below. Is slower than load_csv.
    '''

    # use messytables to determine the header row
    logger.info('Determining column names and types')
    ct = mimetype
    format = os.path.splitext(table_filepath)[1]  # filename extension
    with open(table_filepath, 'rb') as tmp:

        #
        # Copied from datapusher/jobs.py:push_to_datastore
        #

        try:
            table_set = messytables.any_tableset(tmp,
                                                 mimetype=ct,
                                                 extension=ct)
        except messytables.ReadError as e:
            # try again with format
            tmp.seek(0)
            try:
                table_set = messytables.any_tableset(tmp,
                                                     mimetype=format,
                                                     extension=format)
            except Exception as e:
                raise LoaderError(e)

        if not table_set.tables:
            raise LoaderError('Could not parse file as tabular data')
        row_set = table_set.tables.pop()
        offset, headers = messytables.headers_guess(row_set.sample)

        existing = datastore_resource_exists(resource_id)
        existing_info = None
        if existing:
            existing_info = dict((f['id'], f['info'])
                                 for f in existing.get('fields', [])
                                 if 'info' in f)

        # Some headers might have been converted from strings to floats and such.
        headers = encode_headers(headers)

        row_set.register_processor(messytables.headers_processor(headers))
        row_set.register_processor(messytables.offset_processor(offset + 1))
        TYPES, TYPE_MAPPING = get_types()
        types = messytables.type_guess(row_set.sample,
                                       types=TYPES,
                                       strict=True)

        # override with types user requested
        if existing_info:
            types = [{
                'text': messytables.StringType(),
                'numeric': messytables.DecimalType(),
                'timestamp': messytables.DateUtilType(),
            }.get(existing_info.get(h, {}).get('type_override'), t)
                     for t, h in zip(types, headers)]

        row_set.register_processor(messytables.types_processor(types))

        headers = [
            header.strip()[:MAX_COLUMN_LENGTH] for header in headers
            if header.strip()
        ]
        headers_set = set(headers)

        def row_iterator():
            for row in row_set:
                data_row = {}
                for index, cell in enumerate(row):
                    column_name = cell.column.strip()
                    if column_name not in headers_set:
                        continue
                    data_row[column_name] = cell.value
                yield data_row

        result = row_iterator()
        '''
        Delete existing datstore resource before proceeding. Otherwise
        'datastore_create' will append to the existing datastore. And if
        the fields have significantly changed, it may also fail.
        '''
        if existing:
            logger.info('Deleting "{res_id}" from datastore.'.format(
                res_id=resource_id))
            delete_datastore_resource(resource_id)

        headers_dicts = [
            dict(id=field[0], type=TYPE_MAPPING[str(field[1])])
            for field in zip(headers, types)
        ]

        # Maintain data dictionaries from matching column names
        if existing_info:
            for h in headers_dicts:
                if h['id'] in existing_info:
                    h['info'] = existing_info[h['id']]
                    # create columns with types user requested
                    type_override = existing_info[h['id']].get('type_override')
                    if type_override in _TYPE_MAPPING.values():
                        h['type'] = type_override

        logger.info('Determined headers and types: {headers}'.format(
            headers=headers_dicts))

        ### Commented - this is only for tests
        # if dry_run:
        #     return headers_dicts, result

        logger.info('Copying to database...')
        count = 0
        for i, records in enumerate(chunky(result, 250)):
            count += len(records)
            logger.info('Saving chunk {number}'.format(number=i))
            send_resource_to_datastore(resource_id, headers_dicts, records)
        logger.info('...copying done')

        if count:
            logger.info(
                'Successfully pushed {n} entries to "{res_id}".'.format(
                    n=count, res_id=resource_id))
        else:
            # no datastore table is created
            raise LoaderError('No entries found - nothing to load')
def _load_table_xlsx(table_filepath, resource_id, encoding, logger):
    from .excel import headers_guess, type_guess
    logger.info('Determining column names and types')

    wb = load_workbook(filename=table_filepath)
    ws = wb.worksheets[0]

    rows = list(ws.iter_rows())
    cols = ws.iter_cols()
    offset, headers = headers_guess(rows, tolerance=1)

    existing = datastore_resource_exists(resource_id)
    existing_info = None
    if existing:
        existing_info = dict((f['id'], f['info'])
                             for f in existing.get('fields', [])
                             if 'info' in f)

    types = type_guess(cols, strict=True, header_offset=offset)

    # override with types user requested
    if existing_info:
        types = [{
            'text': messytables.StringType(),
            'numeric': messytables.DecimalType(),
            'timestamp': messytables.DateUtilType(),
        }.get(existing_info.get(h, {}).get('type_override'), t)
                 for t, h in zip(types, headers)]

    headers = [
        header.strip()[:MAX_COLUMN_LENGTH] for header in headers
        if header.strip()
    ]

    def data_row_iterator():
        for index, row in enumerate(rows):
            data_row = {}
            if index <= offset:
                continue

            for hi, header in enumerate(headers):
                data_row[header] = row[hi].value

            # for ci, cell in enumerate(row):
            #   column_name = headers[ci]
            #   data_row[column_name] = cell.value
            yield data_row

    data_rows = data_row_iterator()

    if existing:
        logger.info(
            'Deleting "{res_id}" from datastore.'.format(res_id=resource_id))
        delete_datastore_resource(resource_id)

    headers_dicts = [
        dict(id=field[0], type=_TYPE_MAPPING[str(field[1])])
        for field in zip(headers, types)
    ]

    # Maintain data dictionaries from matching column names
    if existing_info:
        for h in headers_dicts:
            if h['id'] in existing_info:
                h['info'] = existing_info[h['id']]
                # create columns with types user requested
                type_override = existing_info[h['id']].get('type_override')
                if type_override in list(_TYPE_MAPPING.values()):
                    h['type'] = type_override

    logger.info('Determined headers and types: {headers}'.format(
        headers=headers_dicts))

    logger.info('Copying to database...')
    count = 0
    for i, records in enumerate(chunky(data_rows, 250)):
        count += len(records)
        logger.info('Saving chunk {number}'.format(number=i))
        send_resource_to_datastore(resource_id, headers_dicts, records)
    logger.info('...copying done')

    if count:
        logger.info('Successfully pushed {n} entries to "{res_id}".'.format(
            n=count, res_id=resource_id))
    else:
        # no datastore table is created
        raise LoaderError('No entries found - nothing to load')