Exemplo n.º 1
0
def update(endpoint, **kwargs):
    """ Updates the database

    Args:
        endpoint (str): The api resource url.
        kwargs (dict): passed to CKAN constructor.

    Kwargs:
        chunk_size (int): Number of rows to process at a time (default: All).
        row_limit (int): Total number of rows to process (default: All).
        err_limit (int): Number of errors to encounter before failing
            (default: Inf).

    Returns:
        (dict): Update details
    """
    start = timer()
    pid = kwargs.pop('pid', None)
    chunk_size = kwargs.pop('chunk_size', 0)
    row_limit = kwargs.pop('row_limit', None)
    err_limit = kwargs.pop('err_limit', None)

    rows = 0
    ckan = CKAN(**kwargs)

    if pid:
        pids = [pid]
    else:
        org_show = partial(ckan.organization_show, include_datasets=True)
        orgs_basic = ckan.organization_list(permission='read')
        org_ids = it.imap(itemgetter('id'), orgs_basic)
        orgs = (org_show(id=org_id) for org_id in org_ids)
        package_lists = it.imap(itemgetter('packages'), orgs)
        pid_getter = partial(map, itemgetter('id'))
        pids = it.chain.from_iterable(it.imap(pid_getter, package_lists))

    data = gen_data(ckan, pids, kwargs.get('mock_freq'))
    errors = {}

    for records in tup.chunk(data, min(row_limit or 'inf', chunk_size)):
        rs = map(partial(patch_or_post, endpoint), records)
        rows += len(filter(lambda r: r.ok, rs))
        ids = map(itemgetter('dataset_id'), records)
        errors.update(dict((k, r.json()) for k, r in zip(ids, rs) if not r.ok))

        if row_limit and rows >= row_limit:
            break

        if err_limit and len(errors) >= err_limit:
            raise Exception(errors)

    elapsed_time = ' ,'.join(fmt_elapsed(timer() - start))
    return {'rows_added': rows, 'errors': errors, 'elapsed_time': elapsed_time}
Exemplo n.º 2
0
    def insert_records(self, resource_id, records, **kwargs):
        """Inserts records into a datastore table.

        Args:
            resource_id (str): The datastore resource id.
            records (List[dict]): The records to insert.
            **kwargs: Keyword arguments that are passed to datastore_create.

        Kwargs:
            method (str): Insert method. One of ['update, 'insert', 'upsert']
                (default: 'insert').
            force (bool): Create resource even if read-only.
            start (int): Row number to start from (zero indexed).
            stop (int): Row number to stop at (zero indexed).
            chunksize (int): Number of rows to write at a time.

        Returns:
            int: Number of records inserted.

        Raises:
            NotFound: If unable to find the resource.

        Examples:
            >>> CKAN(quiet=True).insert_records('rid', [{'field': 'value'}])
            Traceback (most recent call last):
            NotFound: Resource `rid` was not found in filestore.
        """
        chunksize = kwargs.pop('chunksize', 0)
        start = kwargs.pop('start', 0)
        stop = kwargs.pop('stop', None)

        kwargs.setdefault('force', self.force)
        kwargs.setdefault('method', 'insert')
        kwargs['resource_id'] = resource_id
        count = 1

        for chunk in tup.chunk(records, chunksize, start=start, stop=stop):
            length = len(chunk)

            if self.verbose:
                print(
                    'Adding records %i - %i to resource %s...' % (
                        count, count + length - 1, resource_id))

            kwargs['records'] = chunk

            try:
                self.datastore_upsert(**kwargs)
            except requests.exceptions.ConnectionError as err:
                if 'Broken pipe' in err.message[1]:
                    print('Chunksize too large. Try using a smaller chunksize.')
                    return 0
                else:
                    raise err
            except NotFound:
                # Keep exception message consistent with the others
                raise NotFound(
                    'Resource `%s` was not found in filestore.' % resource_id)

            count += length

        return count