Exemplo n.º 1
0
def action(ckan, arguments,
        stdin=None):
    """
    call an action with KEY=VALUE args, yield the result
    """
    if stdin is None:
        stdin = getattr(sys.stdin, 'buffer', sys.stdin)

    if arguments['--input-json']:
        action_args = json.loads(stdin.read().decode('utf-8'))
    elif arguments['--input']:
        action_args = json.loads(open(
            arguments['--input']).read().decode('utf-8'))
    else:
        action_args = {}
        for kv in arguments['KEY=VALUE']:
            key, p, value = kv.partition('=')
            action_args[key] = value
    result = ckan.call_action(arguments['ACTION_NAME'], action_args)

    if arguments['--output-jsonl']:
        if isinstance(result, list):
            for r in result:
                yield compact_json(r) + b'\n'
        else:
            yield compact_json(result) + b'\n'
    elif arguments['--output-json']:
        yield compact_json(result) + b'\n'
    else:
        yield pretty_json(result) + b'\n'
Exemplo n.º 2
0
def create_datapackage(record, base_path, stderr):
    # TODO: how are we going to handle which resources to
    # leave alone? They're very inconsistent in some instances
    # And I can't imagine anyone wants to download a copy
    # of, for example, the API base endpoint
    resource_formats_to_ignore = ['API', 'api']
    dataset_name = record.get('name', '')

    datapackage_dir = os.path.join(base_path, dataset_name)
    os.makedirs(os.path.join(datapackage_dir, 'data'))

    resources = []
    ckan_resources = []
    for resource in record.get('resources', []):
        if resource['format'] in resource_formats_to_ignore:
            continue
        resources.append(create_resource(resource, datapackage_dir, stderr))
        ckan_resources.append(resource)

    json_path = os.path.join(datapackage_dir, 'datapackage.json')
    datapackage = dataset_to_datapackage(dict(record, resources=resources))

    # prefer resource names from datapackage metadata
    for cres, dres in zip(ckan_resources, datapackage.get('resources', [])):
        name = dres['name']
        ext = slugify.slugify(dres['format'])
        if name.endswith(ext):
            name = name[:-len(ext)]
        try:
            os.rename(os.path.join(datapackage_dir, 'data', cres['id']),
                      os.path.join(datapackage_dir, 'data', name + '.' + ext))
            # successful local download
            dres['path'] = 'data/' + name + '.' + ext
        except OSError:
            pass

        # convert datastore data dictionary to datapackage schema
        if 'schema' not in dres and 'datastore_fields' in cres:
            fields = []
            for f in cres['datastore_fields']:
                if f['id'] == '_id':
                    continue
                df = {'name': f['id']}
                dtyp = DATAPACKAGE_TYPES.get(f['type'])
                if dtyp:
                    df['type'] = dtyp
                dtit = f.get('info', {}).get('label', '')
                if dtit:
                    df['title'] = dtit
                ddesc = f.get('info', {}).get('notes', '')
                if ddesc:
                    df['description'] = ddesc
                fields.append(df)
            dres['schema'] = {'fields': fields}

    with open(json_path, 'wb') as out:
        out.write(pretty_json(datapackage))
Exemplo n.º 3
0
def action(ckan, arguments, stdin=None):
    """
    call an action with KEY=STRING, KEY:JSON or JSON args, yield the result
    """
    if stdin is None:
        stdin = getattr(sys.stdin, 'buffer', sys.stdin)

    file_args = {}
    if arguments['--input-json']:
        action_args = json.loads(stdin.read().decode('utf-8'))
    elif arguments['--input']:
        action_args = {}
        with open(expanduser(arguments['--input'])) as in_f:
            action_args = json.loads(
                in_f.read().decode('utf-8') if sys.version_info.major == 2 else in_f.read())
    else:
        action_args = {}
        for kv in arguments['KEY=STRING']:
            if hasattr(kv, 'decode'):
                kv = kv.decode('utf-8')
            skey, p, svalue = kv.partition('=')
            jkey, p, jvalue = kv.partition(':')
            fkey, p, fvalue = kv.partition('@')
            if len(jkey) > len(skey) < len(fkey):
                action_args[skey] = svalue
            elif len(skey) > len(jkey) < len(fkey):
                try:
                    value = json.loads(jvalue)
                except ValueError:
                    raise CLIError("KEY:JSON argument %r has invalid JSON "
                        "value %r" % (jkey, jvalue))
                action_args[jkey] = value
            elif len(jkey) > len(fkey) < len(skey):
                try:
                    f = open(expanduser(fvalue), 'rb')
                except IOError as e:
                    raise CLIError("Error opening %r: %s" %
                        (expanduser(fvalue), e.args[1]))
                file_args[fkey] = f
            else:
                raise CLIError("argument not in the form KEY=STRING, "
                    "KEY:JSON or KEY@FILE %r" % kv)

    result = ckan.call_action(arguments['ACTION_NAME'], action_args,
        files=file_args)

    if arguments['--output-jsonl']:
        if isinstance(result, list):
            for r in result:
                yield compact_json(r) + b'\n'
        else:
            yield compact_json(result) + b'\n'
    elif arguments['--output-json']:
        yield compact_json(result) + b'\n'
    else:
        yield pretty_json(result) + b'\n'
Exemplo n.º 4
0
def create_datapackage(record, base_path, stderr):
    # TODO: how are we going to handle which resources to
    # leave alone? They're very inconsistent in some instances
    # And I can't imagine anyone wants to download a copy
    # of, for example, the API base endpoint
    resource_formats_to_ignore = ['API', 'api']
    dataset_name = record.get('name', '') if record else ''

    target_dir = '{base_path}/{name}/data'.format(base_path=base_path,
                                                  name=dataset_name)

    try:
        os.makedirs(target_dir)
    except Exception as e:
        stderr.write(e.message)

    for resource in record.get('resources', ''):
        if resource.get('name') is not None:
            resource_id = resource['name']
        else:
            resource_id = resource['id']

        resource_filename = os.path.split(resource['url'])[1]

        output = os.path.join(target_dir, resource_filename)

        # Resources can have a free-form address and no internal info, so in those cases
        # we're going to merely save them using the UID. (If they even exist)
        if output.endswith('/'):
            output = os.path.join(output, resource_id)

        resource['path'] = 'data' + output[len(target_dir):]

        try:
            if resource['format'] not in resource_formats_to_ignore:
                r = requests.get(resource['url'], stream=True)
                with open(output, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=DL_CHUNK_SIZE):
                        if chunk:  # filter out keep-alive new chunks
                            f.write(chunk)
                            f.flush()
        except requests.ConnectionError:
            stderr.write(
                'URL {url} refused connection. The resource will not be downloaded\n'
                .format(url=resource['url']))
        except requests.exceptions.RequestException as e:
            stderr.write(e.message)
            stderr.write('\n')

    json_output_name = '{base_path}/{dataset_name}/datapackage.json'.format(
        base_path=base_path, dataset_name=dataset_name)
    with open(json_output_name, 'wb') as out:
        out.write(pretty_json(dict(record, version=DATAPACKAGE_VERSION)))
Exemplo n.º 5
0
def create_datapackage(record, base_path, stderr):
    # TODO: how are we going to handle which resources to
    # leave alone? They're very inconsistent in some instances
    # And I can't imagine anyone wants to download a copy
    # of, for example, the API base endpoint
    resource_formats_to_ignore = ['API', 'api']
    dataset_name = record.get('name', '') if record else ''

    target_dir = '{base_path}/{name}/data'.format(
        base_path=base_path,
        name=dataset_name)

    try:
        os.makedirs(target_dir)
    except Exception as e:
        stderr.write(e.message)

    for resource in record.get('resources', ''):
        if resource.get('name') is not None:
            resource_id = resource['name']
        else:
            resource_id = resource['id']

        resource_filename = os.path.split(resource['url'])[1]

        output = os.path.join(target_dir, resource_filename)

        # Resources can have a free-form address and no internal info, so in those cases
        # we're going to merely save them using the UID. (If they even exist)
        if output.endswith('/'):
            output = os.path.join(output, resource_id)

        resource['path'] = 'data' + output[len(target_dir):]

        try:
            if resource['format'] not in resource_formats_to_ignore:
                r = requests.get(resource['url'], stream=True)
                with open(output, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=DL_CHUNK_SIZE):
                        if chunk: # filter out keep-alive new chunks
                            f.write(chunk)
                            f.flush()
        except requests.ConnectionError:
            stderr.write('URL {url} refused connection. The resource will not be downloaded\n'.format(url=resource['url']))
        except requests.exceptions.RequestException as e:
            stderr.write(e.message)
            stderr.write('\n')

    json_output_name = '{base_path}/{dataset_name}/datapackage.json'.format(
        base_path=base_path, dataset_name=dataset_name)
    with open(json_output_name, 'wb') as out:
        out.write(pretty_json(dict(record, version=DATAPACKAGE_VERSION)))
Exemplo n.º 6
0
def action(ckan, arguments, stdin=None):
    """
    call an action with KEY=STRING, KEY:JSON or JSON args, yield the result
    """
    if stdin is None:
        stdin = getattr(sys.stdin, 'buffer', sys.stdin)

    if arguments['--input-json']:
        action_args = json.loads(stdin.read().decode('utf-8'))
    elif arguments['--input']:
        action_args = json.loads(
            open(arguments['--input']).read().decode('utf-8'))
    else:
        action_args = {}
        for kv in arguments['KEY=STRING']:
            skey, p, svalue = kv.partition('=')
            jkey, p, jvalue = kv.partition(':')
            if len(skey) < len(jkey):
                action_args[skey] = svalue
                continue
            if len(jkey) < len(skey):
                try:
                    value = json.loads(jvalue)
                except ValueError:
                    raise CLIError("KEY:JSON argument %r has invalid JSON "
                                   "value %r" % (jkey, jvalue))
                action_args[jkey] = value
                continue
            raise CLIError("argument not in the form KEY=STRING or KEY:JSON "
                           "%r" % kv)

    result = ckan.call_action(arguments['ACTION_NAME'], action_args)

    if arguments['--output-jsonl']:
        if isinstance(result, list):
            for r in result:
                yield compact_json(r) + b'\n'
        else:
            yield compact_json(result) + b'\n'
    elif arguments['--output-json']:
        yield compact_json(result) + b'\n'
    else:
        yield pretty_json(result) + b'\n'
Exemplo n.º 7
0
def action(ckan, arguments, stdin=None):
    """
    call an action with KEY=STRING, KEY:JSON or JSON args, yield the result
    """
    if stdin is None:
        stdin = getattr(sys.stdin, 'buffer', sys.stdin)

    if arguments['--input-json']:
        action_args = json.loads(stdin.read().decode('utf-8'))
    elif arguments['--input']:
        action_args = json.loads(open(
            arguments['--input']).read().decode('utf-8'))
    else:
        action_args = {}
        for kv in arguments['KEY=STRING']:
            skey, p, svalue = kv.partition('=')
            jkey, p, jvalue = kv.partition(':')
            if len(skey) < len(jkey):
                action_args[skey] = svalue
                continue
            if len(jkey) < len(skey):
                try:
                    value = json.loads(jvalue)
                except ValueError:
                    raise CLIError("KEY:JSON argument %r has invalid JSON "
                        "value %r" % (jkey, jvalue))
                action_args[jkey] = value
                continue
            raise CLIError("argument not in the form KEY=STRING or KEY:JSON "
                "%r" % kv)

    result = ckan.call_action(arguments['ACTION_NAME'], action_args)

    if arguments['--output-jsonl']:
        if isinstance(result, list):
            for r in result:
                yield compact_json(r) + b'\n'
        else:
            yield compact_json(result) + b'\n'
    elif arguments['--output-json']:
        yield compact_json(result) + b'\n'
    else:
        yield pretty_json(result) + b'\n'
Exemplo n.º 8
0
def create_datapackage(record, base_path, stderr):
    # TODO: how are we going to handle which resources to
    # leave alone? They're very inconsistent in some instances
    # And I can't imagine anyone wants to download a copy
    # of, for example, the API base endpoint
    resource_formats_to_ignore = ['API', 'api']
    dataset_name = record.get('name', '')

    datapackage_dir = os.path.join(base_path, dataset_name)
    os.makedirs(os.path.join(datapackage_dir, 'data'))

    resources = [(resource if resource['format'] in resource_formats_to_ignore
                  else create_resource(resource, datapackage_dir, stderr))
                 for resource in record.get('resources', [])]

    json_path = os.path.join(datapackage_dir, 'datapackage.json')
    with open(json_path, 'wb') as out:
        out.write(
            pretty_json(
                dict(record, resources=resources,
                     version=DATAPACKAGE_VERSION)))
Exemplo n.º 9
0
def create_datapackage(record, base_path, stderr):
    # TODO: how are we going to handle which resources to
    # leave alone? They're very inconsistent in some instances
    # And I can't imagine anyone wants to download a copy
    # of, for example, the API base endpoint
    resource_formats_to_ignore = ['API', 'api']
    dataset_name = record.get('name', '')

    datapackage_dir = os.path.join(base_path, dataset_name)
    os.makedirs(os.path.join(datapackage_dir, 'data'))

    # filter out some resources
    ckan_resources = []
    for resource in record.get('resources', []):
        if resource['format'] in resource_formats_to_ignore:
            continue
        ckan_resources.append(resource)
    dataset = dict(record, resources=ckan_resources)

    # get the datapackage (metadata)
    datapackage = dataset_to_datapackage(dataset)

    for cres, dres in zip(ckan_resources, datapackage.get('resources', [])):
        filename = resource_filename(dres)

        # download the resource
        cres = \
            create_resource(resource, filename, datapackage_dir, stderr)
        dres['path'] = 'data/' + filename

        populate_schema_from_datastore(cres, dres)

    json_path = os.path.join(datapackage_dir, 'datapackage.json')
    with open(json_path, 'wb') as out:
        out.write(pretty_json(datapackage))

    return datapackage_dir, datapackage, json_path
Exemplo n.º 10
0
def dump_things(ckan, thing, arguments,
        worker_pool=None, stdout=None, stderr=None):
    """
    dump all datasets, groups or orgs accessible by the connected user

    The parent process creates a pool of worker processes and hands
    out ids to each worker. Status of last record completed and records
    being processed is displayed on stderr.
    """
    if worker_pool is None:
        worker_pool = workers.worker_pool
    if stdout is None:
        stdout = getattr(sys.stdout, 'buffer', sys.stdout)
    if stderr is None:
        stderr = getattr(sys.stderr, 'buffer', sys.stderr)

    if arguments['--worker']:
        return dump_things_worker(ckan, thing, arguments)

    log = None
    if arguments['--log']:
        log = open(arguments['--log'], 'a')

    jsonl_output = stdout
    if arguments['--dp-output']:  # TODO: do we want to just divert this to devnull?
        jsonl_output = open(os.devnull, 'w')
    if arguments['--output']:
        jsonl_output = open(arguments['--output'], 'wb')
    if arguments['--gzip']:
        jsonl_output = gzip.GzipFile(fileobj=jsonl_output)
    if arguments['--all']:
        get_thing_list = {
            'datasets': 'package_list',
            'groups': 'group_list',
            'organizations': 'organization_list',
            }[thing]
        names = ckan.call_action(get_thing_list, {})
    else:
        names = arguments['ID_OR_NAME']

    cmd = _worker_command_line(thing, arguments)
    processes = int(arguments['--processes'])
    if hasattr(ckan, 'parallel_limit'):
        # add your sites to ckanapi.remoteckan.MY_SITES instead of removing
        processes = min(processes, ckan.parallel_limit)
    stats = completion_stats(processes)
    pool = worker_pool(cmd, processes,
        enumerate(compact_json(n) + b'\n' for n in names))

    results = {}
    expecting_number = 0
    with quiet_int_pipe() as errors:
        for job_ids, finished, result in pool:
            timestamp, error, record = json.loads(result.decode('utf-8'))
            results[finished] = record

            if not arguments['--quiet']:
                stderr.write('{0} {1} {2} {3} {4}\n'.format(
                    finished,
                    job_ids,
                    next(stats),
                    error,
                    record.get('name', '') if record else '',
                    ).encode('utf-8'))

            if log:
                log.write(compact_json([
                    timestamp,
                    finished,
                    error,
                    record.get('name', '') if record else None,
                    ]) + b'\n')

            if arguments['--dp-output']:
                # TODO: how are we going to handle which resources to leave alone? They're very inconsistent in some instances
                # And I can't imagine anyone wants to download a copy of, for example, the API base endpoint
                resource_formats_to_ignore = ['API', 'api']
                dataset_name = record.get('name', '') if record else ''

                try:
                    base_path = arguments['--dp-output']
                except KeyError:
                    base_path = './'

                target_dir = '{base_path}/{name}/data'.format(base_path=base_path,
                                                                 name=dataset_name)

                try:
                    os.makedirs(target_dir)
                except Exception as e:
                    stderr.write(e.message)

                for resource in record.get('resources', ''):
                    if resource['name'] is not None:
                        resource_id = resource['name']
                    else:
                        resource_id = resource['id']

                    resource_filename = os.path.split(resource['url'])[1]

                    output = os.path.join(target_dir, resource_filename)

                    # Resources can have a free-form address and no internal info, so in those cases
                    # we're going to merely save them using the UID. (If they even exist)
                    if output.endswith('/'):
                        output = os.path.join(output, resource_id)

                    resource['path'] = output  # datapackage.json format explicitly requires a path to the resource

                    try:
                        if resource['format'] not in resource_formats_to_ignore:
                            r = requests.get(resource['url'], stream=True)
                            with open(output, 'wb') as f:
                                for chunk in r.iter_content(chunk_size=1024):
                                    if chunk: # filter out keep-alive new chunks
                                        f.write(chunk)
                                        f.flush()
                    except requests.ConnectionError:
                        stderr.write('URL {url} refused connection. The resource will not be downloaded\n'.format(url=resource['url']))
                    except requests.exceptions.RequestException as e:
                        stderr.write(e.message)
                        stderr.write('\n')


                datapackagejson_output = open('{base_path}{dataset_name}/datapackage.json'.format(base_path=base_path,
                                                                                                   dataset_name=dataset_name), 'w',)

                record['version'] = '1.0-beta.10'

                datapackagejson_output.write(pretty_json(record))

            # keep the output in the same order as names
            while expecting_number in results:
                record = results.pop(expecting_number)
                if record:
                    # sort keys so we can diff output
                    jsonl_output.write(compact_json(record,
                        sort_keys=True) + b'\n')
                expecting_number += 1
    if 'pipe' in errors:
        return 1
    if 'interrupt' in errors:
        return 2