def action(ckan, arguments, stdin=None): """ call an action with KEY=VALUE args, yield the result """ if stdin is None: stdin = getattr(sys.stdin, 'buffer', sys.stdin) if arguments['--input-json']: action_args = json.loads(stdin.read().decode('utf-8')) elif arguments['--input']: action_args = json.loads(open( arguments['--input']).read().decode('utf-8')) else: action_args = {} for kv in arguments['KEY=VALUE']: key, p, value = kv.partition('=') action_args[key] = value result = ckan.call_action(arguments['ACTION_NAME'], action_args) if arguments['--output-jsonl']: if isinstance(result, list): for r in result: yield compact_json(r) + b'\n' else: yield compact_json(result) + b'\n' elif arguments['--output-json']: yield compact_json(result) + b'\n' else: yield pretty_json(result) + b'\n'
def create_datapackage(record, base_path, stderr): # TODO: how are we going to handle which resources to # leave alone? They're very inconsistent in some instances # And I can't imagine anyone wants to download a copy # of, for example, the API base endpoint resource_formats_to_ignore = ['API', 'api'] dataset_name = record.get('name', '') datapackage_dir = os.path.join(base_path, dataset_name) os.makedirs(os.path.join(datapackage_dir, 'data')) resources = [] ckan_resources = [] for resource in record.get('resources', []): if resource['format'] in resource_formats_to_ignore: continue resources.append(create_resource(resource, datapackage_dir, stderr)) ckan_resources.append(resource) json_path = os.path.join(datapackage_dir, 'datapackage.json') datapackage = dataset_to_datapackage(dict(record, resources=resources)) # prefer resource names from datapackage metadata for cres, dres in zip(ckan_resources, datapackage.get('resources', [])): name = dres['name'] ext = slugify.slugify(dres['format']) if name.endswith(ext): name = name[:-len(ext)] try: os.rename(os.path.join(datapackage_dir, 'data', cres['id']), os.path.join(datapackage_dir, 'data', name + '.' + ext)) # successful local download dres['path'] = 'data/' + name + '.' + ext except OSError: pass # convert datastore data dictionary to datapackage schema if 'schema' not in dres and 'datastore_fields' in cres: fields = [] for f in cres['datastore_fields']: if f['id'] == '_id': continue df = {'name': f['id']} dtyp = DATAPACKAGE_TYPES.get(f['type']) if dtyp: df['type'] = dtyp dtit = f.get('info', {}).get('label', '') if dtit: df['title'] = dtit ddesc = f.get('info', {}).get('notes', '') if ddesc: df['description'] = ddesc fields.append(df) dres['schema'] = {'fields': fields} with open(json_path, 'wb') as out: out.write(pretty_json(datapackage))
def action(ckan, arguments, stdin=None): """ call an action with KEY=STRING, KEY:JSON or JSON args, yield the result """ if stdin is None: stdin = getattr(sys.stdin, 'buffer', sys.stdin) file_args = {} if arguments['--input-json']: action_args = json.loads(stdin.read().decode('utf-8')) elif arguments['--input']: action_args = {} with open(expanduser(arguments['--input'])) as in_f: action_args = json.loads( in_f.read().decode('utf-8') if sys.version_info.major == 2 else in_f.read()) else: action_args = {} for kv in arguments['KEY=STRING']: if hasattr(kv, 'decode'): kv = kv.decode('utf-8') skey, p, svalue = kv.partition('=') jkey, p, jvalue = kv.partition(':') fkey, p, fvalue = kv.partition('@') if len(jkey) > len(skey) < len(fkey): action_args[skey] = svalue elif len(skey) > len(jkey) < len(fkey): try: value = json.loads(jvalue) except ValueError: raise CLIError("KEY:JSON argument %r has invalid JSON " "value %r" % (jkey, jvalue)) action_args[jkey] = value elif len(jkey) > len(fkey) < len(skey): try: f = open(expanduser(fvalue), 'rb') except IOError as e: raise CLIError("Error opening %r: %s" % (expanduser(fvalue), e.args[1])) file_args[fkey] = f else: raise CLIError("argument not in the form KEY=STRING, " "KEY:JSON or KEY@FILE %r" % kv) result = ckan.call_action(arguments['ACTION_NAME'], action_args, files=file_args) if arguments['--output-jsonl']: if isinstance(result, list): for r in result: yield compact_json(r) + b'\n' else: yield compact_json(result) + b'\n' elif arguments['--output-json']: yield compact_json(result) + b'\n' else: yield pretty_json(result) + b'\n'
def create_datapackage(record, base_path, stderr): # TODO: how are we going to handle which resources to # leave alone? They're very inconsistent in some instances # And I can't imagine anyone wants to download a copy # of, for example, the API base endpoint resource_formats_to_ignore = ['API', 'api'] dataset_name = record.get('name', '') if record else '' target_dir = '{base_path}/{name}/data'.format(base_path=base_path, name=dataset_name) try: os.makedirs(target_dir) except Exception as e: stderr.write(e.message) for resource in record.get('resources', ''): if resource.get('name') is not None: resource_id = resource['name'] else: resource_id = resource['id'] resource_filename = os.path.split(resource['url'])[1] output = os.path.join(target_dir, resource_filename) # Resources can have a free-form address and no internal info, so in those cases # we're going to merely save them using the UID. (If they even exist) if output.endswith('/'): output = os.path.join(output, resource_id) resource['path'] = 'data' + output[len(target_dir):] try: if resource['format'] not in resource_formats_to_ignore: r = requests.get(resource['url'], stream=True) with open(output, 'wb') as f: for chunk in r.iter_content(chunk_size=DL_CHUNK_SIZE): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush() except requests.ConnectionError: stderr.write( 'URL {url} refused connection. The resource will not be downloaded\n' .format(url=resource['url'])) except requests.exceptions.RequestException as e: stderr.write(e.message) stderr.write('\n') json_output_name = '{base_path}/{dataset_name}/datapackage.json'.format( base_path=base_path, dataset_name=dataset_name) with open(json_output_name, 'wb') as out: out.write(pretty_json(dict(record, version=DATAPACKAGE_VERSION)))
def create_datapackage(record, base_path, stderr): # TODO: how are we going to handle which resources to # leave alone? They're very inconsistent in some instances # And I can't imagine anyone wants to download a copy # of, for example, the API base endpoint resource_formats_to_ignore = ['API', 'api'] dataset_name = record.get('name', '') if record else '' target_dir = '{base_path}/{name}/data'.format( base_path=base_path, name=dataset_name) try: os.makedirs(target_dir) except Exception as e: stderr.write(e.message) for resource in record.get('resources', ''): if resource.get('name') is not None: resource_id = resource['name'] else: resource_id = resource['id'] resource_filename = os.path.split(resource['url'])[1] output = os.path.join(target_dir, resource_filename) # Resources can have a free-form address and no internal info, so in those cases # we're going to merely save them using the UID. (If they even exist) if output.endswith('/'): output = os.path.join(output, resource_id) resource['path'] = 'data' + output[len(target_dir):] try: if resource['format'] not in resource_formats_to_ignore: r = requests.get(resource['url'], stream=True) with open(output, 'wb') as f: for chunk in r.iter_content(chunk_size=DL_CHUNK_SIZE): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush() except requests.ConnectionError: stderr.write('URL {url} refused connection. The resource will not be downloaded\n'.format(url=resource['url'])) except requests.exceptions.RequestException as e: stderr.write(e.message) stderr.write('\n') json_output_name = '{base_path}/{dataset_name}/datapackage.json'.format( base_path=base_path, dataset_name=dataset_name) with open(json_output_name, 'wb') as out: out.write(pretty_json(dict(record, version=DATAPACKAGE_VERSION)))
def action(ckan, arguments, stdin=None): """ call an action with KEY=STRING, KEY:JSON or JSON args, yield the result """ if stdin is None: stdin = getattr(sys.stdin, 'buffer', sys.stdin) if arguments['--input-json']: action_args = json.loads(stdin.read().decode('utf-8')) elif arguments['--input']: action_args = json.loads( open(arguments['--input']).read().decode('utf-8')) else: action_args = {} for kv in arguments['KEY=STRING']: skey, p, svalue = kv.partition('=') jkey, p, jvalue = kv.partition(':') if len(skey) < len(jkey): action_args[skey] = svalue continue if len(jkey) < len(skey): try: value = json.loads(jvalue) except ValueError: raise CLIError("KEY:JSON argument %r has invalid JSON " "value %r" % (jkey, jvalue)) action_args[jkey] = value continue raise CLIError("argument not in the form KEY=STRING or KEY:JSON " "%r" % kv) result = ckan.call_action(arguments['ACTION_NAME'], action_args) if arguments['--output-jsonl']: if isinstance(result, list): for r in result: yield compact_json(r) + b'\n' else: yield compact_json(result) + b'\n' elif arguments['--output-json']: yield compact_json(result) + b'\n' else: yield pretty_json(result) + b'\n'
def action(ckan, arguments, stdin=None): """ call an action with KEY=STRING, KEY:JSON or JSON args, yield the result """ if stdin is None: stdin = getattr(sys.stdin, 'buffer', sys.stdin) if arguments['--input-json']: action_args = json.loads(stdin.read().decode('utf-8')) elif arguments['--input']: action_args = json.loads(open( arguments['--input']).read().decode('utf-8')) else: action_args = {} for kv in arguments['KEY=STRING']: skey, p, svalue = kv.partition('=') jkey, p, jvalue = kv.partition(':') if len(skey) < len(jkey): action_args[skey] = svalue continue if len(jkey) < len(skey): try: value = json.loads(jvalue) except ValueError: raise CLIError("KEY:JSON argument %r has invalid JSON " "value %r" % (jkey, jvalue)) action_args[jkey] = value continue raise CLIError("argument not in the form KEY=STRING or KEY:JSON " "%r" % kv) result = ckan.call_action(arguments['ACTION_NAME'], action_args) if arguments['--output-jsonl']: if isinstance(result, list): for r in result: yield compact_json(r) + b'\n' else: yield compact_json(result) + b'\n' elif arguments['--output-json']: yield compact_json(result) + b'\n' else: yield pretty_json(result) + b'\n'
def create_datapackage(record, base_path, stderr): # TODO: how are we going to handle which resources to # leave alone? They're very inconsistent in some instances # And I can't imagine anyone wants to download a copy # of, for example, the API base endpoint resource_formats_to_ignore = ['API', 'api'] dataset_name = record.get('name', '') datapackage_dir = os.path.join(base_path, dataset_name) os.makedirs(os.path.join(datapackage_dir, 'data')) resources = [(resource if resource['format'] in resource_formats_to_ignore else create_resource(resource, datapackage_dir, stderr)) for resource in record.get('resources', [])] json_path = os.path.join(datapackage_dir, 'datapackage.json') with open(json_path, 'wb') as out: out.write( pretty_json( dict(record, resources=resources, version=DATAPACKAGE_VERSION)))
def create_datapackage(record, base_path, stderr): # TODO: how are we going to handle which resources to # leave alone? They're very inconsistent in some instances # And I can't imagine anyone wants to download a copy # of, for example, the API base endpoint resource_formats_to_ignore = ['API', 'api'] dataset_name = record.get('name', '') datapackage_dir = os.path.join(base_path, dataset_name) os.makedirs(os.path.join(datapackage_dir, 'data')) # filter out some resources ckan_resources = [] for resource in record.get('resources', []): if resource['format'] in resource_formats_to_ignore: continue ckan_resources.append(resource) dataset = dict(record, resources=ckan_resources) # get the datapackage (metadata) datapackage = dataset_to_datapackage(dataset) for cres, dres in zip(ckan_resources, datapackage.get('resources', [])): filename = resource_filename(dres) # download the resource cres = \ create_resource(resource, filename, datapackage_dir, stderr) dres['path'] = 'data/' + filename populate_schema_from_datastore(cres, dres) json_path = os.path.join(datapackage_dir, 'datapackage.json') with open(json_path, 'wb') as out: out.write(pretty_json(datapackage)) return datapackage_dir, datapackage, json_path
def dump_things(ckan, thing, arguments, worker_pool=None, stdout=None, stderr=None): """ dump all datasets, groups or orgs accessible by the connected user The parent process creates a pool of worker processes and hands out ids to each worker. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdout is None: stdout = getattr(sys.stdout, 'buffer', sys.stdout) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) if arguments['--worker']: return dump_things_worker(ckan, thing, arguments) log = None if arguments['--log']: log = open(arguments['--log'], 'a') jsonl_output = stdout if arguments['--dp-output']: # TODO: do we want to just divert this to devnull? jsonl_output = open(os.devnull, 'w') if arguments['--output']: jsonl_output = open(arguments['--output'], 'wb') if arguments['--gzip']: jsonl_output = gzip.GzipFile(fileobj=jsonl_output) if arguments['--all']: get_thing_list = { 'datasets': 'package_list', 'groups': 'group_list', 'organizations': 'organization_list', }[thing] names = ckan.call_action(get_thing_list, {}) else: names = arguments['ID_OR_NAME'] cmd = _worker_command_line(thing, arguments) processes = int(arguments['--processes']) if hasattr(ckan, 'parallel_limit'): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) pool = worker_pool(cmd, processes, enumerate(compact_json(n) + b'\n' for n in names)) results = {} expecting_number = 0 with quiet_int_pipe() as errors: for job_ids, finished, result in pool: timestamp, error, record = json.loads(result.decode('utf-8')) results[finished] = record if not arguments['--quiet']: stderr.write('{0} {1} {2} {3} {4}\n'.format( finished, job_ids, next(stats), error, record.get('name', '') if record else '', ).encode('utf-8')) if log: log.write(compact_json([ timestamp, finished, error, record.get('name', '') if record else None, ]) + b'\n') if arguments['--dp-output']: # TODO: how are we going to handle which resources to leave alone? They're very inconsistent in some instances # And I can't imagine anyone wants to download a copy of, for example, the API base endpoint resource_formats_to_ignore = ['API', 'api'] dataset_name = record.get('name', '') if record else '' try: base_path = arguments['--dp-output'] except KeyError: base_path = './' target_dir = '{base_path}/{name}/data'.format(base_path=base_path, name=dataset_name) try: os.makedirs(target_dir) except Exception as e: stderr.write(e.message) for resource in record.get('resources', ''): if resource['name'] is not None: resource_id = resource['name'] else: resource_id = resource['id'] resource_filename = os.path.split(resource['url'])[1] output = os.path.join(target_dir, resource_filename) # Resources can have a free-form address and no internal info, so in those cases # we're going to merely save them using the UID. (If they even exist) if output.endswith('/'): output = os.path.join(output, resource_id) resource['path'] = output # datapackage.json format explicitly requires a path to the resource try: if resource['format'] not in resource_formats_to_ignore: r = requests.get(resource['url'], stream=True) with open(output, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush() except requests.ConnectionError: stderr.write('URL {url} refused connection. The resource will not be downloaded\n'.format(url=resource['url'])) except requests.exceptions.RequestException as e: stderr.write(e.message) stderr.write('\n') datapackagejson_output = open('{base_path}{dataset_name}/datapackage.json'.format(base_path=base_path, dataset_name=dataset_name), 'w',) record['version'] = '1.0-beta.10' datapackagejson_output.write(pretty_json(record)) # keep the output in the same order as names while expecting_number in results: record = results.pop(expecting_number) if record: # sort keys so we can diff output jsonl_output.write(compact_json(record, sort_keys=True) + b'\n') expecting_number += 1 if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2