def dump_metadata(ckan, arguments, pagination=DEFAULT_PAGINATION, stdout=None, stderr=None): ''' Dump all the JSON metadata records. This is often a better than using dump_things with thing=datasets, as sites like catalog.data.gov do not support package_list api. The package_search API is used with pagination. ''' if pagination < 1: raise ValueError("Pagination size must be greater or equal to 1") if stdout is None: stdout = getattr(sys.stdout, 'buffer', sys.stdout) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) jsonl_output = stdout if arguments['--output']: jsonl_output = open(arguments['--output'], 'wb') if arguments['--gzip']: jsonl_output = gzip.GzipFile(fileobj=jsonl_output) with quiet_int_pipe() as errors: count = 0 total_count = 0 total_known = False while not total_known or total_count > count: response = ckan.call_action( "package_search", dict(rows=pagination, start=count, sort="id asc")) total_count = response["count"] total_known = True for record in response["results"]: jsonl_output.write( compact_json(record, sort_keys=True) + b'\n') count += 1 if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2
def load_things(ckan, thing, arguments, worker_pool=None, stdin=None, stdout=None, stderr=None): """ create and update datasets, groups, orgs and users The parent process creates a pool of worker processes and hands out json lines to each worker as they finish a task. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdin is None: stdin = getattr(sys.stdin, 'buffer', sys.stdin) if stdout is None: stdout = getattr(sys.__stdout__, 'buffer', sys.__stdout__) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) if arguments['--worker']: return load_things_worker(ckan, thing, arguments) log = None if arguments['--log']: log = open(arguments['--log'], 'a') jsonl_input = stdin if arguments['--input']: jsonl_input = open(arguments['--input'], 'rb') if arguments['--gzip']: jsonl_input = gzip.GzipFile(fileobj=jsonl_input) def line_reader(): """ handle start-record and max-records options """ start_record = int(arguments['--start-record']) max_records = arguments['--max-records'] if max_records is not None: max_records = int(max_records) for num, line in enumerate(jsonl_input, 1): # records start from 1 if num < start_record: continue if max_records is not None and num >= start_record + max_records: break yield num, line cmd = _worker_command_line(thing, arguments) processes = int(arguments['--processes']) if hasattr(ckan, 'parallel_limit'): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) pool = worker_pool(cmd, processes, line_reader()) with quiet_int_pipe() as errors: for job_ids, finished, result in pool: if not result: # child exited with traceback return 1 timestamp, action, error, response = json.loads( result.decode('utf-8')) if not arguments['--quiet']: stderr.write(('%s %s %s %s %s %s\n' % ( finished, job_ids, next(stats), action, error, compact_json(response).decode('utf-8') if response else '' )).encode('utf-8')) if log: log.write(compact_json([ timestamp, finished, action, error, response, ]) + b'\n') log.flush() if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2
def load_things(ckan, thing, arguments, worker_pool=None, stdin=None, stdout=None, stderr=None): """ create and update datasets, groups, orgs and users The parent process creates a pool of worker processes and hands out json lines to each worker as they finish a task. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdin is None: stdin = getattr(sys.stdin, 'buffer', sys.stdin) if stdout is None: stdout = getattr(sys.stdout, 'buffer', sys.stdout) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) if arguments['--worker']: return load_things_worker(ckan, thing, arguments) log = None if arguments['--log']: log = open(arguments['--log'], 'a') jsonl_input = stdin if arguments['--input']: jsonl_input = open(arguments['--input'], 'rb') if arguments['--gzip']: jsonl_input = gzip.GzipFile(fileobj=jsonl_input) def line_reader(): """ handle start-record and max-records options """ start_record = int(arguments['--start-record']) max_records = arguments['--max-records'] if max_records is not None: max_records = int(max_records) for num, line in enumerate(jsonl_input, 1): # records start from 1 if num < start_record: continue if max_records is not None and num >= start_record + max_records: break yield num, line cmd = _worker_command_line(thing, arguments) processes = int(arguments['--processes']) if hasattr(ckan, 'parallel_limit'): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) pool = worker_pool(cmd, processes, line_reader()) with quiet_int_pipe() as errors: for job_ids, finished, result in pool: if not result: # child exited with traceback return 1 timestamp, action, error, response = json.loads( result.decode('utf-8')) if not arguments['--quiet']: stderr.write(('%s %s %s %s %s %s\n' % (finished, job_ids, next(stats), action, error, compact_json(response).decode('utf-8') if response else '')).encode('utf-8')) if log: log.write( compact_json([ timestamp, finished, action, error, response, ]) + b'\n') log.flush() if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2
def delete_things(ckan, thing, arguments, worker_pool=None, stdin=None, stdout=None, stderr=None): """ delete datasets, groups, orgs, users etc, The parent process creates a pool of worker processes and hands out json lines to each worker as they finish a task. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdin is None: stdin = getattr(sys.stdin, 'buffer', sys.stdin) if stdout is None: stdout = getattr(sys.stdout, 'buffer', sys.stdout) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) if arguments['--worker']: return delete_things_worker(ckan, thing, arguments) log = None if arguments['--log']: log = open(arguments['--log'], 'a') jsonl_input = stdin if arguments['--input']: jsonl_input = open(arguments['--input'], 'rb') if arguments['--gzip']: jsonl_input = gzip.GzipFile(fileobj=jsonl_input) def name_reader(): """ handle start-record and max-records options and extract all ids or names from each line (e.g. package_search, package_show or package_list output) record numbers here correspond to names/ids extracted not lines """ start_record = int(arguments['--start-record']) max_records = arguments['--max-records'] if max_records is not None: max_records = int(max_records) for num, name in enumerate(chain.from_iterable( extract_ids_or_names(line) for line in jsonl_input), 1): if num < start_record: continue if max_records is not None and num >= start_record + max_records: break yield num, compact_json(name) cmd = _worker_command_line(thing, arguments) processes = int(arguments['--processes']) if hasattr(ckan, 'parallel_limit'): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) if not arguments['ID_OR_NAME']: pool = worker_pool(cmd, processes, name_reader()) else: pool = worker_pool(cmd, processes, enumerate( (compact_json(n) + b'\n' for n in arguments['ID_OR_NAME']), 1)) with quiet_int_pipe() as errors: for job_ids, finished, result in pool: if not result: # child exited with traceback return 1 timestamp, error, response = json.loads( result.decode('utf-8')) if not arguments['--quiet']: stderr.write(('%s %s %s %s %s\n' % ( finished, job_ids, next(stats), error, compact_json(response).decode('utf-8') if response else '' )).encode('utf-8')) if log: log.write(compact_json([ timestamp, finished, error, response, ]) + b'\n') log.flush() if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2
def dump_things(ckan, thing, arguments, worker_pool=None, stdout=None, stderr=None): """ dump all datasets, groups or orgs accessible by the connected user The parent process creates a pool of worker processes and hands out ids to each worker. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdout is None: stdout = getattr(sys.stdout, 'buffer', sys.stdout) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) if arguments['--worker']: return dump_things_worker(ckan, thing, arguments) log = None if arguments['--log']: log = open(arguments['--log'], 'a') jsonl_output = stdout if arguments['--datapackages']: # TODO: do we want to just divert this to devnull? jsonl_output = open(os.devnull, 'wb') if arguments['--output']: jsonl_output = open(arguments['--output'], 'wb') if arguments['--gzip']: jsonl_output = gzip.GzipFile(fileobj=jsonl_output) if arguments['--all']: get_thing_list = { 'datasets': 'package_list', 'groups': 'group_list', 'organizations': 'organization_list', }[thing] names = ckan.call_action(get_thing_list, {}) else: names = arguments['ID_OR_NAME'] cmd = _worker_command_line(thing, arguments) processes = int(arguments['--processes']) if hasattr(ckan, 'parallel_limit'): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) pool = worker_pool(cmd, processes, enumerate(compact_json(n) + b'\n' for n in names)) results = {} expecting_number = 0 with quiet_int_pipe() as errors: for job_ids, finished, result in pool: timestamp, error, record = json.loads(result.decode('utf-8')) results[finished] = record if not arguments['--quiet']: stderr.write('{0} {1} {2} {3} {4}\n'.format( finished, job_ids, next(stats), error, record.get('name', '') if record else '', ).encode('utf-8')) if log: log.write(compact_json([ timestamp, finished, error, record.get('name', '') if record else None, ]) + b'\n') datapackages_path = arguments['--datapackages'] if datapackages_path: create_datapackage(record, datapackages_path, stderr) # keep the output in the same order as names while expecting_number in results: record = results.pop(expecting_number) if record: # sort keys so we can diff output jsonl_output.write(compact_json(record, sort_keys=True) + b'\n') expecting_number += 1 if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2
def dump_things(ckan, thing, arguments, worker_pool=None, stdout=None, stderr=None): """ dump all datasets, groups, orgs or users accessible by the connected user The parent process creates a pool of worker processes and hands out ids to each worker. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdout is None: stdout = getattr(sys.__stdout__, 'buffer', sys.__stdout__) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) if arguments['--worker']: return dump_things_worker(ckan, thing, arguments) log = None if arguments['--log']: log = open(arguments['--log'], 'a') jsonl_output = stdout if arguments[ '--datapackages']: # TODO: do we want to just divert this to devnull? jsonl_output = open(os.devnull, 'wb') if arguments['--output']: jsonl_output = open(arguments['--output'], 'wb') if arguments['--gzip']: jsonl_output = gzip.GzipFile(fileobj=jsonl_output) if arguments['--all']: get_thing_list = { 'datasets': 'package_list', 'groups': 'group_list', 'organizations': 'organization_list', 'users': 'user_list', 'related': 'related_list', }[thing] params = dict( all_fields=False, # for user_list ) names = ckan.call_action(get_thing_list, params) else: names = arguments['ID_OR_NAME'] if names and isinstance(names[0], dict): names = [rec.get('name', rec.get('id')) for rec in names] if arguments['--datapackages']: arguments['--datastore-fields'] = True cmd = _worker_command_line(thing, arguments) processes = int(arguments['--processes']) if hasattr(ckan, 'parallel_limit'): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) pool = worker_pool(cmd, processes, enumerate(compact_json(n) + b'\n' for n in names)) results = {} expecting_number = 0 with quiet_int_pipe() as errors: for job_ids, finished, result in pool: if not result: # child exited with traceback return 1 timestamp, error, record = json.loads(result.decode('utf-8')) results[finished] = record if not arguments['--quiet']: stderr.write('{0} {1} {2} {3} {4}\n'.format( finished, job_ids, next(stats), error, record.get('name', '') if record else '', ).encode('utf-8')) if log: log.write( compact_json([ timestamp, finished, error, record.get('name', '') if record else None, ]) + b'\n') datapackages_path = arguments['--datapackages'] if datapackages_path: create_datapackage(record, datapackages_path, stderr) # keep the output in the same order as names while expecting_number in results: record = results.pop(expecting_number) if record: # sort keys so we can diff output jsonl_output.write( compact_json(record, sort_keys=True) + b'\n') expecting_number += 1 if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2
def delete_things(ckan, thing, arguments, worker_pool=None, stdin=None, stdout=None, stderr=None): """ delete datasets, groups, orgs, users etc, The parent process creates a pool of worker processes and hands out json lines to each worker as they finish a task. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdin is None: stdin = getattr(sys.stdin, 'buffer', sys.stdin) if stdout is None: stdout = getattr(sys.__stdout__, 'buffer', sys.__stdout__) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) if arguments['--worker']: return delete_things_worker(ckan, thing, arguments) log = None if arguments['--log']: log = open(arguments['--log'], 'a') jsonl_input = stdin if arguments['--input']: jsonl_input = open(arguments['--input'], 'rb') if arguments['--gzip']: jsonl_input = gzip.GzipFile(fileobj=jsonl_input) def name_reader(): """ handle start-record and max-records options and extract all ids or names from each line (e.g. package_search, package_show or package_list output) record numbers here correspond to names/ids extracted not lines """ start_record = int(arguments['--start-record']) max_records = arguments['--max-records'] if max_records is not None: max_records = int(max_records) for num, name in enumerate( chain.from_iterable( extract_ids_or_names(line) for line in jsonl_input), 1): if num < start_record: continue if max_records is not None and num >= start_record + max_records: break yield num, compact_json(name) cmd = _worker_command_line(thing, arguments) processes = int(arguments['--processes']) if hasattr(ckan, 'parallel_limit'): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) if not arguments['ID_OR_NAME']: pool = worker_pool(cmd, processes, name_reader()) else: pool = worker_pool( cmd, processes, enumerate( (compact_json(n) + b'\n' for n in arguments['ID_OR_NAME']), 1)) with quiet_int_pipe() as errors: for job_ids, finished, result in pool: if not result: # child exited with traceback return 1 timestamp, error, response = json.loads(result.decode('utf-8')) if not arguments['--quiet']: stderr.write(('%s %s %s %s %s\n' % (finished, job_ids, next(stats), error, compact_json(response).decode('utf-8') if response else '')).encode('utf-8')) if log: log.write( compact_json([ timestamp, finished, error, response, ]) + b'\n') log.flush() if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2
def load_things(ckan, thing, arguments, worker_pool=None, stdin=None, stdout=None, stderr=None): """ create and update datasets, groups and orgs The parent process creates a pool of worker processes and hands out json lines to each worker as they finish a task. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdin is None: stdin = getattr(sys.stdin, "buffer", sys.stdin) if stdout is None: stdout = getattr(sys.stdout, "buffer", sys.stdout) if stderr is None: stderr = getattr(sys.stderr, "buffer", sys.stderr) if arguments["--worker"]: return load_things_worker(ckan, thing, arguments) log = None if arguments["--log"]: log = open(arguments["--log"], "a") jsonl_input = stdin if arguments["--input"]: jsonl_input = open(arguments["--input"], "rb") if arguments["--gzip"]: jsonl_input = gzip.GzipFile(fileobj=jsonl_input) def line_reader(): """ handle start-record and max-records options """ start_record = int(arguments["--start-record"]) max_records = arguments["--max-records"] if max_records is not None: max_records = int(max_records) for num, line in enumerate(jsonl_input, 1): # records start from 1 if num < start_record: continue if max_records is not None and num >= start_record + max_records: break yield num, line cmd = _worker_command_line(thing, arguments) processes = int(arguments["--processes"]) if hasattr(ckan, "parallel_limit"): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) pool = worker_pool(cmd, processes, line_reader()) with quiet_int_pipe() as errors: for job_ids, finished, result in pool: timestamp, action, error, response = json.loads(result.decode("utf-8")) if not arguments["--quiet"]: stderr.write( ( "%s %s %s %s %s %s\n" % ( finished, job_ids, next(stats), action, error, compact_json(response).decode("utf-8") if response else "", ) ).encode("utf-8") ) if log: log.write(compact_json([timestamp, finished, action, error, response]) + b"\n") log.flush() if "pipe" in errors: return 1 if "interrupt" in errors: return 2
def dump_things(ckan, thing, arguments, worker_pool=None, stdout=None, stderr=None): """ dump all datasets, groups or orgs accessible by the connected user The parent process creates a pool of worker processes and hands out ids to each worker. Status of last record completed and records being processed is displayed on stderr. """ if worker_pool is None: worker_pool = workers.worker_pool if stdout is None: stdout = getattr(sys.stdout, 'buffer', sys.stdout) if stderr is None: stderr = getattr(sys.stderr, 'buffer', sys.stderr) if arguments['--worker']: return dump_things_worker(ckan, thing, arguments) log = None if arguments['--log']: log = open(arguments['--log'], 'a') jsonl_output = stdout if arguments['--dp-output']: # TODO: do we want to just divert this to devnull? jsonl_output = open(os.devnull, 'w') if arguments['--output']: jsonl_output = open(arguments['--output'], 'wb') if arguments['--gzip']: jsonl_output = gzip.GzipFile(fileobj=jsonl_output) if arguments['--all']: get_thing_list = { 'datasets': 'package_list', 'groups': 'group_list', 'organizations': 'organization_list', }[thing] names = ckan.call_action(get_thing_list, {}) else: names = arguments['ID_OR_NAME'] cmd = _worker_command_line(thing, arguments) processes = int(arguments['--processes']) if hasattr(ckan, 'parallel_limit'): # add your sites to ckanapi.remoteckan.MY_SITES instead of removing processes = min(processes, ckan.parallel_limit) stats = completion_stats(processes) pool = worker_pool(cmd, processes, enumerate(compact_json(n) + b'\n' for n in names)) results = {} expecting_number = 0 with quiet_int_pipe() as errors: for job_ids, finished, result in pool: timestamp, error, record = json.loads(result.decode('utf-8')) results[finished] = record if not arguments['--quiet']: stderr.write('{0} {1} {2} {3} {4}\n'.format( finished, job_ids, next(stats), error, record.get('name', '') if record else '', ).encode('utf-8')) if log: log.write(compact_json([ timestamp, finished, error, record.get('name', '') if record else None, ]) + b'\n') if arguments['--dp-output']: # TODO: how are we going to handle which resources to leave alone? They're very inconsistent in some instances # And I can't imagine anyone wants to download a copy of, for example, the API base endpoint resource_formats_to_ignore = ['API', 'api'] dataset_name = record.get('name', '') if record else '' try: base_path = arguments['--dp-output'] except KeyError: base_path = './' target_dir = '{base_path}/{name}/data'.format(base_path=base_path, name=dataset_name) try: os.makedirs(target_dir) except Exception as e: stderr.write(e.message) for resource in record.get('resources', ''): if resource['name'] is not None: resource_id = resource['name'] else: resource_id = resource['id'] resource_filename = os.path.split(resource['url'])[1] output = os.path.join(target_dir, resource_filename) # Resources can have a free-form address and no internal info, so in those cases # we're going to merely save them using the UID. (If they even exist) if output.endswith('/'): output = os.path.join(output, resource_id) resource['path'] = output # datapackage.json format explicitly requires a path to the resource try: if resource['format'] not in resource_formats_to_ignore: r = requests.get(resource['url'], stream=True) with open(output, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush() except requests.ConnectionError: stderr.write('URL {url} refused connection. The resource will not be downloaded\n'.format(url=resource['url'])) except requests.exceptions.RequestException as e: stderr.write(e.message) stderr.write('\n') datapackagejson_output = open('{base_path}{dataset_name}/datapackage.json'.format(base_path=base_path, dataset_name=dataset_name), 'w',) record['version'] = '1.0-beta.10' datapackagejson_output.write(pretty_json(record)) # keep the output in the same order as names while expecting_number in results: record = results.pop(expecting_number) if record: # sort keys so we can diff output jsonl_output.write(compact_json(record, sort_keys=True) + b'\n') expecting_number += 1 if 'pipe' in errors: return 1 if 'interrupt' in errors: return 2