def test_user(args): from botocore.exceptions import ClientError import boto3 account = get_iam_account(l, args, args.user_name) if not account.access_key: err("Can't test user {}; library does not have record for account ( by arn ) " .format(args.user_name)) session = boto3.Session(aws_access_key_id=account.access_key, aws_secret_access_key=account.secret) root_s3 = get_resource(args, 's3') s3 = session.resource('s3') bn, prefix = split_bucket_name(args.bucket, default=None) root_bucket = root_s3.Bucket(bn) bucket = s3.Bucket(bn) prefixes = [prefix] if prefix else TOP_LEVEL_DIRS for prefix in prefixes: k = prefix + '/test/' + args.user_name rk = k + '-root' ro = root_bucket.put_object(Key=rk, Body=args.user_name) try: o = bucket.Object(rk) c = o.get() read = True except ClientError as e: read = False try: o = bucket.put_object(Key=k, Body=args.user_name) write = True except ClientError as e: write = False try: o.delete() delete = True except ClientError as e: delete = False #ro.delete() prt("{:<35s} {:<5s} {:<5s} {:<6s} {}".format( k, 'read' if read else '', 'write' if write else '', 'delete' if delete else '', 'no access' if not any( (read, write, delete)) else ''))
def metatab_derived_handler(m, skip_if_exists=None): """Create local Zip, Excel and Filesystem packages :param m: :param skip_if_exists: :return: """ from metatab.package import PackageError create_list = [] url = None doc = MetatabDoc(m.mt_file) env = get_lib_module_dict(doc) if (m.args.excel is not False or m.args.zip is not False or (hasattr(m.args, 'filesystem') and m.args.filesystem is not False) ): update_name(m.mt_file, fail_on_missing=False, report_unchanged=False) if m.args.force: skip_if_exists = False try: # Always create a filesystem package before ZIP or Excel, so we can use it as a source for # data for the other packages. This means that Transform processes and programs only need # to be run once. if any([m.args.filesystem, m.args.excel, m.args.zip]): _, url, created = make_filesystem_package(m.mt_file, m.cache, env, skip_if_exists) create_list.append(('fs', url, created)) m.mt_file = url env = {} # Don't need it anymore, since no more programs will be run. if m.args.excel is not False: _, url, created = make_excel_package(m.mt_file, m.cache, env, skip_if_exists) create_list.append(('xlsx', url, created)) if m.args.zip is not False: _, url, created = make_zip_package(m.mt_file, m.cache, env, skip_if_exists) create_list.append(('zip', url, created)) if m.args.csv is not False: _, url, created = make_csv_package(m.mt_file, m.cache, env, skip_if_exists) create_list.append(('csv', url, created)) except PackageError as e: err("Failed to generate package: {}".format(e)) return create_list
def package_info(doc): client = dw.api_client() username = '******' title = doc.find_first_value("Root.Title") key = join(username, slugify(title)) try: ds = client.get_dataset(key) prt(json.dumps(ds, indent=4)) except RestApiError as e: err(e)
def metatab_query_handler(m): if m.args.resource or m.args.head: limit = 20 if m.args.head else None try: doc = MetatabDoc(m.mt_file, cache=m.cache) except OSError as e: err("Failed to open Metatab doc: {}".format(e)) return if m.resource: dump_resource(doc, m.resource, limit) else: dump_resources(doc)
def get_resource_urls(doc): resources = {} for dist in doc.find("Root.Distribution"): package_url, metadata_url = resolve_package_metadata_url(dist.value) u = Url(package_url) if u.resource_format == 'zip': prt("Skipping ZIP package ", package_url) elif u.resource_format == 'xlsx': resources[basename(package_url)] = package_url prt("Adding XLS package ", package_url) pass elif u.resource_format == 'csv': resources[basename(package_url)] = package_url prt("Adding CSV package {}".format(basename(package_url))) try: p = open_package(package_url) except (IOError, MetatabError) as e: err("Failed to open package '{}' from reference '{}': {}". format(package_url, dist.value, e)) for r in p.resources(): mimetype = mimetypes.guess_type(r.resolved_url)[0] try: ext = mimetypes.guess_extension(mimetype)[1:] except: ext = None # '.csv': Data>world currently get the format from the name, not the URL resources[r.name + '.csv'] = r.resolved_url prt("Adding CSV resource {}".format(r.name)) else: prt('Skipping {}'.format(package_url)) return resources
def metaworld(): import argparse parser = argparse.ArgumentParser( prog='metakan', description='Publish packages to Data.World, version {}'.format( _meta.__version__)) parser.add_argument('-i', '--info', default=False, action='store_true', help="Show package information") parser.add_argument('metatabfile', nargs='?', default=DEFAULT_METATAB_FILE, help='Path to a Metatab file') class MetapackCliMemo(object): def __init__(self, args): self.cwd = getcwd() self.args = args self.cache = get_cache('metapack') self.mtfile_arg = args.metatabfile if args.metatabfile else join( self.cwd, DEFAULT_METATAB_FILE) self.mtfile_url = Url(self.mtfile_arg) self.resource = self.mtfile_url.parts.fragment self.package_url, self.mt_file = resolve_package_metadata_url( self.mtfile_url.rebuild_url(False, False)) m = MetapackCliMemo(parser.parse_args(sys.argv[1:])) try: doc = MetatabDoc(m.mt_file, cache=m.cache) except (IOError, MetatabError) as e: err("Failed to open metatab '{}': {}".format(m.mt_file, e)) if m.args.info: package_info(doc) else: send_to_dw(doc) exit(0)
def write(self, body, path, acl=None): from botocore.exceptions import ClientError import mimetypes acl = acl if acl is not None else self._acl #if isinstance(body, six.string_types): # with open(body,'rb') as f: # body = f.read() key = join(self._prefix, path).strip('/') try: o = self._bucket.Object(key) if o.content_length == len(body): prt("File '{}' already in bucket; skipping".format(key)) return self.access_url(path) else: prt("File '{}' already in bucket, but length is different; re-wirtting" .format(key)) except ClientError as e: if int(e.response['Error']['Code']) in (403, 405): err("S3 Access failed for '{}:{}': {}\nNOTE: With Docker, this error is often the result of container clock drift. Check your container clock. " .format(self._bucket_name, key, e)) elif int(e.response['Error']['Code']) != 404: err("S3 Access failed for '{}:{}': {}".format( self._bucket_name, key, e)) ct = mimetypes.guess_type(key)[0] try: self._bucket.put_object( Key=key, Body=body, ACL=acl, ContentType=ct if ct else 'binary/octet-stream') except Exception as e: self.err("Failed to write '{}': {}".format(key, e)) return self.access_url(path)
def delete_user(args): from botocore.exceptions import ClientError client = get_client(args, 'iam') try: resource = get_resource(args, 'iam') user = resource.User(args.user_name) for key in user.access_keys.all(): prt("Deleting user key: {}".format(key)) key.delete() for policy in user.policies.all(): prt("Deleting user policy: {}".format(policy.name)) policy.delete() response = client.delete_user(UserName=args.user_name) prt("Deleted user: {}".format(args.user_name)) except ClientError as e: err("Could not delete user: {}".format(e))
def set_mt_arg(self, metatabfile): self.mtfile_arg = metatabfile if metatabfile else join( self.cwd, DEFAULT_METATAB_FILE) self.mtfile_url = Url(self.mtfile_arg) self.resource = self.mtfile_url.parts.fragment self.package_url, self.mt_file = resolve_package_metadata_url( self.mtfile_url.rebuild_url(False, False)) self.api_key = self.args.api or getenv('METAKAN_API_KEY') self.ckan_url = self.args.ckan or getenv('METAKAN_CKAN_URL') if not self.ckan_url: err("Set the --ckan option or the METAKAN_CKAN_URL env var to set the URL of a ckan instance" ) if not self.api_key: err("Set the --api option METAKAN_API_KEY env var with the API key to a CKAN instance" )
def metapack(): import argparse parser = argparse.ArgumentParser( prog='metapack', description='Create and manipulate metatab data packages, version {}'.format(_meta.__version__)) parser.add_argument('metatabfile', nargs='?', help="Path or URL to a metatab file. If not provided, defaults to 'metadata.csv' ") parser.add_argument('-p', '--profile', help="Name of a BOTO or AWS credentails profile", required=False) parser.add_argument('--exceptions', default=False, action='store_true', help='Show full stack tract for some unhandled exceptions') parser.set_defaults(handler=None) ## ## Build Group build_group = parser.add_argument_group('Building Metatab Files', 'Build and manage a metatab file for a pacakge') build_group.add_argument('-c', '--create', action='store', nargs='?', default=False, help="Create a new metatab file, from named template. With no argument, uses the " "'metatab' template ") build_group.add_argument('-a', '--add', default=False, help='Add a file or url to the resources. With a directory add a data files in the directory. ' 'If given a URL to a web page, will add all links that point to CSV, Excel Files and' 'data files in ZIP files. (Caution: it will download and cache all of these files. )') # build_group.add_argument('-S', '--scrape', # help='Similar to --add, but scrape a web page for links to data files, documentation ' # 'and web pages and add the links as resources ') # build_group.add_argument('-r', '--resources', default=False, action='store_true', # help='Rebuild the resources, intuiting rows and encodings from the URLs') build_group.add_argument('-s', '--schemas', default=False, action='store_true', help='Rebuild the schemas for files referenced in the resource section') build_group.add_argument('-d', '--datapackage', action='store_true', default=False, help="Write a datapackage.json file adjacent to the metatab file") build_group.add_argument('-u', '--update', action='store_true', default=False, help="Update the Name from the Datasetname, Origin and Version terms") build_group.add_argument('-F', '--force', action='store_true', default=False, help='Force some operations, like updating the name and building packages') ## ## Derived Package Group derived_group = parser.add_argument_group('Derived Packages', 'Generate other types of packages') derived_group.add_argument('-e', '--excel', action='store_true', default=False, help='Create an excel archive from a metatab file') derived_group.add_argument('-z', '--zip', action='store_true', default=False, help='Create a zip archive from a metatab file') derived_group.add_argument('-f', '--filesystem', action='store_true', default=False, help='Create a filesystem archive from a metatab file') derived_group.add_argument('-v', '--csv', action='store_true', default=False, help='Create a CSV archive from a metatab file') ## ## QueryPackage Group query_group = parser.add_argument_group('Query', 'Return information and data from a package') query_group.add_argument('-r', '--resource', default=False, action='store_true', help='If the URL has no fragment, dump the resources listed in the metatab file.' ' With a fragment, dump a resource as a CSV') query_group.add_argument('-H', '--head', default=False, action='store_true', help="Dump the first 20 lines of a resource ") ## ## Administration Group admin_group = parser.add_argument_group('Administration', 'Information and administration') admin_group.add_argument('--clean-cache', default=False, action='store_true', help="Clean the download cache") admin_group.add_argument('-C', '--clean', default=False, action='store_true', help="For some operations, like updating schemas, clear the section of existing terms first") admin_group.add_argument('-i', '--info', default=False, action='store_true', help="Show configuration information") admin_group.add_argument('-n', '--name', default=False, action='store_true', help="Print the name of the package") admin_group.add_argument('-E', '--enumerate', help='Enumerate the resources referenced from a URL. Does not alter the Metatab file') admin_group.add_argument('--html', default=False, action='store_true', help='Generate HTML documentation') admin_group.add_argument('--markdown', default=False, action='store_true', help='Generate Markdown documentation') # cmd = parser.add_subparsers(title='Plugin Commands', help='Additional command supplied by plugins') # load_plugins(cmd) class MetapackCliMemo(object): def __init__(self, args): self.cwd = getcwd() self.args = args self.cache = get_cache('metapack') if args.metatabfile and args.metatabfile.startswith('#'): # It's just a fragment, default metatab file args.metatabfile = join(self.cwd, DEFAULT_METATAB_FILE) + args.metatabfile self.mtfile_arg = args.metatabfile if args.metatabfile else join(self.cwd, DEFAULT_METATAB_FILE) self.mtfile_url = Url(self.mtfile_arg) self.resource = self.mtfile_url.parts.fragment self.package_url, self.mt_file = resolve_package_metadata_url(self.mtfile_url.rebuild_url(False, False)) m = MetapackCliMemo(parser.parse_args(sys.argv[1:])) if m.args.info: metatab_info(m.cache) exit(0) if m.args.profile: from metatab.s3 import set_s3_profile set_s3_profile(m.args.profile) try: for handler in (metatab_build_handler, metatab_derived_handler, metatab_query_handler, metatab_admin_handler): handler(m) except Exception as e: if m.args.exceptions: raise e else: err(e) clean_cache(m.cache)
def metatab_build_handler(m): if m.args.create is not False: template = m.args.create if m.args.create else 'metatab' if not exists(m.mt_file): doc = make_metatab_file(template) doc['Root']['Identifier'] = six.text_type(uuid4()) doc['Root']['Created'] = datetime_now() write_doc(doc, m.mt_file) prt('Created', m.mt_file) else: err('File', m.mt_file, 'already exists') if m.args.add: update_name(m.mt_file, fail_on_missing=False, report_unchanged=False) add_resource(m.mt_file, m.args.add, cache=m.cache) if False: # m.args.resources: update_name(m.mt_file, fail_on_missing=False, report_unchanged=False) doc = MetatabDoc(m.mt_file) try: doc['Schema'].clean() except KeyError: pass for t in list(doc['Resources']): # w/o list(), will iterate over new terms if not t.term_is('root.datafile'): continue if t.as_dict().get('url'): add_resource(doc, t.as_dict()['url'], m.cache) else: warn("Entry '{}' on row {} is missing a url; skipping".format(t.join, t.row)) write_doc(doc, m.mt_file) if m.args.schemas: update_name(m.mt_file, fail_on_missing=False, report_unchanged=False) process_schemas(m.mt_file, cache=m.cache, clean=m.args.clean) if m.args.datapackage: update_name(m.mt_file, fail_on_missing=False, report_unchanged=False) from metatab.datapackage import convert_to_datapackage doc = MetatabDoc(m.mt_file) u = Url(m.mt_file) if u.proto == 'file': dpj_file = join(dirname(abspath(u.parts.path)), 'datapackage.json') else: dpj_file = join(getcwd(), 'datapackage.json') try: with open(dpj_file, 'w') as f: f.write(json.dumps(convert_to_datapackage(doc), indent=4)) except ConversionError as e: err(e) if m.mtfile_url.scheme == 'file' and m.args.update: update_name(m.mt_file, fail_on_missing=True, force=m.args.force)
def metatab(): import argparse parser = argparse.ArgumentParser( prog='metatab', description='Matatab file parser, version {}'.format(_meta.__version__)) parser.add_argument('-C', '--clean-cache', default=False, action='store_true', help="Clean the download cache") g = parser.add_mutually_exclusive_group(required=True) g.add_argument('-i', '--info', default=False, action='store_true', help="Show configuration information") g.add_argument('-c', '--create', action='store', nargs='?', default=False, help="Create a new metatab file, from named template. With no argument, uses the 'metatab' template ") g.add_argument('-t', '--terms', default=False, action='store_true', help='Parse a file and print out the stream of terms, before interpretation') g.add_argument('-I', '--interp', default=False, action='store_true', help='Parse a file and print out the stream of terms, after interpretation') g.add_argument('-j', '--json', default=False, action='store_true', help='Parse a file and print out a JSON representation') g.add_argument('-y', '--yaml', default=False, action='store_true', help='Parse a file and print out a YAML representation') g.add_argument('-R', '--resource', default=False, action='store_true', help='If the URL has no fragment, dump the resources listed in the metatab file. With a fragment, dump a resource as a CSV') g.add_argument('-H', '--head', default=False, action='store_true', help="Dump the first 20 lines of a resoruce ") g.add_argument('-S', '--schema', help='Dump the schema for one named resource') parser.add_argument('-d', '--show-declaration', default=False, action='store_true', help='Parse a declaration file and print out declaration dict. Use -j or -y for the format') parser.add_argument('-D', '--declare', help='Parse and incorporate a declaration before parsing the file.' + ' (Adds the declaration to the start of the file as the first term. )') parser.add_argument('file', nargs='?', default=DEFAULT_METATAB_FILE, help='Path to a Metatab file') args = parser.parse_args(sys.argv[1:]) # Specing a fragment screws up setting the default metadata file name if args.file.startswith('#'): args.file = DEFAULT_METATAB_FILE + args.file cache = get_cache('metapack') if args.info: prt('Version : {}'.format(_meta.__version__)) prt('Cache dir: {}'.format(str(cache.getsyspath('/')))) exit(0) if args.clean_cache: clean_cache(cache) if args.create is not False: new_metatab_file(args.file, args.create) exit(0) if args.resource or args.head: limit = 20 if args.head else None u = Url(args.file) resource = u.parts.fragment metadata_url = u.rebuild_url(False, False) package_url, metadata_url = resolve_package_metadata_url(metadata_url) try: doc = MetatabDoc(metadata_url, cache=cache) except OSError as e: err("Failed to open Metatab doc: {}".format(e)) return # Never reached if resource: dump_resource(doc, resource, limit) else: dump_resources(doc) exit(0) if args.show_declaration: doc = MetatabDoc() doc.load_declarations([args.file]) print(json.dumps({ 'terms': doc.decl_terms, 'sections': doc.decl_sections }, indent=4)) exit(0) else: package_url, metadata_url = resolve_package_metadata_url(args.file) try: doc = MetatabDoc(metadata_url, cache=cache) except IOError as e: raise err("Failed to open '{}': {}".format(metadata_url, e)) if args.terms: for t in doc._term_parser: print(t) elif args.json: print(json.dumps(doc.as_dict(), indent=4)) elif args.yaml: import yaml print(yaml.safe_dump(doc.as_dict(), default_flow_style=False, indent=4)) elif args.schema: dump_schema(doc, args.schema) exit(0)
def create_packages(m, second_stage_mtfile, distupdated=None): """ Create Excel, ZIP, FS and CSV packages for upload to S3 :param m: CLI Arguments object :param second_stage_mtfile: Path to a Metatab file, which must have distribution entries :param skip_if_exists: If True, don't recreate the file if exists. :return: """ create_list = [] url = None doc = MetatabDoc(second_stage_mtfile) access_value = doc.find_first_value('Root.Access') if access_value == 'private': acl = 'private' else: acl = 'public-read' # Only the first Filesystem nees an env; the others won't need to run processing, since they # are building from processed files. env = {} s3 = S3Bucket(m.args.s3, acl=acl, profile=m.args.profile) urls = [] if (m.args.excel is not False or m.args.zip is not False or (hasattr(m.args, 'filesystem') and m.args.filesystem is not False)): update_name(m.mt_file, fail_on_missing=False, report_unchanged=False) if m.args.force or distupdated is True: skip_if_exists = False else: skip_if_exists = True try: # Always create a filesystem package before ZIP or Excel, so we can use it as a source for # data for the other packages. This means that Transform processes and programs only need # to be run once. _, third_stage_mtfile, created = make_filesystem_package( second_stage_mtfile, m.cache, get_lib_module_dict(doc), skip_if_exists) if m.args.excel is not False: _, ex_url, created = make_excel_package(third_stage_mtfile, m.cache, env, skip_if_exists) with open(ex_url, mode='rb') as f: urls.append(('excel', s3.write(f.read(), basename(ex_url), acl))) if m.args.zip is not False: _, zip_url, created = make_zip_package(third_stage_mtfile, m.cache, env, skip_if_exists) with open(zip_url, mode='rb') as f: urls.append(('zip', s3.write(f.read(), basename(zip_url), acl))) # Note! This is a FileSystem package on the remote S3 bucket, not locally if m.args.fs is not False: try: fs_p, fs_url, created = make_s3_package( third_stage_mtfile, m.args.s3, m.cache, env, acl, skip_if_exists) except NoCredentialsError: print(getenv('AWS_SECRET_ACCESS_KEY')) err("Failed to find boto credentials for S3. " "See http://boto3.readthedocs.io/en/latest/guide/configuration.html " ) urls.append(('fs', fs_url)) # Make the CSV package from the filesystem package on S3; this will ensure that the # package's resource URLs point to the S3 objects if m.args.csv is not False: # Using the signed url in case the bucket is private p = CsvPackage(fs_p.access_url, cache=m.tmp_cache) csv_url = p.save(PACKAGE_PREFIX) with open(csv_url, mode='rb') as f: urls.append(('csv', s3.write(f.read(), basename(csv_url), acl))) except PackageError as e: err("Failed to generate package: {}".format(e)) return urls
def metasync(): import argparse parser = argparse.ArgumentParser( prog='metasync', description='Create packages and store them in s3 buckets, version {}'. format(_meta.__version__), ) parser.add_argument('-i', '--info', default=False, action='store_true', help="Show configuration information") parser.add_argument('-v', '--verbose', default=False, action='store_true', help="For some command, be more verbose") parser.add_argument( '-F', '--force', action='store_true', default=False, help='Force building packages, even when they already exist') parser.add_argument('-p', '--profile', help="Name of a BOTO or AWS credentails profile", required=False) parser.add_argument('-s', '--s3', help="URL to S3 where packages will be stored", required=False) parser.add_argument('-S', '--all-s3', help="Synonym for `metasync -c -e -f -z -s <url>`", required=False) parser.add_argument( '-e', '--excel', action='store_true', default=False, help='Create an excel package from a metatab file and copy it to S3. ') parser.add_argument( '-z', '--zip', action='store_true', default=False, help='Create a zip package from a metatab file and copy it to S3. ') parser.add_argument( '-c', '--csv', action='store_true', default=False, help= 'Create a csv package from a metatab file and copy it to S3. Requires building a file system package' ) parser.add_argument( '-f', '--fs', action='store_true', default=False, help= 'Create a Filesystem package. Unlike -e and -f, only writes the package to S3.' ) parser.add_argument('-D', '--docker', help="Re-run the metasync command through docker", action='store_true', default=False) parser.add_argument( '-C', '--credentials', help="Show S3 Credentials and exit. " "Eval this string to setup credentials in other shells.", action='store_true', default=False) parser.add_argument('metatabfile', nargs='?', help='Path to a Metatab file') class MetapackCliMemo(object): def __init__(self, raw_args): self.cwd = getcwd() self.raw_args = raw_args self.args = parser.parse_args(self.raw_args[1:]) self.cache = get_cache('metapack') # This one is for loading packages that have just been # written to S3. self.tmp_cache = get_cache('temp') clean_cache(self.tmp_cache) if self.args.all_s3: self.args.s3 = self.args.all_s3 self.args.excel = True self.args.zip = True self.args.csv = True self.args.fs = True self.mtfile_arg = self.args.metatabfile if self.args.metatabfile else join( self.cwd, DEFAULT_METATAB_FILE) self.mtfile_url = Url(self.mtfile_arg) self.resource = self.mtfile_url.parts.fragment self.package_url, self.mt_file = resolve_package_metadata_url( self.mtfile_url.rebuild_url(False, False)) self.args.fs = self.args.csv or self.args.fs m = MetapackCliMemo(sys.argv) if m.args.credentials: show_credentials(m.args.profile) exit(0) if m.args.docker: run_docker(m) if m.args.info: metatab_info(m.cache) exit(0) if not m.args.s3: doc = MetatabDoc(m.mt_file) m.args.s3 = doc['Root'].find_first_value('Root.S3') if not m.args.s3: err("Must specify either -S or -s") if m.args.excel is not False or m.args.zip is not False or m.args.fs is not False: update_name(m.mt_file, fail_on_missing=False, report_unchanged=False) doc = MetatabDoc(m.mt_file) doc['Root'].get_or_new_term('Root.S3', m.args.s3) write_doc(doc, m.mt_file) second_stage_mtfile, distupdated = update_distributions(m) if second_stage_mtfile != m.mt_file: prt("Building packages from: ", second_stage_mtfile) created = create_packages(m, second_stage_mtfile, distupdated=distupdated) prt("Synchronized these Package Urls") prt(tabulate(created)) exit(0)
from os.path import join, basename from metatab import _meta, DEFAULT_METATAB_FILE, resolve_package_metadata_url, MetatabDoc, MetatabError, open_package from metatab.cli.core import err from rowgenerators import get_cache, Url from .core import prt from metatab.util import slugify import json import mimetypes try: import datadotworld as dw from datadotworld.client.api import RestApiError except ImportError: err("To run the Metataworld importer, you must first install the datadotworld package. See https://github.com/datadotworld/data.world-py" ) def metaworld(): import argparse parser = argparse.ArgumentParser( prog='metakan', description='Publish packages to Data.World, version {}'.format( _meta.__version__)) parser.add_argument('-i', '--info', default=False, action='store_true', help="Show package information")
def send_to_ckan(m): from ckanapi import RemoteCKAN, NotFound try: doc = MetatabDoc(m.mt_file, cache=m.cache) except (IOError, MetatabError) as e: err("Failed to open metatab '{}': {}".format(m.mt_file, e)) c = RemoteCKAN(m.ckan_url, apikey=m.api_key) ckanid = doc.find_first_value('Root.Ckanid') identifier = doc.find_first_value('Root.Identitfier') name = doc.find_first('Root.Name') ckan_name = name.value.replace('.', '-') id_name = ckanid or ckan_name try: pkg = c.action.package_show(name_or_id=id_name) prt("Updating CKAN dataset for '{}'".format(ckan_name)) except NotFound: pkg = c.action.package_create(name=ckan_name, package_id=identifier) prt("Adding CKAN dataset for '{}'".format(ckan_name)) pkg['title'] = doc.find_first_value('Root.Title') if not pkg['title']: pkg['title'] = doc.find_first_value('Root.Description') try: pkg['notes'] = doc.markdown #doc.find_first_value('Root.Description') except OSError as e: warn(e) pkg['version'] = name.properties.get('version') pkg['groups'] = [{'name': g.value} for g in doc['Root'].find('Root.Group')] pkg['tags'] = [{'name': g.value} for g in doc['Root'].find('Root.Tag')] def get_org(name): if not name: return None try: return except NotFound: return None org_name = name.get('Origin', doc['Root'].find_first_value('Root.CkanOrg')) if org_name: org_name_slug = org_name.value.replace('.', '-') try: owner_org = c.action.organization_show(id=org_name_slug).get('id') pkg['owner_org'] = owner_org except NotFound: warn("Didn't find org for '{}'; not setting organization ".format( org_name_slug)) org_name_slug = None else: org_name_slug = None extras = {} for t in doc.find('*.*', section='Root'): if not t.term_is('Root.Distribution'): extras[t.qualified_term] = t.value for t in name.children: extras[t.qualified_term] = t.value pkg['extras'] = [{'key': k, 'value': v} for k, v in extras.items()] resources = [] for dist in doc.find("Root.Distribution"): package_url, metadata_url = resolve_package_metadata_url(dist.value) u = Url(package_url) if u.resource_format == 'zip': d = dict(url=package_url, name=basename(package_url), format='ZIP', mimetype=mimetypes.guess_type(package_url)[0], description='ZIP version of package') resources.append(d) prt("Adding ZIP package ", d['name']) elif u.resource_format == 'xlsx': d = dict(url=package_url, name=basename(package_url), format='XLSX', mimetype=mimetypes.guess_type(package_url)[0], description='Excel version of package') resources.append(d) prt("Adding XLS package ", d['name']) elif u.resource_format == 'csv': d = dict(url=package_url, name=basename(package_url), format='csv', mimetype=mimetypes.guess_type(metadata_url)[0], description='CSV Package Metadata in Metatab format') resources.append(d) prt("Adding {} package {}".format(d['format'], d['name'])) try: p = open_package(package_url) except (IOError, MetatabError) as e: err("Failed to open package '{}' from reference '{}': {}". format(package_url, dist.value, e)) for r in p.resources(): mimetype = mimetypes.guess_type(r.resolved_url)[0] try: ext = mimetypes.guess_extension(mimetype)[1:] except: ext = None d = dict(name=r.name, format=ext, url=r.resolved_url, mimetype=mimetype, description=r.markdown) resources.append(d) prt("Adding {} resource {}".format(d['format'], d['name'])) pkg['resources'] = resources c.action.package_update(**pkg) pkg = c.action.package_show(name_or_id=ckan_name) update_dist(doc, [], join(m.ckan_url, 'dataset', ckan_name)) ## ## Add a term with CKAN info. doc['Root'].get_or_new_term('CkanId', pkg['id']) if org_name_slug is None and pkg.get('organization'): doc['Root'].get_or_new_term('CkanOrg', (pkg.get('organization') or {}).get('name')) groups = doc['Root'].find('Group') for g in groups: doc.remove_term(g) for group in pkg.get('groups', []): doc['Root'].new_term('Group', group['name']) write_doc(doc, m.mt_file)