def __init__(self, raw_args): self.cwd = getcwd() self.raw_args = raw_args self.args = parser.parse_args(self.raw_args[1:]) self.cache = get_cache('metapack') # This one is for loading packages that have just been # written to S3. self.tmp_cache = get_cache('temp') clean_cache(self.tmp_cache) if self.args.all_s3: self.args.s3 = self.args.all_s3 self.args.excel = True self.args.zip = True self.args.csv = True self.args.fs = True self.mtfile_arg = self.args.metatabfile if self.args.metatabfile else join( self.cwd, DEFAULT_METATAB_FILE) self.mtfile_url = Url(self.mtfile_arg) self.resource = self.mtfile_url.parts.fragment self.package_url, self.mt_file = resolve_package_metadata_url( self.mtfile_url.rebuild_url(False, False)) self.args.fs = self.args.csv or self.args.fs
def __init__(self, url=None, downloader=None, **kwargs): kwargs['proto'] = 'metatab' u = Url(url, **kwargs) assert downloader # If there is no file with an extension in the path, assume that this # is a filesystem package, and that the path should have DEFAULT_METATAB_FILE if file_ext(basename( u.path)) not in ('zip', 'xlsx') + self.simple_file_formats: u.path = join(u.path, DEFAULT_METATAB_FILE) super().__init__(str(u), downloader=downloader, **kwargs) self.scheme_extension = 'metatab' if basename(self.path) == DEFAULT_METATAB_FILE: frag = '' elif self.resource_format in self.simple_file_formats: frag = '' elif self.resource_format == 'xlsx': frag = 'meta' elif self.resource_format == 'zip': frag = DEFAULT_METATAB_FILE self.fragment = [frag, None]
def update_dist(doc, old_dists, v): # This isn't quite correct, because it will try to remove the .csv format # Distributions twice, since both <name>.csv and <name>/metadata.csv have the same format. # (That's why theres a try/except ) But, it is effective name = doc.find_first_value("Root.Name") for d in old_dists: if Url(d.value).resource_format == Url( v).resource_format and name not in d.value: try: doc.remove_term(d) except ValueError: pass t = doc.find_first('Root.Distribution', v) if not t: doc['Root'].new_term('Root.Distribution', v) return True else: return False
def __init__(self, args): self.cwd = getcwd() self.args = args self.cache = get_cache('metapack') self.mtfile_arg = args.metatabfile if args.metatabfile else join( self.cwd, DEFAULT_METATAB_FILE) self.mtfile_url = Url(self.mtfile_arg) self.resource = self.mtfile_url.parts.fragment self.package_url, self.mt_file = resolve_package_metadata_url( self.mtfile_url.rebuild_url(False, False))
def write_doc(doc, mt_file): """ Write a Metatab doc to a CSV file, and update the Modified time :param doc: :param mt_file: :return: """ doc['Root']['Modified'] = datetime_now() doc['Root'].sort_by_term(order=[ 'Root.Declare', 'Root.Title', 'Root.Description', 'Root.Identifier', 'Root.Name', 'Root.Dataset', 'Root.Origin', 'Root.Time', 'Root.Space', 'Root.Grain', 'Root.Version', 'Root.Group', 'Root.Tag', 'Root.Keyword', 'Root.Subject', 'Root.Created', 'Root.Modified', 'Root.Issued', 'Root.Access', 'Root.Distribution' ]) import subprocess out = subprocess.run(['git', 'remote', 'show', 'origin'], stdout=subprocess.PIPE).stdout.decode('utf-8') fetchline = next(l.split() for l in out.splitlines() if 'Fetch' in l) if fetchline: t = doc['Root'].get_or_new_term('GitUrl') t.value = fetchline[-1] u = Url(mt_file) if u.scheme == 'file': doc.write_csv(mt_file) return True else: return False
def test_ipy(self): from rowgenerators import SourceSpec, Url, RowGenerator, get_cache from rowgenerators.fetch import download_and_cache urls = ('ipynb+file:foobar.ipynb', 'ipynb+http://example.com/foobar.ipynb', 'ipynb:foobar.ipynb') for url in urls: u = Url(url) print(u, u.path, u.resource_url) s = SourceSpec(url) print(s, s.proto, s.scheme, s.resource_url, s.target_file, s.target_format) self.assertIn(s.scheme, ('file', 'http')) self.assertEquals('ipynb', s.proto) gen = RowGenerator(cache=get_cache(), url='ipynb:scripts/Py3Notebook.ipynb#lst', working_dir=test_data(), generator_args={'mult': lambda x: x * 3}) rows = gen.generator.execute() print(len(rows))
def get_lib_module_dict(doc): """Load the 'lib' directory as a python module, so it can be used to provide functions for rowpipe transforms""" from os.path import dirname, abspath, join, isdir from importlib import import_module import sys u = Url(doc.ref) if u.proto == 'file': doc_dir = dirname(abspath(u.parts.path)) # Add the dir with the metatab file to the system path sys.path.append(doc_dir) if not isdir(join(doc_dir, 'lib')): return {} try: m = import_module("lib") return {k: v for k, v in m.__dict__.items() if k in m.__all__} except ImportError as e: err("Failed to import python module form 'lib' directory: ", str(e)) else: return {}
def __new__(cls, ref=None, cache=None, callback=None, env=None, save_url=None, acl=None): if cls == Package: if isinstance(ref, Url): b = Bunch(ref.dict) else: b = Bunch(Url(ref).dict) if b.resource_format in ('xls', 'xlsx'): return super(Package, cls).__new__(ExcelPackage) elif b.resource_format == 'zip': return super(Package, cls).__new__(ZipPackage) elif b.proto == 'gs': return super(Package, cls).__new__(GooglePackage) elif b.proto == 's3': return super(Package, cls).__new__(S3Package) elif b.resource_format == 'csv' or b.target_format == 'csv': return super(Package, cls).__new__(CsvPackage) else: raise PackageError( "Can't determine package type for ref '{}' ".format(ref)) else: return super(Package, cls).__new__(cls)
def resolve_package_metadata_url(ref): """Re-write a url to a resource to include the likely refernce to the internal Metatab metadata""" du = Url(ref) if du.resource_format == 'zip': package_url = reparse_url(ref, fragment=False) metadata_url = reparse_url(ref, fragment=DEFAULT_METATAB_FILE) elif du.target_format == 'xlsx' or du.target_format == 'xls': package_url = reparse_url(ref, fragment=False) metadata_url = reparse_url(ref, fragment='meta') elif du.resource_file == DEFAULT_METATAB_FILE: metadata_url = reparse_url(ref) package_url = reparse_url(ref, path=dirname(parse_url_to_dict(ref)['path']), fragment=False) + '/' elif du.target_format == 'csv': package_url = reparse_url(ref, fragment=False) metadata_url = reparse_url(ref) elif du.proto == 'file': p = parse_url_to_dict(ref) if isfile(p['path']): metadata_url = reparse_url(ref) package_url = reparse_url(ref, path=dirname(p['path']), fragment=False) else: p['path'] = join(p['path'], DEFAULT_METATAB_FILE) package_url = reparse_url(ref, fragment=False, path=p['path'].rstrip('/') + '/') metadata_url = unparse_url_dict(p) # Make all of the paths absolute. Saves a lot of headaches later. package_url = reparse_url(package_url, path=abspath( parse_url_to_dict(package_url)['path'])) metadata_url = reparse_url( metadata_url, path=abspath(parse_url_to_dict(metadata_url)['path'])) else: metadata_url = join(ref, DEFAULT_METATAB_FILE) package_url = reparse_url(ref, fragment=False) # raise PackageError("Can't determine package URLs for '{}'".format(ref)) return package_url, metadata_url
def _resolved_url(self): """Return a URL that properly combines the base_url and a possibly relative resource url""" from rowgenerators.generators import PROTO_TO_SOURCE_MAP if self.base_url: u = Url(self.base_url) # S3 security is a pain if u.proto == 's3' and self.doc.find_first_value( "Root.Access") != 'private': print("!!!!!!", type(u)) else: u = Url(self.doc.package_url) # Url(self.doc.ref) if not self._self_url: return None nu = u.component_url(self._self_url) # For some URLs, we ned to put the proto back on. su = Url(self._self_url) if su.proto in PROTO_TO_SOURCE_MAP().keys(): nu = reparse_url(nu, scheme_extension=su.proto) assert nu return nu
def set_mt_arg(self, metatabfile): self.mtfile_arg = metatabfile if metatabfile else join( self.cwd, DEFAULT_METATAB_FILE) self.mtfile_url = Url(self.mtfile_arg) self.resource = self.mtfile_url.parts.fragment self.package_url, self.mt_file = resolve_package_metadata_url( self.mtfile_url.rebuild_url(False, False)) self.api_key = self.args.api or getenv('METAKAN_API_KEY') self.ckan_url = self.args.ckan or getenv('METAKAN_CKAN_URL') if not self.ckan_url: err("Set the --ckan option or the METAKAN_CKAN_URL env var to set the URL of a ckan instance" ) if not self.api_key: err("Set the --api option METAKAN_API_KEY env var with the API key to a CKAN instance" )
def write_csv(self, path=None): from rowgenerators import Url self.cleanse() if path is None: path = self.ref u = Url(path) if u.scheme != 'file': raise MetatabError("Can't write file to URL '{}'".format(path)) with open(u.parts.path, 'wb') as f: f.write(self.as_csv())
class MetapackCliMemo(object): def __init__(self, args): self.cwd = getcwd() self.args = args self.cache = get_cache('metapack') if args.metatabfile and args.metatabfile.startswith('#'): # It's just a fragment, default metatab file args.metatabfile = join(self.cwd, DEFAULT_METATAB_FILE) + args.metatabfile self.mtfile_arg = args.metatabfile if args.metatabfile else join(self.cwd, DEFAULT_METATAB_FILE) self.mtfile_url = Url(self.mtfile_arg) self.resource = self.mtfile_url.parts.fragment self.package_url, self.mt_file = resolve_package_metadata_url(self.mtfile_url.rebuild_url(False, False))
def get_resource_urls(doc): resources = {} for dist in doc.find("Root.Distribution"): package_url, metadata_url = resolve_package_metadata_url(dist.value) u = Url(package_url) if u.resource_format == 'zip': prt("Skipping ZIP package ", package_url) elif u.resource_format == 'xlsx': resources[basename(package_url)] = package_url prt("Adding XLS package ", package_url) pass elif u.resource_format == 'csv': resources[basename(package_url)] = package_url prt("Adding CSV package {}".format(basename(package_url))) try: p = open_package(package_url) except (IOError, MetatabError) as e: err("Failed to open package '{}' from reference '{}': {}". format(package_url, dist.value, e)) for r in p.resources(): mimetype = mimetypes.guess_type(r.resolved_url)[0] try: ext = mimetypes.guess_extension(mimetype)[1:] except: ext = None # '.csv': Data>world currently get the format from the name, not the URL resources[r.name + '.csv'] = r.resolved_url prt("Adding CSV resource {}".format(r.name)) else: prt('Skipping {}'.format(package_url)) return resources
def extract_path_name(ref): du = Url(ref) if du.proto == 'file': path = abspath(ref) name = basename(splitext(path)[0]) ref = "file://" + path else: path = ref if du.target_segment: try: int(du.target_segment) name = du.target_file + text_type(du.target_segment) except ValueError: name = du.target_segment else: name = splitext(du.target_file)[0] return ref, path, name
def __init__(self, url=None, downloader=None, **kwargs): self._state = None self._record_type = None self._year = None self._release = None if url: u = Url(url).remove_netloc().relify_path() for p, v in zip(self.part_names, u.path.split('/')): setattr(self, p, v) for k, v in kwargs.items(): if "_" + k in self.part_names: setattr(self, "_" + k, v) m = self._test_parts() if m: raise PumsUrlError('Parsing error: ' + '; '.join(m)) urls_s = f"pums:{self._state}/{self._record_type}/{self._year}/{self._release}" super().__init__(urls_s, downloader, **kwargs)
class MetapackCliMemo(object): def __init__(self, args): self.cwd = getcwd() self.args = args self.cache = get_cache('metapack') self.set_mt_arg(args.metatabfile) def set_mt_arg(self, metatabfile): self.mtfile_arg = metatabfile if metatabfile else join( self.cwd, DEFAULT_METATAB_FILE) self.mtfile_url = Url(self.mtfile_arg) self.resource = self.mtfile_url.parts.fragment self.package_url, self.mt_file = resolve_package_metadata_url( self.mtfile_url.rebuild_url(False, False)) self.api_key = self.args.api or getenv('METAKAN_API_KEY') self.ckan_url = self.args.ckan or getenv('METAKAN_CKAN_URL') if not self.ckan_url: err("Set the --ckan option or the METAKAN_CKAN_URL env var to set the URL of a ckan instance" ) if not self.api_key: err("Set the --api option METAKAN_API_KEY env var with the API key to a CKAN instance" ) def update_mt_arg(self, metatabfile): """Return a new memo with a new metatabfile argument""" o = MetapackCliMemo(self.args) o.set_mt_arg(metatabfile) return o
def metatab_build_handler(m): if m.args.create is not False: template = m.args.create if m.args.create else 'metatab' if not exists(m.mt_file): doc = make_metatab_file(template) doc['Root']['Identifier'] = six.text_type(uuid4()) doc['Root']['Created'] = datetime_now() write_doc(doc, m.mt_file) prt('Created', m.mt_file) else: err('File', m.mt_file, 'already exists') if m.args.add: update_name(m.mt_file, fail_on_missing=False, report_unchanged=False) add_resource(m.mt_file, m.args.add, cache=m.cache) if False: # m.args.resources: update_name(m.mt_file, fail_on_missing=False, report_unchanged=False) doc = MetatabDoc(m.mt_file) try: doc['Schema'].clean() except KeyError: pass for t in list(doc['Resources']): # w/o list(), will iterate over new terms if not t.term_is('root.datafile'): continue if t.as_dict().get('url'): add_resource(doc, t.as_dict()['url'], m.cache) else: warn("Entry '{}' on row {} is missing a url; skipping".format(t.join, t.row)) write_doc(doc, m.mt_file) if m.args.schemas: update_name(m.mt_file, fail_on_missing=False, report_unchanged=False) process_schemas(m.mt_file, cache=m.cache, clean=m.args.clean) if m.args.datapackage: update_name(m.mt_file, fail_on_missing=False, report_unchanged=False) from metatab.datapackage import convert_to_datapackage doc = MetatabDoc(m.mt_file) u = Url(m.mt_file) if u.proto == 'file': dpj_file = join(dirname(abspath(u.parts.path)), 'datapackage.json') else: dpj_file = join(getcwd(), 'datapackage.json') try: with open(dpj_file, 'w') as f: f.write(json.dumps(convert_to_datapackage(doc), indent=4)) except ConversionError as e: err(e) if m.mtfile_url.scheme == 'file' and m.args.update: update_name(m.mt_file, fail_on_missing=True, force=m.args.force)
def send_to_ckan(m): from ckanapi import RemoteCKAN, NotFound try: doc = MetatabDoc(m.mt_file, cache=m.cache) except (IOError, MetatabError) as e: err("Failed to open metatab '{}': {}".format(m.mt_file, e)) c = RemoteCKAN(m.ckan_url, apikey=m.api_key) ckanid = doc.find_first_value('Root.Ckanid') identifier = doc.find_first_value('Root.Identitfier') name = doc.find_first('Root.Name') ckan_name = name.value.replace('.', '-') id_name = ckanid or ckan_name try: pkg = c.action.package_show(name_or_id=id_name) prt("Updating CKAN dataset for '{}'".format(ckan_name)) except NotFound: pkg = c.action.package_create(name=ckan_name, package_id=identifier) prt("Adding CKAN dataset for '{}'".format(ckan_name)) pkg['title'] = doc.find_first_value('Root.Title') if not pkg['title']: pkg['title'] = doc.find_first_value('Root.Description') try: pkg['notes'] = doc.markdown #doc.find_first_value('Root.Description') except OSError as e: warn(e) pkg['version'] = name.properties.get('version') pkg['groups'] = [{'name': g.value} for g in doc['Root'].find('Root.Group')] pkg['tags'] = [{'name': g.value} for g in doc['Root'].find('Root.Tag')] def get_org(name): if not name: return None try: return except NotFound: return None org_name = name.get('Origin', doc['Root'].find_first_value('Root.CkanOrg')) if org_name: org_name_slug = org_name.value.replace('.', '-') try: owner_org = c.action.organization_show(id=org_name_slug).get('id') pkg['owner_org'] = owner_org except NotFound: warn("Didn't find org for '{}'; not setting organization ".format( org_name_slug)) org_name_slug = None else: org_name_slug = None extras = {} for t in doc.find('*.*', section='Root'): if not t.term_is('Root.Distribution'): extras[t.qualified_term] = t.value for t in name.children: extras[t.qualified_term] = t.value pkg['extras'] = [{'key': k, 'value': v} for k, v in extras.items()] resources = [] for dist in doc.find("Root.Distribution"): package_url, metadata_url = resolve_package_metadata_url(dist.value) u = Url(package_url) if u.resource_format == 'zip': d = dict(url=package_url, name=basename(package_url), format='ZIP', mimetype=mimetypes.guess_type(package_url)[0], description='ZIP version of package') resources.append(d) prt("Adding ZIP package ", d['name']) elif u.resource_format == 'xlsx': d = dict(url=package_url, name=basename(package_url), format='XLSX', mimetype=mimetypes.guess_type(package_url)[0], description='Excel version of package') resources.append(d) prt("Adding XLS package ", d['name']) elif u.resource_format == 'csv': d = dict(url=package_url, name=basename(package_url), format='csv', mimetype=mimetypes.guess_type(metadata_url)[0], description='CSV Package Metadata in Metatab format') resources.append(d) prt("Adding {} package {}".format(d['format'], d['name'])) try: p = open_package(package_url) except (IOError, MetatabError) as e: err("Failed to open package '{}' from reference '{}': {}". format(package_url, dist.value, e)) for r in p.resources(): mimetype = mimetypes.guess_type(r.resolved_url)[0] try: ext = mimetypes.guess_extension(mimetype)[1:] except: ext = None d = dict(name=r.name, format=ext, url=r.resolved_url, mimetype=mimetype, description=r.markdown) resources.append(d) prt("Adding {} resource {}".format(d['format'], d['name'])) pkg['resources'] = resources c.action.package_update(**pkg) pkg = c.action.package_show(name_or_id=ckan_name) update_dist(doc, [], join(m.ckan_url, 'dataset', ckan_name)) ## ## Add a term with CKAN info. doc['Root'].get_or_new_term('CkanId', pkg['id']) if org_name_slug is None and pkg.get('organization'): doc['Root'].get_or_new_term('CkanOrg', (pkg.get('organization') or {}).get('name')) groups = doc['Root'].find('Group') for g in groups: doc.remove_term(g) for group in pkg.get('groups', []): doc['Root'].new_term('Group', group['name']) write_doc(doc, m.mt_file)
def metatab(): import argparse parser = argparse.ArgumentParser( prog='metatab', description='Matatab file parser, version {}'.format(_meta.__version__)) parser.add_argument('-C', '--clean-cache', default=False, action='store_true', help="Clean the download cache") g = parser.add_mutually_exclusive_group(required=True) g.add_argument('-i', '--info', default=False, action='store_true', help="Show configuration information") g.add_argument('-c', '--create', action='store', nargs='?', default=False, help="Create a new metatab file, from named template. With no argument, uses the 'metatab' template ") g.add_argument('-t', '--terms', default=False, action='store_true', help='Parse a file and print out the stream of terms, before interpretation') g.add_argument('-I', '--interp', default=False, action='store_true', help='Parse a file and print out the stream of terms, after interpretation') g.add_argument('-j', '--json', default=False, action='store_true', help='Parse a file and print out a JSON representation') g.add_argument('-y', '--yaml', default=False, action='store_true', help='Parse a file and print out a YAML representation') g.add_argument('-R', '--resource', default=False, action='store_true', help='If the URL has no fragment, dump the resources listed in the metatab file. With a fragment, dump a resource as a CSV') g.add_argument('-H', '--head', default=False, action='store_true', help="Dump the first 20 lines of a resoruce ") g.add_argument('-S', '--schema', help='Dump the schema for one named resource') parser.add_argument('-d', '--show-declaration', default=False, action='store_true', help='Parse a declaration file and print out declaration dict. Use -j or -y for the format') parser.add_argument('-D', '--declare', help='Parse and incorporate a declaration before parsing the file.' + ' (Adds the declaration to the start of the file as the first term. )') parser.add_argument('file', nargs='?', default=DEFAULT_METATAB_FILE, help='Path to a Metatab file') args = parser.parse_args(sys.argv[1:]) # Specing a fragment screws up setting the default metadata file name if args.file.startswith('#'): args.file = DEFAULT_METATAB_FILE + args.file cache = get_cache('metapack') if args.info: prt('Version : {}'.format(_meta.__version__)) prt('Cache dir: {}'.format(str(cache.getsyspath('/')))) exit(0) if args.clean_cache: clean_cache(cache) if args.create is not False: new_metatab_file(args.file, args.create) exit(0) if args.resource or args.head: limit = 20 if args.head else None u = Url(args.file) resource = u.parts.fragment metadata_url = u.rebuild_url(False, False) package_url, metadata_url = resolve_package_metadata_url(metadata_url) try: doc = MetatabDoc(metadata_url, cache=cache) except OSError as e: err("Failed to open Metatab doc: {}".format(e)) return # Never reached if resource: dump_resource(doc, resource, limit) else: dump_resources(doc) exit(0) if args.show_declaration: doc = MetatabDoc() doc.load_declarations([args.file]) print(json.dumps({ 'terms': doc.decl_terms, 'sections': doc.decl_sections }, indent=4)) exit(0) else: package_url, metadata_url = resolve_package_metadata_url(args.file) try: doc = MetatabDoc(metadata_url, cache=cache) except IOError as e: raise err("Failed to open '{}': {}".format(metadata_url, e)) if args.terms: for t in doc._term_parser: print(t) elif args.json: print(json.dumps(doc.as_dict(), indent=4)) elif args.yaml: import yaml print(yaml.safe_dump(doc.as_dict(), default_flow_style=False, indent=4)) elif args.schema: dump_schema(doc, args.schema) exit(0)
def doc_dir(self): from os.path import abspath u = Url(self.ref) return abspath(dirname(u.parts.path))
def __init__(self, ref=None, decl=None, package_url=None, cache=None, clean_cache=False): self._cache = cache if cache else get_cache() self.decl_terms = {} self.decl_sections = {} self.terms = [] self.sections = OrderedDict() self.errors = [] self.package_url = package_url #if Url(self.package_url).proto == 'file': # path = abspath(parse_url_to_dict(self.package_url)['path']) # self.package_url = reparse_url(self.package_url, path = path) if decl is None: self.decls = [] elif not isinstance(decl, MutableSequence): self.decls = [decl] else: self.decls = decl self.load_declarations(self.decls) if ref: self._ref = ref self.root = None self._term_parser = TermParser(self._ref, doc=self) try: self.load_terms(self._term_parser) except SourceError as e: raise MetatabError( "Failed to load terms for document '{}': {}".format( self._ref, e)) u = Url(self._ref) if u.scheme == 'file': try: self._mtime = getmtime(u.parts.path) except (FileNotFoundError, OSError): self._mtime = 0 else: self._mtime = 0 else: self._ref = None self._term_parser = None self.root = SectionTerm('Root', term='Root', doc=self, row=0, col=0, file_name=None, parent=None) self.add_section(self.root) self._mtime = time()