def add_resource(mt_file, ref, cache): """Add a resources entry, downloading the intuiting the file, replacing entries with the same reference""" from metatab.util import enumerate_contents if isinstance(mt_file, MetatabDoc): doc = mt_file else: doc = MetatabDoc(mt_file) if not 'Resources' in doc: doc.new_section('Resources') doc['Resources'].args = [e for e in set(doc['Resources'].args + ['Name', 'StartLine', 'HeaderLines', 'Encoding']) if e] seen_names = set() if isdir(ref): for f in find_files(ref, DATA_FORMATS): if f.endswith(DEFAULT_METATAB_FILE): continue if doc.find_first('Root.Datafile', value=f): prt("Datafile exists for '{}', ignoring".format(f)) else: add_single_resource(doc, f, cache=cache, seen_names=seen_names) else: for c in enumerate_contents(ref, cache=cache, callback=prt): add_single_resource(doc, c.rebuild_url(), cache=cache, seen_names=seen_names) write_doc(doc, mt_file)
def add_single_resource(doc, ref, cache, seen_names): from metatab.util import slugify t = doc.find_first('Root.Datafile', value=ref) if t: prt("Datafile exists for '{}', deleting".format(ref)) doc.remove_term(t) term_name = classify_url(ref) path, name = extract_path_name(ref) # If the name already exists, try to create a new one. # 20 attempts ought to be enough. if name in seen_names: base_name = re.sub(r'-?\d+$', '', name) for i in range(1, 20): name = "{}-{}".format(base_name, i) if name not in seen_names: break seen_names.add(name) encoding = start_line = None header_lines = [] try: encoding, ri = run_row_intuit(path, cache) prt("Added resource for '{}', name = '{}' ".format(ref, name)) start_line = ri.start_line header_lines = ri.header_lines except RowIntuitError as e: warn("Failed to intuit '{}'; {}".format(path, e)) except SourceError as e: warn("Source Error: '{}'; {}".format(path, e)) except Exception as e: warn("Error: '{}'; {}".format(path, e)) if not name: from hashlib import sha1 name = sha1(slugify(path).encode('ascii')).hexdigest()[:12] # xlrd gets grouchy if the name doesn't start with a char try: int(name[0]) name = 'a' + name[1:] except: pass return doc['Resources'].new_term(term_name, ref, name=name, startline=start_line, headerlines=','.join(str(e) for e in header_lines), encoding=encoding)
def init_metatab(mt_file, alt_mt_file): mt_file = alt_mt_file if alt_mt_file else mt_file prt("Initializing '{}'".format(mt_file)) if not exists(mt_file): doc = make_metatab_file() doc['Root']['Identifier'] = str(uuid4()) doc.write_csv(mt_file) else: prt("Doing nothing; file '{}' already exists".format(mt_file))
def metatab_admin_handler(m): if m.args.enumerate: from metatab.util import enumerate_contents specs = list(enumerate_contents(m.args.enumerate, m.cache, callback=prt)) for s in specs: prt(classify_url(s.url), s.target_format, s.url, s.target_segment) if m.args.html: from metatab.html import html doc = MetatabDoc(m.mt_file) # print(doc.html) prt(html(doc)) if m.args.markdown: from metatab.html import markdown doc = MetatabDoc(m.mt_file) prt(markdown(doc)) if m.args.clean_cache: clean_cache('metapack') if m.args.name: doc = MetatabDoc(m.mt_file) prt(doc.find_first_value("Root.Name")) exit(0)
def update_distributions(m): """Add a distribution term for each of the distributions the sync is creating. Also updates the 'Issued' time""" doc = MetatabDoc(m.mt_file) access_value = doc.find_first_value('Root.Access') acl = 'private' if access_value == 'private' else 'public' b = S3Bucket(m.args.s3, acl=acl) updated = False old_dists = list(doc.find('Root.Distribution')) if m.args.excel is not False: p = ExcelPackage(m.mt_file) if update_dist(doc, old_dists, b.access_url(p.save_path())): prt("Added Excel distribution to metadata") updated = True if m.args.zip is not False: p = ZipPackage(m.mt_file) if update_dist(doc, old_dists, b.access_url(p.save_path())): prt("Added ZIP distribution to metadata") updated = True if m.args.fs is not False: p = FileSystemPackage(m.mt_file) if update_dist(doc, old_dists, b.access_url(p.save_path(), DEFAULT_METATAB_FILE)): prt("Added FS distribution to metadata") updated = True if m.args.csv is not False: p = CsvPackage(m.mt_file) url = b.access_url(basename(p.save_path())) if update_dist(doc, old_dists, url): prt("Added CSV distribution to metadata", url) updated = True doc['Root']['Issued'] = datetime_now() if not write_doc(doc, m.mt_file): # The mt_file is probably a URL, so we can't write back to it, # but we need the updated distributions, so write it elsewhere, then # reload it in the next stage. second_stage_file = join(PACKAGE_PREFIX, DEFAULT_METATAB_FILE) if not exists(dirname(second_stage_file)): makedirs(dirname(second_stage_file)) assert write_doc(doc, second_stage_file) else: second_stage_file = m.mt_file return second_stage_file, updated
def write(self, body, path, acl=None): from botocore.exceptions import ClientError import mimetypes acl = acl if acl is not None else self._acl #if isinstance(body, six.string_types): # with open(body,'rb') as f: # body = f.read() key = join(self._prefix, path).strip('/') try: o = self._bucket.Object(key) if o.content_length == len(body): prt("File '{}' already in bucket; skipping".format(key)) return self.access_url(path) else: prt("File '{}' already in bucket, but length is different; re-wirtting" .format(key)) except ClientError as e: if int(e.response['Error']['Code']) in (403, 405): err("S3 Access failed for '{}:{}': {}\nNOTE: With Docker, this error is often the result of container clock drift. Check your container clock. " .format(self._bucket_name, key, e)) elif int(e.response['Error']['Code']) != 404: err("S3 Access failed for '{}:{}': {}".format( self._bucket_name, key, e)) ct = mimetypes.guess_type(key)[0] try: self._bucket.put_object( Key=key, Body=body, ACL=acl, ContentType=ct if ct else 'binary/octet-stream') except Exception as e: self.err("Failed to write '{}': {}".format(key, e)) return self.access_url(path)
def process_schemas(mt_file, cache, clean=False): from rowgenerators import SourceError from requests.exceptions import ConnectionError doc = MetatabDoc(mt_file) try: if clean: doc['Schema'].clean() else: doc['Schema'] except KeyError: doc.new_section('Schema', ['DataType', 'Altname', 'Description']) for r in doc.resources(): schema_name = r.get_value('schema', r.get_value('name')) schema_term = doc.find_first(term='Table', value=schema_name, section='Schema') if schema_term: prt("Found table for '{}'; skipping".format(schema_name)) continue path, name = extract_path_name(r.url) prt("Processing {}".format(r.url)) si = SelectiveRowGenerator(islice(r.row_generator, 100), headers=[int(i) for i in r.get_value('headerlines', '0').split(',')], start=int(r.get_value('startline', 1))) try: ti = TypeIntuiter().run(si) except SourceError as e: warn("Failed to process '{}'; {}".format(path, e)) continue except ConnectionError as e: warn("Failed to download '{}'; {}".format(path, e)) continue table = doc['Schema'].new_term('Table', schema_name) prt("Adding table '{}' ".format(schema_name)) for i, c in enumerate(ti.to_rows()): raw_alt_name = alt_col_name(c['header'], i) alt_name = raw_alt_name if raw_alt_name != c['header'] else '' table.new_child('Column', c['header'], datatype=type_map.get(c['resolved_type'], c['resolved_type']), altname=alt_name) write_doc(doc, mt_file)
def show_credentials(profile): import boto3 session = boto3.Session(profile_name=profile) if profile: cred_line = " 'eval $(metasync -C -p {} )'".format(profile) else: cred_line = " 'eval $(metasync -C)'" prt("export AWS_ACCESS_KEY_ID={} ".format( session.get_credentials().access_key)) prt("export AWS_SECRET_ACCESS_KEY={}".format( session.get_credentials().secret_key)) prt("# Run {} to configure credentials in a shell".format(cred_line))
def run_docker(m): """Re-run the metasync command in docker. """ import botocore.session from subprocess import Popen, PIPE, STDOUT session = botocore.session.get_session() args = [ 'docker', 'run', '--rm', '-t', '-i', '-eAWS_ACCESS_KEY_ID={}'.format(session.get_credentials().access_key), '-eAWS_SECRET_ACCESS_KEY={}'.format( session.get_credentials().secret_key), 'civicknowledge/metatab', 'metasync' ] for a in ('-D', '--docker'): try: m.raw_args.remove(a) except ValueError: pass args.extend(m.raw_args[1:]) if m.args.verbose: prt("Running Docker Command: ", ' '.join(args)) else: prt("Running In Docker") process = Popen(args, stdout=PIPE, stderr=STDOUT) with process.stdout: for line in iter(process.stdout.readline, b''): prt(line.decode('ascii'), end='') exitcode = process.wait() # 0 means success exit(exitcode)
def metatab_build_handler(m): if m.args.create is not False: template = m.args.create if m.args.create else 'metatab' if not exists(m.mt_file): doc = make_metatab_file(template) doc['Root']['Identifier'] = six.text_type(uuid4()) doc['Root']['Created'] = datetime_now() write_doc(doc, m.mt_file) prt('Created', m.mt_file) else: err('File', m.mt_file, 'already exists') if m.args.add: update_name(m.mt_file, fail_on_missing=False, report_unchanged=False) add_resource(m.mt_file, m.args.add, cache=m.cache) if False: # m.args.resources: update_name(m.mt_file, fail_on_missing=False, report_unchanged=False) doc = MetatabDoc(m.mt_file) try: doc['Schema'].clean() except KeyError: pass for t in list(doc['Resources']): # w/o list(), will iterate over new terms if not t.term_is('root.datafile'): continue if t.as_dict().get('url'): add_resource(doc, t.as_dict()['url'], m.cache) else: warn("Entry '{}' on row {} is missing a url; skipping".format(t.join, t.row)) write_doc(doc, m.mt_file) if m.args.schemas: update_name(m.mt_file, fail_on_missing=False, report_unchanged=False) process_schemas(m.mt_file, cache=m.cache, clean=m.args.clean) if m.args.datapackage: update_name(m.mt_file, fail_on_missing=False, report_unchanged=False) from metatab.datapackage import convert_to_datapackage doc = MetatabDoc(m.mt_file) u = Url(m.mt_file) if u.proto == 'file': dpj_file = join(dirname(abspath(u.parts.path)), 'datapackage.json') else: dpj_file = join(getcwd(), 'datapackage.json') try: with open(dpj_file, 'w') as f: f.write(json.dumps(convert_to_datapackage(doc), indent=4)) except ConversionError as e: err(e) if m.mtfile_url.scheme == 'file' and m.args.update: update_name(m.mt_file, fail_on_missing=True, force=m.args.force)
def metatab(): import argparse parser = argparse.ArgumentParser( prog='metatab', description='Matatab file parser, version {}'.format(_meta.__version__)) parser.add_argument('-C', '--clean-cache', default=False, action='store_true', help="Clean the download cache") g = parser.add_mutually_exclusive_group(required=True) g.add_argument('-i', '--info', default=False, action='store_true', help="Show configuration information") g.add_argument('-c', '--create', action='store', nargs='?', default=False, help="Create a new metatab file, from named template. With no argument, uses the 'metatab' template ") g.add_argument('-t', '--terms', default=False, action='store_true', help='Parse a file and print out the stream of terms, before interpretation') g.add_argument('-I', '--interp', default=False, action='store_true', help='Parse a file and print out the stream of terms, after interpretation') g.add_argument('-j', '--json', default=False, action='store_true', help='Parse a file and print out a JSON representation') g.add_argument('-y', '--yaml', default=False, action='store_true', help='Parse a file and print out a YAML representation') g.add_argument('-R', '--resource', default=False, action='store_true', help='If the URL has no fragment, dump the resources listed in the metatab file. With a fragment, dump a resource as a CSV') g.add_argument('-H', '--head', default=False, action='store_true', help="Dump the first 20 lines of a resoruce ") g.add_argument('-S', '--schema', help='Dump the schema for one named resource') parser.add_argument('-d', '--show-declaration', default=False, action='store_true', help='Parse a declaration file and print out declaration dict. Use -j or -y for the format') parser.add_argument('-D', '--declare', help='Parse and incorporate a declaration before parsing the file.' + ' (Adds the declaration to the start of the file as the first term. )') parser.add_argument('file', nargs='?', default=DEFAULT_METATAB_FILE, help='Path to a Metatab file') args = parser.parse_args(sys.argv[1:]) # Specing a fragment screws up setting the default metadata file name if args.file.startswith('#'): args.file = DEFAULT_METATAB_FILE + args.file cache = get_cache('metapack') if args.info: prt('Version : {}'.format(_meta.__version__)) prt('Cache dir: {}'.format(str(cache.getsyspath('/')))) exit(0) if args.clean_cache: clean_cache(cache) if args.create is not False: new_metatab_file(args.file, args.create) exit(0) if args.resource or args.head: limit = 20 if args.head else None u = Url(args.file) resource = u.parts.fragment metadata_url = u.rebuild_url(False, False) package_url, metadata_url = resolve_package_metadata_url(metadata_url) try: doc = MetatabDoc(metadata_url, cache=cache) except OSError as e: err("Failed to open Metatab doc: {}".format(e)) return # Never reached if resource: dump_resource(doc, resource, limit) else: dump_resources(doc) exit(0) if args.show_declaration: doc = MetatabDoc() doc.load_declarations([args.file]) print(json.dumps({ 'terms': doc.decl_terms, 'sections': doc.decl_sections }, indent=4)) exit(0) else: package_url, metadata_url = resolve_package_metadata_url(args.file) try: doc = MetatabDoc(metadata_url, cache=cache) except IOError as e: raise err("Failed to open '{}': {}".format(metadata_url, e)) if args.terms: for t in doc._term_parser: print(t) elif args.json: print(json.dumps(doc.as_dict(), indent=4)) elif args.yaml: import yaml print(yaml.safe_dump(doc.as_dict(), default_flow_style=False, indent=4)) elif args.schema: dump_schema(doc, args.schema) exit(0)
def metasync(): import argparse parser = argparse.ArgumentParser( prog='metasync', description='Create packages and store them in s3 buckets, version {}'. format(_meta.__version__), ) parser.add_argument('-i', '--info', default=False, action='store_true', help="Show configuration information") parser.add_argument('-v', '--verbose', default=False, action='store_true', help="For some command, be more verbose") parser.add_argument( '-F', '--force', action='store_true', default=False, help='Force building packages, even when they already exist') parser.add_argument('-p', '--profile', help="Name of a BOTO or AWS credentails profile", required=False) parser.add_argument('-s', '--s3', help="URL to S3 where packages will be stored", required=False) parser.add_argument('-S', '--all-s3', help="Synonym for `metasync -c -e -f -z -s <url>`", required=False) parser.add_argument( '-e', '--excel', action='store_true', default=False, help='Create an excel package from a metatab file and copy it to S3. ') parser.add_argument( '-z', '--zip', action='store_true', default=False, help='Create a zip package from a metatab file and copy it to S3. ') parser.add_argument( '-c', '--csv', action='store_true', default=False, help= 'Create a csv package from a metatab file and copy it to S3. Requires building a file system package' ) parser.add_argument( '-f', '--fs', action='store_true', default=False, help= 'Create a Filesystem package. Unlike -e and -f, only writes the package to S3.' ) parser.add_argument('-D', '--docker', help="Re-run the metasync command through docker", action='store_true', default=False) parser.add_argument( '-C', '--credentials', help="Show S3 Credentials and exit. " "Eval this string to setup credentials in other shells.", action='store_true', default=False) parser.add_argument('metatabfile', nargs='?', help='Path to a Metatab file') class MetapackCliMemo(object): def __init__(self, raw_args): self.cwd = getcwd() self.raw_args = raw_args self.args = parser.parse_args(self.raw_args[1:]) self.cache = get_cache('metapack') # This one is for loading packages that have just been # written to S3. self.tmp_cache = get_cache('temp') clean_cache(self.tmp_cache) if self.args.all_s3: self.args.s3 = self.args.all_s3 self.args.excel = True self.args.zip = True self.args.csv = True self.args.fs = True self.mtfile_arg = self.args.metatabfile if self.args.metatabfile else join( self.cwd, DEFAULT_METATAB_FILE) self.mtfile_url = Url(self.mtfile_arg) self.resource = self.mtfile_url.parts.fragment self.package_url, self.mt_file = resolve_package_metadata_url( self.mtfile_url.rebuild_url(False, False)) self.args.fs = self.args.csv or self.args.fs m = MetapackCliMemo(sys.argv) if m.args.credentials: show_credentials(m.args.profile) exit(0) if m.args.docker: run_docker(m) if m.args.info: metatab_info(m.cache) exit(0) if not m.args.s3: doc = MetatabDoc(m.mt_file) m.args.s3 = doc['Root'].find_first_value('Root.S3') if not m.args.s3: err("Must specify either -S or -s") if m.args.excel is not False or m.args.zip is not False or m.args.fs is not False: update_name(m.mt_file, fail_on_missing=False, report_unchanged=False) doc = MetatabDoc(m.mt_file) doc['Root'].get_or_new_term('Root.S3', m.args.s3) write_doc(doc, m.mt_file) second_stage_mtfile, distupdated = update_distributions(m) if second_stage_mtfile != m.mt_file: prt("Building packages from: ", second_stage_mtfile) created = create_packages(m, second_stage_mtfile, distupdated=distupdated) prt("Synchronized these Package Urls") prt(tabulate(created)) exit(0)