def add_resource(mt_file, ref, cache): """Add a resources entry, downloading the intuiting the file, replacing entries with the same reference""" if isinstance(mt_file, MetapackDoc): doc = mt_file else: doc = MetapackDoc(mt_file) if 'Resources' not in doc: doc.new_section('Resources') doc['Resources'].args = [ e for e in set(doc['Resources'].args + ['Name', 'StartLine', 'HeaderLines', 'Encoding']) if e ] seen_names = set() u = parse_app_url(ref) # The web and file URLs don't list the same. if u.proto == 'file': entries = u.list() else: entries = [ssu for su in u.list() for ssu in su.list()] for e in entries: add_single_resource(doc, e, cache=cache, seen_names=seen_names) write_doc(doc, mt_file)
def process_schemas(mt_file, resource=None, cache=None, clean=False, report_found=True, force=False, min_rows=5000, allow_codes=True): from metapack import MetapackDoc, MetapackResourceUrl, MetapackDocumentUrl if isinstance(mt_file, MetapackDoc): doc = mt_file write_doc_to_file = False else: doc = MetapackDoc(mt_file) write_doc_to_file = True try: if clean: doc['Schema'].clean() else: doc['Schema'] except KeyError: doc.new_section('Schema', ['DataType', 'AltName', 'Description']) schemas_processed = 0 for r in doc['Resources'].find('Root.Resource'): if resource and r.name != resource: continue schema_term = r.schema_term col_count = len(list(r.columns())) datatype_count = sum(1 for c in r.columns() if c['datatype']) if schema_term and col_count == datatype_count and force is False: if report_found: prt("Found table for '{}'; skipping".format(r.schema_name)) continue if col_count != datatype_count: prt("Found table for '{}'; but {} columns don't have datatypes" .format(r.schema_name, col_count - datatype_count)) schemas_processed += 1 rr = r.resolved_url rmtree(get_materialized_data_cache(doc), ignore_errors=True) if isinstance(rr, MetapackDocumentUrl): warn('{} is a MetapackDocumentUrl; skipping', r.name) elif isinstance(rr, MetapackResourceUrl): _process_metapack_resource(doc, r, force) else: _process_normal_resource(doc, r, force, skip_start=min_rows, allow_codes=allow_codes) if write_doc_to_file and schemas_processed: write_doc(doc, mt_file)
def run_update(args): m = MetapackCliMemo(args, downloader) m.doc # Trigger an error if the doc can't be found if m.args.schemas: update_schemas(m) if m.args.schema_properties: update_schema_props(m) if m.args.clean_properties: clean_properties(m) if m.args.alt_name: move_alt_names(m) if m.args.categories: update_categories(m) if m.args.descriptions: update_descriptions(m) if m.args.giturl: from metapack.cli.core import add_giturl add_giturl(m.doc, force=True) write_doc(m.doc) if m.args.promote: update_promote(m) if m.args.custom_update: update_custom(m) if m.args.files: add_files(m) if m.mtfile_url.scheme == 'file' and m.args.name: mod_version = m.args.version if m.args.version \ else '+' if m.args.increment \ else False update_name(m.mt_file, fail_on_missing=True, force=m.args.force, mod_version=mod_version) if m.args.coverage: update_coverage(m) if m.args.touch: touch_metadata(m) if m.args.semantic: to_semantic_version(m)
def change_cmd(m): doc = m.doc t = doc.find_first(m.term, section=m.section) t.value = m.value if m.args.echo: print(doc.as_csv()) write_doc(doc)
def add_cmd(m): doc = m.doc args = dict(a.split('=', 1) for a in m.args.arg) doc[m.section].new_term(m.term, m.value, **args) if m.args.echo: print(doc.as_csv()) write_doc(doc)
def rewrite_schema(r, df, doc=None): """Rebuild the schema for a resource based on a dataframe and re-write the doc""" from metapack.cli.core import write_doc if doc is None: doc = open_source_package() rebuild_schema(doc, r, df) write_doc(doc, doc.ref)
def update_descriptions(m): doc = m.doc for ref in doc.resources(): ref['Description'] = ref.description print(ref.name, id(ref)) print("Updated '{}' to '{}'".format(ref.name, ref.description)) for ref in doc.references(): ref.find_first('Description') # Is this necessary? print(ref.name, id(ref), ref.description) write_doc(doc)
def run_url_add(args): """Add a resources entry, downloading the intuiting the file, replacing entries with the same reference""" m = MetapackCliMemo(args, downloader) update_name(m.mt_file, fail_on_missing=False, report_unchanged=False) if isinstance(m.mt_file, MetapackDoc): doc = m.mt_file else: doc = MetapackDoc(m.mt_file) if 'Resources' not in doc: doc.new_section('Resources') doc['Resources'].args = [ e for e in set(doc['Resources'].args + ['Name', 'StartLine', 'HeaderLines', 'Encoding']) if e ] seen_names = set() u = parse_app_url(args.url) # The web and file URLs don't list the same. if u.proto == 'file': entries = u.list() else: entries = [ssu for su in u.list() for ssu in su.list()] errors = [] for e in entries: if not add_single_resource( doc, e, cache=m.cache, seen_names=seen_names): errors.append(e) if errors: prt() warn("Found, but failed to add these urls:") for e in errors: print(' ', e) write_doc(doc)
def run_url_scrape(args): m = MetapackCliMemo(args, downloader) from metapack.util import scrape_urls_from_web_page doc = m.doc url = m.args.url doc['resources'].new_term('DownloadPage', url) d = scrape_urls_from_web_page(url) if d.get('error'): err(d.get('error')) new_resources = 0 new_documentation = 0 if not args.no_resources: for k, v in d['sources'].items(): u = parse_app_url(v['url']) t = doc['Resources'].new_term('DataFile', v['url'], name=u.fspath.stem, description=v.get('description')) new_resources += 1 if args.verbose: prt(t, t.props) if not args.no_docs: for k, v in d['external_documentation'].items(): term_name = classify_url(v['url']) u = parse_app_url(v['url']) t = doc['Documentation'].new_term(term_name, v['url'], name=u.fspath.stem, description=v.get('description')) new_documentation += 1 if args.verbose: prt(t, t.props) prt("Added {} resource and {} documentation terms".format( new_resources, new_documentation)) if not args.dry_run: write_doc(doc)
def update_promote(m): sections = m.args.promote if not sections or sections == 'RR': sections = ['resources', 'references'] elif sections == '*': sections = [e.lower() for e in m.doc.sections.keys()] else: sections = [sections.lower()] for section_name, section in m.doc.sections.items(): if section_name.lower() in sections: for arg in promote_terms(section): prt("Move {} to header in section {} ".format( arg, section_name)) write_doc(m.doc)
def delete_cmd(m): doc = m.doc terms = doc.find(m.term, section=m.section) if not terms: warn("No terms found for: '{}' in section '{}' ".format( m.term, m.section)) for term in terms: doc.remove_term(term) terms = doc.find(m.term, section=m.section) if terms: warn("Delete failed") if m.args.echo: print(doc.as_csv()) write_doc(doc)
def move_alt_names(m): doc = m.doc for t in doc['Schema'].find('Root.Table'): moved = 0 for c in t.find('Table.Column'): altname = c.get('AltName') if altname: if not c.get('Description'): c.description = (c.name or '').replace('\n', ' ') c.name = altname.value c.remove_child(altname) moved += 1 prt("Moved {} names in '{}'".format(moved, t.name)) write_doc(doc)
def run_init_cmd(m): from git.exc import GitCommandError g = Github(get_token()) remote_r = get_or_new_github_repo(g, m) local_r = get_or_init_local_repo(m) try: origin = local_r.remote('origin') except (ValueError, GitCommandError) as e: print(e) origin = local_r.create_remote('origin', remote_r.clone_url) #local_r.create_head('master', origin.refs.master) local_r.git.push('--set-upstream', 'origin', 'master') add_giturl(m.doc, force=True) write_doc(m.doc) prt(f'Initialized local and remote {origin.refs.master} at {remote_r.clone_url}' )
def convert_metatab_notebook(m): m.doc['Documentation'].get_or_new_term('Root.Readme').value = get_readme(m) return source = None # Path(source) if source.suffix == '.csv': dest = source.with_suffix('.ipynb') doc = MetapackDoc(source) doc.ensure_identifier() doc.update_name(create_term=True) # _write_metatab_notebook(doc, dest) elif source.suffix == '.ipynb': dest = source.with_suffix('.csv') doc = None # extract_notebook_metatab(source) doc.ensure_identifier() doc.update_name(create_term=True) write_doc(doc, dest) else: err("Source file must be either .ipynb or .csv")
def clean_properties(m): doc = m.doc doc.clean_unused_schema_terms() write_doc(doc)
def new_cmd(args): from metapack import MetapackDoc from metapack.util import make_metatab_file, datetime_now, ensure_dir from metapack.cli.core import write_doc, prt, err from os.path import exists, join, expanduser from metatab import DEFAULT_METATAB_FILE from os import getenv if args.config: config_file = args.config elif getenv("METAPACK_CONFIG"): config_file = getenv("METAPACK_DEFAULTS") elif expanduser('~/.metapack-default.csv'): config_file = expanduser('~/.metapack-defaults.csv') else: config_file = None if config_file and exists(config_file): prt(f"Using defaults file '{config_file}'") config = MetapackDoc(config_file) else: config = MetapackDoc() if args.jupyter: import tempfile with tempfile.NamedTemporaryFile(suffix='.ipynb', delete=False) as fp: r = requests.get(TEMPLATE_NOTEBOOK, allow_redirects=True) r.raise_for_status() fp.write(r.content) nb_path = Path(fp.name) doc = MetapackDoc(str(nb_path)) else: doc = make_metatab_file(args.template) doc['Root']['Created'] = datetime_now() origin = args.origin or config.get_value('Root.Origin') if not origin: err("Must specify a value for origin, either on command line or in defaults file" ) (doc['Root'].find_first('Root.Origin') or et).value = origin (doc['Root'].find_first('Root.Dataset') or et).value = args.dataset (doc['Root'].find_first('Root.Space') or et).value = args.space or config.get_value('Root.Space') (doc['Root'].find_first('Root.Time') or et).value = args.time or config.get_value('Root.Time') (doc['Root'].find_first('Root.Grain') or et).value = args.grain or config.get_value('Root.Grain') (doc['Root'].find_first('Root.Variant') or et).value = args.variant or config.get_value('Root.Variant') v = doc['Root'].get_or_new_term('Root.Version') v.get_or_new_child( 'Version.Major' ).value = args.revision or config.get_value('Root.Version') v.get_or_new_child('Version.Minor').value = 1 v.get_or_new_child('Version.Patch').value = 1 # Copy contacts in if 'Contacts' in config: for c in config['Contacts']: doc['Contacts'].add_term(c) if args.title: doc['Root'].find_first('Root.Title').value = args.title.strip() nv_name = doc.as_version(None) if args.example: doc['Resources'].new_term( 'Root.Datafile', 'http://public.source.civicknowledge.com/example.com/sources/random-names.csv', name='random_names') doc['Documentation'].new_term('Root.Homepage', 'http://metatab.org', title='Metatab Home Page') doc.ensure_identifier() doc.update_name(create_term=True) if getattr(args, 'jupyter'): # b/c maybe metatab_jupyter is not installed from metapack_jupyter.convert import write_metatab_notebook from metapack_jupyter.core import edit_notebook, set_cell_source, get_cell_source new_nb_path = Path(f'{nv_name}.ipynb') doc['Resources'].new_term( 'Root.Datafile', './' + str(new_nb_path) + "#df", name='local_dataframe', description='Example of using a local Dataframe') if new_nb_path.exists(): err(f"Directory {nb_path} already exists") copyfile(nb_path, new_nb_path) write_metatab_notebook(doc, new_nb_path) with edit_notebook(new_nb_path) as nb: init = get_cell_source(nb, 'init') init += f"\nthis_package_name = '{str(new_nb_path.name)}'" set_cell_source(nb, 'init', init) nb_path.unlink() else: doc['Documentation'].new_term('Root.Documentation', 'file:README.md', title='README') if exists(nv_name): err(f"Directory {nv_name} already exists") if args.csv: fn = doc.nonver_name + '.csv' write_doc(doc, fn) prt(f"Writing to {fn}") else: ensure_dir(nv_name) pylib_dir = join(nv_name, 'pylib') ensure_dir(pylib_dir) with open(join(pylib_dir, '__init__.py'), 'w') as f_out, open(pylib.__file__) as f_in: f_out.write(f_in.read()) if args.example: doc['Resources'].new_term('Root.Datafile', 'python:pylib#row_generator', name='row_generator') prt(f"Writing to '{nv_name}'") write_doc(doc, join(nv_name, DEFAULT_METATAB_FILE)) add_missing_files(nv_name) if args.title: readme = '# {}\n'.format(args.title) else: readme = '# {}\n'.format(doc.get_value('Root.Name')) with open(join(nv_name, 'README.md'), 'w') as f: f.write(readme)
def update_schema_props(m): doc = m.doc update_schema_properties(doc, force=m.args.force) write_doc(doc)