def convert_documentation(nb_path): """Run only the document conversion portion of the notebook conversion The final document will not be completel """ with open(nb_path) as f: nb = nbformat.reads(f.read(), as_version=4) doc = ExtractInlineMetatabDoc(package_url="metapack+file:" + dirname(nb_path)).run(nb) package_name = doc.as_version(None) output_dir = join(getcwd(), package_name) de = DocumentationExporter(config=Config(), log=logger, metadata=doc_metadata(doc)) prt('Converting documentation') output, resources = de.from_filename(nb_path) fw = FilesWriter() fw.build_directory = join(output_dir, 'docs') fw.write(output, resources, notebook_name='notebook') prt("Wrote documentation to {}".format(fw.build_directory))
def convert_notebook(nb_path): prt('Convert notebook to Metatab source package') if not exists(nb_path): err("Notebook path does not exist: '{}' ".format(nb_path)) c = Config() pe = NotebookExecutor(config=c, log=logger) prt('Running the notebook') output, resources = pe.from_filename(nb_path) fw = FilesWriter() fw.build_directory = pe.output_dir fw.write(output, resources, notebook_name=DEFAULT_METATAB_FILE) de = DocumentationExporter(config=c, log=logger, metadata=doc_metadata(pe.doc)) prt('Exporting documentation') output, resources = de.from_filename(nb_path) fw.build_directory = join(pe.output_dir, 'docs') fw.write(output, resources, notebook_name='notebook') new_mt_file = join(pe.output_dir, DEFAULT_METATAB_FILE) doc = MetapackDoc(new_mt_file) de.update_metatab(doc, resources) for lib_dir in pe.lib_dirs: lib_dir = normpath(lib_dir).lstrip('./') doc['Resources'].new_term("Root.PythonLib", lib_dir) path = abspath(lib_dir) dest = join(pe.output_dir, lib_dir) ensure_dir(dest) copytree(path, join(pe.output_dir, lib_dir)) doc.write_csv() # Reset the input to use the new data prt('Running with new package file: {}'.format(new_mt_file))
def __init__(self, args): self.cwd = getcwd() self.args = args self.cache = get_cache('metapack') if args.metatabfile and args.metatabfile.startswith('#'): # It's just a fragment, default metatab file args.metatabfile = join(self.cwd, DEFAULT_METATAB_FILE) + args.metatabfile self.mtfile_arg = args.metatabfile if args.metatabfile else join(self.cwd, DEFAULT_METATAB_FILE) self.mtfile_url = Url(self.mtfile_arg) self.resource = self.mtfile_url.parts.fragment self.package_url, self.mt_file = resolve_package_metadata_url(self.mtfile_url.rebuild_url(False, False))
def make_metatab_file(template='metatab'): import metatab.templates as tmpl template_path = join(dirname(tmpl.__file__), template + '.csv') doc = MetatabDoc(template_path) return doc
def download_and_cache(spec, cache_fs, account_accessor=None, clean=False, logger=None, working_dir='', callback=None): parts = {} working_dir = working_dir if working_dir else '' if spec.scheme == 'file': parts['cache_path'] = parse_url_to_dict(spec.resource_url)['path'] parts['download_time'] = None locations = { # What a mess ... abspath(parts['cache_path']), abspath(parts['cache_path'].lstrip('/')), abspath(join(working_dir, parts['cache_path'])), abspath(parts['cache_path'].lstrip('/')) } for l in locations: if exists(l): parts['sys_path'] = l break else: raise DownloadError(("File resource does not exist. Found none of:" "\n{}\n\nWorking dir = {}\ncache_path={}\nspec_path={}") .format('\n'.join(locations), working_dir, parts['cache_path'], spec.path)) else: cache_fs = cache_fs or get_cache() try: parts['cache_path'], parts['download_time'] = \ download(spec.resource_url, cache_fs, account_accessor, clean=clean, logger=logger, callback=callback) except AccessError as e: try: parts['cache_path'], parts['download_time'] = \ download(spec.auth_resource_url, cache_fs, account_accessor, clean=clean, logger=logger, callback=callback) except AttributeError: raise e parts['sys_path'] = cache_fs.getsyspath(parts['cache_path']) return parts
def __init__(self, args): self.cwd = getcwd() self.args = args self.downloader = Downloader.get_instance() self.cache = self.downloader.cache self.mtfile_arg = self.args.metatabfile if self.args.metatabfile else join( self.cwd, DEFAULT_METATAB_FILE) self.mtfile_url = MetapackUrl(self.mtfile_arg, downloader=self.downloader) self.resource = self.mtfile_url.target_file self.package_url = self.mtfile_url.package_url self.mt_file = self.mtfile_url.metadata_url self.package_root = self.package_url.join(PACKAGE_PREFIX) if not self.args.s3: doc = MetapackDoc(self.mt_file) self.args.s3 = doc['Root'].find_first_value('Root.S3') self.s3_url = parse_app_url(self.args.s3) if self.s3_url and not self.s3_url.scheme == 's3': self.s3_url = parse_app_url("s3://{}".format(self.args.s3)) self.doc = MetapackDoc(self.mt_file) access_value = self.doc.find_first_value('Root.Access') self.acl = 'private' if access_value == 'private' else 'public-read' self.bucket = S3Bucket( self.s3_url, acl=self.acl, profile=self.args.profile) if self.s3_url else None
def metatab_build_handler(m): if m.args.create is not False: template = m.args.create if m.args.create else 'metatab' if not exists(m.mt_file): doc = make_metatab_file(template) doc['Root']['Identifier'] = six.text_type(uuid4()) doc['Root']['Created'] = datetime_now() write_doc(doc, m.mt_file) prt('Created', m.mt_file) else: err('File', m.mt_file, 'already exists') if m.args.add: update_name(m.mt_file, fail_on_missing=False, report_unchanged=False) add_resource(m.mt_file, m.args.add, cache=m.cache) if False: # m.args.resources: update_name(m.mt_file, fail_on_missing=False, report_unchanged=False) doc = MetatabDoc(m.mt_file) try: doc['Schema'].clean() except KeyError: pass for t in list(doc['Resources']): # w/o list(), will iterate over new terms if not t.term_is('root.datafile'): continue if t.as_dict().get('url'): add_resource(doc, t.as_dict()['url'], m.cache) else: warn("Entry '{}' on row {} is missing a url; skipping".format(t.join, t.row)) write_doc(doc, m.mt_file) if m.args.schemas: update_name(m.mt_file, fail_on_missing=False, report_unchanged=False) process_schemas(m.mt_file, cache=m.cache, clean=m.args.clean) if m.args.datapackage: update_name(m.mt_file, fail_on_missing=False, report_unchanged=False) from metatab.datapackage import convert_to_datapackage doc = MetatabDoc(m.mt_file) u = Url(m.mt_file) if u.proto == 'file': dpj_file = join(dirname(abspath(u.parts.path)), 'datapackage.json') else: dpj_file = join(getcwd(), 'datapackage.json') try: with open(dpj_file, 'w') as f: f.write(json.dumps(convert_to_datapackage(doc), indent=4)) except ConversionError as e: err(e) if m.mtfile_url.scheme == 'file' and m.args.update: update_name(m.mt_file, fail_on_missing=True, force=m.args.force)
def download(url, cache_fs, account_accessor=None, clean=False, logger=None, callback=None): """ Download a URL and store it in the cache. :param url: :param cache_fs: :param account_accessor: callable of one argument (url) returning dict with credentials. :param clean: Remove files from cache and re-download :param logger: :param callback: :return: """ import os.path import time from fs.errors import DirectoryExpected, NoSysPath, ResourceInvalid, DirectoryExists assert isinstance(url, string_types) url = url.replace('\\','/') # .decode('utf8'). The fs modulegets upset when given strings, so # we need to decode to unicode. UTF8 is a WAG. try: parsed = urlparse(url.decode('utf8')) except AttributeError: parsed = urlparse(url) # Create a name for the file in the cache, based on the URL # the '\' replacement is because pyfs only wants to use UNIX path seperators, but # python os.path.join will use the one specified for the operating system. cache_path = join(parsed.netloc, parsed.path.strip('/')) # If there is a query, hash it and add it to the path if parsed.query: hash = hashlib.sha224(parsed.query.encode('utf8')).hexdigest() cache_path = join(cache_path, hash) if not cache_fs.exists(cache_path): cache_dir = os.path.dirname(cache_path) try: cache_fs.makedirs(cache_dir, recreate=True) except DirectoryExpected as e: # Probably b/c the dir name is already a file dn = os.path.dirname(cache_path) bn = os.path.basename(cache_path) for i in range(10): try: cache_path = join(dn + str(i), bn) cache_fs.makedirs(os.path.dirname(cache_path)) break except DirectoryExpected: continue except DirectoryExists: pass # ? No idea what's supposed to happen here. raise e try: from filelock import FileLock lock = FileLock(cache_fs.getsyspath(cache_path + '.lock')) except NoSysPath: # mem: caches, and others, don't have sys paths. # FIXME should check for MP operation and raise if there would be # contention. Mem caches are only for testing with single processes lock = _NoOpFileLock() with lock: if cache_fs.exists(cache_path): if clean: try: cache_fs.remove(cache_path) except ResourceInvalid: pass # Well, we tried. else: return cache_path, None try: _download(url, cache_fs, cache_path, account_accessor, logger, callback) return cache_path, time.time() except HTTPError as e: if e.response.status_code == 403: raise AccessError("Access error on download: {}".format(e)) else: raise DownloadError("Failed to download: {}".format(e)) except (KeyboardInterrupt, Exception): # This is really important -- its really bad to have partly downloaded # files being confused with fully downloaded ones. # FIXME. Should also handle signals. deleteing partly downloaded files is important. # Maybe should have a sentinel file, or download to another name and move the # file after done. if cache_fs.exists(cache_path): cache_fs.remove(cache_path) raise assert False, 'Should never get here'