def open_package(ref, cache=None, clean_cache=False): from rowgenerators.util import clean_cache as rg_clean_cache package_url, metadata_url = resolve_package_metadata_url(ref) cache = cache if cache else get_cache() return MetatabDoc(metadata_url, package_url=package_url, cache=cache)
def get_cache(clean=False): from rowgenerators.util import get_cache, clean_cache cache = get_cache('metapack') if clean: clean_cache(cache) return cache
def __init__(self, ref=None, cache=None, callback=None, env=None): self._cache = cache if cache else get_cache('metapack') self._ref = ref self._doc = None self._callback = callback self._env = env if env is not None else {} self.init_doc()
def download_and_cache(spec, cache_fs, account_accessor=None, clean=False, logger=None, working_dir='', callback=None): parts = {} working_dir = working_dir if working_dir else '' if spec.scheme == 'file': parts['cache_path'] = parse_url_to_dict(spec.resource_url)['path'] parts['download_time'] = None locations = { # What a mess ... abspath(parts['cache_path']), abspath(parts['cache_path'].lstrip('/')), abspath(join(working_dir, parts['cache_path'])), abspath(parts['cache_path'].lstrip('/')) } for l in locations: if exists(l): parts['sys_path'] = l break else: raise DownloadError(("File resource does not exist. Found none of:" "\n{}\n\nWorking dir = {}\ncache_path={}\nspec_path={}") .format('\n'.join(locations), working_dir, parts['cache_path'], spec.path)) else: cache_fs = cache_fs or get_cache() try: parts['cache_path'], parts['download_time'] = \ download(spec.resource_url, cache_fs, account_accessor, clean=clean, logger=logger, callback=callback) except AccessError as e: try: parts['cache_path'], parts['download_time'] = \ download(spec.auth_resource_url, cache_fs, account_accessor, clean=clean, logger=logger, callback=callback) except AttributeError: raise e parts['sys_path'] = cache_fs.getsyspath(parts['cache_path']) return parts
def rowgen(): import argparse parser = argparse.ArgumentParser( prog='rowgen', description='Return CSV rows of data from a rowgenerator URL') parser.add_argument('-H', '--head', default=False, action='store_true', help='Display only the first 20 lines, in tabular format') parser.add_argument('-e', '--encoding', help='Force the encoding') parser.add_argument('-f', '--format', help="Force the file format. Typical values are 'csv', 'xls', 'xlsx' ") parser.add_argument('-u', '--urlfiletype', help="Force the type of the file downloaded from the url. Equivalent to changing the file extension ") parser.add_argument('-s', '--start', help='Line number where data starts') parser.add_argument('-d', '--headers', default=None, action='store_true', help="Comma seperated list of header line numebrs") parser.add_argument('-E', '--enumerate', default=None, action='store_true', help="Download the URL and enumerate it's contents as URLs") parser.add_argument('-i', '--intuit', default=None, action='store_true', help="Intuit headers, start lines, etc") parser.add_argument('-I', '--info', default=None, action='store_true', help="Print information about the url") parser.add_argument('url') cache = get_cache() args = parser.parse_args(sys.argv[1:]) ss = SourceSpec(url=args.url, target_format=args.format, encoding=args.encoding, resource_format=args.urlfiletype) contents = list(enumerate_contents(ss, cache_fs=cache)) if args.info: prt(tabulate(ss.dict.items())) sys.exit(0) if args.enumerate: for s in contents: print(s.rebuild_url()) elif args.intuit: for s in contents: try: encoding, ri = run_row_intuit(s.rebuild_url(),cache=cache) prt("{} headers={} start={} encoding={}".format( s.rebuild_url(), ','.join(str(e) for e in ri.header_lines), ri.start_line, encoding)) except SourceError as e: warn("{}: {}".format(s.rebuild_url(), e)) elif len(contents) == 1: s = contents.pop(0) rg = s.get_generator(cache=cache) print(tabulate(islice(rg,20))) elif len(contents) > 1 and not args.enumerate: warn("URL has multiple content files; enumerating instead") for s in contents: print(s.rebuild_url())
import json import sys from genericpath import exists from metatab import DEFAULT_METATAB_FILE, MetatabDoc, parse_app_url from rowgenerators.util import get_cache, clean_cache from os.path import dirname from rowgenerators.util import fs_join as join import logging logger = logging.getLogger('user') logger_err = logging.getLogger('cli-errors') debug_logger = logging.getLogger('debug') cache = get_cache() def metatab(): import argparse parser = argparse.ArgumentParser(prog='metatab', description='Matatab file parser', epilog='Cache dir: {}\n'.format( str(cache.getsyspath('/')))) g = parser.add_mutually_exclusive_group() g.add_argument( '-C', '--create', action='store',
def __init__(self, ref=None, decl=None, package_url=None, cache=None, clean_cache=False): self._cache = cache if cache else get_cache() self.decl_terms = {} self.decl_sections = {} self.terms = [] self.sections = OrderedDict() self.errors = [] self.package_url = package_url #if Url(self.package_url).proto == 'file': # path = abspath(parse_url_to_dict(self.package_url)['path']) # self.package_url = reparse_url(self.package_url, path = path) if decl is None: self.decls = [] elif not isinstance(decl, MutableSequence): self.decls = [decl] else: self.decls = decl self.load_declarations(self.decls) if ref: self._ref = ref self.root = None self._term_parser = TermParser(self._ref, doc=self) try: self.load_terms(self._term_parser) except SourceError as e: raise MetatabError( "Failed to load terms for document '{}': {}".format( self._ref, e)) u = Url(self._ref) if u.scheme == 'file': try: self._mtime = getmtime(u.parts.path) except (FileNotFoundError, OSError): self._mtime = 0 else: self._mtime = 0 else: self._ref = None self._term_parser = None self.root = SectionTerm('Root', term='Root', doc=self, row=0, col=0, file_name=None, parent=None) self.add_section(self.root) self._mtime = time()