Пример #1
0
def open_package(ref, cache=None, clean_cache=False):
    from rowgenerators.util import clean_cache as rg_clean_cache
    package_url, metadata_url = resolve_package_metadata_url(ref)

    cache = cache if cache else get_cache()

    return MetatabDoc(metadata_url, package_url=package_url, cache=cache)
Пример #2
0
def get_cache(clean=False):
    from rowgenerators.util import get_cache, clean_cache

    cache = get_cache('metapack')

    if clean:
        clean_cache(cache)

    return cache
Пример #3
0
    def __init__(self, ref=None, cache=None, callback=None, env=None):

        self._cache = cache if cache else get_cache('metapack')
        self._ref = ref
        self._doc = None
        self._callback = callback
        self._env = env if env is not None else {}

        self.init_doc()
Пример #4
0
def download_and_cache(spec, cache_fs, account_accessor=None, clean=False, logger=None,
                       working_dir='', callback=None):

    parts = {}

    working_dir = working_dir if working_dir else ''

    if spec.scheme == 'file':
        parts['cache_path'] = parse_url_to_dict(spec.resource_url)['path']
        parts['download_time'] = None

        locations = { # What a mess ...
            abspath(parts['cache_path']),
            abspath(parts['cache_path'].lstrip('/')),
            abspath(join(working_dir, parts['cache_path'])),
            abspath(parts['cache_path'].lstrip('/'))
        }

        for l in locations:
            if exists(l):
                parts['sys_path'] = l
                break
        else:
            raise DownloadError(("File resource does not exist. Found none of:"
                                "\n{}\n\nWorking dir = {}\ncache_path={}\nspec_path={}")
                          .format('\n'.join(locations), working_dir, parts['cache_path'], spec.path))

    else:
        cache_fs = cache_fs or get_cache()

        try:
            parts['cache_path'], parts['download_time'] = \
                download(spec.resource_url, cache_fs, account_accessor,
                         clean=clean, logger=logger, callback=callback)
        except AccessError as e:
            try:
                parts['cache_path'], parts['download_time'] = \
                    download(spec.auth_resource_url, cache_fs, account_accessor,
                             clean=clean, logger=logger, callback=callback)
            except AttributeError:
                raise e


        parts['sys_path'] = cache_fs.getsyspath(parts['cache_path'])

    return parts
Пример #5
0
def rowgen():
    import argparse

    parser = argparse.ArgumentParser(
        prog='rowgen',
        description='Return CSV rows of data from a rowgenerator URL')

    parser.add_argument('-H', '--head', default=False, action='store_true',
                        help='Display only the first 20 lines, in tabular format')

    parser.add_argument('-e', '--encoding',
                        help='Force the encoding')

    parser.add_argument('-f', '--format',
                        help="Force the file format. Typical values are 'csv', 'xls', 'xlsx' ")

    parser.add_argument('-u', '--urlfiletype',
                        help="Force the type of the file downloaded from the url. Equivalent to changing the file extension ")

    parser.add_argument('-s', '--start',
                        help='Line number where data starts')

    parser.add_argument('-d', '--headers', default=None, action='store_true',
                        help="Comma seperated list of header line numebrs")

    parser.add_argument('-E', '--enumerate', default=None, action='store_true',
                        help="Download the URL and enumerate it's contents as URLs")

    parser.add_argument('-i', '--intuit', default=None, action='store_true',
                        help="Intuit headers, start lines, etc")

    parser.add_argument('-I', '--info', default=None, action='store_true',
                        help="Print information about the url")

    parser.add_argument('url')

    cache = get_cache()

    args = parser.parse_args(sys.argv[1:])

    ss = SourceSpec(url=args.url, target_format=args.format, encoding=args.encoding, resource_format=args.urlfiletype)

    contents = list(enumerate_contents(ss, cache_fs=cache))

    if args.info:
        prt(tabulate(ss.dict.items()))
        sys.exit(0)

    if args.enumerate:
        for s in contents:
            print(s.rebuild_url())

    elif args.intuit:
        for s in contents:

            try:
                encoding, ri = run_row_intuit(s.rebuild_url(),cache=cache)

                prt("{} headers={} start={} encoding={}".format(
                        s.rebuild_url(),
                        ','.join(str(e) for e in ri.header_lines),
                        ri.start_line,
                        encoding))
            except SourceError as e:
                warn("{}: {}".format(s.rebuild_url(), e))

    elif len(contents) == 1:
        s = contents.pop(0)

        rg = s.get_generator(cache=cache)

        print(tabulate(islice(rg,20)))

    elif len(contents) > 1 and not args.enumerate:
        warn("URL has multiple content files; enumerating instead")
        for s in contents:
            print(s.rebuild_url())
Пример #6
0
import json
import sys
from genericpath import exists

from metatab import DEFAULT_METATAB_FILE, MetatabDoc, parse_app_url
from rowgenerators.util import get_cache, clean_cache
from os.path import dirname
from rowgenerators.util import fs_join as join

import logging

logger = logging.getLogger('user')
logger_err = logging.getLogger('cli-errors')
debug_logger = logging.getLogger('debug')

cache = get_cache()


def metatab():
    import argparse
    parser = argparse.ArgumentParser(prog='metatab',
                                     description='Matatab file parser',
                                     epilog='Cache dir: {}\n'.format(
                                         str(cache.getsyspath('/'))))

    g = parser.add_mutually_exclusive_group()

    g.add_argument(
        '-C',
        '--create',
        action='store',
Пример #7
0
    def __init__(self,
                 ref=None,
                 decl=None,
                 package_url=None,
                 cache=None,
                 clean_cache=False):

        self._cache = cache if cache else get_cache()

        self.decl_terms = {}
        self.decl_sections = {}

        self.terms = []
        self.sections = OrderedDict()
        self.errors = []
        self.package_url = package_url

        #if Url(self.package_url).proto == 'file':
        #    path = abspath(parse_url_to_dict(self.package_url)['path'])
        #    self.package_url = reparse_url(self.package_url, path = path)

        if decl is None:
            self.decls = []
        elif not isinstance(decl, MutableSequence):
            self.decls = [decl]
        else:
            self.decls = decl

        self.load_declarations(self.decls)

        if ref:
            self._ref = ref
            self.root = None
            self._term_parser = TermParser(self._ref, doc=self)
            try:
                self.load_terms(self._term_parser)
            except SourceError as e:
                raise MetatabError(
                    "Failed to load terms for document '{}': {}".format(
                        self._ref, e))

            u = Url(self._ref)
            if u.scheme == 'file':
                try:
                    self._mtime = getmtime(u.parts.path)
                except (FileNotFoundError, OSError):
                    self._mtime = 0
            else:
                self._mtime = 0

        else:
            self._ref = None
            self._term_parser = None
            self.root = SectionTerm('Root',
                                    term='Root',
                                    doc=self,
                                    row=0,
                                    col=0,
                                    file_name=None,
                                    parent=None)
            self.add_section(self.root)
            self._mtime = time()