Exemplo n.º 1
0
def add_resource(mt_file, ref, cache):
    """Add a resources entry, downloading the intuiting the file, replacing entries with
    the same reference"""
    from metatab.util import enumerate_contents

    if isinstance(mt_file, MetatabDoc):
        doc = mt_file
    else:
        doc = MetatabDoc(mt_file)

    if not 'Resources' in doc:
        doc.new_section('Resources')

    doc['Resources'].args = [e for e in set(doc['Resources'].args + ['Name', 'StartLine', 'HeaderLines', 'Encoding']) if
                             e]

    seen_names = set()

    if isdir(ref):
        for f in find_files(ref, DATA_FORMATS):

            if f.endswith(DEFAULT_METATAB_FILE):
                continue

            if doc.find_first('Root.Datafile', value=f):
                prt("Datafile exists for '{}', ignoring".format(f))
            else:
                add_single_resource(doc, f, cache=cache, seen_names=seen_names)
    else:

        for c in enumerate_contents(ref, cache=cache, callback=prt):
            add_single_resource(doc, c.rebuild_url(), cache=cache, seen_names=seen_names)

    write_doc(doc, mt_file)
Exemplo n.º 2
0
def add_single_resource(doc, ref, cache, seen_names):
    from metatab.util import slugify

    t = doc.find_first('Root.Datafile', value=ref)

    if t:
        prt("Datafile exists for '{}', deleting".format(ref))
        doc.remove_term(t)

    term_name = classify_url(ref)

    path, name = extract_path_name(ref)

    # If the name already exists, try to create a new one.
    # 20 attempts ought to be enough.
    if name in seen_names:
        base_name = re.sub(r'-?\d+$', '', name)

        for i in range(1, 20):
            name = "{}-{}".format(base_name, i)
            if name not in seen_names:
                break

    seen_names.add(name)

    encoding = start_line = None
    header_lines = []

    try:
        encoding, ri = run_row_intuit(path, cache)
        prt("Added resource for '{}', name = '{}' ".format(ref, name))
        start_line = ri.start_line
        header_lines = ri.header_lines
    except RowIntuitError as e:
        warn("Failed to intuit '{}'; {}".format(path, e))

    except SourceError as e:
        warn("Source Error: '{}'; {}".format(path, e))

    except Exception as e:
        warn("Error: '{}'; {}".format(path, e))

    if not name:
        from hashlib import sha1
        name = sha1(slugify(path).encode('ascii')).hexdigest()[:12]

        # xlrd gets grouchy if the name doesn't start with a char
        try:
            int(name[0])
            name = 'a' + name[1:]
        except:
            pass

    return doc['Resources'].new_term(term_name, ref, name=name,
                                     startline=start_line,
                                     headerlines=','.join(str(e) for e in header_lines),
                                     encoding=encoding)
Exemplo n.º 3
0
def init_metatab(mt_file, alt_mt_file):
    mt_file = alt_mt_file if alt_mt_file else mt_file

    prt("Initializing '{}'".format(mt_file))

    if not exists(mt_file):
        doc = make_metatab_file()

        doc['Root']['Identifier'] = str(uuid4())

        doc.write_csv(mt_file)
    else:
        prt("Doing nothing; file '{}' already exists".format(mt_file))
Exemplo n.º 4
0
def metatab_admin_handler(m):
    if m.args.enumerate:

        from metatab.util import enumerate_contents

        specs = list(enumerate_contents(m.args.enumerate, m.cache, callback=prt))

        for s in specs:
            prt(classify_url(s.url), s.target_format, s.url, s.target_segment)

    if m.args.html:
        from metatab.html import html
        doc = MetatabDoc(m.mt_file)

        # print(doc.html)
        prt(html(doc))

    if m.args.markdown:
        from metatab.html import markdown

        doc = MetatabDoc(m.mt_file)
        prt(markdown(doc))

    if m.args.clean_cache:
        clean_cache('metapack')

    if m.args.name:
        doc = MetatabDoc(m.mt_file)
        prt(doc.find_first_value("Root.Name"))
        exit(0)
Exemplo n.º 5
0
def update_distributions(m):
    """Add a distribution term for each of the distributions the sync is creating. Also updates the 'Issued' time"""

    doc = MetatabDoc(m.mt_file)

    access_value = doc.find_first_value('Root.Access')

    acl = 'private' if access_value == 'private' else 'public'

    b = S3Bucket(m.args.s3, acl=acl)

    updated = False

    old_dists = list(doc.find('Root.Distribution'))

    if m.args.excel is not False:
        p = ExcelPackage(m.mt_file)

        if update_dist(doc, old_dists, b.access_url(p.save_path())):
            prt("Added Excel distribution to metadata")
            updated = True

    if m.args.zip is not False:
        p = ZipPackage(m.mt_file)
        if update_dist(doc, old_dists, b.access_url(p.save_path())):
            prt("Added ZIP distribution to metadata")
            updated = True

    if m.args.fs is not False:
        p = FileSystemPackage(m.mt_file)
        if update_dist(doc, old_dists,
                       b.access_url(p.save_path(), DEFAULT_METATAB_FILE)):
            prt("Added FS distribution to metadata")
            updated = True

    if m.args.csv is not False:
        p = CsvPackage(m.mt_file)
        url = b.access_url(basename(p.save_path()))
        if update_dist(doc, old_dists, url):
            prt("Added CSV distribution to metadata", url)
            updated = True

    doc['Root']['Issued'] = datetime_now()

    if not write_doc(doc, m.mt_file):
        # The mt_file is probably a URL, so we can't write back to it,
        # but we need the updated distributions, so write it elsewhere, then
        # reload it in the next stage.
        second_stage_file = join(PACKAGE_PREFIX, DEFAULT_METATAB_FILE)

        if not exists(dirname(second_stage_file)):
            makedirs(dirname(second_stage_file))

        assert write_doc(doc, second_stage_file)

    else:
        second_stage_file = m.mt_file

    return second_stage_file, updated
Exemplo n.º 6
0
    def write(self, body, path, acl=None):
        from botocore.exceptions import ClientError
        import mimetypes

        acl = acl if acl is not None else self._acl

        #if isinstance(body, six.string_types):
        #    with open(body,'rb') as f:
        #        body = f.read()

        key = join(self._prefix, path).strip('/')

        try:
            o = self._bucket.Object(key)
            if o.content_length == len(body):
                prt("File '{}' already in bucket; skipping".format(key))
                return self.access_url(path)
            else:
                prt("File '{}' already in bucket, but length is different; re-wirtting"
                    .format(key))

        except ClientError as e:
            if int(e.response['Error']['Code']) in (403, 405):
                err("S3 Access failed for '{}:{}': {}\nNOTE: With Docker, this error is often the result of container clock drift. Check your container clock. "
                    .format(self._bucket_name, key, e))
            elif int(e.response['Error']['Code']) != 404:
                err("S3 Access failed for '{}:{}': {}".format(
                    self._bucket_name, key, e))

        ct = mimetypes.guess_type(key)[0]

        try:
            self._bucket.put_object(
                Key=key,
                Body=body,
                ACL=acl,
                ContentType=ct if ct else 'binary/octet-stream')
        except Exception as e:
            self.err("Failed to write '{}': {}".format(key, e))

        return self.access_url(path)
Exemplo n.º 7
0
def process_schemas(mt_file, cache, clean=False):
    from rowgenerators import SourceError
    from requests.exceptions import ConnectionError

    doc = MetatabDoc(mt_file)

    try:
        if clean:
            doc['Schema'].clean()
        else:
            doc['Schema']

    except KeyError:
        doc.new_section('Schema', ['DataType', 'Altname', 'Description'])

    for r in doc.resources():

        schema_name = r.get_value('schema', r.get_value('name'))

        schema_term = doc.find_first(term='Table', value=schema_name, section='Schema')

        if schema_term:
            prt("Found table for '{}'; skipping".format(schema_name))
            continue

        path, name = extract_path_name(r.url)

        prt("Processing {}".format(r.url))

        si = SelectiveRowGenerator(islice(r.row_generator, 100),
                                   headers=[int(i) for i in r.get_value('headerlines', '0').split(',')],
                                   start=int(r.get_value('startline', 1)))

        try:
            ti = TypeIntuiter().run(si)
        except SourceError as e:
            warn("Failed to process '{}'; {}".format(path, e))
            continue
        except ConnectionError as e:
            warn("Failed to download '{}'; {}".format(path, e))
            continue

        table = doc['Schema'].new_term('Table', schema_name)

        prt("Adding table '{}' ".format(schema_name))

        for i, c in enumerate(ti.to_rows()):
            raw_alt_name = alt_col_name(c['header'], i)
            alt_name = raw_alt_name if raw_alt_name != c['header'] else ''

            table.new_child('Column', c['header'],
                            datatype=type_map.get(c['resolved_type'], c['resolved_type']),
                            altname=alt_name)

    write_doc(doc, mt_file)
Exemplo n.º 8
0
def show_credentials(profile):
    import boto3

    session = boto3.Session(profile_name=profile)

    if profile:
        cred_line = " 'eval $(metasync -C -p {} )'".format(profile)
    else:
        cred_line = " 'eval $(metasync -C)'"

    prt("export AWS_ACCESS_KEY_ID={} ".format(
        session.get_credentials().access_key))
    prt("export AWS_SECRET_ACCESS_KEY={}".format(
        session.get_credentials().secret_key))
    prt("# Run {} to configure credentials in a shell".format(cred_line))
Exemplo n.º 9
0
def run_docker(m):
    """Re-run the metasync command in docker. """

    import botocore.session
    from subprocess import Popen, PIPE, STDOUT

    session = botocore.session.get_session()

    args = [
        'docker', 'run', '--rm', '-t', '-i',
        '-eAWS_ACCESS_KEY_ID={}'.format(session.get_credentials().access_key),
        '-eAWS_SECRET_ACCESS_KEY={}'.format(
            session.get_credentials().secret_key), 'civicknowledge/metatab',
        'metasync'
    ]

    for a in ('-D', '--docker'):
        try:
            m.raw_args.remove(a)
        except ValueError:
            pass

    args.extend(m.raw_args[1:])

    if m.args.verbose:
        prt("Running Docker Command: ", ' '.join(args))
    else:
        prt("Running In Docker")

    process = Popen(args, stdout=PIPE, stderr=STDOUT)
    with process.stdout:
        for line in iter(process.stdout.readline, b''):
            prt(line.decode('ascii'), end='')

    exitcode = process.wait()  # 0 means success

    exit(exitcode)
Exemplo n.º 10
0
def metatab_build_handler(m):
    if m.args.create is not False:

        template = m.args.create if m.args.create else 'metatab'

        if not exists(m.mt_file):

            doc = make_metatab_file(template)

            doc['Root']['Identifier'] = six.text_type(uuid4())

            doc['Root']['Created'] = datetime_now()

            write_doc(doc, m.mt_file)

            prt('Created', m.mt_file)
        else:
            err('File', m.mt_file, 'already exists')

    if m.args.add:
        update_name(m.mt_file, fail_on_missing=False, report_unchanged=False)

        add_resource(m.mt_file, m.args.add, cache=m.cache)

    if False:  # m.args.resources:
        update_name(m.mt_file, fail_on_missing=False, report_unchanged=False)

        doc = MetatabDoc(m.mt_file)

        try:
            doc['Schema'].clean()
        except KeyError:
            pass

        for t in list(doc['Resources']):  # w/o list(), will iterate over new terms

            if not t.term_is('root.datafile'):
                continue

            if t.as_dict().get('url'):
                add_resource(doc, t.as_dict()['url'], m.cache)

            else:
                warn("Entry '{}' on row {} is missing a url; skipping".format(t.join, t.row))

        write_doc(doc, m.mt_file)

    if m.args.schemas:
        update_name(m.mt_file, fail_on_missing=False, report_unchanged=False)

        process_schemas(m.mt_file, cache=m.cache, clean=m.args.clean)

    if m.args.datapackage:
        update_name(m.mt_file, fail_on_missing=False, report_unchanged=False)

        from metatab.datapackage import convert_to_datapackage

        doc = MetatabDoc(m.mt_file)

        u = Url(m.mt_file)

        if u.proto == 'file':
            dpj_file = join(dirname(abspath(u.parts.path)), 'datapackage.json')
        else:
            dpj_file = join(getcwd(), 'datapackage.json')

        try:
            with open(dpj_file, 'w') as f:
                f.write(json.dumps(convert_to_datapackage(doc), indent=4))
        except ConversionError as e:
            err(e)

    if m.mtfile_url.scheme == 'file' and m.args.update:
        update_name(m.mt_file, fail_on_missing=True, force=m.args.force)
Exemplo n.º 11
0
def metatab():
    import argparse
    parser = argparse.ArgumentParser(
        prog='metatab',
        description='Matatab file parser, version {}'.format(_meta.__version__))

    parser.add_argument('-C', '--clean-cache', default=False, action='store_true',
                        help="Clean the download cache")

    g = parser.add_mutually_exclusive_group(required=True)

    g.add_argument('-i', '--info', default=False, action='store_true',
                   help="Show configuration information")

    g.add_argument('-c', '--create', action='store', nargs='?', default=False,
                   help="Create a new metatab file, from named template. With no argument, uses the 'metatab' template ")

    g.add_argument('-t', '--terms', default=False, action='store_true',
                   help='Parse a file and print out the stream of terms, before interpretation')

    g.add_argument('-I', '--interp', default=False, action='store_true',
                   help='Parse a file and print out the stream of terms, after interpretation')

    g.add_argument('-j', '--json', default=False, action='store_true',
                   help='Parse a file and print out a JSON representation')

    g.add_argument('-y', '--yaml', default=False, action='store_true',
                   help='Parse a file and print out a YAML representation')

    g.add_argument('-R', '--resource', default=False, action='store_true',
                   help='If the URL has no fragment, dump the resources listed in the metatab file. With a fragment, dump a resource as a CSV')

    g.add_argument('-H', '--head', default=False, action='store_true',
                   help="Dump the first 20 lines of a resoruce ")

    g.add_argument('-S', '--schema',
                   help='Dump the schema for one named resource')

    parser.add_argument('-d', '--show-declaration', default=False, action='store_true',
                        help='Parse a declaration file and print out declaration dict. Use -j or -y for the format')

    parser.add_argument('-D', '--declare', help='Parse and incorporate a declaration before parsing the file.' +
                                                ' (Adds the declaration to the start of the file as the first term. )')

    parser.add_argument('file', nargs='?', default=DEFAULT_METATAB_FILE, help='Path to a Metatab file')

    args = parser.parse_args(sys.argv[1:])

    # Specing a fragment screws up setting the default metadata file name
    if args.file.startswith('#'):
        args.file = DEFAULT_METATAB_FILE + args.file

    cache = get_cache('metapack')

    if args.info:
        prt('Version  : {}'.format(_meta.__version__))
        prt('Cache dir: {}'.format(str(cache.getsyspath('/'))))
        exit(0)

    if args.clean_cache:
        clean_cache(cache)

    if args.create is not False:
        new_metatab_file(args.file, args.create)
        exit(0)

    if args.resource or args.head:

        limit = 20 if args.head else None

        u = Url(args.file)
        resource = u.parts.fragment
        metadata_url = u.rebuild_url(False, False)

        package_url, metadata_url = resolve_package_metadata_url(metadata_url)

        try:
            doc = MetatabDoc(metadata_url, cache=cache)
        except OSError as e:
            err("Failed to open Metatab doc: {}".format(e))
            return # Never reached

        if resource:
            dump_resource(doc, resource, limit)
        else:
            dump_resources(doc)


        exit(0)

    if args.show_declaration:

        doc = MetatabDoc()
        doc.load_declarations([args.file])

        print(json.dumps({
            'terms': doc.decl_terms,
            'sections': doc.decl_sections
        }, indent=4))
        exit(0)
    else:

        package_url, metadata_url = resolve_package_metadata_url(args.file)
        try:
            doc = MetatabDoc(metadata_url, cache=cache)
        except IOError as e:
            raise
            err("Failed to open '{}': {}".format(metadata_url, e))

    if args.terms:
        for t in doc._term_parser:
            print(t)

    elif args.json:
        print(json.dumps(doc.as_dict(), indent=4))


    elif args.yaml:
        import yaml
        print(yaml.safe_dump(doc.as_dict(), default_flow_style=False, indent=4))


    elif args.schema:
        dump_schema(doc, args.schema)

    exit(0)
Exemplo n.º 12
0
def metasync():
    import argparse
    parser = argparse.ArgumentParser(
        prog='metasync',
        description='Create packages and store them in s3 buckets, version {}'.
        format(_meta.__version__),
    )

    parser.add_argument('-i',
                        '--info',
                        default=False,
                        action='store_true',
                        help="Show configuration information")

    parser.add_argument('-v',
                        '--verbose',
                        default=False,
                        action='store_true',
                        help="For some command, be more verbose")

    parser.add_argument(
        '-F',
        '--force',
        action='store_true',
        default=False,
        help='Force building packages, even when they already exist')

    parser.add_argument('-p',
                        '--profile',
                        help="Name of a BOTO or AWS credentails profile",
                        required=False)

    parser.add_argument('-s',
                        '--s3',
                        help="URL to S3 where packages will be stored",
                        required=False)

    parser.add_argument('-S',
                        '--all-s3',
                        help="Synonym for `metasync -c -e -f -z -s <url>`",
                        required=False)

    parser.add_argument(
        '-e',
        '--excel',
        action='store_true',
        default=False,
        help='Create an excel package from a metatab file and copy it to S3. ')

    parser.add_argument(
        '-z',
        '--zip',
        action='store_true',
        default=False,
        help='Create a zip package from a metatab file and copy it to S3. ')

    parser.add_argument(
        '-c',
        '--csv',
        action='store_true',
        default=False,
        help=
        'Create a csv package from a metatab file and copy it to S3. Requires building a file system package'
    )

    parser.add_argument(
        '-f',
        '--fs',
        action='store_true',
        default=False,
        help=
        'Create a Filesystem package. Unlike -e and -f, only writes the package to S3.'
    )

    parser.add_argument('-D',
                        '--docker',
                        help="Re-run the metasync command through docker",
                        action='store_true',
                        default=False)

    parser.add_argument(
        '-C',
        '--credentials',
        help="Show S3 Credentials and exit. "
        "Eval this string to setup credentials in other shells.",
        action='store_true',
        default=False)

    parser.add_argument('metatabfile',
                        nargs='?',
                        help='Path to a Metatab file')

    class MetapackCliMemo(object):
        def __init__(self, raw_args):
            self.cwd = getcwd()

            self.raw_args = raw_args

            self.args = parser.parse_args(self.raw_args[1:])

            self.cache = get_cache('metapack')

            # This one is for loading packages that have just been
            # written to S3.
            self.tmp_cache = get_cache('temp')
            clean_cache(self.tmp_cache)

            if self.args.all_s3:
                self.args.s3 = self.args.all_s3
                self.args.excel = True
                self.args.zip = True
                self.args.csv = True
                self.args.fs = True

            self.mtfile_arg = self.args.metatabfile if self.args.metatabfile else join(
                self.cwd, DEFAULT_METATAB_FILE)

            self.mtfile_url = Url(self.mtfile_arg)
            self.resource = self.mtfile_url.parts.fragment

            self.package_url, self.mt_file = resolve_package_metadata_url(
                self.mtfile_url.rebuild_url(False, False))

            self.args.fs = self.args.csv or self.args.fs

    m = MetapackCliMemo(sys.argv)

    if m.args.credentials:
        show_credentials(m.args.profile)
        exit(0)

    if m.args.docker:
        run_docker(m)

    if m.args.info:
        metatab_info(m.cache)
        exit(0)

    if not m.args.s3:
        doc = MetatabDoc(m.mt_file)
        m.args.s3 = doc['Root'].find_first_value('Root.S3')

    if not m.args.s3:
        err("Must specify either -S or -s")

    if m.args.excel is not False or m.args.zip is not False or m.args.fs is not False:
        update_name(m.mt_file, fail_on_missing=False, report_unchanged=False)

    doc = MetatabDoc(m.mt_file)
    doc['Root'].get_or_new_term('Root.S3', m.args.s3)
    write_doc(doc, m.mt_file)

    second_stage_mtfile, distupdated = update_distributions(m)

    if second_stage_mtfile != m.mt_file:
        prt("Building packages from: ", second_stage_mtfile)

    created = create_packages(m, second_stage_mtfile, distupdated=distupdated)

    prt("Synchronized these Package Urls")
    prt(tabulate(created))

    exit(0)