示例#1
0
def drop_sql(args, doc, r):

    try:
        st = r.schema_term
    except AttributeError:
        return '';

    if not st:
        err(f"Resource '{r.name}' does not have a schema term")

    table_name =  mk_table_name(r, doc)

    table = Table(table_name, MetaData(bind=None))

    dialect = dialect_map.get(args.dialect,mysql.dialect())

    lines =  str(DropTable(table).compile(dialect=dialect)).\
            replace('DROP TABLE', 'DROP TABLE IF EXISTS')

    out = []
    for l in lines.splitlines():
        if l.strip():
            out.append(l+';')

    return '\n'.join(out)
示例#2
0
def convert_hugo(nb_path, hugo_path):
    from os import environ
    from os.path import abspath

    # Total hack. Would like the -H to be allowed to have no arg, and then use the env var,
    # but I don't know how to do that. This is the case where the user types
    # -H nb_path, so just go with it.
    if hugo_path and not nb_path:
        nb_path = hugo_path
        hugo_path = environ.get('METAPACK_HUGO_DIR')

    if not hugo_path:
        err("Must specify value for -H or the METAPACK_HUGO_DIR environment var"
            )

    if not exists(nb_path):
        err("Notebook path does not exist: '{}' ".format(nb_path))

    c = Config()
    c.HugoExporter.hugo_dir = abspath(
        hugo_path)  # Exports assume rel path is rel to notebook
    he = HugoExporter(config=c, log=logger)

    output, resources = he.from_filename(nb_path)

    prt('Writing Notebook to Hugo Markdown')

    prt('    Writing ',
        resources['unique_key'] + resources['output_extension'])
    for k, v in resources['outputs'].items():
        prt('    Writing ', k)

    fw = FilesWriter()
    fw.write(output, resources, notebook_name=resources['unique_key'])
示例#3
0
def write_eda_notebook(m):
    # Get the EDA notebook file from Github

    url = "https://raw.githubusercontent.com/Metatab/exploratory-data-analysis/master/eda.ipynb"

    resource = m.get_resource()

    if not resource:
        warn('Must specify a resource. Select one of:')
        list_rr(m.doc)
        sys.exit(0)

    r = requests.get(url, allow_redirects=True)
    r.raise_for_status()

    nb_path = Path('notebooks/{}-{}.ipynb'.format(
        splitext(basename(url))[0], resource.name))

    ensure_dir(nb_path.parent)

    if nb_path.exists():
        err("Notebook {} already exists".format(nb_path))

    with nb_path.open('wb') as f:
        f.write(r.content)

    prt('Wrote {}'.format(nb_path))

    with edit_notebook(nb_path) as nb:
        set_cell_source(nb, 'resource_name',
                        "resource_name='{}'".format(resource.name))
示例#4
0
def create_sql(args, doc, r):

    try:
        st = r.schema_term
    except AttributeError:
        return ''

    if not st:
        err(f"Resource '{r.name}' does not have a schema term")

    table_name =  mk_table_name(r, doc)

    table = Table(table_name, MetaData(bind=None))

    comment_rows = []

    for col in r.columns():
        # print(col)
        sql_type = type_map.get(col['datatype'], Text)

        table.append_column(Column(col['header'], sql_type,
                                   comment=col.get('description')))

        comment_rows.append((col['header'], sql_type.__name__, col['description']))

    dialect = dialect_map.get(args.dialect,mysql.dialect())

    comment=dedent(f"""
Table:       {table_name}
Description: {r.description}
Dataset:     {r.doc.name}
Columns:
{tabulate(comment_rows, tablefmt='simple')}""")

    return textwrap.indent(comment,'-- ')+'\n'+str(CreateTable(table).compile(dialect=dialect)).strip()+';'
示例#5
0
def get_config():
    config = _get_config()

    if config is None:
        err("No metatab configuration found. Can't get Github credentials. Maybe create '~/.metapack.yaml'"
            )

    if not config.get('github', {}).get('token'):
        err('No token set in config file at github.token')

    return config
示例#6
0
def extract_metatab(nb_path):
    if not exists(nb_path):
        err("Notebook path does not exist: '{}' ".format(nb_path))

    c = Config()

    with open(nb_path) as f:
        nb = nbformat.reads(f.read(), as_version=4)

    return ExtractInlineMetatabDoc(package_url="metapack+file:" +
                                   dirname(nb_path)).run(nb)
示例#7
0
def convert_notebook(nb_path):
    prt('Convert notebook to Metatab source package')

    if not exists(nb_path):
        err("Notebook path does not exist: '{}' ".format(nb_path))

    c = Config()

    pe = NotebookExecutor(config=c, log=logger)

    prt('Running the notebook')
    output, resources = pe.from_filename(nb_path)

    fw = FilesWriter()
    fw.build_directory = pe.output_dir

    fw.write(output, resources, notebook_name=DEFAULT_METATAB_FILE)

    de = DocumentationExporter(config=c,
                               log=logger,
                               metadata=doc_metadata(pe.doc))

    prt('Exporting documentation')
    output, resources = de.from_filename(nb_path)

    fw.build_directory = join(pe.output_dir, 'docs')
    fw.write(output, resources, notebook_name='notebook')

    new_mt_file = join(pe.output_dir, DEFAULT_METATAB_FILE)

    doc = MetapackDoc(new_mt_file)

    de.update_metatab(doc, resources)

    for lib_dir in pe.lib_dirs:
        lib_dir = normpath(lib_dir).lstrip('./')

        doc['Resources'].new_term("Root.PythonLib", lib_dir)

        path = abspath(lib_dir)
        dest = join(pe.output_dir, lib_dir)

        ensure_dir(dest)
        copytree(path, join(pe.output_dir, lib_dir))

    doc.write_csv()

    # Reset the input to use the new data

    prt('Running with new package file: {}'.format(new_mt_file))
示例#8
0
def run_colmap_new(args):
    m = MetapackCliMemo(args, downloader)

    resources = get_resources(m)

    if not resources:
        err(f"No resources found with colmap name '{m.args.colmap_name}'")

    # Collect all of the headers, into a list of headers,
    # and the union of all of them in col_index
    col_index = []
    headers = []

    for r in resources:
        h = r.headers

        col_index += [
            alt_col_name(c) for c in h if alt_col_name(c) not in col_index
        ]
        headers.append(h)

    # Create lists, of the same length as the index, of the source
    # column names, at the same position as the alt_col_name is in the col_index
    data = [col_index]

    for header in headers:
        new_row = [None] * len(col_index)
        for c in header:
            new_row[col_index.index(alt_col_name(c))] = c

        data.append(new_row)

    t = [['index'] + [r.name for r in resources]] + list(
        zip(*data))  # zip transposes rows into columns.

    path = Path(f"colmap-{m.args.colmap_name}.csv")

    if m.args.print:
        from tabulate import tabulate
        prt(tabulate(t[1:], headers=t[0]))
    else:
        if path.exists() and not m.args.force:
            err(f"Col map file '{str(path)}' already exists. Use -f to overwrite"
                )

        else:
            with path.open('w') as f:
                csv.writer(f).writerows(t)
            prt(f"Wrote {str(path)}")
示例#9
0
def run_url_scrape(args):
    m = MetapackCliMemo(args, downloader)

    from metapack.util import scrape_urls_from_web_page

    doc = m.doc
    url = m.args.url

    doc['resources'].new_term('DownloadPage', url)

    d = scrape_urls_from_web_page(url)

    if d.get('error'):
        err(d.get('error'))

    new_resources = 0
    new_documentation = 0

    if not args.no_resources:
        for k, v in d['sources'].items():
            u = parse_app_url(v['url'])
            t = doc['Resources'].new_term('DataFile',
                                          v['url'],
                                          name=u.fspath.stem,
                                          description=v.get('description'))
            new_resources += 1
            if args.verbose:
                prt(t, t.props)

    if not args.no_docs:
        for k, v in d['external_documentation'].items():
            term_name = classify_url(v['url'])
            u = parse_app_url(v['url'])
            t = doc['Documentation'].new_term(term_name,
                                              v['url'],
                                              name=u.fspath.stem,
                                              description=v.get('description'))
            new_documentation += 1
            if args.verbose:
                prt(t, t.props)

    prt("Added {} resource and {} documentation terms".format(
        new_resources, new_documentation))

    if not args.dry_run:
        write_doc(doc)
示例#10
0
    def __init__(self, args, downloader):
        super().__init__(args, downloader)

        self.term = self.args.term[0]
        self.value = self.args.value[0] if hasattr(self.args,
                                                   'value') else None

        parts = self.term.split('.')

        if len(parts) != 2 and len(parts) != 3:
            err('Term arg must have 2 or 3 words seperated by a period')

        if len(parts) == 3:
            self.section, parts = parts[0], parts[1:]
            self.term = '.'.join(parts)
        else:
            self.section = 'Root'
示例#11
0
def get_col_map(r):
    return r.header_map

    cm_name = r.get_value('colmap')

    if not cm_name:
        err(f"Resource '{r.name}' does not have a ColMap property")

    path = Path(f"colmap-{cm_name}.csv")

    if not path.exists():
        err(f"Colmap file '{str(path)}' does nto exist")

    with path.open() as f:
        cm = {}
        for row in csv.DictReader(f):
            if row[r.name]:
                cm[row[r.name]] = row['index']

    return cm
示例#12
0
def write_notebook(m):
    # Get the EDA notebook file from Github

    url = "https://raw.githubusercontent.com/Metatab/notebook-templates/master/package-notebook.ipynb"

    r = requests.get(url, allow_redirects=True)
    r.raise_for_status()

    p = Path(m.args.new_notebook)

    nb_path = f'notebooks/{p.stem}.ipynb'

    ensure_dir(dirname(nb_path))

    if exists(nb_path):
        err("Notebook {} already exists".format(nb_path))

    with open(nb_path, 'wb') as f:
        f.write(r.content)

    prt('Wrote {}'.format(nb_path))
示例#13
0
def load_sql(args, doc,r):

    try:
        st = r.schema_term
    except AttributeError:
        return ''

    if not st:
        err(f"Resource '{r.name}' does not have a schema term")

    table_name =  mk_table_name(r, doc)

    if args.dialect == 'redshift':

        if args.access_key and args.secret:
            access_key, secret = args.access_key, args.secret
        elif args.s3profile:
            access_key, secret = get_credentials(args.s3profile)

        else:
            err('For redshift loading, must specify --access_key and --secret or --profile')

        if r.get_value('s3url'):
            cred = f"ACCESS_KEY_ID '{access_key}' SECRET_ACCESS_KEY '{secret}' ;"
            return f"""COPY {table_name} FROM '{r.s3url}' CSV {cred}"""

    elif args.dialect == 'postgresql':
        if args.load_prog:
            url = r.url

            return f"""COPY {table_name} FROM PROGRAM '{ args.load_prog} "{url}"' WITH CSV HEADER ENCODING 'utf8'; """

    elif args.dialect == 'sqlite':

        u = r.resolved_url.get_resource().get_target()
        return f".mode csv {table_name}\n.import  '{str(u.fspath)}' {table_name}"

    return ''
示例#14
0
def convert_metatab_notebook(m):
    m.doc['Documentation'].get_or_new_term('Root.Readme').value = get_readme(m)

    return

    source = None  # Path(source)

    if source.suffix == '.csv':
        dest = source.with_suffix('.ipynb')
        doc = MetapackDoc(source)
        doc.ensure_identifier()
        doc.update_name(create_term=True)
        # _write_metatab_notebook(doc, dest)

    elif source.suffix == '.ipynb':
        dest = source.with_suffix('.csv')

        doc = None  # extract_notebook_metatab(source)
        doc.ensure_identifier()
        doc.update_name(create_term=True)
        write_doc(doc, dest)

    else:
        err("Source file must be either .ipynb or .csv")
示例#15
0
def _build_cmd(args):
    from rowgenerators.rowpipe.exceptions import TooManyCastingErrors

    downloader.set_callback((build_downloader_callback))

    m = MetapackCliMemo(args, downloader)

    if m.args.profile:
        from metatab.s3 import set_s3_profile
        set_s3_profile(m.args.profile)

    if m.args.clean_cache:
        clean_cache('metapack')

    try:
        changes = metatab_derived_handler(m)
        prt(f"{changes} changes")

    except TooManyCastingErrors as e:
        prt('Casting Errors:')
        for error in e.errors:
            prt(error)
        if m.args.exceptions:
            raise e
        else:
            err(e)
    except Exception as e:
        raise
        if m.args.exceptions:
            raise e
        else:
            err(e)

    clean_cache(m.cache)

    return changes
示例#16
0
def update_custom(m):
    try:
        r = m.doc.get_lib_module_dict()
        r['custom_update'](m.doc, m.args.remainder)
    except ImportError:
        err('No custom function')
示例#17
0
def new_cmd(args):
    from metapack import MetapackDoc
    from metapack.util import make_metatab_file, datetime_now, ensure_dir
    from metapack.cli.core import write_doc, prt, err
    from os.path import exists, join, expanduser
    from metatab import DEFAULT_METATAB_FILE
    from os import getenv

    if args.config:
        config_file = args.config
    elif getenv("METAPACK_CONFIG"):
        config_file = getenv("METAPACK_DEFAULTS")
    elif expanduser('~/.metapack-default.csv'):
        config_file = expanduser('~/.metapack-defaults.csv')
    else:
        config_file = None

    if config_file and exists(config_file):
        prt(f"Using defaults file '{config_file}'")
        config = MetapackDoc(config_file)
    else:
        config = MetapackDoc()

    if args.jupyter:
        import tempfile

        with tempfile.NamedTemporaryFile(suffix='.ipynb', delete=False) as fp:

            r = requests.get(TEMPLATE_NOTEBOOK, allow_redirects=True)
            r.raise_for_status()

            fp.write(r.content)
            nb_path = Path(fp.name)

        doc = MetapackDoc(str(nb_path))

    else:

        doc = make_metatab_file(args.template)

    doc['Root']['Created'] = datetime_now()

    origin = args.origin or config.get_value('Root.Origin')

    if not origin:
        err("Must specify a value for origin, either on command line or in defaults file"
            )

    (doc['Root'].find_first('Root.Origin') or et).value = origin
    (doc['Root'].find_first('Root.Dataset') or et).value = args.dataset
    (doc['Root'].find_first('Root.Space')
     or et).value = args.space or config.get_value('Root.Space')
    (doc['Root'].find_first('Root.Time')
     or et).value = args.time or config.get_value('Root.Time')
    (doc['Root'].find_first('Root.Grain')
     or et).value = args.grain or config.get_value('Root.Grain')
    (doc['Root'].find_first('Root.Variant')
     or et).value = args.variant or config.get_value('Root.Variant')

    v = doc['Root'].get_or_new_term('Root.Version')
    v.get_or_new_child(
        'Version.Major'
    ).value = args.revision or config.get_value('Root.Version')
    v.get_or_new_child('Version.Minor').value = 1
    v.get_or_new_child('Version.Patch').value = 1

    # Copy contacts in
    if 'Contacts' in config:
        for c in config['Contacts']:
            doc['Contacts'].add_term(c)

    if args.title:
        doc['Root'].find_first('Root.Title').value = args.title.strip()

    nv_name = doc.as_version(None)

    if args.example:
        doc['Resources'].new_term(
            'Root.Datafile',
            'http://public.source.civicknowledge.com/example.com/sources/random-names.csv',
            name='random_names')

        doc['Documentation'].new_term('Root.Homepage',
                                      'http://metatab.org',
                                      title='Metatab Home Page')

    doc.ensure_identifier()
    doc.update_name(create_term=True)

    if getattr(args, 'jupyter'):  # b/c maybe metatab_jupyter is not installed

        from metapack_jupyter.convert import write_metatab_notebook
        from metapack_jupyter.core import edit_notebook, set_cell_source, get_cell_source

        new_nb_path = Path(f'{nv_name}.ipynb')

        doc['Resources'].new_term(
            'Root.Datafile',
            './' + str(new_nb_path) + "#df",
            name='local_dataframe',
            description='Example of using a local Dataframe')

        if new_nb_path.exists():
            err(f"Directory {nb_path} already exists")

        copyfile(nb_path, new_nb_path)

        write_metatab_notebook(doc, new_nb_path)

        with edit_notebook(new_nb_path) as nb:
            init = get_cell_source(nb, 'init')
            init += f"\nthis_package_name = '{str(new_nb_path.name)}'"
            set_cell_source(nb, 'init', init)

        nb_path.unlink()
    else:

        doc['Documentation'].new_term('Root.Documentation',
                                      'file:README.md',
                                      title='README')

        if exists(nv_name):
            err(f"Directory {nv_name} already exists")

        if args.csv:
            fn = doc.nonver_name + '.csv'
            write_doc(doc, fn)
            prt(f"Writing to {fn}")

        else:
            ensure_dir(nv_name)

            pylib_dir = join(nv_name, 'pylib')
            ensure_dir(pylib_dir)
            with open(join(pylib_dir, '__init__.py'),
                      'w') as f_out, open(pylib.__file__) as f_in:
                f_out.write(f_in.read())

            if args.example:
                doc['Resources'].new_term('Root.Datafile',
                                          'python:pylib#row_generator',
                                          name='row_generator')

            prt(f"Writing to '{nv_name}'")

            write_doc(doc, join(nv_name, DEFAULT_METATAB_FILE))

            add_missing_files(nv_name)

            if args.title:
                readme = '# {}\n'.format(args.title)
            else:
                readme = '# {}\n'.format(doc.get_value('Root.Name'))

            with open(join(nv_name, 'README.md'), 'w') as f:
                f.write(readme)
示例#18
0
def upload_packages(m):
    """"""
    dist_urls = []
    fs_p = None

    files_processed = []

    # For each package in _packages with the same name as this document...
    for ptype, purl, cache_path in generate_packages(m):

        au = m.bucket.access_url(cache_path)

        # Just copy the Excel and Zip files directly to S3
        if ptype in ('xlsx', 'zip'):
            with open(purl.path, mode='rb') as f:
                access_url = m.bucket.write(f.read(), basename(purl.path),
                                            m.acl)

                if m.bucket.last_reason:
                    files_processed.append([
                        *m.bucket.last_reason, access_url,
                        '/'.join(purl.path.split(os.sep)[-2:])
                    ])

                prt("Added {} distribution: {} ".format(ptype, au))
                dist_urls.append(au)

        elif ptype == 'fs':
            # Write all of the FS package files to S3

            try:
                s3_package_root = MetapackPackageUrl(str(m.s3_url),
                                                     downloader=m.downloader)

                # fake-out: it's not actually an S3 CSV package; it's a FS package on S3.
                fs_p = S3CsvPackageBuilder(purl.metadata_url,
                                           s3_package_root,
                                           callback=prt,
                                           env={},
                                           acl='public-read')

                url = fs_p.save()

                prt("Packaged saved to: {}".format(url))

                # fs_url = MetapackUrl(url, downloader=purl.metadata_url.downloader)

            except NoCredentialsError:
                print(getenv('AWS_SECRET_ACCESS_KEY'))
                err("Failed to find boto credentials for S3. "
                    "See http://boto3.readthedocs.io/en/latest/guide/configuration.html "
                    )

            # A crappy hack. make_s3_package should return the correct url
            if fs_p:
                if m.acl == 'private':
                    au = fs_p.private_access_url.inner
                else:
                    au = fs_p.public_access_url.inner

                dist_urls.append(au)

    if fs_p:
        fs_p.files_processed += files_processed  # Ugly encapsulating-breaking hack.

    return dist_urls, fs_p