def test_line_doc_parts(self): doc = MetatabDoc(TextRowGenerator("Declare: metatab-latest")) for fn in ( 'line/line-oriented-doc-root.txt', 'line/line-oriented-doc-contacts.txt', 'line/line-oriented-doc-references-1.txt', 'line/line-oriented-doc-references-2.txt', 'line/line-oriented-doc-bib.txt', ): with open(test_data(fn)) as f: text = f.read() tp = TermParser(TextRowGenerator(text), resolver=doc.resolver, doc=doc) doc.load_terms(tp) self.assertEqual('47bc1089-7584-41f0-b804-602ec42f1249', doc.get_value('Root.Identifier')) self.assertEqual(152, len(doc.terms)) self.assertEqual(5, len(list(doc['References']))) self.assertEqual(5, len(list(doc['References'].find('Root.Resource'))))
def test_write_line_doc(self): """Convert CSV files to text lines and back to text lines""" all = [ 'example1.csv', 'example2.csv', 'example1-web.csv', 'children.csv', 'children2.csv', 'issue1.csv' ] self.maxDiff = None for f in all: path = test_data(f) doc1 = MetatabDoc(path) doc1_lines = doc1.as_lines() print(doc1_lines) doc2 = MetatabDoc(TextRowGenerator(doc1_lines)) doc2_lines = doc2.as_lines() self.assertEqual(doc1_lines, doc2_lines) self.compare_dict(doc1.as_dict(), doc2.as_dict()) self.assertEqual(doc1_lines, doc2_lines) self.assertEqual(doc1.as_csv(), doc2.as_csv())
def add_resource(mt_file, ref, cache): """Add a resources entry, downloading the intuiting the file, replacing entries with the same reference""" from metatab.util import enumerate_contents if isinstance(mt_file, MetatabDoc): doc = mt_file else: doc = MetatabDoc(mt_file) if not 'Resources' in doc: doc.new_section('Resources') doc['Resources'].args = [e for e in set(doc['Resources'].args + ['Name', 'StartLine', 'HeaderLines', 'Encoding']) if e] seen_names = set() if isdir(ref): for f in find_files(ref, DATA_FORMATS): if f.endswith(DEFAULT_METATAB_FILE): continue if doc.find_first('Root.Datafile', value=f): prt("Datafile exists for '{}', ignoring".format(f)) else: add_single_resource(doc, f, cache=cache, seen_names=seen_names) else: for c in enumerate_contents(ref, cache=cache, callback=prt): add_single_resource(doc, c.rebuild_url(), cache=cache, seen_names=seen_names) write_doc(doc, mt_file)
def test_datapackage_declare(self): import datapackage doc = MetatabDoc(test_data('datapackage_ex2.csv')) d = doc.as_dict() f = open('/tmp/package.json', 'w') # NamedTemporaryFile(delete=False) f.write(json.dumps(d, indent=4)) f.close() try: dp = datapackage.DataPackage(f.name) dp.validate() except: with open(f.name) as f2: print(f2.read()) raise print(f.name) # unlink(f.name) doc = MetatabDoc(test_data('example1.csv')) from metatab.datapackage import convert_to_datapackage print(json.dumps(convert_to_datapackage(doc), indent=4))
def test_line_doc(self): doc = MetatabDoc(TextRowGenerator("Declare: metatab-latest")) with open(test_data('line/line-oriented-doc.txt')) as f: text = f.read() tp = TermParser(TextRowGenerator(text), resolver=doc.resolver, doc=doc) doc.load_terms(tp) self.assertEqual('47bc1089-7584-41f0-b804-602ec42f1249', doc.get_value('Root.Identifier')) self.assertEqual(152, len(doc.terms)) self.assertEqual(5, len(list(doc['References']))) self.assertEqual(5, len(list(doc['References'].find('Root.Reference')))) self.assertEqual(5, len(list(doc['References'].find( 'Root.Resource')))) #References are Resources rt = list(doc['References'].find('Root.Resource'))[0] print(type(rt))
def update_distributions(m): """Add a distribution term for each of the distributions the sync is creating. Also updates the 'Issued' time""" doc = MetatabDoc(m.mt_file) access_value = doc.find_first_value('Root.Access') acl = 'private' if access_value == 'private' else 'public' b = S3Bucket(m.args.s3, acl=acl) updated = False old_dists = list(doc.find('Root.Distribution')) if m.args.excel is not False: p = ExcelPackage(m.mt_file) if update_dist(doc, old_dists, b.access_url(p.save_path())): prt("Added Excel distribution to metadata") updated = True if m.args.zip is not False: p = ZipPackage(m.mt_file) if update_dist(doc, old_dists, b.access_url(p.save_path())): prt("Added ZIP distribution to metadata") updated = True if m.args.fs is not False: p = FileSystemPackage(m.mt_file) if update_dist(doc, old_dists, b.access_url(p.save_path(), DEFAULT_METATAB_FILE)): prt("Added FS distribution to metadata") updated = True if m.args.csv is not False: p = CsvPackage(m.mt_file) url = b.access_url(basename(p.save_path())) if update_dist(doc, old_dists, url): prt("Added CSV distribution to metadata", url) updated = True doc['Root']['Issued'] = datetime_now() if not write_doc(doc, m.mt_file): # The mt_file is probably a URL, so we can't write back to it, # but we need the updated distributions, so write it elsewhere, then # reload it in the next stage. second_stage_file = join(PACKAGE_PREFIX, DEFAULT_METATAB_FILE) if not exists(dirname(second_stage_file)): makedirs(dirname(second_stage_file)) assert write_doc(doc, second_stage_file) else: second_stage_file = m.mt_file return second_stage_file, updated
def load_doc(self, ref): if isinstance(ref, string_types): self._doc = MetatabDoc(ref, cache=self._cache) else: self._doc = ref return self
def test_open(self): doc = MetatabDoc(test_data('almost-everything.csv')) self.assertEquals('9FC11204-B291-4E0E-A841-5372090ADEC0', doc.find_first_value('Root.Identifier')) self.assertEquals('9FC11204-B291-4E0E-A841-5372090ADEC0', doc['Root'].find_first_value('Root.Identifier'))
def test_new(self): import metatab.templates as tmpl template_path = join(dirname(tmpl.__file__), 'metatab.csv') doc = MetatabDoc(template_path) doc.cleanse() print(doc.as_csv()[:200])
def test_acessors(self): doc = MetatabDoc(test_data('properties.csv')) c = doc.find_first('Root.Citation', name='ipums') # Arg_props not include Author, Title or Year, which are children, but not arg props self.assertEquals( sorted([ 'type', 'month', 'publisher', 'journal', 'version', 'volume', 'number', 'pages', 'accessdate', 'location', 'url', 'doi', 'issn', 'name' ]), sorted(list(c.arg_props.keys()))) # Props includes just the children that actually have values self.assertEquals( sorted([ 'type', 'publisher', 'version', 'accessdate', 'url', 'doi', 'author', 'title', 'year' ]), sorted(list(c.props.keys()))) # All props includes values for all of the children and all of the property args self.assertEquals( sorted([ 'type', 'month', 'publisher', 'journal', 'version', 'volume', 'number', 'pages', 'accessdate', 'location', 'url', 'doi', 'issn', 'name', 'author', 'title', 'year' ]), sorted(list(c.all_props.keys()))) # Attribute acessors self.assertEqual('dataset', c.type) self.assertEqual('2017', c.year) self.assertEqual('Integrated Public Use Microdata Series', c.title) self.assertEqual('University of Minnesota', c.publisher) # These are properties of Term self.assertEqual(c.join, 'root.citation') self.assertTrue(c.term_is('Root.Citation')) # Item style acessors self.assertEqual('dataset', c['type'].value) self.assertTrue(c['type'].term_is('Citation.Type')) self.assertEqual('2017', c['year'].value) self.assertEqual('Integrated Public Use Microdata Series', c['title'].value) self.assertEqual('University of Minnesota', c['publisher'].value) self.assertTrue(c['publisher'].term_is('Citation.Publisher')) c.foo = 'bar' c.type = 'foobar' self.assertEqual('foobar', c.type) self.assertEqual('foobar', c['type'].value)
def init_doc(self): if self._ref: self.load_doc(self._ref) else: self._doc = MetatabDoc() if not self._doc.find("Root.Declare"): # FIXME. SHould really have a way to insert this term as the first term. self.sections.root.new_term('Declare', 'metatab-latest') self._doc.load_declarations(['metatab-latest']) return self.doc
def __iter__(self): """Iterate over all of the lines in the file""" import yaml from metatab import MetatabDoc with open(self.url.fspath) as f: d = yaml.load(f) decl = d.get('declare', 'metatab-latest') doc = MetatabDoc(decl=decl) #yield from doc.rows section_names = [ 'root', 'contacts', 'documentation', 'resources', 'references', 'schema' ] for section_name in section_names: section = doc.decl_sections[section_name] #print(section_name, section) for tn in section.get('terms', []): self.section_map[tn.lower()] = section_name self.sections[section_name] = doc.get_or_new_section( section_name, section['args']) last_section = None last_term = {} for term_name, value, parent in self.yield_dict(doc, d): print(term_name, value, parent) section = self.sections.get( self.section_map.get(term_name) or 'root') if parent is None: term = section.new_term(term_name, value) else: parent_term = last_term[parent] term = parent_term.new_child(term_name, value) last_term[term_name] = term yield from doc.rows
def test_includes(self): doc = MetatabDoc(test_data('include1.csv')) d = doc.as_dict() for t in doc['root'].terms: print(t) print(d) self.assertEquals( ['Include File 1', 'Include File 2', 'Include File 3'], d['note']) self.assertTrue(any('include2.csv' in e for e in d['include'])) self.assertTrue(any('include3.csv' in e for e in d['include']))
def x_test_metatab_line(self): from metatab.generate import TextRowGenerator from metatab.cli.core import process_schemas from metatab import MetatabDoc cli_init() doc = MetatabDoc( TextRowGenerator(test_data('simple-text.txt'), 'simple-text.txt')) process_schemas(doc) r = doc.resource('resource') for c in r.columns(): print(c)
def errs(fn): with self.assertRaises(IncludeError): doc = MetatabDoc() tp = TermParser(fn, resolver=WebResolver, doc=doc) _ = list(tp) return tp.errors_as_dict()
def make_metatab_file(template='metatab'): import metatab.templates as tmpl template_path = join(dirname(tmpl.__file__), template + '.csv') doc = MetatabDoc(template_path) return doc
def test_children(self): doc = MetatabDoc(test_data('children.csv')) for t in doc.terms: print(t) import json print(json.dumps(doc.as_dict(), indent=4)) for t in doc.as_dict()['parent']: self.assertEquals( { 'prop1': 'prop1', 'prop2': 'prop2', '@value': 'parent' }, t)
def metatab_admin_handler(m): if m.args.enumerate: from metatab.util import enumerate_contents specs = list(enumerate_contents(m.args.enumerate, m.cache, callback=prt)) for s in specs: prt(classify_url(s.url), s.target_format, s.url, s.target_segment) if m.args.html: from metatab.html import html doc = MetatabDoc(m.mt_file) # print(doc.html) prt(html(doc)) if m.args.markdown: from metatab.html import markdown doc = MetatabDoc(m.mt_file) prt(markdown(doc)) if m.args.clean_cache: clean_cache('metapack') if m.args.name: doc = MetatabDoc(m.mt_file) prt(doc.find_first_value("Root.Name")) exit(0)
def test_new_parser(self): tp = MetatabDoc(test_data('short.csv')) for t in tp.terms: print(t) import json print(json.dumps(tp.decl_terms, indent=4))
def test_datapackage_convert(self): import datapackage from metatab.datapackage import convert_to_datapackage doc = MetatabDoc(test_data('example1.csv')) dp = convert_to_datapackage(doc) print(json.dumps(dp, indent=4)) dp = datapackage.DataPackage(dp) dp.validate()
def metatab_derived_handler(m, skip_if_exists=None): """Create local Zip, Excel and Filesystem packages :param m: :param skip_if_exists: :return: """ from metatab.package import PackageError create_list = [] url = None doc = MetatabDoc(m.mt_file) env = get_lib_module_dict(doc) if (m.args.excel is not False or m.args.zip is not False or (hasattr(m.args, 'filesystem') and m.args.filesystem is not False) ): update_name(m.mt_file, fail_on_missing=False, report_unchanged=False) if m.args.force: skip_if_exists = False try: # Always create a filesystem package before ZIP or Excel, so we can use it as a source for # data for the other packages. This means that Transform processes and programs only need # to be run once. if any([m.args.filesystem, m.args.excel, m.args.zip]): _, url, created = make_filesystem_package(m.mt_file, m.cache, env, skip_if_exists) create_list.append(('fs', url, created)) m.mt_file = url env = {} # Don't need it anymore, since no more programs will be run. if m.args.excel is not False: _, url, created = make_excel_package(m.mt_file, m.cache, env, skip_if_exists) create_list.append(('xlsx', url, created)) if m.args.zip is not False: _, url, created = make_zip_package(m.mt_file, m.cache, env, skip_if_exists) create_list.append(('zip', url, created)) if m.args.csv is not False: _, url, created = make_csv_package(m.mt_file, m.cache, env, skip_if_exists) create_list.append(('csv', url, created)) except PackageError as e: err("Failed to generate package: {}".format(e)) return create_list
def get_metatab_doc(nb_path): """Read a notebook and extract the metatab document. Only returns the first document""" from metatab.generate import CsvDataRowGenerator from metatab.rowgenerators import TextRowGenerator from metatab import MetatabDoc with open(nb_path) as f: nb = nbformat.reads(f.read(), as_version=4) for cell in nb.cells: source = ''.join(cell['source']).strip() if source.startswith('%%metatab'): return MetatabDoc(TextRowGenerator(source))
def test_parse_everything(self): import json all = [ 'example1.csv', 'example2.csv', 'example1-web.csv', 'include1.csv', 'include2.csv', 'include3.csv', 'children.csv', 'children2.csv', 'issue1.csv' ] # These are currently broken -- as_dict doesn't work properly with the # datapackage-latest decl. datapackages = [ 'datapackage_ex1.csv', 'datapackage_ex1_web.csv', 'datapackage_ex2.csv' ] for fn in all: print('Testing ', fn) path = test_data(fn) json_path = test_data('json', fn.replace('.csv', '.json')) doc = MetatabDoc(path) d = doc.as_dict() if not exists(json_path): with open(json_path, 'w') as f: print("Writing", json_path) json.dump(d, f, indent=4) with open(json_path) as f: d2 = json.load(f) self.compare_dict(d, d2)
def test_versions(self): doc = MetatabDoc(test_data('example1.csv')) self.assertEqual('201404', doc.find_first_value('Root.Version')) self.assertEqual('example.com-voters-2002_2014-ca-county-201409', doc.as_version('+5')) self.assertEqual('example.com-voters-2002_2014-ca-county-201399', doc.as_version('-5')) self.assertEqual('example.com-voters-2002_2014-ca-county-foobar', doc.as_version('foobar')) self.assertEqual('example.com-voters-2002_2014-ca-county', doc.as_version(None))
def metatab_query_handler(m): if m.args.resource or m.args.head: limit = 20 if m.args.head else None try: doc = MetatabDoc(m.mt_file, cache=m.cache) except OSError as e: err("Failed to open Metatab doc: {}".format(e)) return if m.resource: dump_resource(doc, m.resource, limit) else: dump_resources(doc)
def process_schemas(mt_file, cache, clean=False): from rowgenerators import SourceError from requests.exceptions import ConnectionError doc = MetatabDoc(mt_file) try: if clean: doc['Schema'].clean() else: doc['Schema'] except KeyError: doc.new_section('Schema', ['DataType', 'Altname', 'Description']) for r in doc.resources(): schema_name = r.get_value('schema', r.get_value('name')) schema_term = doc.find_first(term='Table', value=schema_name, section='Schema') if schema_term: prt("Found table for '{}'; skipping".format(schema_name)) continue path, name = extract_path_name(r.url) prt("Processing {}".format(r.url)) si = SelectiveRowGenerator(islice(r.row_generator, 100), headers=[int(i) for i in r.get_value('headerlines', '0').split(',')], start=int(r.get_value('startline', 1))) try: ti = TypeIntuiter().run(si) except SourceError as e: warn("Failed to process '{}'; {}".format(path, e)) continue except ConnectionError as e: warn("Failed to download '{}'; {}".format(path, e)) continue table = doc['Schema'].new_term('Table', schema_name) prt("Adding table '{}' ".format(schema_name)) for i, c in enumerate(ti.to_rows()): raw_alt_name = alt_col_name(c['header'], i) alt_name = raw_alt_name if raw_alt_name != c['header'] else '' table.new_child('Column', c['header'], datatype=type_map.get(c['resolved_type'], c['resolved_type']), altname=alt_name) write_doc(doc, mt_file)
def test_update_name(self): for fn in ('name.csv', 'name2.csv'): doc = MetatabDoc(test_data(fn)) updates = doc.update_name() name = doc.find_first_value("Root.Name") self.assertEquals('example.com-foobar-2017-ca-people-1', name) self.assertEquals(['Changed Name'], updates) try: doc.remove_term(doc.find_first('Root.Dataset')) except ValueError: nv = doc.find_first('Root.Name') nv.remove_child(nv.find_first('Name.Dataset')) updates = doc.update_name() self.assertIn("No Root.Dataset, so can't update the name", updates)
def metaworld(): import argparse parser = argparse.ArgumentParser( prog='metakan', description='Publish packages to Data.World, version {}'.format( _meta.__version__)) parser.add_argument('-i', '--info', default=False, action='store_true', help="Show package information") parser.add_argument('metatabfile', nargs='?', default=DEFAULT_METATAB_FILE, help='Path to a Metatab file') class MetapackCliMemo(object): def __init__(self, args): self.cwd = getcwd() self.args = args self.cache = get_cache('metapack') self.mtfile_arg = args.metatabfile if args.metatabfile else join( self.cwd, DEFAULT_METATAB_FILE) self.mtfile_url = Url(self.mtfile_arg) self.resource = self.mtfile_url.parts.fragment self.package_url, self.mt_file = resolve_package_metadata_url( self.mtfile_url.rebuild_url(False, False)) m = MetapackCliMemo(parser.parse_args(sys.argv[1:])) try: doc = MetatabDoc(m.mt_file, cache=m.cache) except (IOError, MetatabError) as e: err("Failed to open metatab '{}': {}".format(m.mt_file, e)) if m.args.info: package_info(doc) else: send_to_dw(doc) exit(0)
def test_sections(self): doc = MetatabDoc(test_data('example1.csv')) self.assertEqual( ['root', u'resources', u'contacts', u'notes', u'schema'], list(doc.sections.keys())) del doc['Resources'] self.assertEqual(['root', u'contacts', u'notes', u'schema'], list(doc.sections.keys())) notes = list(doc['notes']) self.assertEquals(2, len(notes)) for sname, s in doc.sections.items(): print(sname, s.value)
def test_find(self): doc = MetatabDoc(test_data('example1.csv')) self.assertEquals('cdph.ca.gov-hci-registered_voters-county', doc.find_first('Root.Identifier').value) doc = MetatabDoc(test_data('resources.csv')) self.assertEqual( { 'root.downloadpage', 'root.supplementarydata', 'root.api', 'root.citation', 'root.datafile', 'root.datadictionary', 'root.image', 'root.reference', 'root.documentation', 'root.homepage', 'root.webpage', 'root.sql', 'root.dsn' }, doc.derived_terms['root.resource']) self.assertEqual([ 'example1', 'example10', 'example2', 'example3', 'example4', 'example5', 'example6', 'example7', 'example8', 'example9' ], sorted([t.name for t in doc.find('root.resource')])) self.assertEquals(['example1', 'example2'], [t.name for t in doc.find('root.datafile')])