def test_sync_csv_package(self): from metapack_build.package import CsvPackageBuilder package_root = MetapackPackageUrl(test_data( 'packages/example.com/example.com-simple_example-2017-us/_packages' ), downloader=downloader) source_url = 'http://library.metatab.org/example.com-simple_example-2017-us-2/metadata.csv' u = MetapackUrl(source_url, downloader=downloader) u.get_resource().get_target() p = CsvPackageBuilder( u, package_root, resource_root=u.dirname().as_type(MetapackPackageUrl)) csv_url = p.save() doc = csv_url.metadata_url.doc for r in doc.resources(): print(r.name, r.url)
def test_build_notebook_package(self): try: from metapack import MetapackDocumentUrl, get_cache from metapack_build.build import make_filesystem_package m = MetapackDocumentUrl(test_data( 'packages/example.com/example.com-notebook/metadata.csv'), downloader=downloader) # process_schemas(m) doc = MetapackDoc(m) r = doc.resource('basic_a') self.assertEqual(2501, len(list(r))) package_dir = m.package_url.join_dir(PACKAGE_PREFIX) _, fs_url, created = make_filesystem_package( m, package_dir, get_cache(), {}, False, False, False) print(fs_url) except ImportError: unittest.skip("Pandas not installed") return
def test_fixed_resource(self): from itertools import islice from rowgenerators.generator.fixed import FixedSource m = MetapackUrl(test_data( 'packages/example.com/example.com-full-2017-us/metadata.csv'), downloader=downloader) doc = MetapackDoc(m) r = doc.resource('simple-fixed') self.assertEqual( 'fixed+http://public.source.civicknowledge.com/example.com/sources/simple-example.txt', str(r.url)) self.assertEqual( 'fixed+http://public.source.civicknowledge.com/example.com/sources/simple-example.txt', str(r.resolved_url)) g = r.row_generator print(r.row_processor_table()) self.assertIsInstance(g, FixedSource) rows = list(islice(r, 10)) print('----') for row in rows: print(row) self.assertEqual('f02d53a3-6bbc-4095-a889-c4dde0ccf5', rows[5][1])
def test_build_geo_package(self): from rowgenerators.valuetype import ShapeValue m = MetapackUrl(test_data( 'packages/sangis.org/sangis.org-census_regions/metadata.csv'), downloader=downloader) package_dir = m.package_url.join_dir(PACKAGE_PREFIX) _, fs_url, created = make_filesystem_package(m, package_dir, downloader.cache, {}, True) print(fs_url) doc = MetapackDoc(fs_url) r = doc.resource('sra') rows = list(r.iterdict) self.assertEqual(41, len(rows)) self.assertIsInstance(rows[1]['geometry'], ShapeValue)
def test_build_simple_package(self): cli_init() cache = Downloader().cache m = MetapackUrl(test_data( 'packages/example.com/example.com-simple_example-2017-us'), downloader=downloader) package_dir = m.package_url.join_dir(PACKAGE_PREFIX) package_dir = package_dir _, fs_url, created = make_filesystem_package(m, package_dir, cache, {}, False) fs_doc = MetapackDoc(fs_url, cache=downloader.cache) fs_doc.resource('random-names') # Excel _, url, created = make_excel_package(fs_url, package_dir, cache, {}, False) self.assertEqual(['random-names', 'renter_cost', 'unicode-latin1'], [r.name for r in url.doc.resources()]) self.assertEqual(['random-names', 'renter_cost', 'unicode-latin1'], [r.url for r in url.doc.resources()]) # ZIP _, url, created = make_zip_package(fs_url, package_dir, cache, {}, False) self.assertEqual(['random-names', 'renter_cost', 'unicode-latin1'], [r.name for r in url.doc.resources()]) self.assertEqual([ 'data/random-names.csv', 'data/renter_cost.csv', 'data/unicode-latin1.csv' ], [r.url for r in url.doc.resources()]) # CSV _, url, created = make_csv_package(fs_url, package_dir, cache, {}, False) self.assertEqual(['random-names', 'renter_cost', 'unicode-latin1'], [r.name for r in url.doc.resources()]) self.assertEqual([ 'com-simple_example-2017-us-2/data/random-names.csv', '.com-simple_example-2017-us-2/data/renter_cost.csv', 'm-simple_example-2017-us-2/data/unicode-latin1.csv' ], [str(r.url)[-50:] for r in url.doc.resources()])
def test_metapack_resources(self): cli_init() p = test_data( 'packages/example.com/example.com-metab_reuse/metadata.csv') m = MetapackUrl(p, downloader=downloader) print(m.doc.resources()) print(m.get_resource().get_target().exists())
def test_build_transform_package(self): m = MetapackUrl(test_data( 'packages/example.com/example.com-transforms/metadata.csv'), downloader=downloader) package_dir = m.package_url.join_dir(PACKAGE_PREFIX) _, fs_url, created = make_filesystem_package(m, package_dir, downloader.cache, {}, False) print(fs_url)
def test_html(self): p = open_package( test_data( 'packages/example.com/example.com-full-2017-us/metadata.csv')) self.assertTrue(len(p._repr_html_()) > 4500, len(p._repr_html_())) print(list(e.name for e in p.find('Root.Resource'))) r = p.find_first('Root.Resource', name='random-names') self.assertTrue(len(r._repr_html_()) > 400, len(r._repr_html_()))
def test_nbconvert(self): from collections import namedtuple from metapack.jupyter.convert import convert_documentation cli_init() M = namedtuple('M', 'mt_file') fn = test_data('notebooks/ConversionTest.ipynb') m = M(mt_file=parse_app_url(fn)) convert_documentation(m.mt_file.path)
def test_line_doc_parts(self): doc = MetapackDoc(TextRowGenerator("Declare: metatab-latest")) for fn in ( 'line/line-oriented-doc-root.txt', 'line/line-oriented-doc-contacts.txt', 'line/line-oriented-doc-datafiles.txt', 'line/line-oriented-doc-references-1.txt', 'line/line-oriented-doc-references-2.txt', 'line/line-oriented-doc-bib.txt', ): with open(test_data(fn)) as f: text = f.read() tp = TermParser(TextRowGenerator(text), resolver=doc.resolver, doc=doc) doc.load_terms(tp) self.assertEqual('47bc1089-7584-41f0-b804-602ec42f1249', doc.get_value('Root.Identifier')) self.assertEqual(157, len(doc.terms)) self.assertEqual(5, len(list(doc['References']))) self.assertEqual(5, len(list(doc['References'].find('Root.Reference')))) self.assertEqual(5, len(list(doc['References'].find( 'Root.Resource')))) # References are Resources rt = list(doc['References'].find('Root.Resource'))[0] self.assertIsInstance(rt, Reference) self.assertEqual(5, len(list(doc['Resources']))) self.assertEqual(5, len(list(doc['Resources'].find('Root.Datafile')))) self.assertEqual(5, len(list(doc['Resources'].find( 'Root.Resource')))) # References are Resources rt = list(doc['Resources'].find('Root.Resource'))[0] self.assertIsInstance(rt, Resource) doc._repr_html_() # Check no exceptions
def test_build_dataframe(self): p = open_package( test_data('packages/example.com/example.com-python/metadata.csv')) df = p.resource('simple').dataframe() self.assertEqual(270, df.sum().sum()) df = p.resource('explicit_dataframe_source').dataframe() self.assertEqual(435, df.sum().sum()) df = p.resource('implicit_dataframe_source').dataframe() self.assertEqual(435, df.sum().sum())
def test_petl(self): from petl import look m = MetapackUrl(test_data( 'packages/example.com/example.com-full-2017-us/metadata.csv'), downloader=downloader) doc = MetapackDoc(m) r = doc.resource('simple-example') r.resolved_url.get_resource().get_target() p = r.petl() print(look(p))
def x_test_metatab_line(self): from metatab.generate import TextRowGenerator from metatab.cli.core import process_schemas from metatab import MetatabDoc cli_init() doc = MetatabDoc( TextRowGenerator(test_data('simple-text.txt'), 'simple-text.txt')) process_schemas(doc) r = doc.resource('resource') for c in r.columns(): print(c)
def test_read_geo_packages(self): import warnings from requests.exceptions import HTTPError warnings.simplefilter("ignore") try: from publicdata.census.dataframe import CensusDataFrame except ImportError: return unittest.skip("Public data isn't installed") with open(test_data('line', 'line-oriented-doc.txt')) as f: text = f.read() doc = MetapackDoc(TextRowGenerator("Declare: metatab-latest\n" + text)) r = doc.reference('B09020') try: df = r.dataframe() except HTTPError: # The Census reporter URLs fail sometimes. return unittest.skip("Census Reporter vanished") self.assertIsInstance(df, CensusDataFrame) r = doc.reference('sra_geo') gf = r.geoframe() self.assertEqual(41, len(gf.geometry.geom_type)) self.assertEqual({'Polygon'}, set(gf.geometry.geom_type)) r = doc.reference('ri_tracts') gf = r.geoframe() self.assertEqual(244, len(gf.geometry.geom_type)) print(sorted(list(set(gf.geometry.geom_type)))) self.assertEqual(['MultiPolygon', 'Polygon'], sorted(list(set(gf.geometry.geom_type)))) print(gf.head())
def test_gen_line_rows(self): from metatab import parse_app_url from metapack import MetapackDocumentUrl from metatab.rowgenerators import TextRowGenerator u = parse_app_url(test_data('line', 'line-oriented-doc.txt'), proto='metapack') self.assertIsInstance(u, MetapackDocumentUrl) self.assertIsInstance(u.get_resource(), MetapackDocumentUrl) self.assertIsInstance(u.get_resource().get_target(), MetapackDocumentUrl) self.assertIsInstance(u.generator, TextRowGenerator) doc = MetapackDoc(u) self.assertEqual('47bc1089-7584-41f0-b804-602ec42f1249', doc.get_value('Root.Identifier'))
def test_open_package(self): from metapack import open_package from metapack.terms import Resource p = open_package( test_data( 'packages/example.com/example.com-full-2017-us/metadata.csv')) self.assertEqual(Resource, type(p.find_first('root.datafile'))) self.assertEqual('example.com-full-2017-us-1', p.find_first('Root.Name').value) self.assertEqual(16, len(list(p['Resources'].find('Root.Resource')))) all_names = [r.name for r in p.find('Datafile')] for name in [ 'renter_cost', 'simple-example-altnames', 'simple-example', 'unicode-latin1', 'unicode-utf8', 'renter_cost_excel07', 'renter_cost_excel97', 'renter_cost-2', 'random-names', 'random-names-fs', 'random-names-csv', 'random-names-xlsx', 'random-names-zip', 'sra' ]: self.assertIn(name, all_names) self.assertIsInstance(p.resource('random-names'), Resource) self.assertEqual('random-names', p.resource('random-names').name) r = p.find_first('Root.DataFile') print(r.resolved_url) self.assertEqual( 'http://public.source.civicknowledge.com/example.com/sources/test_data.zip#renter_cost.csv', str(r.resolved_url)) for r in p.find('Root.DataFile'): if r.name != 'unicode-latin1': continue self.assertEqual(int(r.nrows), len(list(r))) self.assertEqual(['ipums', 'bordley', 'mcdonald', 'majumder'], [c.name for c in p['Bibliography']])
def test_line_doc(self): from os.path import splitext, basename import sys with open(test_data('line', 'line-oriented-doc.txt')) as f: text = f.read() doc = MetapackDoc(TextRowGenerator("Declare: metatab-latest\n" + text)) # process_schemas(doc) r = doc.reference('tracts') self.assertEqual(628, len(list(r))) tracts = r.dataframe() self.assertEqual(-73427, tracts.lon.sum().astype(int)) tracts = r.read_csv() self.assertEqual(-73427, tracts.lon.sum().astype(int)) r.dataframe() # Test loading a Python Library from a package. ref = doc.reference('incv') self.assertIsNotNone(ref) ref_resource = parse_app_url( ref.url).inner.clear_fragment().get_resource() # The path has to be a Metatab ZIP archive, and the root directory must be the same as # the name of the path pkg_name, _ = splitext(basename(ref_resource.path)) lib_path = ref_resource.join(pkg_name).path if lib_path not in sys.path: sys.path.insert(0, lib_path)
def test_build_package(self): try: cli_init() m = MetapackUrl(test_data( 'packages/example.com/example.com-full-2017-us/metadata.csv'), downloader=downloader) package_dir = m.package_url.join_dir(PACKAGE_PREFIX) cache = Downloader().cache _, fs_url, created = make_filesystem_package( m, package_dir, cache, {}, False) except ImportError as e: unittest.skip(str(e)) return print(created)
def test_line_oriented(self): doc = MetapackDoc( TextRowGenerator(test_data('line', 'line-oriented-doc.txt'))) self.assertEqual('47bc1089-7584-41f0-b804-602ec42f1249', doc.get_value('Root.Identifier')) self.assertEqual(153, len(doc.terms)) self.assertEqual(6, len(list(doc['References']))) self.assertEqual(6, len(list(doc['References'].find('Root.Reference')))) self.assertEqual(6, len(list(doc['References'].find( 'Root.Resource')))) # References are Resources rt = list(doc['References'].find('Root.Resource'))[0] self.assertIsInstance(rt, Reference)
def test_nbconvert_package(self): try: from collections import namedtuple from metapack.jupyter.convert import convert_notebook cli_init() M = namedtuple('M', 'mt_file mtfile_arg init_stage2') fn = test_data('notebooks/ConversionTest.ipynb') m = M(mt_file=parse_app_url(fn), mtfile_arg=parse_app_url(fn), init_stage2=lambda x, y: None) convert_notebook(m.mt_file.path) except (ImportError, FileNotFoundError): unittest.skip("Pandoc is not installed") return
def test_resolve_resource_urls(self): """Test how resources are resolved in packages. - A name, for excel and CSV packages - a path, for ZIP and filesystem packages - a web url, for any kind of package """ with open(test_data('packages.csv')) as f: for i, l in enumerate(DictReader(f), 2): # print(i, l['url'], l['target_file']) u = MetapackPackageUrl(l['url'], downloader=Downloader()) try: t = u.resolve_url(l['target_file']) self.assertFalse(bool(l['resolve_error'])) except ResourceError: self.assertTrue(bool(l['resolve_error'])) continue except DownloadError: raise # Testing containment because t can have path in local filesystem, which changes depending on where # test is run # print(" ", t) self.assertTrue(l['resolved_url'] in str(t), (i, l['resolved_url'], str(t))) try: g = get_generator(t.get_resource().get_target()) self.assertEqual(101, len(list(g))) self.assertFalse(bool(l['generate_error'])) except DownloadError: raise except RowGeneratorError: self.assertTrue(bool(l['generate_error'])) continue
def test_program_resource(self): return # Actually, completely broken right now m = MetapackUrl(test_data( 'packages/example.com/example.com-full-2017-us/metadata.csv'), downloader=downloader) doc = MetapackDoc(m) r = doc.resource('rowgen') self.assertEqual('program+file:scripts/rowgen.py', str(r.url)) print(r.resolved_url) g = r.row_generator print(type(g)) for row in r: print(row)
def test_notebook_url(self): try: from metapack.appurl import JupyterNotebookUrl from metapack.jupyter.exec import execute_notebook from os.path import exists u = parse_app_url(test_data('notebooks', 'GenerateDataTest.ipynb')) self.assertIsInstance(u, JupyterNotebookUrl) execute_notebook(u.path, '/tmp/nbtest', ['dfa', 'dfb'], True) self.assertTrue(exists('/tmp/nbtest/dfa.csv')) self.assertTrue(exists('/tmp/nbtest/dfb.csv')) g = get_generator(parse_app_url('/tmp/nbtest/dfa.csv')) print(list(g)) except ImportError: unittest.skip("Missing pandas or jupyter client") return
def x_test_ipy(self): from rowgenerators import SourceSpec, Url, RowGenerator, get_cache urls = ('ipynb+file:foobar.ipynb', 'ipynb+http://example.com/foobar.ipynb', 'ipynb:foobar.ipynb') for url in urls: u = Url(url) print(u, u.path, u.resource_url) s = SourceSpec(url) print(s, s.proto, s.scheme, s.resource_url, s.target_file, s.target_format) self.assertIn(s.scheme, ('file', 'http')) self.assertEquals('ipynb', s.proto) gen = RowGenerator(cache=get_cache(), url='ipynb:scripts/Py3Notebook.ipynb#lst', working_dir=test_data(), generator_args={'mult': lambda x: x * 3}) rows = gen.generator.execute() print(len(rows))
def test_dataframe(self): try: p = open_package( test_data( 'packages/example.com/example.com-full-2017-us/metadata.csv' )) r = p.resource('random-names') df = r.dataframe() self.assertTrue(df.describe().loc['count', 'Size'] == 100) self.assertTrue(df.describe().loc['mean', 'Size'].round(4) == 49.8032) df = r.read_csv() self.assertTrue(df.describe().loc['count', 'Size'] == 100) self.assertTrue(df.describe().loc['mean', 'Size'].round(4) == 49.8032) except ImportError: unittest.skip("Pandas not installed") return