def test_RowGenerator(self): import warnings warnings.simplefilter("ignore") from rowgenerators import RowGenerator rg = RowGenerator('census://CA/140/B17001') self.assertEqual(8058, len(list(rg))) df = rg.dataframe() self.assertEqual(8057, len(df))
def test_ipy(self): from rowgenerators import SourceSpec, Url, RowGenerator, get_cache from rowgenerators.fetch import download_and_cache urls = ('ipynb+file:foobar.ipynb', 'ipynb+http://example.com/foobar.ipynb', 'ipynb:foobar.ipynb') for url in urls: u = Url(url) print(u, u.path, u.resource_url) s = SourceSpec(url) print(s, s.proto, s.scheme, s.resource_url, s.target_file, s.target_format) self.assertIn(s.scheme, ('file', 'http')) self.assertEquals('ipynb', s.proto) gen = RowGenerator(cache=get_cache(), url='ipynb:scripts/Py3Notebook.ipynb#lst', working_dir=test_data(), generator_args={'mult': lambda x: x * 3}) rows = gen.generator.execute() print(len(rows))
def dataframe(self, limit=None): """Return a pandas datafrome from the resource""" from .pands import MetatabDataFrame d = self.properties d['url'] = self.resolved_url d['working_dir'] = self._doc.doc_dir rg = RowGenerator(**d) headers = self.headers if headers: # There are several args for SelectiveRowGenerator, but only # start is really important. start = d.get('start', 1) rg = islice(rg, start, limit) else: headers = next( rg) # Get the headers from the first row of the file rp_table = self.row_processor_table() if rp_table: rg = RowProcessor(rg, rp_table, source_headers=headers, env={}) df = MetatabDataFrame(list(rg), columns=headers, metatab_resource=self) self.errors = df.metatab_errors = rg.errors if rg.errors else {} return df
def test_metapack(self): from metatab import open_package, resolve_package_metadata_url cache = cache_fs() url = 'metatab+http://library.metatab.org/example.com-simple_example-2017-us-1#random-names' rg = RowGenerator(cache=cache, url=url) package_url, metadata_url = resolve_package_metadata_url( rg.generator.spec.resource_url) self.assertEquals( 'http://library.metatab.org/example.com-simple_example-2017-us-1/', package_url) self.assertEquals( 'http://library.metatab.org/example.com-simple_example-2017-us-1/metadata.csv', metadata_url) doc = open_package(rg.generator.spec.resource_url, cache=cache) self.assertEquals( 'http://library.metatab.org/example.com-simple_example-2017-us-1/data/random-names.csv', doc.resource('random-names').resolved_url) urls = [ 'metatab+http://library.metatab.org/example.com-simple_example-2017-us-1#random-names', 'metatab+http://library.metatab.org/example.com-simple_example-2017-us-1.zip#random-names', 'metatab+http://library.metatab.org/example.com-simple_example-2017-us-1.xlsx#random-names' ] for url in urls: gen = None try: gen = RowGenerator(cache=cache, url=url) rows = list(gen) self.assertEquals(101, len(rows)) except: print("ERROR URL", url) print("Row Generator ", gen) raise
def run_row_intuit(path, cache): for encoding in ('ascii', 'utf8', 'latin1'): try: rows = list(islice(RowGenerator(url=path, encoding=encoding, cache=cache), 5000)) return encoding, RowIntuiter().run(rows) except TextEncodingError: pass raise Exception('Failed to convert with any encoding')
def test_register(self): from pandasreporter import CensusReporterSource, get_cache register_proto('censusreporter', CensusReporterSource) url = 'censusreporter:B17001/140/05000US06073' gen = RowGenerator(cache=get_cache(), url=url) self.assertEquals('B17001', gen.generator.table_id) self.assertEquals('140', gen.generator.summary_level) self.assertEquals('05000US06073', gen.generator.geoid) for row in gen: print(row)
def run_row_intuit(path, cache): from rowgenerators import RowGenerator from tableintuit import RowIntuiter from itertools import islice from rowgenerators import TextEncodingError for encoding in ('ascii', 'utf8', 'latin1'): try: rows = list(islice(RowGenerator(url=path, encoding=encoding, cache=cache, ), 5000)) return encoding, RowIntuiter().run(list(rows)) except (TextEncodingError, UnicodeEncodeError) as e: pass raise RowIntuitError('Failed to convert with any encoding')
def test_source_spec_url(self): from rowgenerators import SourceSpec, RowGenerator from copy import deepcopy ss = SourceSpec(url='http://foobar.com/a/b.csv') self.assertEqual('b.csv', ss.target_file) self.assertIsNone(ss.target_segment) ss = SourceSpec(url='http://foobar.com/a/b.zip#a') print(ss._url) self.assertEqual('a', ss.target_file) self.assertIsNone(ss.target_segment) ss2 = deepcopy(ss) self.assertEqual(ss.target_file, ss2.target_file) self.assertIsNone(ss.target_segment) ss = SourceSpec(url='http://foobar.com/a/b.zip#a;b') self.assertEqual('a', ss.target_file) self.assertEqual('b', ss.target_segment) ss2 = deepcopy(ss) self.assertEqual(ss.target_file, ss2.target_file) self.assertEqual(ss.target_segment, ss2.target_segment) ss = RowGenerator( url= 'http://public.source.civicknowledge.com/example.com/sources/test_data.zip#renter_cost_excel07.xlsx' ) self.assertEqual('renter_cost_excel07.xlsx', ss.target_file) ss2 = deepcopy(ss) self.assertEqual(ss.target_file, ss2.target_file) for url in ('http://example.com/foo/archive.zip', 'http://example.com/foo/archive.zip#file.xlsx', 'http://example.com/foo/archive.zip#file.xlsx;0', 'socrata+http://example.com/foo/archive.zip'): pass print( SourceSpec( url='socrata+http://chhs.data.ca.gov/api/views/tthg-z4mf'). __dict__)
def test_zip(self): from rowgenerators import enumerate_contents, RowGenerator, SourceError, TextEncodingError z = 'http://public.source.civicknowledge.com/example.com/sources/test_data.zip' cache = cache_fs() for c in enumerate_contents(z, cache): print(c.url, c.encoding) if c.target_format in ('foo', 'txt'): continue gen = RowGenerator(url=c.url) try: print(len(list(gen))) except (UnicodeDecodeError, TextEncodingError) as e: print("UERROR", c.name, e) except SourceError as e: print("ERROR", c.name, e)
def test_notebook(self): urls = ('ipynb+file:foobar.ipynb', 'ipynb+http://example.com/foobar.ipynb', 'ipynb:foobar.ipynb') for url in urls: u = Url(url) print(u, u.path, u.resource_url) s = SourceSpec(url) print(s, s.proto, s.scheme, s.resource_url, s.target_file, s.target_format) self.assertIn(s.scheme, ('file', 'http')) self.assertEquals('ipynb', s.proto) # print(download_and_cache(s, cache_fs())) gen = RowGenerator(cache=cache_fs(), url='ipynb:Py3Notebook.ipynb#lst', working_dir=script_path(), generator_args={'mult': lambda x: x * 3}) print(gen.generator.execute())
def test_shapefile(self): url = "shape+http://s3.amazonaws.com/test.library.civicknowledge.com/census/tl_2016_us_state.geojson" gen = RowGenerator(url=url, cache=cache_fs()) self.assertTrue(gen.is_geo) print("HEADERS", gen.headers) x = 0 for row in gen.iter_rp(): x += float(row['INTPTLON']) self.assertEquals(-4776, int(x)) url = "shape+http://s3.amazonaws.com/test.library.civicknowledge.com/census/tl_2016_us_state.geojson.zip" gen = RowGenerator(url=url, cache=cache_fs()) self.assertTrue(gen.is_geo) x = 0 for row in gen.iter_rp(): x += float(row['INTPTLON']) self.assertEquals(-4776, int(x)) return url = "shape+http://s3.amazonaws.com/test.library.civicknowledge.com/census/tl_2016_us_state.zip" gen = RowGenerator(url=url, cache=cache_fs()) for row in gen: print(row)
def test_run_sources(self): cache = cache_fs() for sd in sources(): # Don't have the column map yet. if sd['name'] in ('simple_fixed', 'facilities'): continue try: ss = SourceSpec(**sd) gen = RowGenerator(cache=cache, **sd) rows = list(gen) self.assertEquals(int(sd['n_rows']), len(rows)) except Exception as e: print('---') print(sd['name'], e) print(rows[0]) print(rows[-1])
def test_program(self): urls = ( ('program:rowgen.py', 'rowgen.py'), ('program:/rowgen.py', '/rowgen.py'), ('program:///rowgen.py', '/rowgen.py'), ('program:/a/b/c/rowgen.py', '/a/b/c/rowgen.py'), ('program:/a/b/c/rowgen.py', '/a/b/c/rowgen.py'), ('program:a/b/c/rowgen.py', 'a/b/c/rowgen.py'), ('program+http://foobar.com/a/b/c/rowgen.py', '/a/b/c/rowgen.py'), ) for u, v in urls: url = Url(u) self.assertEquals(url.path, v, u) cache = cache_fs() options = {'-a': 'a', '-b': 'b', '--foo': 'foo', '--bar': 'bar'} options.update({ 'ENV1': 'env1', 'ENV2': 'env2', 'prop1': 'prop1', 'prop2': 'prop2' }) gen = RowGenerator(cache=cache, url='program:rowgen.py', working_dir=script_path(), generator_args=options) rows = list(gen) for row in rows: print(row)
def row_generator(self): d = self.properties d['url'] = self.resolved_url d['target_format'] = d.get('format') d['target_segment'] = d.get('segment') d['target_file'] = d.get('file') d['engoding'] = d.get('encoding', 'utf8'), generator_args = dict(d.items()) # For ProgramSource generator, These become values in a JSON encoded dict in the PROPERTIE env var generator_args['working_dir'] = self._doc.doc_dir generator_args['metatab_doc'] = self._doc.ref generator_args['metatab_package'] = str(self._doc.package_url) # These become their own env vars. generator_args['METATAB_DOC'] = self._doc.ref generator_args['METATAB_PACKAGE'] = str(self._doc.package_url) d['cache'] = self._doc._cache d['working_dir'] = self._doc.doc_dir d['generator_args'] = generator_args return RowGenerator(**d)
table = e.table # Should also be the same if documentation: ep.sections.documentation.new_term('Documentation', documentation) ep.sections.root.new_term('Title', desc) ep.sections.root.new_term('Name', table) ep.doc.new_section('Schema', 'DataType WidthFormat Description Coding'.split()) t = ep.sections.schema.new_term("Root.Table", table) r = ep.sections.resources # Make sure the section exists for r in ep.resources: if "DataDictionary" in r.url: for row in DictRowGenerator(RowGenerator(r.url)): try: t.new_child('Table.Column', row.get('Variable', row['Name']), datatype=row['Type'], widthformat=row['Width/Format'], description=row['Definition'], coding=row['Coding'] ) except KeyError as e: warn("Different keys for '{}': {} ".format(table, e)) ep.doc.remove_term(r.term) elif ("Filter" in r.url