예제 #1
0
    def test_RowGenerator(self):
        import warnings
        warnings.simplefilter("ignore")

        from rowgenerators import RowGenerator

        rg = RowGenerator('census://CA/140/B17001')

        self.assertEqual(8058, len(list(rg)))

        df = rg.dataframe()

        self.assertEqual(8057, len(df))
예제 #2
0
    def test_ipy(self):
        from rowgenerators import SourceSpec, Url, RowGenerator, get_cache

        from rowgenerators.fetch import download_and_cache

        urls = ('ipynb+file:foobar.ipynb',
                'ipynb+http://example.com/foobar.ipynb', 'ipynb:foobar.ipynb')

        for url in urls:
            u = Url(url)
            print(u, u.path, u.resource_url)

            s = SourceSpec(url)
            print(s, s.proto, s.scheme, s.resource_url, s.target_file,
                  s.target_format)
            self.assertIn(s.scheme, ('file', 'http'))
            self.assertEquals('ipynb', s.proto)

        gen = RowGenerator(cache=get_cache(),
                           url='ipynb:scripts/Py3Notebook.ipynb#lst',
                           working_dir=test_data(),
                           generator_args={'mult': lambda x: x * 3})

        rows = gen.generator.execute()

        print(len(rows))
예제 #3
0
파일: doc.py 프로젝트: rkiyengar/metatab-py
    def dataframe(self, limit=None):
        """Return a pandas datafrome from the resource"""

        from .pands import MetatabDataFrame

        d = self.properties

        d['url'] = self.resolved_url
        d['working_dir'] = self._doc.doc_dir

        rg = RowGenerator(**d)

        headers = self.headers

        if headers:
            # There are several args for SelectiveRowGenerator, but only
            # start is really important.
            start = d.get('start', 1)

            rg = islice(rg, start, limit)

        else:
            headers = next(
                rg)  # Get the headers from the first row of the file

        rp_table = self.row_processor_table()

        if rp_table:
            rg = RowProcessor(rg, rp_table, source_headers=headers, env={})

        df = MetatabDataFrame(list(rg), columns=headers, metatab_resource=self)

        self.errors = df.metatab_errors = rg.errors if rg.errors else {}

        return df
예제 #4
0
    def test_metapack(self):

        from metatab import open_package, resolve_package_metadata_url

        cache = cache_fs()

        url = 'metatab+http://library.metatab.org/example.com-simple_example-2017-us-1#random-names'

        rg = RowGenerator(cache=cache, url=url)

        package_url, metadata_url = resolve_package_metadata_url(
            rg.generator.spec.resource_url)

        self.assertEquals(
            'http://library.metatab.org/example.com-simple_example-2017-us-1/',
            package_url)
        self.assertEquals(
            'http://library.metatab.org/example.com-simple_example-2017-us-1/metadata.csv',
            metadata_url)

        doc = open_package(rg.generator.spec.resource_url, cache=cache)

        self.assertEquals(
            'http://library.metatab.org/example.com-simple_example-2017-us-1/data/random-names.csv',
            doc.resource('random-names').resolved_url)

        urls = [
            'metatab+http://library.metatab.org/example.com-simple_example-2017-us-1#random-names',
            'metatab+http://library.metatab.org/example.com-simple_example-2017-us-1.zip#random-names',
            'metatab+http://library.metatab.org/example.com-simple_example-2017-us-1.xlsx#random-names'
        ]

        for url in urls:
            gen = None
            try:
                gen = RowGenerator(cache=cache, url=url)

                rows = list(gen)

                self.assertEquals(101, len(rows))
            except:
                print("ERROR URL", url)
                print("Row Generator ", gen)
                raise
예제 #5
0
def run_row_intuit(path, cache):

    for encoding in ('ascii', 'utf8', 'latin1'):
        try:
            rows = list(islice(RowGenerator(url=path, encoding=encoding, cache=cache), 5000))
            return encoding, RowIntuiter().run(rows)
        except TextEncodingError:
            pass

    raise Exception('Failed to convert with any encoding')
예제 #6
0
    def test_register(self):

        from pandasreporter import CensusReporterSource, get_cache

        register_proto('censusreporter', CensusReporterSource)

        url = 'censusreporter:B17001/140/05000US06073'

        gen = RowGenerator(cache=get_cache(), url=url)

        self.assertEquals('B17001', gen.generator.table_id)
        self.assertEquals('140', gen.generator.summary_level)
        self.assertEquals('05000US06073', gen.generator.geoid)

        for row in gen:
            print(row)
예제 #7
0
def run_row_intuit(path, cache):
    from rowgenerators import RowGenerator
    from tableintuit import RowIntuiter
    from itertools import islice
    from rowgenerators import TextEncodingError

    for encoding in ('ascii', 'utf8', 'latin1'):
        try:
            rows = list(islice(RowGenerator(url=path,
                                            encoding=encoding,
                                            cache=cache,
                                            ), 5000))
            return encoding, RowIntuiter().run(list(rows))
        except (TextEncodingError, UnicodeEncodeError) as e:
            pass

    raise RowIntuitError('Failed to convert with any encoding')
예제 #8
0
    def test_source_spec_url(self):
        from rowgenerators import SourceSpec, RowGenerator
        from copy import deepcopy

        ss = SourceSpec(url='http://foobar.com/a/b.csv')
        self.assertEqual('b.csv', ss.target_file)
        self.assertIsNone(ss.target_segment)

        ss = SourceSpec(url='http://foobar.com/a/b.zip#a')
        print(ss._url)
        self.assertEqual('a', ss.target_file)
        self.assertIsNone(ss.target_segment)

        ss2 = deepcopy(ss)
        self.assertEqual(ss.target_file, ss2.target_file)
        self.assertIsNone(ss.target_segment)

        ss = SourceSpec(url='http://foobar.com/a/b.zip#a;b')
        self.assertEqual('a', ss.target_file)
        self.assertEqual('b', ss.target_segment)

        ss2 = deepcopy(ss)
        self.assertEqual(ss.target_file, ss2.target_file)
        self.assertEqual(ss.target_segment, ss2.target_segment)

        ss = RowGenerator(
            url=
            'http://public.source.civicknowledge.com/example.com/sources/test_data.zip#renter_cost_excel07.xlsx'
        )
        self.assertEqual('renter_cost_excel07.xlsx', ss.target_file)

        ss2 = deepcopy(ss)
        self.assertEqual(ss.target_file, ss2.target_file)

        for url in ('http://example.com/foo/archive.zip',
                    'http://example.com/foo/archive.zip#file.xlsx',
                    'http://example.com/foo/archive.zip#file.xlsx;0',
                    'socrata+http://example.com/foo/archive.zip'):
            pass

        print(
            SourceSpec(
                url='socrata+http://chhs.data.ca.gov/api/views/tthg-z4mf').
            __dict__)
예제 #9
0
    def test_zip(self):

        from rowgenerators import enumerate_contents, RowGenerator, SourceError, TextEncodingError

        z = 'http://public.source.civicknowledge.com/example.com/sources/test_data.zip'
        cache = cache_fs()

        for c in enumerate_contents(z, cache):

            print(c.url, c.encoding)

            if c.target_format in ('foo', 'txt'):
                continue

            gen = RowGenerator(url=c.url)
            try:
                print(len(list(gen)))
            except (UnicodeDecodeError, TextEncodingError) as e:
                print("UERROR", c.name, e)
            except SourceError as e:
                print("ERROR", c.name, e)
예제 #10
0
    def test_notebook(self):

        urls = ('ipynb+file:foobar.ipynb',
                'ipynb+http://example.com/foobar.ipynb', 'ipynb:foobar.ipynb')

        for url in urls:
            u = Url(url)
            print(u, u.path, u.resource_url)

            s = SourceSpec(url)
            print(s, s.proto, s.scheme, s.resource_url, s.target_file,
                  s.target_format)
            self.assertIn(s.scheme, ('file', 'http'))
            self.assertEquals('ipynb', s.proto)
            # print(download_and_cache(s, cache_fs()))

        gen = RowGenerator(cache=cache_fs(),
                           url='ipynb:Py3Notebook.ipynb#lst',
                           working_dir=script_path(),
                           generator_args={'mult': lambda x: x * 3})

        print(gen.generator.execute())
예제 #11
0
    def test_shapefile(self):

        url = "shape+http://s3.amazonaws.com/test.library.civicknowledge.com/census/tl_2016_us_state.geojson"

        gen = RowGenerator(url=url, cache=cache_fs())

        self.assertTrue(gen.is_geo)

        print("HEADERS", gen.headers)

        x = 0
        for row in gen.iter_rp():
            x += float(row['INTPTLON'])

        self.assertEquals(-4776, int(x))

        url = "shape+http://s3.amazonaws.com/test.library.civicknowledge.com/census/tl_2016_us_state.geojson.zip"

        gen = RowGenerator(url=url, cache=cache_fs())

        self.assertTrue(gen.is_geo)

        x = 0
        for row in gen.iter_rp():
            x += float(row['INTPTLON'])

        self.assertEquals(-4776, int(x))

        return

        url = "shape+http://s3.amazonaws.com/test.library.civicknowledge.com/census/tl_2016_us_state.zip"

        gen = RowGenerator(url=url, cache=cache_fs())

        for row in gen:
            print(row)
예제 #12
0
    def test_run_sources(self):

        cache = cache_fs()

        for sd in sources():
            # Don't have the column map yet.
            if sd['name'] in ('simple_fixed', 'facilities'):
                continue

            try:

                ss = SourceSpec(**sd)

                gen = RowGenerator(cache=cache, **sd)

                rows = list(gen)

                self.assertEquals(int(sd['n_rows']), len(rows))
            except Exception as e:
                print('---')
                print(sd['name'], e)
                print(rows[0])
                print(rows[-1])
예제 #13
0
    def test_program(self):

        urls = (
            ('program:rowgen.py', 'rowgen.py'),
            ('program:/rowgen.py', '/rowgen.py'),
            ('program:///rowgen.py', '/rowgen.py'),
            ('program:/a/b/c/rowgen.py', '/a/b/c/rowgen.py'),
            ('program:/a/b/c/rowgen.py', '/a/b/c/rowgen.py'),
            ('program:a/b/c/rowgen.py', 'a/b/c/rowgen.py'),
            ('program+http://foobar.com/a/b/c/rowgen.py', '/a/b/c/rowgen.py'),
        )

        for u, v in urls:
            url = Url(u)

            self.assertEquals(url.path, v, u)

        cache = cache_fs()

        options = {'-a': 'a', '-b': 'b', '--foo': 'foo', '--bar': 'bar'}

        options.update({
            'ENV1': 'env1',
            'ENV2': 'env2',
            'prop1': 'prop1',
            'prop2': 'prop2'
        })

        gen = RowGenerator(cache=cache,
                           url='program:rowgen.py',
                           working_dir=script_path(),
                           generator_args=options)

        rows = list(gen)

        for row in rows:
            print(row)
예제 #14
0
파일: doc.py 프로젝트: rkiyengar/metatab-py
    def row_generator(self):
        d = self.properties

        d['url'] = self.resolved_url
        d['target_format'] = d.get('format')
        d['target_segment'] = d.get('segment')
        d['target_file'] = d.get('file')
        d['engoding'] = d.get('encoding', 'utf8'),

        generator_args = dict(d.items())
        # For ProgramSource generator, These become values in a JSON encoded dict in the PROPERTIE env var
        generator_args['working_dir'] = self._doc.doc_dir
        generator_args['metatab_doc'] = self._doc.ref
        generator_args['metatab_package'] = str(self._doc.package_url)

        # These become their own env vars.
        generator_args['METATAB_DOC'] = self._doc.ref
        generator_args['METATAB_PACKAGE'] = str(self._doc.package_url)

        d['cache'] = self._doc._cache
        d['working_dir'] = self._doc.doc_dir
        d['generator_args'] = generator_args

        return RowGenerator(**d)
예제 #15
0
        table = e.table # Should also be the same

    if documentation:
        ep.sections.documentation.new_term('Documentation', documentation)

    ep.sections.root.new_term('Title', desc)
    ep.sections.root.new_term('Name', table)

    ep.doc.new_section('Schema', 'DataType WidthFormat Description Coding'.split())

    t = ep.sections.schema.new_term("Root.Table", table)
    r = ep.sections.resources # Make sure the section exists

    for r in ep.resources:
        if "DataDictionary" in r.url:
            for row in DictRowGenerator(RowGenerator(r.url)):
                try:
                    t.new_child('Table.Column',
                                row.get('Variable', row['Name']),
                                datatype=row['Type'],
                                widthformat=row['Width/Format'],
                                description=row['Definition'],
                                coding=row['Coding']
                                )
                except KeyError as e:
                    warn("Different keys for '{}': {} ".format(table, e))


            ep.doc.remove_term(r.term)

        elif ("Filter" in r.url