Пример #1
0
    def join_dir(self, s):

        try:
            path = s.path
        except AttributeError:
            path = parse_app_url(s).path

        # If there is a netloc, it's an absolute URL
        if s.netloc:
            return s

        new_key = join(dirname(self.key), path)

        return parse_app_url('s3://{bucket}/{key}'.format(
            bucket=self.bucket_name.strip('/'), key=new_key.lstrip('/')))
Пример #2
0
    def test_fixed(self):
        from itertools import islice

        from rowgenerators import Table

        t = Table()
        t.add_column('id', int, 6)
        t.add_column('uuid', str, 34)
        t.add_column('int', int, 3)
        t.add_column('float', float, 14)

        print(str(t))

        parse = t.make_fw_row_parser()

        u = parse_app_url(
            'fixed+file:/Volumes/Storage/Downloads/test_data/fixed/simple-example.txt'
        )

        print(u.get_resource())
        print(u.get_resource().get_target())

        g = get_generator(u, table=t)

        print(type(g))

        for row in islice(g, 10):
            print(row)
Пример #3
0
    def test_program(self):

        u = parse_app_url(script_path('rowgen.py'))
        u.scheme_extension = 'program'

        env = {
            '--long-arg': 'a',
            '-s': 'a',
            'ENV_VAR': 'a',
            'prop1': 'a',
            'prop2': 'a'
        }

        g = get_generator(u, working_dir=dirname(u.path), env=env)

        print(type(g))

        rows = {}

        for row in g.iter_rp:
            rows[row['type'] + '-' + row['k']] = row.v
            print(row)

        self.assertEqual('a', rows['prop-prop1'])
        self.assertEqual('{"prop1": "a", "prop2": "a"}',
                         rows['env-PROPERTIES'])
Пример #4
0
def get_file(url_str):

    from appurl import parse_app_url

    u = parse_app_url(url_str)

    return u.get_resource().get_target()
Пример #5
0
    def test_geo(self):

        u = parse_app_url('censusreporter://B01001/140/05000US06073')

        B01001 = u.generator.dataframe()

        geo = B01001.geo

        print(len(geo))
Пример #6
0
    def test_app_urls(self):

        with open(test_data('database_urls.csv')) as f:
            for e in DictReader(f):

                u = parse_app_url(e['url'])

                self.assertEquals(str(e['driver']), str(u.driver))
                self.assertEquals(str(e['dialect']), str(u.dialect))
Пример #7
0
def extract(resource, doc, *args, **kwargs):
    """Extract rows from an FFIEC disclosire file, from a collection of Root.References,
    for a given prefix

    This function is used as a program URL in a Root.DataFile term:

    Section: Resources
    DataFile:       python:publicdata.ffiec#extract
    Datafile.Name:  sb_loan_orig
    Datafile.Schema:cra_disclosure
    Datafile.Prefix:D1-1

    The schema for the table must be specified, because the rows are fixed width, so
    the schema must have a Column.Width for each column.

    The function also expects that all of the references in the document refer to FFEIC file, such as:

    Section: References
    Reference: https://www.ffiec.gov/cra/xls/15exp_discl.zip
    Reference.Name: discl_15
    Reference: https://www.ffiec.gov/cra/xls/14exp_discl.zip
    Reference.Name: discl_14
    Reference: https://www.ffiec.gov/cra/xls/13exp_discl.zip
    Reference.Name: discl_13
    Reference: https://www.ffiec.gov/cra/xls/12exp_discl.zip
    Reference.Name: discl_12
    Reference: https://www.ffiec.gov/cra/xls/11exp_discl.zip
    Reference.Name: discl_11
    Reference: https://www.ffiec.gov/cra/xls/10exp_discl.zip
    Reference.Name: discl_10


    """

    test = bool(resource.get_value('test', False))

    prefix = resource.prefix

    table = resource.row_processor_table()

    yield table.headers

    parser = table.make_fw_row_parser(ignore_empty=True)

    for r in doc.references():

        print("Processing ", r.name)

        t = parse_app_url(r.url).get_resource().get_target()

        with open(t.path, 'rU') as f:

            for line in (islice(f.readlines(), 10) if test else f.readlines()):
                if not line.startswith(prefix + ' '):
                    continue

                yield parser(line)
Пример #8
0
def enumerate_contents(base_spec, cache_fs, callback=None):
    """Inspect the URL, and if it is a container ( ZIP Or Excel ) inspect each of the contained
    files. Yields all of the lower-level URLs"""

    if not isinstance(base_spec, Url):
        base_spec = parse_app_url(url=base_spec)

    for s in inspect(base_spec, cache_fs, callback=callback):
        for s2 in inspect(s, cache_fs, callback=callback):
            yield s2
Пример #9
0
    def test_entrypoints(self):
        from rowgenerators.generator.iterator import IteratorSource
        from rowgenerators.generator.generator import GeneratorSource
        from rowgenerators.generator.csv import CsvSource

        us = 'http://public.source.civicknowledge.com/example.com/sources/unicode-utf8.csv'

        def g():
            yield None

        self.assertIsInstance(get_generator([]), IteratorSource)
        self.assertIsInstance(get_generator(g()), GeneratorSource)
        self.assertIsInstance(
            get_generator(parse_app_url(us).get_resource().get_target()),
            CsvSource)
Пример #10
0
    def get_resource(self):
        """Get the contents of resource and save it to the cache, returning a file-like object"""

        from appurl import parse_app_url

        self._resource = self._downloader.download(self.inner)

        ru = parse_app_url(self._resource.sys_path,
                           fragment=self.fragment,
                           fragment_query=self.fragment_query,
                           scheme_extension=self.scheme_extension,
                           downloader = self.downloader
                           )

        return ru
Пример #11
0
    def __init__(self, url=None, downloader=None, **kwargs):
        super().__init__(url, downloader, **kwargs)

        self._proto = 'gs'

        self.key = self.path or self.netloc  # former without '://', later with ':'
        self.gid = self.target_file

        if self.gid:
            web_url = (self.url_template + self.gid_siffix).format(
                key=self.key, gid=self.gid)
        else:
            web_url = self.url_template.format(key=self.key)

        web_url += "#target_file={}-{}.csv".format(self.key, self.gid)

        self.web_url = parse_app_url(web_url)
Пример #12
0
    def get_resource(self):
        cache = self.downloader.cache

        if cache and cache.exists(self.cache_key):
            pass

        else:
            r = requests.get(self.resource_url)
            r.raise_for_status()
            data = r.json()

            if cache:
                cache.makedirs(dirname(self.cache_key), recreate=True)
                cache.settext(self.cache_key, json.dumps(data, indent=4))

        return parse_app_url(
            cache.getsyspath(self.cache_key),
            fragment=["/".join(self._parts), None],
        ).as_type(CensusReporterJsonUrl)
Пример #13
0
    def test_google(self):

        url = 'gs:1qjjtkMqpxtkDp3qZlkF7P8Tm8VtfIwiWW-OqJ2J91yE#2038675149'

        u = parse_app_url(url)

        wu = u.web_url
        print(type(wu), wu)

        r = u.get_resource()

        print(type(r), r.path)

        t = r.get_target()

        print(type(t), t.path)

        for r in t.generator:
            print(r)
Пример #14
0
    def test_sources(self):
        from csv import DictReader

        with open(data_path('sources.csv')) as f:
            for e in DictReader(f):

                if not e['url_class']:
                    print()
                    continue

                u = parse_app_url(e['url'])
                r = u.get_resource()
                t = r.get_target()

                g = get_generator(t)

                self.assertEquals(e['gen_class'], g.__class__.__name__)

                self.assertEquals(int(e['n_rows']), (len(list(g))))
Пример #15
0
    def test_geo(self):

        from rowgenerators.generator.shapefile import ShapefileSource
        from rowgenerators.appurl.shapefile import ShapefileUrl

        us = 'shape+http://s3.amazonaws.com/public.source.civicknowledge.com/sangis.org/Subregional_Areas_2010.zip'
        u = parse_app_url(us)

        r = u.get_resource()

        self.assertIsInstance(r, ShapefileUrl)

        t = r.get_target()

        self.assertIsInstance(t, ShapefileUrl)

        self.assertTrue(
            str(t).endswith(
                'public.source.civicknowledge.com/sangis.org/Subregional_Areas_2010.zip#SRA2010tiger.shp'
            ))

        g = get_generator(t)

        self.assertIsInstance(g, ShapefileSource)

        self.assertEqual([{
            'name': 'id',
            'type': 'int'
        }, {
            'name': 'SRA',
            'type': 'int'
        }, {
            'name': 'NAME',
            'type': 'str'
        }, {
            'name': 'geometry',
            'type': 'geometry_type'
        }], g.columns)
        self.assertEqual(['id', 'SRA', 'NAME', 'geometry'], g.headers)

        self.assertEquals(42, len(list(g)))
Пример #16
0
    def test_geo(self):

        us = 'shape+http://s3.amazonaws.com/public.source.civicknowledge.com/sangis.org/Subregional_Areas_2010.zip'
        u = parse_app_url(us)

        r = u.get_resource()

        print(type(r), r)

        t = r.get_target()

        print(type(t), t)

        g = get_generator(t)

        print(type(g))

        print(g.columns)
        print(g.headers)

        self.assertEquals(42, len(list(g)))
Пример #17
0
    def test_basic(self):
        from publicdata.censusreporter.url import CensusReporterURL
        from publicdata.censusreporter.generator import CensusReporterSource

        u = parse_app_url('censusreporter://B01001/140/05000US06073')

        self.assertEqual(629, len(list(u.generator)))
        self.assertIsInstance(u, CensusReporterURL)
        self.assertIsInstance(u.generator, CensusReporterSource)

        B01001 = u.generator.dataframe()

        self.assertEqual(3223096.0, B01001.B01001001.sum())

        #print(B01001.titles.iloc[:2].T)

        cols = [
            'geoid',
            'B01001001',  # Total Population
            'B01001002',  # Total Male
            'B01001026',  # Total Female
            'B01001013', 'B01001014',  # Males, 35-39 and 40-44
            'B01001037', 'B01001038'  # Female, 35-39 and 40-44
        ]

        df = B01001[cols].copy()


        df['male_35_44'], df['male_35_44_m90'] = df.sum_m('B01001013', 'B01001014')
        df['female_35_44'], df['female_35_44_m90'] = df.sum_m('B01001037', 'B01001038')

        df['m_ratio'],df['m_ratio_m90'] = df.ratio('male_35_44','B01001002')

        print(len(df.proportion('male_35_44', 'female_35_44')))

        df['mf_proprtion'] , df['mf_proprtion_m90'] = df.proportion('male_35_44', 'female_35_44')

        self.assertEqual(211707.0, df.female_35_44.dropna().sum())
        self.assertEqual(82, int(df.m_ratio.dropna().sum()))
Пример #18
0
    def test_census_shapes(self):
        from publicdata.censusreporter.url import CensusReporterShapeURL
        from rowgenerators.appurl.shapefile import ShapefileUrl, ShapefileShpUrl
        from rowgenerators.generator.shapefile import ShapefileSource

        u = parse_app_url('censusreportergeo://B01003/140/05000US06073')

        self.assertTrue(str(u.resource_url).endswith('&format=shp'))

        self.assertIsInstance(u, CensusReporterShapeURL)

        r = u.get_resource()

        self.assertIsInstance(r, ShapefileUrl)

        self.assertTrue(str(r).endswith('/latest.zip#.%2A%5C.shp%24'), str(r))

        g = r.generator

        self.assertIsInstance(g, ShapefileSource)

        self.assertEquals(629, (len(list(g))))

        return
Пример #19
0
    def resource_url(self):

        return parse_app_url("http://{host}/1.0/data/download/latest?table_ids={table_id}&geo_ids={sl}|{geoid}&format=shp" \
            .format(host=self.api_host,table_id=self.table_id, sl=self.summary_level, geoid=self.geoid),
                      downloader=self.downloader)
Пример #20
0
def process_schema(doc, resource, df):
    """Add schema entiries to a metatab doc from a dataframe"""
    from rowgenerators import SourceError
    from requests.exceptions import ConnectionError

    from metapack.cli.core import extract_path_name, type_map
    from metapack_build.core import alt_col_name
    from tableintuit import TypeIntuiter
    from rowgenerators.generator.python import PandasDataframeSource
    from appurl import parse_app_url

    try:
        doc['Schema']
    except KeyError:
        doc.new_section('Schema', ['DataType', 'Altname', 'Description'])

    schema_name = resource.get_value('schema', resource.get_value('name'))

    schema_term = doc.find_first(term='Table',
                                 value=schema_name,
                                 section='Schema')

    if schema_term:
        logger.info("Found table for '{}'; skipping".format(schema_name))
        return

    path, name = extract_path_name(resource.url)

    logger.info("Processing {}".format(resource.url))

    si = PandasDataframeSource(
        parse_app_url(resource.url),
        df,
        cache=doc._cache,
    )

    try:
        ti = TypeIntuiter().run(si)
    except SourceError as e:
        logger.warn("Failed to process '{}'; {}".format(path, e))
        return
    except ConnectionError as e:
        logger.warn("Failed to download '{}'; {}".format(path, e))
        return

    table = doc['Schema'].new_term('Table', schema_name)

    logger.info("Adding table '{}' to metatab schema".format(schema_name))

    for i, c in enumerate(ti.to_rows()):
        raw_alt_name = alt_col_name(c['header'], i)
        alt_name = raw_alt_name if raw_alt_name != c['header'] else ''

        t = table.new_child('Column', c['header'],
                            datatype=type_map.get(c['resolved_type'], c['resolved_type']),
                            altname=alt_name,
                            description=df[c['header']].description \
                                        if hasattr(df, 'description') and df[c['header']].description else ''
                            )

    return table
Пример #21
0
    def __init__(self,
                 url,
                 name=None,
                 proto=None,
                 resource_format=None,
                 target_file=None,
                 target_segment=None,
                 target_format=None,
                 encoding=None,
                 columns=None,
                 generator_args=None,
                 **kwargs):
        """

        The ``header_lines`` can be a list of header lines, or one of a few special values:

        * [0]. The header line is the first line in the dataset.
        * False. The header line is not specified, so it should be intuited
        * None or 'none'. There is no header line, and it should not be intuited.

        :param url:
        :param name: An optional name for the source
        :param proto: Either the scheme of the url, or the scheme extension. One of http, https, gs, socrata.
        Forces how the URL is interpreted.
        :param target_format: Forces the file format, which may be either the downloaded resource, or an internal file in a
        ZIP archive. , which is usually taked from the file extension. May be any typical extension string.

        :param file: A reference to an internal file in a Zip archive. May a string, or a regular expression.
        :param segment: A reference to a worksheet in a spreadsheet. May be a string or a number

        :param resource_format: The file format of the object the URL points to, such as a ZIP file, which may
        have internal file of another type.
        :param encoding: The file encoding.

        :param kwargs: Stored and made available to generators
        :return:



        The segment may have one or two parameters. If it contains a ';', there are two parameters. The
        first will identify a spreadsheet file in an archive, and the second identifies a worksheet in the
        file.

        """

        if isinstance(url, Url):
            self._url = url
        else:
            self._url = parse_app_url(url,
                                      proto=proto,
                                      resource_format=resource_format.lower()
                                      if resource_format else resource_format,
                                      target_file=target_file,
                                      target_segment=target_segment,
                                      target_format=target_format.lower()
                                      if target_format else target_format,
                                      encoding=encoding)

        self.name = name if name else str(uuid4())
        self.columns = columns
        self.download_time = None  # Set externally

        self.generator_args = generator_args
        self.kwargs = kwargs
Пример #22
0
def get_generator(source, **kwargs):
    from rowgenerators import Source
    names = []

    if isinstance(source, Source):
        return source

    if isinstance(source, str):

        ref = parse_app_url(source).get_resource().get_target()
        try:
            names.append('.{}'.format(ref.target_format))
        except AttributeError:
            pass

    elif inspect.isgenerator(source):
        names.append('<generator>')
        ref = source

    elif isinstance(source, collections.Iterable):
        names.append('<iterator>')
        ref = source

    elif hasattr(source, '__iter__'):
        names.append('<iterator>')
        ref = source

    elif isinstance(source, Url):
        ref = source
        try:
            names.append('.{}'.format(ref.target_format))
        except AttributeError:
            pass

        try:
            names.append('{}+'.format(ref.scheme_extension))
        except AttributeError:
            pass

        try:
            names.append('{}:'.format(ref.scheme))
        except AttributeError:
            pass

        try:
            names.append('<{}>'.format(ref.__class__.__name__))
        except AttributeError:
            pass

    else:
        raise RowGeneratorError("Unknown arg type for source: '{}'".format(
            type(source)))

    classes = sorted([
        ep.load()
        for ep in iter_entry_points(group='rowgenerators') if ep.name in names
    ],
                     key=lambda cls: cls.priority)

    if not classes:
        raise RowGeneratorError(
            "Can't find generator for source '{}' \nproto={}, resource_format={}, target_format={} "
            .format(source, ref.proto, ref.resource_format, ref.target_format))

    try:
        return classes[0](ref, **kwargs)
    except Exception as e:

        raise RowGeneratorError(
            "Failed to instantiate generator for class '{}', ref '{}'".format(
                classes[0], ref)) from e