Пример #1
0
    def test_urls(self):

        import rowgenerators as rg

        gdf = rg.geoframe('censusgeo://CA/140')
        print(gdf.set_index('geoid').head())

        return

        u = rg.parse_app_url('census://CA/140/B17001')
        t = u.get_resource().get_target()
        print(t, t.year, t.release)
        self.assertEqual('census://CA/140/B17001', str(t))
        self.assertEqual(2016, t.year)
        self.assertEqual(5, t.release)

        u = rg.parse_app_url('census://2015/3/CA/140/B17001')
        t = u.get_resource().get_target()
        print(t, t.year, t.release)
        self.assertEqual('census://2015/3/CA/140/B17001', str(t))
        self.assertEqual(2015, t.year)
        self.assertEqual(3, t.release)

        gdf = t.geoframe()
        self.assertEqual(43.083, gdf.area.sum().round(3))

        gdf = rg.geoframe('census://CA/140/B17001')
        self.assertEqual(43.083, gdf.area.sum().round(3))

        gdf = rg.geoframe('censusgeo://CA/140')
        self.assertEqual(43.083, gdf.area.sum().round(3))
Пример #2
0
    def __iter__(self):
        """Iterate the estimates and margins, interleaved"""

        yield self.file_headers

        for e, m in zip(parse_app_url(self.est_url).generator, parse_app_url(self.margin_url).generator):
            yield e[:6] + list(ileave(e[6:], m[6:]))
Пример #3
0
    def __iter__(self):

        yield self.file_headers

        for e, m in zip(
                parse_app_url(self.est_url).generator,
                parse_app_url(self.margin_url).generator):
            yield e[:6] + list(ileave(e[6:], m[6:]))
Пример #4
0
    def test_geo_dataframe(self):

        u = parse_app_url('census://2016/5/RI/140/B01002')

        self.assertEqual(244, len(u.geoframe().geometry))

        u = parse_app_url('censusgeo://2016/5/RI/140')

        self.assertEqual(244, len(u.geoframe().geometry))
Пример #5
0
    def __iter__(self):
        headers = list(parse_app_url(self.geo_header_url).generator)

        yield headers[0]

        t = parse_app_url(self.geo_url).get_resource().get_target()
        t.encoding = 'latin1'

        yield from t.generator
Пример #6
0
    def mt_open_package(self, line):
        """Find the metatab file for this package, open it, and load it into the namespace. """

        from metapack.jupyter.ipython import open_package

        parse_argstring(self.mt_open_package, line)
        self.shell.user_ns[MT_DOC_VAR] = open_package(self.shell.user_ns)

        if self.mt_doc.package_url:
            parse_app_url(self.mt_doc.package_url)
Пример #7
0
    def _load_documentation_files(self):

        from metapack_jupyter.exporters import DocumentationExporter

        notebook_docs = []

        # First find and remove notebooks from the docs. These wil get processed to create
        # normal documents.
        try:
            for term in list(
                    self.doc['Documentation'].find('Root.Documentation')):

                u = parse_app_url(term.value)
                if u is not None and u.target_format == 'ipynb' and u.proto == 'file':
                    notebook_docs.append(term)
                    self.doc.remove_term(term)
        except KeyError:
            self.warn("No documentation defined in metadata")

        # Process all of the normal files
        super()._load_documentation_files()

        fw = FilesWriter()
        fw.build_directory = join(self.package_path.path, 'docs')

        # Now, generate the notebook documents directly into the filesystem package
        for term in notebook_docs:

            de = DocumentationExporter(
                base_name=term.name or slugify(term.title))

            u = parse_app_url(term.value)

            nb_path = join(self.source_dir,
                           u.path)  # Only works if the path is relative.

            try:
                output, resources = de.from_filename(nb_path)
                fw.write(output,
                         resources,
                         notebook_name=de.base_name +
                         '_full')  # Write notebook html with inputs

                de.update_metatab(self.doc, resources)
            except Exception as e:
                from metapack.cli.core import warn
                warn("Failed to convert document for {}: {}".format(
                    term.name, e))
Пример #8
0
def add_resource(mt_file, ref, cache):
    """Add a resources entry, downloading the intuiting the file, replacing entries with
    the same reference"""

    if isinstance(mt_file, MetapackDoc):
        doc = mt_file
    else:
        doc = MetapackDoc(mt_file)

    if 'Resources' not in doc:
        doc.new_section('Resources')

    doc['Resources'].args = [
        e for e in set(doc['Resources'].args +
                       ['Name', 'StartLine', 'HeaderLines', 'Encoding']) if e
    ]

    seen_names = set()

    u = parse_app_url(ref)

    # The web and file URLs don't list the same.

    if u.proto == 'file':
        entries = u.list()
    else:
        entries = [ssu for su in u.list() for ssu in su.list()]

    for e in entries:
        add_single_resource(doc, e, cache=cache, seen_names=seen_names)

    write_doc(doc, mt_file)
Пример #9
0
def maybe_trial_build(m):
    from shutil import copyfile
    '''Update the metadata for a trial build, then restore it'''

    if not m.args.trial:
        yield False, m.mt_file
        return

    if not m.doc._has_semver():
        raise MetapackError(
            "To use trial builds, package must have a semantic version ")

    prt('Building a trial')

    mt_file = Path(m.mt_file.fspath).parent.joinpath('trial.csv')

    copyfile(m.mt_file.fspath, mt_file)

    doc = MetapackDoc(mt_file)
    version = doc['Root'].find_first('Root.Version')
    vb = version.get_or_new_child('Version.Build')
    vb.value = 'trial'

    try:
        doc.update_name()
        doc.write()

        yield True, parse_app_url(str(mt_file), downloader)
    finally:
        mt_file.unlink()
Пример #10
0
    def __init__(self,
                 source_ref=None,
                 package_root=None,
                 callback=None,
                 env=None):
        from metapack.doc import MetapackDoc

        self._downloader = source_ref._downloader
        self._cache = self._downloader.cache

        self._source_ref = source_ref
        self.source_dir = dirname(parse_app_url(self._source_ref).path)

        self.package_root = package_root
        self._callback = callback
        self._env = env if env is not None else {}

        self._source_doc = MetapackDoc(
            self._source_ref, cache=self._cache)  # this one stays constant

        self._doc = MetapackDoc(self._source_ref,
                                cache=self._cache)  # This one gets edited

        self._last_write_path = None

        if not self.doc.find_first_value('Root.Name'):
            raise PackageError("Package must have Root.Name term defined")
Пример #11
0
    def test_age_dimensions(self):

        '''Check that there are not tables with 'year' in the title that don't get a parsed age range'''

        tm = TableMeta(2016, 5)

        age_tables = []
        for t_id, table in tm.tables.items():
            if 'by age' in table.title.lower():
                age_tables.append(t_id)

        for at in age_tables:
            u = parse_app_url('census://2016/5/RI/40/{}'.format(at.lower()))
            g = u.generator
            t = g.table

            parse_errors = []

            for c in t.columns:
                if '_m90' not in c.unique_id and 'year' in c.description and not c.age_range and '1 year ago' not in \
                        c.description and 'year-round' not in c.description:
                    parse_errors.append(c)

            for parse_error in parse_errors:
                print(parse_error.row)

            self.assertEqual(0, len(parse_errors))
Пример #12
0
    def test_appurl_US(self):
        from rowgenerators import parse_app_url
        from rowgenerators.appurl.web.download import logger as download_logger
        from publicdata.census.files import logger

        logging.basicConfig()

        logger.setLevel(logging.DEBUG)

        # Iterate over all counties in the US
        u = parse_app_url('census://2016/5/US/county/B01003')

        rows = list(u.generator)

        states = set()
        counties = set()
        for row in rows[1:]:
            states.add(row[1])
            counties.add(row[3])

        from collections import Counter

        c = Counter(row[3] for row in rows[1:])

        for k, v in c.items():
            if v > 1:
                print(k,v)

        self.assertEqual(52, len(states))
        self.assertEqual(3220, len(counties))
        self.assertEqual(3220,len(rows[1:]))
Пример #13
0
    def save(self, path=None):
        from metapack import MetapackPackageUrl

        # HACK ...
        if not self.doc.ref:
            self.doc._ref = self.package_path  # Really should not do this but ...

        self.check_is_ready()

        self.load_declares()

        self.doc.cleanse()

        self._load_resources()

        self._relink_documentation()

        self._clean_doc()

        if path is None:
            if self.package_path.inner.proto == 'file':
                path = self.package_path.path
            else:
                raise PackageError("Can't write doc to path: '{}'".format(path))

        self.doc['Root'].get_or_new_term('Root.Issued').value = datetime_now()

        self._last_write_path = path

        self.doc.write_csv(path)

        return parse_app_url(abspath(path)).as_type(MetapackPackageUrl)
Пример #14
0
def acs_dataframe(year, release, stateab, summary_level, table):
    """
    Return a dataframe with ACS data

    :param year: ACS year
    :param release: Release, either 5 or 1
    :param stateab:  State abbreviation, or US
    :param summary_level: Summary level, either a number or string
    :param table: Table ID
    :return:
    """

    u = parse_app_url('census://2016/5/RI/140/B01002')

    print(type(u))

    g = u.generator

    rows = list(g)

    self.assertEqual(245, len(rows))

    df = u.generator.dataframe()

    self.assertEqual(9708, int(df['B01002_001'].sum()))
    self.assertEqual(809, int(df['B01002_001_m90'].sum()))
    self.assertEqual(9375, int(df['B01002_002'].sum()))
    self.assertEqual(1171, int(df['B01002_002_m90'].sum()))
Пример #15
0
    def load(self, url, load_all_resources=False):
        """Load a package and possibly one or all resources, from a url"""

        u = parse_app_url(url)

        d = MetapackDoc(u.clear_fragment())

        db_doc = self.document(name=d.get_value('Root.Name'))

        if not db_doc:
            self.add_doc(d)
            db_doc = self.document(name=d.get_value('Root.Name'))
            assert db_doc

        resources = []

        if load_all_resources:

            for r in self.resources(db_doc):
                self.load_resource(r)
                resources.append(r)

        elif u.target_file:

            r = self.resource(db_doc, u.target_file)

            self.load_resource(r)

            resources.append(d)

        return (db_doc, resources)
Пример #16
0
    def resource_url(self):
        predicates = {}

        url = self.dataset.fetch_url(*self.target_file.split(','),
                             geo_for=self.geo_for, geo_in=self.geo_in, **predicates)

        return parse_app_url(url, downloader=self.downloader)
Пример #17
0
    def __init__(self, year, release, stusab, summary_level, seq):

        assert seq is not None

        super().__init__(year, release, stusab, summary_level, seq)

        # Url to the estimates
        self.est_url = seq_estimate_url(self.year, self.release, self.stusab, self.summary_level, self.seq)

        # Url to the margins
        self.margin_url = seq_margin_url(self.year, self.release, self.stusab, self.summary_level, self.seq)

        # Url to the file header, which includes fancy descriptions
        # The file is a 2-row Excel file, intended to be used as the headers
        # for the data files. The first row is the column ids, and the second is
        # the titles. The first 6 columns are for STUSAB, SEQUENCE, LOGRECNO, etc,
        # so they are cut off.
        self.header_url = seq_header_url(self.year, self.release, self.stusab, self.summary_level, self.seq)

        # There are only two rows in the file, the first is the file headers ( column IDs )
        # and the second is longer descriptions
        self._file_headers, _descriptions = list(parse_app_url(self.header_url).generator)

        # At least some of the fields have '%' as a seperator instead of ' - '
        self._descriptions =  [ c.replace('%',' -') for c in _descriptions]
Пример #18
0
    def test_build_s3_package(self):
        from metapack_build.build import make_s3_csv_package

        cache = Downloader().cache

        fs_url = MetapackUrl(
            '/Volumes/Storage/proj/virt-proj/metapack/metapack/test-data/packages/example.com/'
            'example-package/_packages/example.com-example_data_package-2017-us-1/metadata.csv',
            downloader=downloader)

        # _, url, created =  make_excel_package(fs_url,package_dir,get_cache(), {}, False)

        # _, url, created = make_zip_package(fs_url, package_dir, get_cache(), {}, False)

        # _, url, created = make_csv_package(fs_url, package_dir, get_cache(), {}, False)

        package_dir = parse_app_url(
            's3://test.library.civicknowledge.com/metatab',
            downloader=downloader)

        _, url, created = make_s3_csv_package(fs_url, package_dir, cache, {},
                                              False)

        print(url)
        print(created)
Пример #19
0
    def _write_path(self, path):

        if path:
            u = parse_app_url(str(path))
        else:
            u = self.ref

        if u.scheme != 'file':
            raise MetatabError("Can't write file to URL '{}'".format(str(path)))

        path = u.fspath

        if path is None:

            try:
                path = pathlib.Path(self.ref.fspath)
            except AttributeError:

                if isinstance(self.ref, str):
                    path = pathlib.Path(self.ref)
                else:
                    path = pathlib.Path(DEFAULT_METATAB_FILE)

            return path
        else:
            return pathlib.Path(str(path))
Пример #20
0
    def save(self, path=None):

        self.check_is_ready()

        # Resets the ref so that resource.resolved_url link to the resources as written in S3
        self._doc._ref = self.access_url.join('metatab.csv')

        # Copy all of the files from the Filesystem package
        for root, dirs, files in walk(self.source_dir):
            for f in files:
                source = join(root, f)
                rel = source.replace(self.source_dir, '').strip('/')

                with open(source, 'rb') as f:
                    self.write_to_s3(rel, f)

        # Re-write the URLS for the datafiles
        for r in self.datafiles:
            r.url = self.bucket.access_url(r.url)
            # s3_url = self.bucket.private_access_url(r.url)
            # r.new_child('S3Url', s3_url)

        # Re-write the HTML index file.
        self._write_html()

        # Rewrite Documentation urls:
        for r in self.doc.find(['Root.Documentation', 'Root.Image']):

            url = parse_app_url(r.url)
            if url.proto == 'file':
                r.url = self.bucket.access_url(url.path)

        return self.access_url
Пример #21
0
    def geo_url(self):
        """Return a url for the geofile for this Census file"""
        from geoid.acs import AcsGeoid

        us = tiger_url(self.year, self.summary_level,
                       AcsGeoid.parse(self.geoid).stusab)

        return parse_app_url(us)
Пример #22
0
    def test_geo_dataframe(self):

        u = parse_app_url('census://2016/5/RI/140/B01002')

        gdf = u.generator.geoframe

        print(gdf.head())
        print(gdf.geometry.head())
Пример #23
0
    def _load_files(self):
        """Load other files"""
        def copy_dir(path):
            for (dr, _, files) in walk(path):
                for fn in files:

                    if any([e in fn for e in self.excludes]):
                        continue

                    relpath = dr.replace(self.source_dir, '').strip('/')
                    src = parse_app_url(join(dr, fn))
                    dest = join(relpath, fn)

                    resource = src.get_resource()

                    self._load_file(dest, resource.read())

        for term in self.resources(term='Root.Pythonlib'):

            uv = parse_app_url(term.value)
            ur = parse_app_url(self.source_dir)

            # In the case that the input doc is a file, and the ref is to a file,
            # try interpreting the file as relative.
            if ur.proto == 'file' and uv.proto == 'file':

                # Either a file or a directory
                path = join(self.source_dir, uv.path)
                if isdir(path):
                    copy_dir(path)

            else:
                # Load it as a URL
                f = self._get_ref_contents(term)
                try:
                    self._load_file(term.value, f.read())
                except Exception as e:
                    raise PackageError(
                        "Failed to load file for '{}': {} ".format(
                            term.value, e))

        # Copy the whole notebooks director, excluding some files.
        nb_dir = join(self.source_dir, 'notebooks')

        if exists(nb_dir) and isdir(nb_dir):
            copy_dir(nb_dir)
Пример #24
0
    def shape_url(self):
        """Return the shapefile URL"""
        from geoid.acs import AcsGeoid

        us = tiger_url(self.year, self.summary_level,
                       AcsGeoid.parse(self.geoid).stusab)

        return parse_app_url(us)
Пример #25
0
def run_url_scrape(args):
    m = MetapackCliMemo(args, downloader)

    from metapack.util import scrape_urls_from_web_page

    doc = m.doc
    url = m.args.url

    doc['resources'].new_term('DownloadPage', url)

    d = scrape_urls_from_web_page(url)

    if d.get('error'):
        err(d.get('error'))

    new_resources = 0
    new_documentation = 0

    if not args.no_resources:
        for k, v in d['sources'].items():
            u = parse_app_url(v['url'])
            t = doc['Resources'].new_term('DataFile',
                                          v['url'],
                                          name=u.fspath.stem,
                                          description=v.get('description'))
            new_resources += 1
            if args.verbose:
                prt(t, t.props)

    if not args.no_docs:
        for k, v in d['external_documentation'].items():
            term_name = classify_url(v['url'])
            u = parse_app_url(v['url'])
            t = doc['Documentation'].new_term(term_name,
                                              v['url'],
                                              name=u.fspath.stem,
                                              description=v.get('description'))
            new_documentation += 1
            if args.verbose:
                prt(t, t.props)

    prt("Added {} resource and {} documentation terms".format(
        new_resources, new_documentation))

    if not args.dry_run:
        write_doc(doc)
Пример #26
0
    def test_titles(self):
        import rowgenerators as rg
        #df = rg.dataframe(f'census:/2017/1/CA/50/B22003')
        #print(df.titles.head().T)

        u = parse_app_url('census:/2017/1/CA/50/B22003')
        for e in u.generator.table.columns:
            print(e.row)
Пример #27
0
    def test_appurl(self):
        from publicdata.census.util import sub_geoids, sub_summarylevel

        from rowgenerators import parse_app_url
        from publicdata.census.exceptions import CensusParsingException

        #self.assertEqual(245,list(parse_app_url('census://2016/5/RI/140/B17001').generator))

        #self.assertEqual(245, list(parse_app_url('census://RI/140/B17001').generator))

        with self.assertRaises(ValueError):
            sub_geoids('foobar')

        u = parse_app_url('census://RI/140/B17001')
        self.assertEqual('B17001', u.tableid)
        self.assertEqual('04000US44', u.geoid)

        u = parse_app_url('census://B17001/140/RI')
        self.assertEqual('B17001', u.tableid)
        self.assertEqual('04000US44', u.geoid)

        u = parse_app_url('census://140/RI/B17001')
        self.assertEqual('B17001', u.tableid)
        self.assertEqual('04000US44', u.geoid)

        with self.assertRaises(CensusParsingException):
            parse_app_url('census://B17001/Frop/140')

        with self.assertRaises(CensusParsingException):
            parse_app_url('census://BINGO/RI/140')
Пример #28
0
    def resolved_url(self):
        """Return a URL to the PUMS file"""
        # '{year}/{release}-Year/csv_{record_type}(state}.zip'
        us = self.url_proto.format(year=self._year,
                                   release=self._release,
                                   record_type=self.record_type.lower(),
                                   state=self._state.lower())

        return parse_app_url(us)
Пример #29
0
    def test_create(self):

        from publicdata.nlsy import NLSY97

        u = parse_app_url('nlsy+file:test_data/test-package/')

        nlsy = u.nlsy

        print(nlsy)
Пример #30
0
    def __init__(self, ref=None, decl=None, package_url=None, cache=None, resolver=None, clean_cache=False):

        self._input_ref = ref

        self._cache = cache if cache else get_cache()

        self.decl_terms = {}
        self.decl_sections = {}

        self.terms = []
        self.sections = OrderedDict()
        self.super_terms = {}
        self.derived_terms = {}
        self.errors = []
        self.package_url = package_url

        self.resolver = resolver or WebResolver()

        if decl is None:
            self.decls = []
        elif not isinstance(decl, MutableSequence):
            self.decls = [decl]
        else:
            self.decls = decl

        self.root = RootSectionTerm(doc=self)
        self.add_section(self.root)

        self.load_declarations(self.decls)

        if ref:
            try:
                self._ref = parse_app_url(ref)

                if self._ref.scheme == 'file':
                    try:
                        self._mtime = getmtime(self._ref.path)
                    except (FileNotFoundError, OSError):
                        self._mtime = 0
                else:
                    self._mtime = 0

            except AppUrlError as e:  # ref is probably a generator, not a string or Url
                self._ref = None

            self._term_parser = TermParser(ref, resolver=self.resolver, doc=self)

            try:
                self.load_terms(self._term_parser)
            except SourceError as e:
                raise MetatabError("Failed to load terms for document '{}': {}".format(self._ref, e))


        else:
            self._ref = None
            self._term_parser = None
            self._mtime = time()