Python scrape_urls_from_web_page示例，ambry.util.scrape_urls_from_web_page Python示例

示例#1

0

显示文件

文件： bundle.py 项目： CivicSpleen/census-bundles

    def meta_add_5yr_sources(self):
        """The 5 year release has a different structure because the files are bigger. """
        from ambry.orm import DataSource, File
        from ambry.util import scrape_urls_from_web_page
        import os

        year = self.year
        span = 5

        source_name = 'dnlpage{}{}'.format(year, span)
        source = self.source(source_name)

        self.log("Loading from {}".format(source.url))

        state_entries = scrape_urls_from_web_page(source.url)['links']
        s = self.session
        for state_name, parts in state_entries.items():
            if state_name.endswith('/'):
                state_name = state_name.replace('/', '')
                url = parts['url']

                for suffix, size in ((
                        'All_Geographies_Not_Tracts_Block_Groups', 'L'),
                                     ('Tracts_Block_Groups_Only', 'S')):
                    gurl = os.path.join(url, suffix)
                    table_urls = scrape_urls_from_web_page(gurl)['sources']
                    for k, v in table_urls.items():
                        if k.startswith('g{}{}'.format(year, span)):
                            self.log('Found: {}{}'.format(k, size))
                            d = {
                                'name': k + size,
                                'source_table_name': 'geoschema',
                                'dest_table_name': 'geoschema',
                                'filetype': 'fixed',
                                'file': 'g{}.*\.txt'.format(year),
                                'encoding': 'latin1',
                                'time': year,
                                'grain': span,
                                'url': v['url']
                            }

                            ds = self._dataset.source_file(d['name'])
                            if ds:
                                ds.update(**d)
                            else:
                                ds = DataSource(**d)
                                ds.d_vid = self.dataset.vid

                            s.merge(ds)

示例#2

0

显示文件

文件： geofile.py 项目： CivicKnowledge/censuslib

    def _meta_add_5yr_sources(self):
        """The 5 year release has a different structure because the files are bigger. """
        from ambry.orm import DataSource, File
        from ambry.util import scrape_urls_from_web_page
        from ambry.orm.exc import NotFoundError
        import os

        year = self.year
        span = 5

        source = self.source("dnlpage{}{}".format(year, span))

        self.log("Loading from {}".format(source.url))

        name_map = {"All_Geographies_Not_Tracts_Block_Groups": "L", "Tracts_Block_Groups_Only": "S"}

        def parse_name(inp):
            for suffix, code in name_map.items():
                if inp.endswith(suffix):
                    return inp.replace("_" + suffix, ""), code
            return (None, None)

        for link_name, parts in scrape_urls_from_web_page(source.url)["sources"].items():
            url = parts["url"]

            state_name, size_code = parse_name(link_name)

            d = {
                "name": "{}{}_{}{}".format(state_name, size_code, self.year, span),
                "source_table_name": "geofile",
                "dest_table_name": "geofile",
                "filetype": "csv",
                "file": "g{}.*\.csv".format(self.year),
                "encoding": "latin1",
                "time": str(self.year) + str(span),
                "start_line": 0,
                "url": url,
            }

            try:
                s = self._dataset.source_file(d["name"])
                s.update(**d)
            except NotFoundError:
                s = self.dataset.new_source(**d)

            self.session.merge(s)
            self.log(s.name)

        self.commit()

        self.build_source_files.sources.objects_to_record()

        self.commit()

示例#3

0

显示文件

文件： geofile.py 项目： CivicKnowledge/censuslib

    def _meta_add_13yr_sources(self, span):
        """Run once to create to create the sources.csv file. Scrapes the web page with the links to the
        files.  """
        from ambry.orm import DataSource, File
        from ambry.util import scrape_urls_from_web_page
        from ambry.orm.exc import NotFoundError

        source = self.source("dnlpage{}{}".format(self.year, span))

        entries = scrape_urls_from_web_page(source.url)["sources"]

        for k, v in entries.items():

            d = {
                "name": k.lower() + "_{}{}".format(self.year, span),
                "source_table_name": "geofile",
                "dest_table_name": "geofile",
                "filetype": "csv",
                "file": "g{}.*\.csv".format(self.year),
                "encoding": "latin1",
                "time": str(self.year) + str(span),
                "start_line": 0,
                "url": v["url"],
            }

            try:
                s = self._dataset.source_file(d["name"])
                s.update(**d)
            except NotFoundError:
                s = self.dataset.new_source(**d)

            self.session.merge(s)

        self.commit()

        self.build_source_files.sources.objects_to_record()

        self.commit()

示例#4

0

显示文件

文件： bundle.py 项目： CivicSpleen/census-bundles

    def meta_add_sources(self):
        """Run once to create to create the soruces.csv file"""
        from ambry.orm import DataSource, File
        from ambry.util import scrape_urls_from_web_page

        year = self.year

        for span in [1, 3]:
            source_name = 'dnlpage{}{}'.format(year, span)
            source = self.source(source_name)

            entries = scrape_urls_from_web_page(source.url)['sources']
            s = self.session
            for k, v in entries.items():

                d = {
                    'name': k.lower() + "_{}{}".format(year, span),
                    'source_table_name': 'geoschema',
                    'dest_table_name': 'geoschema',
                    'filetype': 'fixed',
                    'file': 'g{}.*\.txt'.format(self.year),
                    'encoding': 'latin1',
                    'time': year,
                    'grain': span,
                    'url': v['url']
                }

                ds = self._dataset.source_file(d['name'])
                if ds:
                    ds.update(**d)
                else:
                    ds = DataSource(**d)
                    ds.d_vid = self.dataset.vid

                s.merge(ds)

            s.commit()