def meta_add_5yr_sources(self): """The 5 year release has a different structure because the files are bigger. """ from ambry.orm import DataSource, File from ambry.util import scrape_urls_from_web_page import os year = self.year span = 5 source_name = 'dnlpage{}{}'.format(year, span) source = self.source(source_name) self.log("Loading from {}".format(source.url)) state_entries = scrape_urls_from_web_page(source.url)['links'] s = self.session for state_name, parts in state_entries.items(): if state_name.endswith('/'): state_name = state_name.replace('/', '') url = parts['url'] for suffix, size in (( 'All_Geographies_Not_Tracts_Block_Groups', 'L'), ('Tracts_Block_Groups_Only', 'S')): gurl = os.path.join(url, suffix) table_urls = scrape_urls_from_web_page(gurl)['sources'] for k, v in table_urls.items(): if k.startswith('g{}{}'.format(year, span)): self.log('Found: {}{}'.format(k, size)) d = { 'name': k + size, 'source_table_name': 'geoschema', 'dest_table_name': 'geoschema', 'filetype': 'fixed', 'file': 'g{}.*\.txt'.format(year), 'encoding': 'latin1', 'time': year, 'grain': span, 'url': v['url'] } ds = self._dataset.source_file(d['name']) if ds: ds.update(**d) else: ds = DataSource(**d) ds.d_vid = self.dataset.vid s.merge(ds)
def _meta_add_5yr_sources(self): """The 5 year release has a different structure because the files are bigger. """ from ambry.orm import DataSource, File from ambry.util import scrape_urls_from_web_page from ambry.orm.exc import NotFoundError import os year = self.year span = 5 source = self.source("dnlpage{}{}".format(year, span)) self.log("Loading from {}".format(source.url)) name_map = {"All_Geographies_Not_Tracts_Block_Groups": "L", "Tracts_Block_Groups_Only": "S"} def parse_name(inp): for suffix, code in name_map.items(): if inp.endswith(suffix): return inp.replace("_" + suffix, ""), code return (None, None) for link_name, parts in scrape_urls_from_web_page(source.url)["sources"].items(): url = parts["url"] state_name, size_code = parse_name(link_name) d = { "name": "{}{}_{}{}".format(state_name, size_code, self.year, span), "source_table_name": "geofile", "dest_table_name": "geofile", "filetype": "csv", "file": "g{}.*\.csv".format(self.year), "encoding": "latin1", "time": str(self.year) + str(span), "start_line": 0, "url": url, } try: s = self._dataset.source_file(d["name"]) s.update(**d) except NotFoundError: s = self.dataset.new_source(**d) self.session.merge(s) self.log(s.name) self.commit() self.build_source_files.sources.objects_to_record() self.commit()
def _meta_add_13yr_sources(self, span): """Run once to create to create the sources.csv file. Scrapes the web page with the links to the files. """ from ambry.orm import DataSource, File from ambry.util import scrape_urls_from_web_page from ambry.orm.exc import NotFoundError source = self.source("dnlpage{}{}".format(self.year, span)) entries = scrape_urls_from_web_page(source.url)["sources"] for k, v in entries.items(): d = { "name": k.lower() + "_{}{}".format(self.year, span), "source_table_name": "geofile", "dest_table_name": "geofile", "filetype": "csv", "file": "g{}.*\.csv".format(self.year), "encoding": "latin1", "time": str(self.year) + str(span), "start_line": 0, "url": v["url"], } try: s = self._dataset.source_file(d["name"]) s.update(**d) except NotFoundError: s = self.dataset.new_source(**d) self.session.merge(s) self.commit() self.build_source_files.sources.objects_to_record() self.commit()
def meta_add_sources(self): """Run once to create to create the soruces.csv file""" from ambry.orm import DataSource, File from ambry.util import scrape_urls_from_web_page year = self.year for span in [1, 3]: source_name = 'dnlpage{}{}'.format(year, span) source = self.source(source_name) entries = scrape_urls_from_web_page(source.url)['sources'] s = self.session for k, v in entries.items(): d = { 'name': k.lower() + "_{}{}".format(year, span), 'source_table_name': 'geoschema', 'dest_table_name': 'geoschema', 'filetype': 'fixed', 'file': 'g{}.*\.txt'.format(self.year), 'encoding': 'latin1', 'time': year, 'grain': span, 'url': v['url'] } ds = self._dataset.source_file(d['name']) if ds: ds.update(**d) else: ds = DataSource(**d) ds.d_vid = self.dataset.vid s.merge(ds) s.commit()