def getFiveDigitZCTAs(depends_on="http://www.census.gov/geo/www/cob/z52000.html", creates=root + "FiveDigitZCTAs/"): MakeDir(creates) wget(depends_on, creates + "__Index.html") Soup = BS.BeautifulSoup(open(creates + "__Index.html")) A = [ (Contents(x.findParent()).split(" - ")[0].strip(), str(dict(x.attrs)["href"])) for x in Soup.findAll("a") if "_shp" in str(x) ] for (name, url) in A: print "Downloading", name wget("http://www.census.gov" + url, creates + url.split("/")[-1]) os.system("cd " + creates + " ; unzip " + url.split("/")[-1])
def WgetMultiple(link, fname, maxtries=10): link = link if is_string_like(link) else link['URL'] opstring = '--user-agent="Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1.7) Gecko/20091221 Firefox/3.5.7"' time.sleep(5) for i in range(maxtries): wget(link, fname, opstring) F = open(fname,'r').read().strip() if F.startswith('<!DOCTYPE HTML'): return else: print 'download of ' + link + ' failed: ' + F[:20] time.sleep(15) print 'download of ' + link + ' failed after ' + str(maxtries) + ' attempts' return
def bls_downloader(download_dir,code): MakeDirs(download_dir) download_dir += ('/' if download_dir[-1] != '/' else '') MakeDir(download_dir + 'RawDownloads/') get = "ftp://ftp.bls.gov/pub/time.series/" + code + '/' WgetMultiple(get,download_dir + 'RawDownloads/index.html') Soup = BeautifulSoup(open(download_dir + 'RawDownloads/index.html')) A = Soup.findAll('a') Records = [(Contents(a),str(dict(a.attrs)['href'])) for a in A] Records = [r for r in Records if 'Current' not in r[0].split('.')] RecordsR = [r for r in Records if 'AllData' in r[0]] if RecordsR: Records = RecordsR + [r for r in Records if not '.data.' in r[0]] T = tb.tabarray(records = Records,names = ['File','URL']) for (f,u) in T: wget(u,download_dir + 'RawDownloads/' + f + '.txt') makemetadata(code,download_dir + 'RawDownloads/',download_dir + 'metadata.pickle',download_dir + 'filenames.tsv') MakeDir(download_dir + '__FILES__') processtextfile(download_dir + 'RawDownloads/',download_dir + '/__FILES__/documentation.txt') MakeDir(download_dir + '__PARSE__') for l in listdir(download_dir + 'RawDownloads/'): if '.data.' in l: Rename(download_dir + 'RawDownloads/' + l, download_dir + '__PARSE__/' + l) SPs = [download_dir + 'RawDownloads/' + l for l in listdir(download_dir + 'RawDownloads/') if l.endswith('.series.txt')] assert len(SPs) == 1, 'Wrong number of series paths.' serpath = SPs[0] parse_series(download_dir + 'RawDownloads/',download_dir + 'series.txt') delete(serpath)
def getStates(depends_on="http://www.census.gov/geo/cob/bdy/st/st00shp/st99_d00_shp.zip", creates=root + "States/"): MakeDir(creates) wget(depends_on, creates + "st99_d00_shp.zip") os.system("cd " + creates + " ; unzip st99_d00_shp.zip")
def getMSA(depends_on="http://www.census.gov/geo/www/cob/mmsa2003.html", creates=root + "MSA/"): MakeDir(creates) for x in ["cs99_03c_shp.zip", "cb99_03c_shp.zip", "md99_03c_shp.zip"]: wget("http://www.census.gov/geo/cob/bdy/metroarea/2003/shp/" + x, creates + x) os.system("cd " + creates + " ; unzip " + x)
def getCounties(depends_on="http://www.census.gov/geo/cob/bdy/co/co00shp/co99_d00_shp.zip", creates=root + "Counties/"): MakeDir(creates) wget(depends_on, creates + "co99_d00_shp.zip") os.system("cd " + creates + " ; unzip co99_d00_shp.zip")
def getCensusRegions( depends_on="http://www.census.gov/geo/cob/bdy/rg/rg99_d00_shp.zip", creates=root + "CensusRegions/" ): MakeDir(creates) wget(depends_on, creates + "rg99_d00_shp.zip") os.system("cd " + creates + " ; unzip rg99_d00_shp.zip")
def modwget(dd,path): if 'opstring' in dd.dtype.names: wget(dd['URL'],path,opstring=dd['opstring']) else: wget(dd['URL'],path)