raw_bea = gbd.get_bea_data('http://www.bea.gov/newsreleases/regional/gdp_metro/2015/xls/gdp_metro0915.xls') bea_df = gbd.clean_me(raw_bea) bea_df = bea_df[:-2] next_df = pd.concat([new_df, bea_df[bea_df['bea_2014'] > 20000]], axis=1) print 'Bureau of Economic Affairs data merged!' # incorporate numbeo data: url_prefix = 'http://www.numbeo.com/cost-of-living/region_rankings.jsp?title=' url_suffix = '®ion=021' year_list = ['2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016'] urls = ns.build_urls(year_list) for url in urls: soup_can = ns.get_pages(url) table_list = [ns.clean_up(soup) for soup in soup_can] zipped = list(zip(year_list, table_list)) df_dict = ns.build_data_frames(zipped) for item in year_list: columns= ns.fix_em(['Rank','City','Cost of Living Index','Rent Index','Cost of Living Plus Rent Index', 'Groceries Index','Restaurant Price Index','Local Purchasing Power Index']) first_cols = columns[:2] first_cols.extend([column + '_{}'.format(item)for column in columns[2:]]) df_dict[item].columns = first_cols def clean_up_df(df): df['state'] = df['city'].apply(lambda x: x.split(',')[1].strip().lower().replace(' ', '_')) df['city'] = df['city'].apply(lambda x: x.split(',')[0].lower().replace(' ', '_')) del df['rank'] return df
raw_bea = gbd.get_bea_data("http://www.bea.gov/newsreleases/regional/gdp_metro/2015/xls/gdp_metro0915.xls") bea_df = gbd.clean_me(raw_bea) bea_df = bea_df[:-2] next_df = pd.concat([new_df, bea_df[bea_df["bea_2014"] > 20000]], axis=1) print "Bureau of Economic Affairs data merged!" # incorporate numbeo data: url_prefix = "http://www.numbeo.com/cost-of-living/region_rankings.jsp?title=" url_suffix = "®ion=021" year_list = ["2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016"] urls = ns.build_urls(year_list) for url in urls: soup_can = ns.get_pages(url) table_list = [ns.clean_up(soup) for soup in soup_can] zipped = list(zip(year_list, table_list)) df_dict = ns.build_data_frames(zipped) for item in year_list: columns = ns.fix_em( [ "Rank", "City", "Cost of Living Index", "Rent Index", "Cost of Living Plus Rent Index", "Groceries Index", "Restaurant Price Index", "Local Purchasing Power Index", ]
def get_walk_data(url): doc = requests.get(url).text soup = BeautifulSoup(doc, 'lxml') return clean_up(soup)