import get_bea_data as gbd import population_cleanup as pc import recent_pop_cleanup as rpc import glob import walkscore as ws import os # read in population (1790 - 2010) and rj metrics meetup info (2013-2014) and merge df's census_pop_df = pc.get_pop_data('data/1790-2010_MASTER.csv') rj_df = pc.get_rj_data('data/rj_metrics.txt') new_df = pd.concat([census_pop_df, rj_df], axis=1) print 'Census data merged to RJ metrics data!' # clean and join bureau of economic affairs info raw_bea = gbd.get_bea_data('http://www.bea.gov/newsreleases/regional/gdp_metro/2015/xls/gdp_metro0915.xls') bea_df = gbd.clean_me(raw_bea) bea_df = bea_df[:-2] next_df = pd.concat([new_df, bea_df[bea_df['bea_2014'] > 20000]], axis=1) print 'Bureau of Economic Affairs data merged!' # incorporate numbeo data: url_prefix = 'http://www.numbeo.com/cost-of-living/region_rankings.jsp?title=' url_suffix = '®ion=021' year_list = ['2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016'] urls = ns.build_urls(year_list) for url in urls: soup_can = ns.get_pages(url) table_list = [ns.clean_up(soup) for soup in soup_can] zipped = list(zip(year_list, table_list))
# 'Name_2010', df.drop(['Place Type','CityST', 'ID','LAT_BING', 'LON_BING', '1790','1800','1810', '1820', '1830', '1840', '1850', '1860', '1870', '1880', '1890', '1900', '1910', '1920', '1930', '1940',], axis=1, inplace=True) return df def get_rj_data(file_path='data/rj_metrics.txt'): ''' INPUT: File path to rj metrics text file OUTPUT: Cleaned dataFrame of file ''' rj_df = pd.read_table(file_path) rj_df['state'] = (rj_df['City'].apply(lambda x: x.split(',')[-1])) rj_df['city'] = rj_df['City'].apply(lambda x: x.lower().split(',')[0]) rj_df['city'] = rj_df['city'].apply(lambda x: x.replace(' ', '_').replace('-','_')) rj_df.drop('City', axis =1, inplace=True) rj_df.set_index(['city'], inplace = True) return rj_df if __name__ == '__main__': new_df = pd.concat([df, rj_df], axis=1) meetup_df = new_df[new_df['Pop'].notnull()] cities = list(meetup_df.index) # bureau of economic affairs clean and join url = 'http://www.bea.gov/newsreleases/regional/gdp_metro/2015/xls/gdp_metro0915.xls' raw_bea = gbd.get_bea_data(url) bea_df = gbd.clean_me(raw_bea) next_df = pd.concat([new_df, bea_df], axis=0)
axis=1, inplace=True) return df def get_rj_data(file_path='data/rj_metrics.txt'): ''' INPUT: File path to rj metrics text file OUTPUT: Cleaned dataFrame of file ''' rj_df = pd.read_table(file_path) rj_df['state'] = (rj_df['City'].apply(lambda x: x.split(',')[-1])) rj_df['city'] = rj_df['City'].apply(lambda x: x.lower().split(',')[0]) rj_df['city'] = rj_df['city'].apply( lambda x: x.replace(' ', '_').replace('-', '_')) rj_df.drop('City', axis=1, inplace=True) rj_df.set_index(['city'], inplace=True) return rj_df if __name__ == '__main__': new_df = pd.concat([df, rj_df], axis=1) meetup_df = new_df[new_df['Pop'].notnull()] cities = list(meetup_df.index) # bureau of economic affairs clean and join url = 'http://www.bea.gov/newsreleases/regional/gdp_metro/2015/xls/gdp_metro0915.xls' raw_bea = gbd.get_bea_data(url) bea_df = gbd.clean_me(raw_bea) next_df = pd.concat([new_df, bea_df], axis=0)
import get_bea_data as gbd import population_cleanup as pc import recent_pop_cleanup as rpc import glob import walkscore as ws import os # read in population (1790 - 2010) and rj metrics meetup info (2013-2014) and merge df's census_pop_df = pc.get_pop_data("data/1790-2010_MASTER.csv") rj_df = pc.get_rj_data("data/rj_metrics.txt") new_df = pd.concat([census_pop_df, rj_df], axis=1) print "Census data merged to RJ metrics data!" # clean and join bureau of economic affairs info raw_bea = gbd.get_bea_data("http://www.bea.gov/newsreleases/regional/gdp_metro/2015/xls/gdp_metro0915.xls") bea_df = gbd.clean_me(raw_bea) bea_df = bea_df[:-2] next_df = pd.concat([new_df, bea_df[bea_df["bea_2014"] > 20000]], axis=1) print "Bureau of Economic Affairs data merged!" # incorporate numbeo data: url_prefix = "http://www.numbeo.com/cost-of-living/region_rankings.jsp?title=" url_suffix = "®ion=021" year_list = ["2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016"] urls = ns.build_urls(year_list) for url in urls: soup_can = ns.get_pages(url) table_list = [ns.clean_up(soup) for soup in soup_can] zipped = list(zip(year_list, table_list))