def get_grad_data(url, cols, skip_rows=5): ''' INPUT: Target url to scrape, column names, and rows to skip (default=5) OUTPUT: Dataframe of scraped info ''' file_name = url.split('/')[-1].replace('-', '_') path = os.getcwd()+'/data/biggestuscities' file_path = '{}/{}.csv'.format(path, file_name) if not os.path.exists(path): os.makedirs(path) if not os.path.isfile(file_path): soup = ns.get_pages(url) table = soup[0].findAll('table') tabs = [tag.text for tag in table] clean_table = clean_top_100(tabs, skip_rows) return process_df(clean_table, file_path, cols) else: return pd.read_csv(file_path)
def get_grad_data(url, cols, skip_rows=5): ''' INPUT: Target url to scrape, column names, and rows to skip (default=5) OUTPUT: Dataframe of scraped info ''' file_name = url.split('/')[-1].replace('-', '_') path = os.getcwd() + '/data/biggestuscities' #/{}'.format(file_name) file_path = '{}/{}.csv'.format(path, file_name) if not os.path.exists(path): os.makedirs(path) if not os.path.isfile(file_path): soup = ns.get_pages(url) tabs = [] table = soup[0].findAll('table') for tag in table: tabs.append(tag.text) clean_table = clean_top_100(tabs, skip_rows) return process_df(clean_table, file_path, cols) else: return pd.read_csv(file_path)
# clean and join bureau of economic affairs info raw_bea = gbd.get_bea_data('http://www.bea.gov/newsreleases/regional/gdp_metro/2015/xls/gdp_metro0915.xls') bea_df = gbd.clean_me(raw_bea) bea_df = bea_df[:-2] next_df = pd.concat([new_df, bea_df[bea_df['bea_2014'] > 20000]], axis=1) print 'Bureau of Economic Affairs data merged!' # incorporate numbeo data: url_prefix = 'http://www.numbeo.com/cost-of-living/region_rankings.jsp?title=' url_suffix = '®ion=021' year_list = ['2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016'] urls = ns.build_urls(year_list) for url in urls: soup_can = ns.get_pages(url) table_list = [ns.clean_up(soup) for soup in soup_can] zipped = list(zip(year_list, table_list)) df_dict = ns.build_data_frames(zipped) for item in year_list: columns= ns.fix_em(['Rank','City','Cost of Living Index','Rent Index','Cost of Living Plus Rent Index', 'Groceries Index','Restaurant Price Index','Local Purchasing Power Index']) first_cols = columns[:2] first_cols.extend([column + '_{}'.format(item)for column in columns[2:]]) df_dict[item].columns = first_cols def clean_up_df(df): df['state'] = df['city'].apply(lambda x: x.split(',')[1].strip().lower().replace(' ', '_')) df['city'] = df['city'].apply(lambda x: x.split(',')[0].lower().replace(' ', '_')) del df['rank']
# clean and join bureau of economic affairs info raw_bea = gbd.get_bea_data("http://www.bea.gov/newsreleases/regional/gdp_metro/2015/xls/gdp_metro0915.xls") bea_df = gbd.clean_me(raw_bea) bea_df = bea_df[:-2] next_df = pd.concat([new_df, bea_df[bea_df["bea_2014"] > 20000]], axis=1) print "Bureau of Economic Affairs data merged!" # incorporate numbeo data: url_prefix = "http://www.numbeo.com/cost-of-living/region_rankings.jsp?title=" url_suffix = "®ion=021" year_list = ["2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016"] urls = ns.build_urls(year_list) for url in urls: soup_can = ns.get_pages(url) table_list = [ns.clean_up(soup) for soup in soup_can] zipped = list(zip(year_list, table_list)) df_dict = ns.build_data_frames(zipped) for item in year_list: columns = ns.fix_em( [ "Rank", "City", "Cost of Living Index", "Rent Index", "Cost of Living Plus Rent Index", "Groceries Index", "Restaurant Price Index", "Local Purchasing Power Index",