示例#1
0
def get_grad_data(url, cols, skip_rows=5):
    '''
    INPUT: Target url to scrape, column names, and rows to skip (default=5)
    OUTPUT: Dataframe of scraped info
    '''
    file_name = url.split('/')[-1].replace('-', '_')
    path = os.getcwd()+'/data/biggestuscities'
    file_path = '{}/{}.csv'.format(path, file_name)

    if not os.path.exists(path):
        os.makedirs(path)
    if not os.path.isfile(file_path):
        soup = ns.get_pages(url)
        table = soup[0].findAll('table')
        tabs = [tag.text for tag in table]
        clean_table = clean_top_100(tabs, skip_rows)
        return process_df(clean_table, file_path, cols)
    else:
        return pd.read_csv(file_path)
def get_grad_data(url, cols, skip_rows=5):
    '''
    INPUT: Target url to scrape, column names, and rows to skip (default=5)
    OUTPUT: Dataframe of scraped info 
    '''
    file_name = url.split('/')[-1].replace('-', '_')
    path = os.getcwd() + '/data/biggestuscities'  #/{}'.format(file_name)
    file_path = '{}/{}.csv'.format(path, file_name)

    if not os.path.exists(path):
        os.makedirs(path)
    if not os.path.isfile(file_path):
        soup = ns.get_pages(url)
        tabs = []
        table = soup[0].findAll('table')
        for tag in table:
            tabs.append(tag.text)
        clean_table = clean_top_100(tabs, skip_rows)
        return process_df(clean_table, file_path, cols)
    else:
        return pd.read_csv(file_path)
示例#3
0
# clean and join bureau of economic affairs info

raw_bea = gbd.get_bea_data('http://www.bea.gov/newsreleases/regional/gdp_metro/2015/xls/gdp_metro0915.xls')
bea_df = gbd.clean_me(raw_bea)
bea_df = bea_df[:-2]
next_df = pd.concat([new_df, bea_df[bea_df['bea_2014'] > 20000]], axis=1)
print 'Bureau of Economic Affairs data merged!'
# incorporate numbeo data:

url_prefix = 'http://www.numbeo.com/cost-of-living/region_rankings.jsp?title='
url_suffix = '&region=021'
year_list = ['2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016']

urls = ns.build_urls(year_list)
for url in urls:
    soup_can = ns.get_pages(url)
table_list = [ns.clean_up(soup) for soup in soup_can]
zipped = list(zip(year_list, table_list))
df_dict = ns.build_data_frames(zipped)

for item in year_list:
    columns= ns.fix_em(['Rank','City','Cost of Living Index','Rent Index','Cost of Living Plus Rent Index',
          'Groceries Index','Restaurant Price Index','Local Purchasing Power Index'])
    first_cols = columns[:2]
    first_cols.extend([column + '_{}'.format(item)for column in columns[2:]])
    df_dict[item].columns = first_cols

def clean_up_df(df):
    df['state'] = df['city'].apply(lambda x: x.split(',')[1].strip().lower().replace(' ', '_'))
    df['city'] = df['city'].apply(lambda x: x.split(',')[0].lower().replace(' ', '_'))
    del df['rank']
示例#4
0
文件: main.py 项目: CLuiz/CityScapes
# clean and join bureau of economic affairs info

raw_bea = gbd.get_bea_data("http://www.bea.gov/newsreleases/regional/gdp_metro/2015/xls/gdp_metro0915.xls")
bea_df = gbd.clean_me(raw_bea)
bea_df = bea_df[:-2]
next_df = pd.concat([new_df, bea_df[bea_df["bea_2014"] > 20000]], axis=1)
print "Bureau of Economic Affairs data merged!"
# incorporate numbeo data:

url_prefix = "http://www.numbeo.com/cost-of-living/region_rankings.jsp?title="
url_suffix = "&region=021"
year_list = ["2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016"]

urls = ns.build_urls(year_list)
for url in urls:
    soup_can = ns.get_pages(url)
table_list = [ns.clean_up(soup) for soup in soup_can]
zipped = list(zip(year_list, table_list))
df_dict = ns.build_data_frames(zipped)

for item in year_list:
    columns = ns.fix_em(
        [
            "Rank",
            "City",
            "Cost of Living Index",
            "Rent Index",
            "Cost of Living Plus Rent Index",
            "Groceries Index",
            "Restaurant Price Index",
            "Local Purchasing Power Index",