# say that maximum of 40 players are being paid (should be ~ 12)
ncol = 42
sal_perc_df = pd.DataFrame(np.ones((df.values[df.values == 1].size, ncol), dtype=np.float64), columns=['team', 'year'] + ['p' + str(i) for i in range(1,41)])
rc = 0

# fill the salary df
scrape_bad = False
for i in range(0, df.shape[0]):
    for j in range(0, df.shape[1]):
        if df.iloc[i, j] == 1:
            sal_perc_df['team'][rc] = df.index[i]
            sal_perc_df['year'][rc] = df.columns[j]
            # scrape salary data
            url = base + str(df.columns[j]) + '/' + df.index[i] + '.jsp'
            try:
                sal_percs = cs.extractSalaryPercents(url)
            except:
                print 'Scraping error on url <' + url + '>'
                print 'Improve your extract function, you noob.'
                scrape_bad = True
                break
            sal_percs += [0] * ((ncol - 2) - len(sal_percs)) 
            # pad the zeros
            sal_perc_df.iloc[rc, 2:] = sal_percs # fill that row in         
            rc += 1
            print 'Finished payroll for ' + df.index[i] + ', ' + str(df.columns[j])
        else:
            print 'Skipping 404 page'
    if scrape_bad:
        break   
Exemplo n.º 2
0
            2:ncol]
        sal_perc_df_full.iloc[inds, 2:ncol] = old_percs
        old_count = old_count + num_seasons

# now we need to fill in salary data for Charlotte Bobcats,
# New Orleans Hornets/New Orleans Pelicans, Seattle Supersonics/
# Oklahoma City Thunder

# NO Hornets
sal_perc_df_full.iloc[144:150, 2:] = sal_perc_df[sal_perc_df['team'] == 'hornets'].iloc[0:-1, 2:]
# NO Pelicans
sal_perc_df_full.iloc[150:152, 2:] = sal_perc_df[sal_perc_df['team'] == 'pelicans'].iloc[:, 2:]
# OKC
sal_perc_df_full.iloc[218:224, 2:] = sal_perc_df[sal_perc_df['team'] == 'thunder'].iloc[:, 2:]

# now salaries that we haven't collected:
# Supersonics data can't be found on ShamSports, so we're making it nan
sal_perc_df_full.iloc[216:218, 2:] = float('nan')

# Charlotte Bobcats (only changing to Charlotte Hornets for 
# 2014-2015 season)
sal_perc_df_full.iloc[168:176, 0:3]

i = 0
for year in range(2007, 2015):
    sal_percs = cs.extractSalaryPercents('http://data.shamsports.com/content/pages/data/salaries/' + str(year) + '/bobcats.jsp')
    sal_percs += [0] * ((ncol - 2) - len(sal_percs))
    sal_perc_df_full.iloc[168 + i, 2:] = sal_percs
    i += 1

pickle.dump(sal_perc_df_full, open('fixed_payroll_shamsports.p', 'wb'))