def generate_game_data(filename, teamno, folderpath=PLAY_BY_PLAY_DIRECTORY): '''This funciton generates game data from lineup retrieved from stats.nba.com''' df = pd.read_csv(folderpath + filename) # add_aid_rows(df,) teamname = filename[9:12] if teamno == 1 else filename[12:15] team = BasketballGame(filename, df, teamname, teamno) gamestats = [] # reads in data of the season's mean ref = pd.read_csv(STATS_NBA_PATH % "2008-09_team_mean.csv") # reads the mean data of teams of the season opponent_team_mean = ref[ref['TEAM_ABBREVIATION'] == team.oppteamname].squeeze() opponent_team_mean.index = ['OPP_%s' % elt for elt in opponent_team_mean.index] for lineup_no in range(1, team.totalcombi + 1): # Loop over lineups in the game lineup = team.give_nth_combination(lineup_no) player_names = get_player_list(lineup, team.teamno) interval = calculate_interval(team, lineup_no) lineup_data = get_lineup_data(teamname, player_names, interval, type="none") if lineup_data.empty: return pd.Series() # print lineup_data row_s = lineup_data # This transforms the columns name used by stats.nba.com to the convention # I used throughout the project # row_s.rename(index={ # 'PLUS_MINUS': 'PTD', # 'PTS': 'P', # 'FT_PCT': 'FTP', # 'FG_PCT': 'FGP', # 'OREB': 'ORB', # 'DREB': 'DRB', # 'REB': 'TRB', # 'FG3_PCT': 'B3P', # 'PF': 'F' # }, inplace=True) # row_s.drop("PFD", inplace=True) # drop Personal Foul Drawn since not used in our study row_s.name = lineup_no row_s["interval"] = interval # if row_s.TYPE == ("median" or "mean"): # pass # else: gamestats.append(row_s.T) # print gamestats game_agg = pd.DataFrame(gamestats).sum() # sum up a games all scores # this PTD is still the actual PTD of the game, it is only the # stats that have been replaced. # game_agg.PTD = team.PTD game_agg["Actual PTS"] = team.P game_agg["Actual PTS scaled"] = team.P * game_agg.interval / float(team.interval) game_agg["Actual PTD scaled"] = team.PTD * game_agg.interval / float(team.interval) game_agg["Actual PTD"] = team.PTD game_agg = game_agg.append(opponent_team_mean["OPP_FGM":]) # print game_agg return game_agg
def opplineup(filename): # namecolumns = ['a1', 'a2', 'a3', 'a4', 'a5', 'h1', 'h2', 'h3', 'h4', 'h5'] DATA_FOR_NOT_FOUND = "median" out = [] df = pd.read_csv(PLAY_BY_PLAY_DIRECTORY + filename) team_name = filename[9:12] opp_team_name = filename[12:15] if (opp_team_name not in ALLTEAMS) and (team_name not in ALLTEAMS): print '%s, one of the team not available, skip!' % filename return pd.DataFrame() else: add_aid_rows(df, [team_name, opp_team_name]) total_lineup = df.head(1).lineup_no.item() + 1 for lineup_no in range(1, int(total_lineup)): # go through all the lineups in the game lineup_df = df[df.lineup_no == lineup_no] # interval = lineup_df.head(1).timeleft.item() - lineup_df.tail(1).timeleft.item() # calculate the # interval of this lineup # print 'This is lineup: %s, the interval is %s' %(lineup_no,interval) team1 = lineup_df[lineup_df.team == team_name] team2 = lineup_df[lineup_df.team == opp_team_name] for team_no, (team_name, team_data), in enumerate(zip([team_name, opp_team_name], [team1, team2])): if team_data.empty: # check that the lineup contains this data of this team print '%s, %s, %s, team not in this lineup data' % (filename, lineup_no, team_name) else: team_players = get_player_list(team_data, team_no + 1) lineup_data = get_lineup_data(team_name, team_players, type="none") # retrieve data from stats.nba if lineup_data.empty is False: team_points = score_calculate(team_data) lineup_data['interval'] = interval # calculate the interval of this lineup lineup_data['PPM'] = float(team_points) / interval * 60 # calculate PPM(POINTS PER MINUTE) # print lineup_data out.append(lineup_data) # append a line to the lineup data # print pd.concat(out, axis=1).T if out != []: return pd.concat(out, axis=1).T # concatenate all lineup data of one game else: return pd.Series()