def ANALYSIS_all(division='men', my_data={}, metric=True): open_data = get_analysis_dataframe(competition='open', division=division) open_data = clear_outliers(open_data) regionals_data = get_analysis_dataframe(competition='regionals', division=division) regionals_data = clear_outliers(regionals_data) games_data = get_analysis_dataframe(competition='games', division=division) games_data = clear_outliers(games_data) box_plots_all(open_data, regionals_data, games_data, division.title(), my_data, metric)
def ANALYSIS_all_imputed(division='men', my_data={}, metric=True): open_data = get_imputed_dataframe(division=division, competition='open') regionals_data = get_analysis_dataframe(division=division, competition='regionals') games_data = get_analysis_dataframe(division=division, competition='games') # use imputed values from open data to fill in athlete stats for regionals/games data regionals_data = pd.merge( open_data.drop(['overallrank', 'overallscore'], axis=1), regionals_data[['userid', 'overallrank', 'overallscore']], on='userid', how='inner') games_data = pd.merge(open_data.drop(['overallrank', 'overallscore'], axis=1), games_data[['userid', 'overallrank', 'overallscore']], on='userid', how='inner') box_plots_all(open_data, regionals_data, games_data, "Imputed " + division.title(), my_data, metric)
def ANALYSIS_games(division='men'): games_data = get_analysis_dataframe(competition='games', division=division) games_data = clear_outliers(games_data) box_plots(games_data, 'Games')
def ANALYSIS_regionals(division='men'): regionals_data = get_analysis_dataframe(competition='regionals', division=division) regionals_data = clear_outliers(regionals_data) box_plots(regionals_data, 'Regionals')
def ANALYSIS_open(division='men'): open_data = get_analysis_dataframe(competition='open', division=division) open_data = clear_outliers(open_data) box_plots(open_data, 'Open')
def _get_imputed_dataframe(*args, **kwargs): def impute_rows(data, X_cols, y_cols): rows_idx = np.argwhere( np.logical_and( np.isnan(data[:, y_cols]).all(axis=1), ~np.isnan(data[:, X_cols]).any(axis=1))) y_pred = np.zeros((len(rows_idx), len(y_cols))) if len(rows_idx) > 0: print("\tImputing", len(rows_idx), "rows") full_rows = np.argwhere( np.logical_and(~np.isnan(data[:, X_cols]).any(axis=1), ~np.isnan(data[:, y_cols]).any(axis=1))) reg = RANSACRegressor() reg.fit(data[full_rows, X_cols], data[full_rows, y_cols]) y_pred = reg.predict(data[rows_idx, X_cols]).clip(min=0) return (rows_idx, y_cols, y_pred) def impute_update_data(data, X_cols, y_cols): print(X_cols, "predicting", y_cols) cols = list(data) X_cols = [cols.index(x) for x in X_cols] y_cols = [cols.index(y) for y in y_cols] matrix = data.as_matrix() rows_idx, y_cols, y_pred = impute_rows(matrix, X_cols, y_cols) matrix[rows_idx, y_cols] = y_pred return pd.DataFrame(matrix, index=data.index, columns=data.columns) data = get_analysis_dataframe(*args, **kwargs) data = data.astype(float) data = clear_outliers(data) Xys = [ #(['Height'],['Weight']), #(['Weight'],['Height']), (['Snatch'], ['Clean and Jerk']), (['Clean and Jerk'], ['Snatch']), (['Snatch', 'Clean and Jerk'], ['Back Squat']), (['Snatch', 'Clean and Jerk', 'Back Squat'], ['Deadlift']), (['Back Squat'], ['Deadlift']), (['Deadlift'], ['Back Squat']), #(['Run 5k'],['Sprint 400m']), #(['Sprint 400m'],['Run 5k']), (['Weight', 'Snatch', 'Clean and Jerk', 'Back Squat', 'Deadlift'], ['Max Pull-ups']), (['Weight', 'Back Squat', 'Deadlift'], ['Max Pull-ups']), (['Weight', 'Snatch', 'Clean and Jerk'], ['Max Pull-ups']), #(['Filthy 50'],['Fight Gone Bad']), #(['Fight Gone Bad'],['Filthy 50']), (['Max Pull-ups', 'Clean and Jerk'], ['Fran']), (['Clean and Jerk', 'Fran'], ['Grace']), (['Max Pull-ups', 'Sprint 400m', 'Run 5k'], ['Helen']), #(['Max Pull-ups', 'Grace'],['Fran']), ] for x, y in Xys: data = impute_update_data(data, x, y) data = clear_outliers(data) imputer = RecursiveKNN(verbose=1, n_jobs=4, feature_selector=DecisionTreeRegressor) data = pd.DataFrame(imputer.complete(data), index=data.index, columns=data.columns) return data
'Max Pull-ups': 25, 'Fran': 5 } pan = { 'Age': 22, 'Height': 158, 'Weight': 53, 'Back Squat': 57, 'Clean and Jerk': 35, 'Snatch': 28, 'Deadlift': 70, 'Max Pull-ups': 0 } fraser = get_analysis_dataframe(division='men', competition='games').iloc[0].dropna().drop( ['overallscore', 'userid', 'overallrank']).to_dict() tct = get_analysis_dataframe(division='women', competition='games').iloc[0].dropna().drop( ['overallscore', 'userid', 'overallrank']).to_dict() sara = get_analysis_dataframe(division='women', competition='open').iloc[0].dropna().drop( ['overallscore', 'userid', 'overallrank']).to_dict() import xgboost as xgb @memoize() def get_fitted_model(data):