def pull_stats(): if os.path.exists(os.path.join(cur_path, 'test_data', 'stats.csv')): stats = pd.read_csv(os.path.join(cur_path, 'test_data', 'stats.csv')) stats.drop('Unnamed: 0', axis = 1, inplace = True) else: stats = pg_query(PSQL.client, 'SELECT bs.bout_id, date, fighter_id, kd, ssa, sss, tsa, tss, sub, pas, rev, headssa, headsss, bodyssa, bodysss, legssa, legsss, disssa, dissss, clinssa, clinsss, gndssa, gndsss, tda, tds FROM ufc.bout_stats bs join ufc.bouts b on b.bout_id = bs.bout_id join ufc.fights f on f.fight_id = b.fight_id where champ is false;') stats.columns = ['bout_id', 'fight_date', 'fighter_id', 'kd', 'ssa', 'sss', 'tsa', 'tss', 'sub', 'pas', 'rev', 'headssa', 'headsss', 'bodyssa', 'bodysss', 'legssa', 'legsss', 'disssa', 'dissss', 'clinssa', 'clinsss', 'gndssa', 'gndsss', 'tda', 'tds'] bouts = pg_query(PSQL.client, "select b.bout_id, weight_id, winner, loser from ufc.bouts b join ufc.bout_results br on br.bout_id = b.bout_id join ufc.fights f on f.fight_id = b.fight_id") bouts.columns = ['bout_id', 'weight_id', 'winner', 'loser'] opponents = {i+j:k for i,j,k in bouts[['bout_id', 'winner', 'loser']].values} for i,j,k in bouts[['bout_id', 'loser', 'winner']].values: opponents[i+j] = k bouts = {i:{'winner':j, 'loser': k} for i,j,k in bouts[['bout_id', 'winner', 'loser']].values} stats['opponent_id'] = (stats['bout_id'] + stats['fighter_id']).apply(lambda x: opponents[x] if x in opponents.keys() else np.nan) stats.dropna(inplace = True) bout_len = pg_query(PSQL.client, "SELECT bout_id, length from ufc.bout_results") bout_len.columns = ['bout_id', 'length'] stats = pd.merge(stats, bout_len, left_on = 'bout_id', right_on = 'bout_id') for col in cols: stats[col] = stats[col] / stats['length'] stats.drop('length', axis = 1, inplace = True) def_stats = deepcopy(stats) def_stats.drop('fighter_id', axis = 1, inplace = True) def_stats.rename(columns = {'opponent_id': 'fighter_id'}, inplace = True) def_stats.rename(columns = {i: 'd_'+i for i in cols}, inplace = True) stats = pd.merge(stats, def_stats, left_on = ['fighter_id', 'bout_id', 'fight_date'], right_on = ['fighter_id', 'bout_id', 'fight_date']) def_stats = None stats.to_csv(os.path.join(cur_path, 'test_data', 'stats.csv')) return(stats)
def pull_adj_avg_data(): data = pg_query(PSQL.client, 'Select * from ufc.adj_avg_stats') data.columns = [ 'fighter_id', 'bout_id', 'adj_avg_d_bodyssa', 'adj_avg_d_bodysss', 'adj_avg_d_clinssa', 'adj_avg_d_clinsss', 'adj_avg_d_disssa', 'adj_avg_d_dissss', 'adj_avg_d_gndssa', 'adj_avg_d_gndsss', 'adj_avg_d_headssa', 'adj_avg_d_headsss', 'adj_avg_d_kd', 'adj_avg_d_legssa', 'adj_avg_d_legsss', 'adj_avg_d_pas', 'adj_avg_d_rev', 'adj_avg_d_ssa', 'adj_avg_d_sss', 'adj_avg_d_sub', 'adj_avg_d_tda', 'adj_avg_d_tds', 'adj_avg_d_tsa', 'adj_avg_d_tss', 'adj_avg_o_bodyssa', 'adj_avg_o_bodysss', 'adj_avg_o_clinssa', 'adj_avg_o_clinsss', 'adj_avg_o_disssa', 'adj_avg_o_dissss', 'adj_avg_o_gndssa', 'adj_avg_o_gndsss', 'adj_avg_o_headssa', 'adj_avg_o_headsss', 'adj_avg_o_kd', 'adj_avg_o_legssa', 'adj_avg_o_legsss', 'adj_avg_o_pas', 'adj_avg_o_rev', 'adj_avg_o_ssa', 'adj_avg_o_sss', 'adj_avg_o_sub', 'adj_avg_o_tda', 'adj_avg_o_tds', 'adj_avg_o_tsa', 'adj_avg_o_tss' ] opponents = pg_query(PSQL.client, 'Select * from ufc.bout_fighter_xref') opponents.columns = ['bout_id', 'fighter_id', 'opponent_id'] data = pd.merge(data, opponents, left_on=['bout_id', 'fighter_id'], right_on=['bout_id', 'fighter_id']) dates = pg_query( PSQL.client, 'Select bout_id, date from ufc.fights f join ufc.bouts b on b.fight_id = f.fight_id' ) dates.columns = ['bout_id', 'fight_date'] data = pd.merge(data, dates, left_on='bout_id', right_on='bout_id') return (data)
def store_meta_res(domain): # domain = 'length' X, Y = pull_val_data(domain) pred_df = pd.DataFrame(Y) # res_df = pd.DataFrame() final_model_folder = os.path.join(cur_path, 'model_tuning', 'modelling', domain, 'final', 'models') for mod_name in os.listdir(final_model_folder): if mod_name == '.DS_Store': continue model_path = os.listdir(os.path.join(final_model_folder, mod_name)) model = load(os.path.join(final_model_folder, mod_name, model_path[0])) feats_folder = os.path.join(cur_path, 'model_tuning', 'modelling', domain, 'final', 'features') with open(os.path.join(feats_folder, '%s.json' % (mod_name)), 'r') as fp: feats = json.load(fp) feats = feats[max(feats.keys())] scale_folder = os.path.join(cur_path, 'model_tuning', 'modelling', domain, 'final', 'scalers', mod_name) scale_path = os.path.join(scale_folder, os.listdir(os.path.join(scale_folder))[0]) scale = load(scale_path) mod_preds = cross_validate(X[feats], Y, model, scale) mod_preds.rename(columns={0: mod_name}, inplace=True) pred_df = pred_df.join(mod_preds) pred_cols = [i for i in list(pred_df) if i != domain] mod_scores = {} for idx in pred_df.index: mod_scores[idx] = {} row = pred_df.loc[idx] for mod in pred_cols: if domain == 'winner': row_score = abs(row[domain] - row[mod]) elif domain == 'length': row_score = abs(row[domain] - row[mod]) mod_scores[idx][mod] = row_score mod_scores = pd.DataFrame.from_dict(mod_scores).T meta_data = mod_scores.join(X) PSQL = db_connection('psql') bouts = pg_query( PSQL.client, "select b.bout_id, weight_desc from ufc.bouts b join ufc.bout_results br on br.bout_id = b.bout_id join ufc.fights f on f.fight_id = b.fight_id join ufc.weights w on b.weight_id = w.weight_id" ) bouts.columns = ['bout_id', 'weight_id'] weights = pd.get_dummies(bouts['weight_id']) weights['index'] = bouts['bout_id'] weights.drop_duplicates(inplace=True) weights.set_index('index', inplace=True) meta_data = meta_data.join(weights) meta_data.to_csv( os.path.join(cur_path, 'data', 'meta', 'meta_%s.csv' % (domain)))
def pull_avg_data(stats): stats = pd.read_csv(os.path.join(cur_path, 'test_data', 'stats.csv')) stats.drop('Unnamed: 0', axis=1, inplace=True) fut_bouts = pg_query( PSQL.client, "SELECT date, bx.bout_id, bx.fighter_id, opponent_id FROM ufc.bout_fighter_xref bx join ufc.bouts b on b.bout_id = bx.bout_id join ufc.fights f on f.fight_id = b.fight_id" ) fut_bouts.columns = ['date', 'bout_id', 'fighter_id', 'opponent_id'] stat_avgs = {} for date, bout, fighter, opponent in fut_bouts.values: f_stats = {} ostat = stats.loc[stats['fighter_id'] == fighter] ostat.sort_values('fight_date', inplace=True) dstat = stats.loc[stats['opponent_id'] == fighter] dstat.sort_values('fight_date', inplace=True) if len(dstat) != len(ostat): raise ValueError() if (dstat[['bout_id', 'fight_date']].values != ostat[['bout_id', 'fight_date']].values).any(): raise ValueError() f_stats[bout] = {'fight_date': date, 'opponent_id': opponent} for col in cols: f_stats[bout]['avg_o_' + col] = ostat[col].mean() for col in cols: f_stats[bout]['avg_d_' + col] = dstat['d_' + col].mean() if len(f_stats.keys()) > 0: stat_avgs[fighter] = f_stats avg_data = {} i = 0 for k, v in stat_avgs.items(): for kk, vv in v.items(): vv['fighter_id'] = k vv['bout_id'] = kk avg_data[i] = vv i += 1 avg_data = pd.DataFrame.from_dict(avg_data).T avg_data.to_csv(os.path.join(cur_path, 'test_data', 'avg_data_TEST.csv')) return (avg_data)
mod_scores = {} for idx in pred_df.index: mod_scores[idx] = {} row = pred_df.loc[idx] for mod in pred_cols: row_score = logloss(row['winner'], row[mod]) mod_scores[idx][mod] = row_score mod_scores = pd.DataFrame.from_dict(mod_scores).T meta_data = mod_scores.join(X) for col in [[i for i in list(meta_data) if i not in pred_cols]]: meta_data[col] = StandardScaler().fit_transform(meta_data[col]) PSQL = db_connection('psql') bouts = pg_query( PSQL.client, "select b.bout_id, weight_desc from ufc.bouts b join ufc.bout_results br on br.bout_id = b.bout_id join ufc.fights f on f.fight_id = b.fight_id join ufc.weights w on b.weight_id = w.weight_id" ) bouts.columns = ['bout_id', 'weight_id'] weights = pd.get_dummies(bouts['weight_id']) weights['index'] = bouts['bout_id'] weights.drop_duplicates(inplace=True) weights.set_index('index', inplace=True) meta_data = meta_data.join(weights) meta_data.to_csv(os.path.join(cur_path, 'test_data', 'pred_res_winner.csv')) adsfasfda model_feats = {} for mod in pred_cols: meta_y = (meta_data[mod] - 0.693) * -1
def pull_pred_data(): avg_data = pull_avg_data() adj_avg_data = pull_adj_avg_data() nxt_bouts = pg_query(PSQL.client, "select bx.bout_id, fighter_id, opponent_id, date from ufc.bout_fighter_xref bx join ufc.bouts b on b.bout_id = bx.bout_id join ufc.fights f on f.fight_id = b.fight_id where b.fight_id = '%s';" % ('351264d11286d09a')) nxt_bouts.columns = ['bout_id', 'fighter_id', 'opponent_id', 'fight_date'] avg_data = pd.merge(avg_data, nxt_bouts, left_on = ['bout_id', 'fighter_id', 'opponent_id', 'fight_date'], right_on = ['bout_id', 'fighter_id', 'opponent_id', 'fight_date'], how = 'inner') adj_avg_data = pd.merge(adj_avg_data, nxt_bouts, left_on = ['bout_id', 'fighter_id', 'opponent_id', 'fight_date'], right_on = ['bout_id', 'fighter_id', 'opponent_id', 'fight_date'], how = 'inner') acc_stat_dict = {'acc_ss': ['ssa', 'sss'], 'acc_headss': ['headssa', 'headsss'], 'acc_bodyss': ['bodyssa', 'bodysss'], 'acc_legss': ['legssa', 'legsss'], 'acc_disss': ['disssa', 'dissss'], 'acc_clinss': ['clinssa', 'clinsss'], 'acc_gndss': ['gndssa', 'gndsss'], 'acc_td': ['tda', 'tds']} share_ss_dict = {'share_headss': ['headssa', 'headsss'], 'share_bodyss': ['bodyssa', 'bodysss'], 'share_legss': ['legssa', 'legsss'], 'share_disss': ['disssa', 'dissss'], 'share_clinss': ['clinssa', 'clinsss'], 'share_gndss': ['gndssa', 'gndsss']} for k, v in acc_stat_dict.items(): avg_data['avg_o_'+k] = (avg_data['avg_o_'+v[1]] / avg_data['avg_o_'+v[0]]).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) avg_data['avg_d_'+k] = (avg_data['avg_d_'+v[1]] / avg_data['avg_d_'+v[0]]).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) for k, v in share_ss_dict.items(): avg_data['avg_o_'+k+'_a'] = (avg_data['avg_o_'+v[0]]/avg_data['avg_o_'+'ssa']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) avg_data['avg_d_'+k+'_a'] = (avg_data['avg_d_'+v[0]]/avg_data['avg_d_'+'ssa']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) avg_data['avg_o_'+k+'_s'] = (avg_data['avg_o_'+v[1]]/avg_data['avg_o_'+'sss']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) avg_data['avg_d_'+k+'_s'] = (avg_data['avg_d_'+v[1]]/avg_data['avg_d_'+'sss']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) for k, v in acc_stat_dict.items(): adj_avg_data['adj_avg_o_'+k] = (adj_avg_data['adj_avg_o_'+v[1]] / adj_avg_data['adj_avg_o_'+v[0]]).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) adj_avg_data['adj_avg_d_'+k] = (adj_avg_data['adj_avg_d_'+v[1]] / adj_avg_data['adj_avg_d_'+v[0]]).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) for k, v in share_ss_dict.items(): adj_avg_data['adj_avg_o_'+k+'_a'] = (adj_avg_data['adj_avg_o_'+v[0]]/adj_avg_data['adj_avg_o_'+'ssa']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) adj_avg_data['adj_avg_d_'+k+'_a'] = (adj_avg_data['adj_avg_d_'+v[0]]/adj_avg_data['adj_avg_d_'+'ssa']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) adj_avg_data['adj_avg_o_'+k+'_s'] = (adj_avg_data['adj_avg_o_'+v[1]]/adj_avg_data['adj_avg_o_'+'sss']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) adj_avg_data['adj_avg_d_'+k+'_s'] = (adj_avg_data['adj_avg_d_'+v[1]]/adj_avg_data['adj_avg_d_'+'sss']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) data = pd.merge(avg_data, adj_avg_data, left_on = ['bout_id', 'fighter_id', 'fight_date', 'opponent_id'], right_on = ['bout_id', 'fighter_id', 'fight_date', 'opponent_id']) data.dropna(inplace = True) fighters = pg_query(PSQL.client, 'select fighter_id, height, reach, stance, dob from ufc.fighters') fighters.columns = ['fighter_id', 'height', 'reach', 'stance', 'dob'] fighters.set_index('fighter_id', inplace = True) fighters = fighters.join(pd.get_dummies(fighters['stance'])) fighters.drop('stance', axis = 1, inplace = True) fighters.rename(columns = {'': 'Missing Stance'}, inplace = True) fighters.reset_index(inplace = True) fighter_dob = {i:j for i,j in fighters[['fighter_id', 'dob']].values} i = 0 fighter_ages = {} for bout, fighter, date in data[['bout_id', 'fighter_id', 'fight_date']].values: fighter_ages[i] = {'bout_id': bout, 'fighter_id':fighter, 'age': (datetime.strptime(str(date).split(' ')[0], '%Y-%m-%d') - fighter_dob[fighter]).days/365} i += 1 data = pd.merge(data, pd.DataFrame.from_dict(fighter_ages).T, left_on = ['bout_id', 'fighter_id'], right_on = ['bout_id', 'fighter_id']) fighter_reach = {i:j for i,j in fighters[['fighter_id', 'reach']].values} fighter_height = {i:j for i,j in fighters[['fighter_id', 'height']].values} data['reach'] = data['fighter_id'].apply(lambda x: fighter_reach[x]) data['height'] = data['fighter_id'].apply(lambda x: fighter_height[x]) stats = pull_stats() bouts = pg_query(PSQL.client, "select b.bout_id, weight_id, winner, loser from ufc.bouts b join ufc.bout_results br on br.bout_id = b.bout_id join ufc.fights f on f.fight_id = b.fight_id") bouts.columns = ['bout_id', 'weight_id', 'winner', 'loser'] bouts = {i:{'winner':j, 'loser': k} for i,j,k in bouts[['bout_id', 'winner', 'loser']].values} winner_id = 0 winner = {} for b,f in stats[['bout_id', 'fighter_id']].values: if b in bouts.keys(): if bouts[b]['winner'] == f: winner[winner_id] = {'bout_id': b, 'fighter_id': f, 'won': 1} winner_id += 1 elif bouts[b]['loser'] == f: winner[winner_id] = {'bout_id': b, 'fighter_id': f, 'won': 0} winner_id += 1 else: raise ValueError() winner = pd.DataFrame.from_dict(winner).T stats = pd.merge(stats, winner, left_on = ['bout_id', 'fighter_id'], right_on = ['bout_id', 'fighter_id']) bout_len = pg_query(PSQL.client, "SELECT bout_id, length from ufc.bout_results") bout_len.columns = ['bout_id', 'length'] stats = pd.merge(stats, bout_len, left_on = 'bout_id', right_on = 'bout_id') stats = pd.merge(stats, data['fighter_id'], left_on = 'fighter_id', right_on = 'fighter_id', how = 'inner') streak_data = {} for fighter in stats.fighter_id.unique(): add_data = stats.loc[stats['fighter_id'] == fighter][['bout_id', 'fight_date', 'length', 'won']] add_data.sort_values('fight_date', inplace = True) f_streak = {} f_streak[data.loc[data['fighter_id'] == fighter]['bout_id'].values[0]] = {} f_streak[data.loc[data['fighter_id'] == fighter]['bout_id'].values[0]]['len_avg'] = add_data['length'].mean() f_streak[data.loc[data['fighter_id'] == fighter]['bout_id'].values[0]]['win_avg'] = add_data['won'].mean() last_res = add_data.iloc[-1]['won'] streak_count = 0 for res in reversed(add_data['won'].values): if res == last_res: streak_count += 1 else: break if last_res == 0: streak_count *= -1 f_streak[data.loc[data['fighter_id'] == fighter]['bout_id'].values[0]]['win_streak'] = streak_count if len(f_streak.keys()) > 0: streak_data[fighter] = f_streak streak_avg_data = {} i = 0 for k,v in streak_data.items(): for kk, vv in v.items(): vv['fighter_id'] = k vv['bout_id'] = kk streak_avg_data[i] = vv i += 1 streak_avg_data = pd.DataFrame.from_dict(streak_avg_data).T data = pd.merge(data, streak_avg_data, left_on = ['bout_id', 'fighter_id'], right_on = ['bout_id', 'fighter_id']) pred_data = {} hold_cols = ['bout_id', 'fighter_id', 'fight_date', 'opponent_id'] for bout in data['bout_id'].unique(): bout_data = data.loc[data['bout_id'] == bout].sample(frac=1) if len(bout_data) != 2: continue bout_data.reset_index(inplace = True, drop = True) bout_meta = bout_data[hold_cols] bout_data = bout_data[[i for i in list(bout_data) if i not in hold_cols]] bout_preds = {} for k,v in (bout_data.T[0] - bout_data.T[1]).to_dict().items(): bout_preds[k+'_diff'] = v for k,v in ((bout_data.T[0] + bout_data.T[1])/2).to_dict().items(): bout_preds[k+'_avg'] = v cur_cols = list(bout_data) for col in cur_cols: if '_o_' in col: bout_preds[col+'_xdif'] = bout_data.loc[0][col] - bout_data.loc[1][col.replace('_o_', '_d_')] elif '_d_' in col: bout_preds[col+'_xdif'] = bout_data.loc[0][col] - bout_data.loc[1][col.replace('_d_', '_o_')] else: continue for k,v in bout_meta.T[0].to_dict().items(): bout_preds[k] = v bout_preds.pop('bout_id') pred_data[bout] = bout_preds pred_data = pd.DataFrame.from_dict(pred_data).T pred_data.reset_index(inplace = True) pred_data.rename(columns = {'index':'bout_id'}, inplace = True) pred_data.set_index('bout_id', inplace = True) bouts = pg_query(PSQL.client, "select b.bout_id, weight_desc from ufc.bouts b join ufc.fights f on f.fight_id = b.fight_id join ufc.weights w on b.weight_id = w.weight_id") bouts.columns = ['bout_id', 'weight_id'] weights = pd.get_dummies(bouts['weight_id']) weights['index'] = bouts['bout_id'] weights.drop_duplicates(inplace = True) weights.set_index('index', inplace = True) META_X = pred_data.join(weights) return(pred_data, META_X)
PRED_X, META_X= pull_pred_data() meta = PRED_X[['fighter_id', 'opponent_id']] win_predictions, win_predicted_errors = comb_preds('winner', PRED_X, META_X) len_predictions, len_predicted_errors = comb_preds('length', PRED_X, META_X) win_predictions.name = 'win' win_predicted_errors.name = 'win_error' len_predictions.name = 'length' len_predicted_errors.name = 'length_error' predictions = meta.join(win_predictions).join(win_predicted_errors).join(len_predictions).join(len_predicted_errors) fighters = pg_query(PSQL.client, "select fighter_id, name from ufc.fighters") fighters = {k:v for k,v in fighters.values} predictions['fighter_id'] = predictions['fighter_id'].apply(lambda x: fighters[x]) predictions['opponent_id'] = predictions['opponent_id'].apply(lambda x: fighters[x]) f_to_code = pg_query(PSQL.client, "select fighter_id, name from ufc.fighters") f_to_code = {v:k for k,v in f_to_code.values} odds = {} for bout in predictions.index: print('Fight: %s VS %s' % (predictions.loc[bout]['fighter_id'], predictions.loc[bout]['opponent_id'])) win_odds = input('Vegas odds for %s: ' % (predictions.loc[bout]['fighter_id'])) win_prob = odds_converter(win_odds) lose_prob = input('Vegas odds for %s: ' % (predictions.loc[bout]['opponent_id']))
def pull_pred_data(only_avg=False): only_avg = True avg_data = pull_avg_data() if not only_avg: adj_avg_data = pull_adj_avg_data() acc_stat_dict = { 'acc_ss': ['ssa', 'sss'], 'acc_headss': ['headssa', 'headsss'], 'acc_bodyss': ['bodyssa', 'bodysss'], 'acc_legss': ['legssa', 'legsss'], 'acc_disss': ['disssa', 'dissss'], 'acc_clinss': ['clinssa', 'clinsss'], 'acc_gndss': ['gndssa', 'gndsss'], 'acc_td': ['tda', 'tds'] } share_ss_dict = { 'share_headss': ['headssa', 'headsss'], 'share_bodyss': ['bodyssa', 'bodysss'], 'share_legss': ['legssa', 'legsss'], 'share_disss': ['disssa', 'dissss'], 'share_clinss': ['clinssa', 'clinsss'], 'share_gndss': ['gndssa', 'gndsss'] } for k, v in acc_stat_dict.items(): avg_data['avg_o_' + k] = ( avg_data['avg_o_' + v[1]] / avg_data['avg_o_' + v[0]] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) avg_data['avg_d_' + k] = ( avg_data['avg_d_' + v[1]] / avg_data['avg_d_' + v[0]] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) for k, v in share_ss_dict.items(): avg_data['avg_o_' + k + '_a'] = ( avg_data['avg_o_' + v[0]] / avg_data['avg_o_' + 'ssa'] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) avg_data['avg_d_' + k + '_a'] = ( avg_data['avg_d_' + v[0]] / avg_data['avg_d_' + 'ssa'] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) avg_data['avg_o_' + k + '_s'] = ( avg_data['avg_o_' + v[1]] / avg_data['avg_o_' + 'sss'] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) avg_data['avg_d_' + k + '_s'] = ( avg_data['avg_d_' + v[1]] / avg_data['avg_d_' + 'sss'] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) if not only_avg: for k, v in acc_stat_dict.items(): adj_avg_data['adj_avg_o_' + k] = (adj_avg_data['adj_avg_o_' + v[1]] / adj_avg_data['adj_avg_o_' + v[0]] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) adj_avg_data['adj_avg_d_' + k] = (adj_avg_data['adj_avg_d_' + v[1]] / adj_avg_data['adj_avg_d_' + v[0]] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) for k, v in share_ss_dict.items(): adj_avg_data['adj_avg_o_' + k + '_a'] = (adj_avg_data['adj_avg_o_' + v[0]] / adj_avg_data['adj_avg_o_' + 'ssa'] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) adj_avg_data['adj_avg_d_' + k + '_a'] = (adj_avg_data['adj_avg_d_' + v[0]] / adj_avg_data['adj_avg_d_' + 'ssa'] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) adj_avg_data['adj_avg_o_' + k + '_s'] = (adj_avg_data['adj_avg_o_' + v[1]] / adj_avg_data['adj_avg_o_' + 'sss'] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) adj_avg_data['adj_avg_d_' + k + '_s'] = (adj_avg_data['adj_avg_d_' + v[1]] / adj_avg_data['adj_avg_d_' + 'sss'] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) data = pd.merge( avg_data, adj_avg_data, left_on=['bout_id', 'fighter_id', 'fight_date', 'opponent_id'], right_on=['bout_id', 'fighter_id', 'fight_date', 'opponent_id']) else: data = avg_data data.dropna(inplace=True) fighters = pg_query( PSQL.client, 'select fighter_id, height, reach, stance, dob from ufc.fighters') fighters.columns = ['fighter_id', 'height', 'reach', 'stance', 'dob'] fighters.set_index('fighter_id', inplace=True) fighters = fighters.join(pd.get_dummies(fighters['stance'])) fighters.drop('stance', axis=1, inplace=True) fighters.rename(columns={'': 'Missing Stance'}, inplace=True) fighters.reset_index(inplace=True) fighter_dob = {i: j for i, j in fighters[['fighter_id', 'dob']].values} i = 0 fighter_ages = {} for bout, fighter, date in data[['bout_id', 'fighter_id', 'fight_date']].values: fighter_ages[i] = { 'bout_id': bout, 'fighter_id': fighter, 'age': (datetime.strptime(str(date).split(' ')[0], '%Y-%m-%d') - fighter_dob[fighter]).days / 365 } i += 1 data = pd.merge(data, pd.DataFrame.from_dict(fighter_ages).T, left_on=['bout_id', 'fighter_id'], right_on=['bout_id', 'fighter_id']) fighter_reach = {i: j for i, j in fighters[['fighter_id', 'reach']].values} fighter_height = { i: j for i, j in fighters[['fighter_id', 'height']].values } data['reach'] = data['fighter_id'].apply(lambda x: fighter_reach[x]) data['height'] = data['fighter_id'].apply(lambda x: fighter_height[x]) bouts = pg_query( PSQL.client, "select b.bout_id, weight_id, winner, loser from ufc.bouts b join ufc.bout_results br on br.bout_id = b.bout_id join ufc.fights f on f.fight_id = b.fight_id" ) bouts.columns = ['bout_id', 'weight_id', 'winner', 'loser'] bouts = { i: { 'winner': j, 'loser': k } for i, j, k in bouts[['bout_id', 'winner', 'loser']].values } winner_id = 0 winner = {} for b, f in data[['bout_id', 'fighter_id']].values: if b in bouts.keys(): if bouts[b]['winner'] == f: winner[winner_id] = {'bout_id': b, 'fighter_id': f, 'won': 1} winner_id += 1 elif bouts[b]['loser'] == f: winner[winner_id] = {'bout_id': b, 'fighter_id': f, 'won': 0} winner_id += 1 else: raise ValueError() winner = pd.DataFrame.from_dict(winner).T data = pd.merge(data, winner, left_on=['bout_id', 'fighter_id'], right_on=['bout_id', 'fighter_id']) bout_len = pg_query(PSQL.client, "SELECT bout_id, length from ufc.bout_results") bout_len.columns = ['bout_id', 'length'] data = pd.merge(data, bout_len, left_on='bout_id', right_on='bout_id') stats = pull_stats() stats = pd.merge(stats, bout_len, left_on='bout_id', right_on='bout_id') winner = {} for b, f in stats[['bout_id', 'fighter_id']].values: if b in bouts.keys(): if bouts[b]['winner'] == f: winner[winner_id] = {'bout_id': b, 'fighter_id': f, 'won': 1} winner_id += 1 elif bouts[b]['loser'] == f: winner[winner_id] = {'bout_id': b, 'fighter_id': f, 'won': 0} winner_id += 1 else: raise ValueError() winner = pd.DataFrame.from_dict(winner).T stats = pd.merge(stats, winner, left_on=['bout_id', 'fighter_id'], right_on=['bout_id', 'fighter_id']) streak_data = {} for fighter in stats.fighter_id.unique(): add_data = stats.loc[stats['fighter_id'] == fighter][[ 'bout_id', 'fight_date', 'length', 'won' ]] add_data.sort_values('fight_date', inplace=True) f_streak = {} for i in range(len(add_data)): if i == 0: continue f_streak[add_data.iloc[i]['bout_id']] = {} f_streak[add_data.iloc[i]['bout_id']][ 'len_avg'] = add_data.iloc[:i]['length'].mean() f_streak[add_data.iloc[i] ['bout_id']]['win_avg'] = add_data.iloc[:i]['won'].mean() last_res = add_data.iloc[i - 1]['won'] streak_count = 0 for res in reversed(add_data.iloc[:i]['won'].values): if res == last_res: streak_count += 1 else: break if last_res == 0: streak_count *= -1 f_streak[add_data.iloc[i]['bout_id']]['win_streak'] = streak_count if len(f_streak.keys()) > 0: streak_data[fighter] = f_streak streak_avg_data = {} i = 0 for k, v in streak_data.items(): for kk, vv in v.items(): vv['fighter_id'] = k vv['bout_id'] = kk streak_avg_data[i] = vv i += 1 streak_avg_data = pd.DataFrame.from_dict(streak_avg_data).T data = pd.merge(data, streak_avg_data, left_on=['bout_id', 'fighter_id'], right_on=['bout_id', 'fighter_id']) pred_data_1 = {} hold_cols = ['bout_id', 'fighter_id', 'fight_date', 'opponent_id'] for bout in data['bout_id'].unique(): bout_data = data.loc[data['bout_id'] == bout] if len(bout_data) != 2: continue bout_data.reset_index(inplace=True, drop=True) bout_meta = bout_data[hold_cols] bout_data = bout_data[[ i for i in list(bout_data) if i not in hold_cols ]] bout_preds = {} for k, v in (bout_data.T[0] - bout_data.T[1]).to_dict().items(): bout_preds[k + '_diff'] = v for k, v in ((bout_data.T[0] + bout_data.T[1]) / 2).to_dict().items(): bout_preds[k + '_avg'] = v bout_preds.pop('won_avg') bout_preds.keys() bout_preds.pop('length_diff') cur_cols = list(bout_data) for col in cur_cols: if '_o_' in col: bout_preds[col + '_xdif'] = bout_data.loc[0][col] - bout_data.loc[1][ col.replace('_o_', '_d_')] elif '_d_' in col: bout_preds[col + '_xdif'] = bout_data.loc[0][col] - bout_data.loc[1][ col.replace('_d_', '_o_')] else: continue for k, v in bout_meta.T[0].to_dict().items(): bout_preds[k] = v bout_preds.pop('bout_id') pred_data_1[bout] = bout_preds pred_data_1 = pd.DataFrame.from_dict(pred_data_1).T pred_data_1.reset_index(inplace=True) pred_data_1.rename(columns={'index': 'bout_id'}, inplace=True) # pred_data = pd.merge(pred_data, bout_len, left_on = 'bout_id', right_on = 'bout_id') pred_data_1.rename(columns={'won_diff': 'winner'}, inplace=True) pred_data_1.rename(columns={'length_avg': 'length'}, inplace=True) pred_data_2 = {} for bout in data['bout_id'].unique(): bout_data = data.loc[data['bout_id'] == bout] if len(bout_data) != 2: continue bout_data = bout_data.iloc[[1, 0]] bout_data.reset_index(inplace=True, drop=True) bout_meta = bout_data[hold_cols] bout_data = bout_data[[ i for i in list(bout_data) if i not in hold_cols ]] bout_preds = {} for k, v in (bout_data.T[0] - bout_data.T[1]).to_dict().items(): bout_preds[k + '_diff'] = v for k, v in ((bout_data.T[0] + bout_data.T[1]) / 2).to_dict().items(): bout_preds[k + '_avg'] = v bout_preds.pop('won_avg') bout_preds.keys() bout_preds.pop('length_diff') cur_cols = list(bout_data) for col in cur_cols: if '_o_' in col: bout_preds[col + '_xdif'] = bout_data.loc[0][col] - bout_data.loc[1][ col.replace('_o_', '_d_')] elif '_d_' in col: bout_preds[col + '_xdif'] = bout_data.loc[0][col] - bout_data.loc[1][ col.replace('_d_', '_o_')] else: continue for k, v in bout_meta.T[0].to_dict().items(): bout_preds[k] = v bout_preds.pop('bout_id') pred_data_2[bout] = bout_preds pred_data_2 = pd.DataFrame.from_dict(pred_data_2).T pred_data_2.reset_index(inplace=True) pred_data_2.rename(columns={'index': 'bout_id'}, inplace=True) # pred_data = pd.merge(pred_data, bout_len, left_on = 'bout_id', right_on = 'bout_id') pred_data_2.rename(columns={'won_diff': 'winner'}, inplace=True) pred_data_2.rename(columns={'length_avg': 'length'}, inplace=True) pred_data = pred_data_1.append(pred_data_2) # pred_data.drop('won_avg', axis = 1, inplace = True) [i for i in list(pred_data) if 'len' in i] # pred_data_length = pred_data[[i for i in list(pred_data) if i != 'winner']] pred_data_winner = pred_data[[i for i in list(pred_data) if i != 'length']] pred_data_winner.set_index('bout_id', inplace=True) if only_avg: pred_data_winner.to_csv( os.path.join(cur_path, 'data', 'only_avg', 'winner_data.csv')) else: pred_data_winner.to_csv( os.path.join(cur_path, 'data', 'winner_data.csv'))
from _connections import db_connection from pop_psql import pg_query pred_data = pd.read_csv( os.path.join(cur_path, 'test_data', 'pred_data_TEST.csv')) pred_data.drop('Unnamed: 0', axis=1, inplace=True) pred_data.set_index('bout_id', inplace=True) hold_cols = ['fighter_id', 'fight_date', 'opponent_id'] meta = pred_data[hold_cols] X = pred_data[[i for i in list(pred_data) if i not in hold_cols]] PSQL = db_connection('psql') bouts = pg_query( PSQL.client, "select b.bout_id, weight_desc from ufc.bouts b join ufc.fights f on f.fight_id = b.fight_id join ufc.weights w on b.weight_id = w.weight_id" ) bouts.columns = ['bout_id', 'weight_id'] weights = pd.get_dummies(bouts['weight_id']) weights['index'] = bouts['bout_id'] weights.drop_duplicates(inplace=True) weights.set_index('index', inplace=True) meta_X = X.join(weights) length_preds = {} length_error_preds = {} for mod in os.listdir(os.path.join(cur_path, 'fit_models', 'length')): length_model = load(os.path.join(cur_path, 'fit_models', 'length', mod)) feats_folder = os.path.join(cur_path, 'modelling', 'length', 'final', 'features') with open(os.path.join(feats_folder, '%s.json' % (mod.split('.')[0])),
def pull_pred_data(avg_data, adj_avg_data): avg_data = pd.read_csv( os.path.join(cur_path, 'test_data', 'avg_data_TEST.csv')) avg_data.drop('Unnamed: 0', axis=1, inplace=True) adj_avg_data = pd.read_csv( os.path.join(cur_path, 'test_data', 'adj_avg_data_TEST.csv')) adj_avg_data.drop('Unnamed: 0', axis=1, inplace=True) acc_stat_dict = { 'acc_ss': ['ssa', 'sss'], 'acc_headss': ['headssa', 'headsss'], 'acc_bodyss': ['bodyssa', 'bodysss'], 'acc_legss': ['legssa', 'legsss'], 'acc_disss': ['disssa', 'dissss'], 'acc_clinss': ['clinssa', 'clinsss'], 'acc_gndss': ['gndssa', 'gndsss'], 'acc_td': ['tda', 'tds'] } share_ss_dict = { 'share_headss': ['headssa', 'headsss'], 'share_bodyss': ['bodyssa', 'bodysss'], 'share_legss': ['legssa', 'legsss'], 'share_disss': ['disssa', 'dissss'], 'share_clinss': ['clinssa', 'clinsss'], 'share_gndss': ['gndssa', 'gndsss'] } for k, v in acc_stat_dict.items(): avg_data['avg_o_' + k] = ( avg_data['avg_o_' + v[1]] / avg_data['avg_o_' + v[0]] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) avg_data['avg_d_' + k] = ( avg_data['avg_d_' + v[1]] / avg_data['avg_d_' + v[0]] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) for k, v in share_ss_dict.items(): avg_data['avg_o_' + k + '_a'] = ( avg_data['avg_o_' + v[0]] / avg_data['avg_o_' + 'ssa'] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) avg_data['avg_d_' + k + '_a'] = ( avg_data['avg_d_' + v[0]] / avg_data['avg_d_' + 'ssa'] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) avg_data['avg_o_' + k + '_s'] = ( avg_data['avg_o_' + v[1]] / avg_data['avg_o_' + 'sss'] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) avg_data['avg_d_' + k + '_s'] = ( avg_data['avg_d_' + v[1]] / avg_data['avg_d_' + 'sss'] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) for k, v in acc_stat_dict.items(): adj_avg_data['adj_avg_o_' + k] = ( adj_avg_data['adj_avg_o_' + v[1]] / adj_avg_data['adj_avg_o_' + v[0]] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) adj_avg_data['adj_avg_d_' + k] = ( adj_avg_data['adj_avg_d_' + v[1]] / adj_avg_data['adj_avg_d_' + v[0]] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) for k, v in share_ss_dict.items(): adj_avg_data['adj_avg_o_' + k + '_a'] = ( adj_avg_data['adj_avg_o_' + v[0]] / adj_avg_data['adj_avg_o_' + 'ssa'] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) adj_avg_data['adj_avg_d_' + k + '_a'] = ( adj_avg_data['adj_avg_d_' + v[0]] / adj_avg_data['adj_avg_d_' + 'ssa'] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) adj_avg_data['adj_avg_o_' + k + '_s'] = ( adj_avg_data['adj_avg_o_' + v[1]] / adj_avg_data['adj_avg_o_' + 'sss'] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) adj_avg_data['adj_avg_d_' + k + '_s'] = ( adj_avg_data['adj_avg_d_' + v[1]] / adj_avg_data['adj_avg_d_' + 'sss'] ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) data = pd.merge( avg_data, adj_avg_data, left_on=['bout_id', 'fighter_id', 'fight_date', 'opponent_id'], right_on=['bout_id', 'fighter_id', 'fight_date', 'opponent_id']) data.dropna(inplace=True) fighters = pg_query( PSQL.client, 'select fighter_id, height, reach, stance, dob from ufc.fighters') fighters.columns = ['fighter_id', 'height', 'reach', 'stance', 'dob'] fighters.set_index('fighter_id', inplace=True) fighters = fighters.join(pd.get_dummies(fighters['stance'])) fighters.drop('stance', axis=1, inplace=True) fighters.rename(columns={'': 'Missing Stance'}, inplace=True) fighters.reset_index(inplace=True) fighter_dob = {i: j for i, j in fighters[['fighter_id', 'dob']].values} i = 0 fighter_ages = {} for bout, fighter, date in data[['bout_id', 'fighter_id', 'fight_date']].values: fighter_ages[i] = { 'bout_id': bout, 'fighter_id': fighter, 'age': (datetime.strptime(date.split(' ')[0], '%Y-%m-%d') - fighter_dob[fighter]).days / 365 } i += 1 data = pd.merge(data, pd.DataFrame.from_dict(fighter_ages).T, left_on=['bout_id', 'fighter_id'], right_on=['bout_id', 'fighter_id']) fighter_reach = {i: j for i, j in fighters[['fighter_id', 'reach']].values} fighter_height = { i: j for i, j in fighters[['fighter_id', 'height']].values } data['reach'] = data['fighter_id'].apply(lambda x: fighter_reach[x]) data['height'] = data['fighter_id'].apply(lambda x: fighter_height[x]) pred_data = {} hold_cols = ['bout_id', 'fighter_id', 'fight_date', 'opponent_id'] for bout in data['bout_id'].unique(): bout_data = data.loc[data['bout_id'] == bout].sample(frac=1) if len(bout_data) != 2: continue bout_data.reset_index(inplace=True, drop=True) bout_meta = bout_data[hold_cols] bout_data = bout_data[[ i for i in list(bout_data) if i not in hold_cols ]] bout_preds = {} for k, v in (bout_data.T[0] - bout_data.T[1]).to_dict().items(): bout_preds[k + '_diff'] = v for k, v in ((bout_data.T[0] + bout_data.T[1]) / 2).to_dict().items(): bout_preds[k + '_avg'] = v cur_cols = list(bout_data) for col in cur_cols: if '_o_' in col: bout_preds[col + '_xdif'] = bout_data.loc[0][col] - bout_data.loc[1][ col.replace('_o_', '_d_')] elif '_d_' in col: bout_preds[col + '_xdif'] = bout_data.loc[0][col] - bout_data.loc[1][ col.replace('_d_', '_o_')] else: continue # bout_preds.pop('won_x') for k, v in bout_meta.T[0].to_dict().items(): bout_preds[k] = v bout_preds.pop('bout_id') pred_data[bout] = bout_preds pred_data = pd.DataFrame.from_dict(pred_data).T pred_data.reset_index(inplace=True) pred_data.rename(columns={'index': 'bout_id'}, inplace=True) pred_data.to_csv(os.path.join(cur_path, 'test_data', 'pred_data_TEST.csv'))
def pull_pred_data(avg_data, adj_avg_data): if os.path.exists(os.path.join(cur_path, 'test_data', 'pred_data_winner.csv')) and os.path.exists(os.path.join(cur_path, 'pred_data_length.csv')): pred_data_winner = pd.read_csv(os.path.join(cur_path, 'test_data', 'pred_data_winner.csv')) pred_data_winner.drop('Unnamed: 0', inplace = True, axis = 1) pred_data_length = pd.read_csv(os.path.join(cur_path, 'test_data', 'pred_data_length.csv')) pred_data_length.drop('Unnamed: 0', inplace = True, axis = 1) else: acc_stat_dict = {'acc_ss': ['ssa', 'sss'], 'acc_headss': ['headssa', 'headsss'], 'acc_bodyss': ['bodyssa', 'bodysss'], 'acc_legss': ['legssa', 'legsss'], 'acc_disss': ['disssa', 'dissss'], 'acc_clinss': ['clinssa', 'clinsss'], 'acc_gndss': ['gndssa', 'gndsss'], 'acc_td': ['tda', 'tds']} share_ss_dict = {'share_headss': ['headssa', 'headsss'], 'share_bodyss': ['bodyssa', 'bodysss'], 'share_legss': ['legssa', 'legsss'], 'share_disss': ['disssa', 'dissss'], 'share_clinss': ['clinssa', 'clinsss'], 'share_gndss': ['gndssa', 'gndsss']} for k, v in acc_stat_dict.items(): avg_data['avg_o_'+k] = (avg_data['avg_o_'+v[1]] / avg_data['avg_o_'+v[0]]).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) avg_data['avg_d_'+k] = (avg_data['avg_d_'+v[1]] / avg_data['avg_d_'+v[0]]).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) for k, v in share_ss_dict.items(): avg_data['avg_o_'+k+'_a'] = (avg_data['avg_o_'+v[0]]/avg_data['avg_o_'+'ssa']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) avg_data['avg_d_'+k+'_a'] = (avg_data['avg_d_'+v[0]]/avg_data['avg_d_'+'ssa']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) avg_data['avg_o_'+k+'_s'] = (avg_data['avg_o_'+v[1]]/avg_data['avg_o_'+'sss']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) avg_data['avg_d_'+k+'_s'] = (avg_data['avg_d_'+v[1]]/avg_data['avg_d_'+'sss']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) for k, v in acc_stat_dict.items(): adj_avg_data['adj_avg_o_'+k] = (adj_avg_data['adj_avg_o_'+v[1]] / adj_avg_data['adj_avg_o_'+v[0]]).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) adj_avg_data['adj_avg_d_'+k] = (adj_avg_data['adj_avg_d_'+v[1]] / adj_avg_data['adj_avg_d_'+v[0]]).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) for k, v in share_ss_dict.items(): adj_avg_data['adj_avg_o_'+k+'_a'] = (adj_avg_data['adj_avg_o_'+v[0]]/adj_avg_data['adj_avg_o_'+'ssa']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) adj_avg_data['adj_avg_d_'+k+'_a'] = (adj_avg_data['adj_avg_d_'+v[0]]/adj_avg_data['adj_avg_d_'+'ssa']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) adj_avg_data['adj_avg_o_'+k+'_s'] = (adj_avg_data['adj_avg_o_'+v[1]]/adj_avg_data['adj_avg_o_'+'sss']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) adj_avg_data['adj_avg_d_'+k+'_s'] = (adj_avg_data['adj_avg_d_'+v[1]]/adj_avg_data['adj_avg_d_'+'sss']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0) data = pd.merge(avg_data, adj_avg_data, left_on = ['bout_id', 'fighter_id', 'fight_date', 'opponent_id'], right_on = ['bout_id', 'fighter_id', 'fight_date', 'opponent_id']) data.dropna(inplace = True) fighters = pg_query(PSQL.client, 'select fighter_id, height, reach, stance, dob from ufc.fighters') fighters.columns = ['fighter_id', 'height', 'reach', 'stance', 'dob'] fighters.set_index('fighter_id', inplace = True) fighters = fighters.join(pd.get_dummies(fighters['stance'])) fighters.drop('stance', axis = 1, inplace = True) fighters.rename(columns = {'': 'Missing Stance'}, inplace = True) fighters.reset_index(inplace = True) fighter_dob = {i:j for i,j in fighters[['fighter_id', 'dob']].values} i = 0 fighter_ages = {} for bout, fighter, date in data[['bout_id', 'fighter_id', 'fight_date']].values: fighter_ages[i] = {'bout_id': bout, 'fighter_id':fighter, 'age': (datetime.strptime(date, '%Y-%m-%d') - fighter_dob[fighter]).days/365} i += 1 data = pd.merge(data, pd.DataFrame.from_dict(fighter_ages).T, left_on = ['bout_id', 'fighter_id'], right_on = ['bout_id', 'fighter_id']) fighter_reach = {i:j for i,j in fighters[['fighter_id', 'reach']].values} fighter_height = {i:j for i,j in fighters[['fighter_id', 'height']].values} data['reach'] = data['fighter_id'].apply(lambda x: fighter_reach[x]) data['height'] = data['fighter_id'].apply(lambda x: fighter_height[x]) bouts = pg_query(PSQL.client, "select b.bout_id, weight_id, winner, loser from ufc.bouts b join ufc.bout_results br on br.bout_id = b.bout_id join ufc.fights f on f.fight_id = b.fight_id") bouts.columns = ['bout_id', 'weight_id', 'winner', 'loser'] bouts = {i:{'winner':j, 'loser': k} for i,j,k in bouts[['bout_id', 'winner', 'loser']].values} winner_id = 0 winner = {} for b,f in data[['bout_id', 'fighter_id']].values: if b in bouts.keys(): if bouts[b]['winner'] == f: winner[winner_id] = {'bout_id': b, 'fighter_id': f, 'won': 1} winner_id += 1 elif bouts[b]['loser'] == f: winner[winner_id] = {'bout_id': b, 'fighter_id': f, 'won': 0} winner_id += 1 else: raise ValueError() winner = pd.DataFrame.from_dict(winner).T data = pd.merge(data, winner, left_on = ['bout_id', 'fighter_id'], right_on = ['bout_id', 'fighter_id']) bout_len = pg_query(PSQL.client, "SELECT bout_id, length from ufc.bout_results") bout_len.columns = ['bout_id', 'length'] pred_data = {} hold_cols = ['bout_id', 'fighter_id', 'fight_date', 'opponent_id'] for bout in data['bout_id'].unique(): bout_data = data.loc[data['bout_id'] == bout].sample(frac=1) if len(bout_data) != 2: continue bout_data.reset_index(inplace = True, drop = True) bout_meta = bout_data[hold_cols] bout_data = bout_data[[i for i in list(bout_data) if i not in hold_cols]] bout_preds = {} for k,v in (bout_data.T[0] - bout_data.T[1]).to_dict().items(): bout_preds[k+'_diff'] = v for k,v in ((bout_data.T[0] + bout_data.T[1])/2).to_dict().items(): bout_preds[k+'_avg'] = v cur_cols = list(bout_data) for col in cur_cols: if '_o_' in col: bout_preds[col+'_xdif'] = bout_data.loc[0][col] - bout_data.loc[1][col.replace('_o_', '_d_')] elif '_d_' in col: bout_preds[col+'_xdif'] = bout_data.loc[0][col] - bout_data.loc[1][col.replace('_d_', '_o_')] else: continue # bout_preds.pop('won_x') for k,v in bout_meta.T[0].to_dict().items(): bout_preds[k] = v bout_preds.pop('bout_id') pred_data[bout] = bout_preds pred_data = pd.DataFrame.from_dict(pred_data).T pred_data.reset_index(inplace = True) pred_data.rename(columns = {'index':'bout_id'}, inplace = True) pred_data = pd.merge(pred_data, bout_len, left_on = 'bout_id', right_on = 'bout_id') pred_data.rename(columns = {'won_diff': 'winner'}, inplace = True) pred_data.drop('won_avg', axis = 1, inplace = True) pred_data_length = pred_data[[i for i in list(pred_data) if i != 'winner']] pred_data_winner = pred_data[[i for i in list(pred_data) if i != 'length']] pred_data_winner.to_csv(os.path.join(cur_path, 'test_data', 'pred_data_winner.csv')) pred_data_length.to_csv(os.path.join(cur_path, 'test_data', 'pred_data_length.csv')) return(pred_data_winner, pred_data_length)