예제 #1
0
파일: proc.py 프로젝트: eghensley/ufc
def pull_stats():
    if os.path.exists(os.path.join(cur_path, 'test_data', 'stats.csv')):
        stats = pd.read_csv(os.path.join(cur_path, 'test_data', 'stats.csv'))
        stats.drop('Unnamed: 0', axis = 1, inplace = True)
    else:
        stats = pg_query(PSQL.client, 'SELECT bs.bout_id, date, fighter_id, kd, ssa, sss, tsa, tss, sub, pas, rev, headssa, headsss, bodyssa, bodysss, legssa, legsss, disssa, dissss, clinssa, clinsss, gndssa, gndsss, tda, tds FROM ufc.bout_stats bs join ufc.bouts b on b.bout_id = bs.bout_id join ufc.fights f on f.fight_id = b.fight_id where champ is false;')
        stats.columns = ['bout_id', 'fight_date', 'fighter_id', 'kd', 'ssa', 'sss', 'tsa', 'tss', 'sub', 'pas', 'rev', 'headssa', 'headsss', 'bodyssa', 'bodysss', 'legssa', 'legsss', 'disssa', 'dissss', 'clinssa', 'clinsss', 'gndssa', 'gndsss', 'tda', 'tds']
        
        bouts = pg_query(PSQL.client, "select b.bout_id, weight_id, winner, loser from ufc.bouts b join ufc.bout_results br on br.bout_id = b.bout_id join ufc.fights f on f.fight_id = b.fight_id")
        bouts.columns = ['bout_id', 'weight_id', 'winner', 'loser']
        opponents = {i+j:k for i,j,k in bouts[['bout_id', 'winner', 'loser']].values}
        for i,j,k in bouts[['bout_id', 'loser', 'winner']].values:
            opponents[i+j] = k
        bouts = {i:{'winner':j, 'loser': k} for i,j,k in bouts[['bout_id', 'winner', 'loser']].values}
        stats['opponent_id'] = (stats['bout_id'] + stats['fighter_id']).apply(lambda x: opponents[x] if x in opponents.keys() else np.nan)
        stats.dropna(inplace = True)
        
        bout_len = pg_query(PSQL.client, "SELECT bout_id, length from ufc.bout_results")
        bout_len.columns = ['bout_id', 'length']
        stats = pd.merge(stats, bout_len, left_on = 'bout_id', right_on = 'bout_id')
        for col in cols:
            stats[col] = stats[col] / stats['length']
        stats.drop('length', axis = 1, inplace = True)
        
        def_stats = deepcopy(stats)
        def_stats.drop('fighter_id', axis = 1, inplace = True)
        def_stats.rename(columns = {'opponent_id': 'fighter_id'}, inplace = True)
        def_stats.rename(columns = {i: 'd_'+i for i in cols}, inplace = True)
        
        stats = pd.merge(stats, def_stats, left_on = ['fighter_id', 'bout_id', 'fight_date'], right_on = ['fighter_id', 'bout_id', 'fight_date'])
        def_stats = None
        stats.to_csv(os.path.join(cur_path, 'test_data', 'stats.csv'))
    return(stats)
예제 #2
0
def pull_adj_avg_data():
    data = pg_query(PSQL.client, 'Select * from ufc.adj_avg_stats')
    data.columns = [
        'fighter_id', 'bout_id', 'adj_avg_d_bodyssa', 'adj_avg_d_bodysss',
        'adj_avg_d_clinssa', 'adj_avg_d_clinsss', 'adj_avg_d_disssa',
        'adj_avg_d_dissss', 'adj_avg_d_gndssa', 'adj_avg_d_gndsss',
        'adj_avg_d_headssa', 'adj_avg_d_headsss', 'adj_avg_d_kd',
        'adj_avg_d_legssa', 'adj_avg_d_legsss', 'adj_avg_d_pas',
        'adj_avg_d_rev', 'adj_avg_d_ssa', 'adj_avg_d_sss', 'adj_avg_d_sub',
        'adj_avg_d_tda', 'adj_avg_d_tds', 'adj_avg_d_tsa', 'adj_avg_d_tss',
        'adj_avg_o_bodyssa', 'adj_avg_o_bodysss', 'adj_avg_o_clinssa',
        'adj_avg_o_clinsss', 'adj_avg_o_disssa', 'adj_avg_o_dissss',
        'adj_avg_o_gndssa', 'adj_avg_o_gndsss', 'adj_avg_o_headssa',
        'adj_avg_o_headsss', 'adj_avg_o_kd', 'adj_avg_o_legssa',
        'adj_avg_o_legsss', 'adj_avg_o_pas', 'adj_avg_o_rev', 'adj_avg_o_ssa',
        'adj_avg_o_sss', 'adj_avg_o_sub', 'adj_avg_o_tda', 'adj_avg_o_tds',
        'adj_avg_o_tsa', 'adj_avg_o_tss'
    ]
    opponents = pg_query(PSQL.client, 'Select * from ufc.bout_fighter_xref')
    opponents.columns = ['bout_id', 'fighter_id', 'opponent_id']

    data = pd.merge(data,
                    opponents,
                    left_on=['bout_id', 'fighter_id'],
                    right_on=['bout_id', 'fighter_id'])

    dates = pg_query(
        PSQL.client,
        'Select bout_id, date from ufc.fights f join ufc.bouts b on b.fight_id = f.fight_id'
    )
    dates.columns = ['bout_id', 'fight_date']

    data = pd.merge(data, dates, left_on='bout_id', right_on='bout_id')
    return (data)
예제 #3
0
def store_meta_res(domain):
    #    domain = 'length'
    X, Y = pull_val_data(domain)

    pred_df = pd.DataFrame(Y)
    #    res_df = pd.DataFrame()
    final_model_folder = os.path.join(cur_path, 'model_tuning', 'modelling',
                                      domain, 'final', 'models')
    for mod_name in os.listdir(final_model_folder):
        if mod_name == '.DS_Store':
            continue
        model_path = os.listdir(os.path.join(final_model_folder, mod_name))
        model = load(os.path.join(final_model_folder, mod_name, model_path[0]))
        feats_folder = os.path.join(cur_path, 'model_tuning', 'modelling',
                                    domain, 'final', 'features')
        with open(os.path.join(feats_folder, '%s.json' % (mod_name)),
                  'r') as fp:
            feats = json.load(fp)
            feats = feats[max(feats.keys())]
        scale_folder = os.path.join(cur_path, 'model_tuning', 'modelling',
                                    domain, 'final', 'scalers', mod_name)
        scale_path = os.path.join(scale_folder,
                                  os.listdir(os.path.join(scale_folder))[0])
        scale = load(scale_path)
        mod_preds = cross_validate(X[feats], Y, model, scale)
        mod_preds.rename(columns={0: mod_name}, inplace=True)
        pred_df = pred_df.join(mod_preds)

    pred_cols = [i for i in list(pred_df) if i != domain]

    mod_scores = {}
    for idx in pred_df.index:
        mod_scores[idx] = {}
        row = pred_df.loc[idx]
        for mod in pred_cols:
            if domain == 'winner':
                row_score = abs(row[domain] - row[mod])
            elif domain == 'length':
                row_score = abs(row[domain] - row[mod])
            mod_scores[idx][mod] = row_score
    mod_scores = pd.DataFrame.from_dict(mod_scores).T

    meta_data = mod_scores.join(X)
    PSQL = db_connection('psql')
    bouts = pg_query(
        PSQL.client,
        "select b.bout_id, weight_desc from ufc.bouts b join ufc.bout_results br on br.bout_id = b.bout_id join ufc.fights f on f.fight_id = b.fight_id join ufc.weights w on b.weight_id = w.weight_id"
    )
    bouts.columns = ['bout_id', 'weight_id']
    weights = pd.get_dummies(bouts['weight_id'])
    weights['index'] = bouts['bout_id']
    weights.drop_duplicates(inplace=True)
    weights.set_index('index', inplace=True)
    meta_data = meta_data.join(weights)
    meta_data.to_csv(
        os.path.join(cur_path, 'data', 'meta', 'meta_%s.csv' % (domain)))
예제 #4
0
def pull_avg_data(stats):

    stats = pd.read_csv(os.path.join(cur_path, 'test_data', 'stats.csv'))
    stats.drop('Unnamed: 0', axis=1, inplace=True)

    fut_bouts = pg_query(
        PSQL.client,
        "SELECT date, bx.bout_id, bx.fighter_id, opponent_id FROM ufc.bout_fighter_xref bx join ufc.bouts b on b.bout_id = bx.bout_id join ufc.fights f on f.fight_id = b.fight_id"
    )
    fut_bouts.columns = ['date', 'bout_id', 'fighter_id', 'opponent_id']
    stat_avgs = {}
    for date, bout, fighter, opponent in fut_bouts.values:
        f_stats = {}

        ostat = stats.loc[stats['fighter_id'] == fighter]
        ostat.sort_values('fight_date', inplace=True)

        dstat = stats.loc[stats['opponent_id'] == fighter]
        dstat.sort_values('fight_date', inplace=True)

        if len(dstat) != len(ostat):
            raise ValueError()

        if (dstat[['bout_id', 'fight_date']].values !=
                ostat[['bout_id', 'fight_date']].values).any():
            raise ValueError()

        f_stats[bout] = {'fight_date': date, 'opponent_id': opponent}

        for col in cols:
            f_stats[bout]['avg_o_' + col] = ostat[col].mean()

        for col in cols:
            f_stats[bout]['avg_d_' + col] = dstat['d_' + col].mean()

        if len(f_stats.keys()) > 0:
            stat_avgs[fighter] = f_stats

    avg_data = {}
    i = 0
    for k, v in stat_avgs.items():
        for kk, vv in v.items():
            vv['fighter_id'] = k
            vv['bout_id'] = kk
            avg_data[i] = vv
            i += 1

    avg_data = pd.DataFrame.from_dict(avg_data).T
    avg_data.to_csv(os.path.join(cur_path, 'test_data', 'avg_data_TEST.csv'))
    return (avg_data)
예제 #5
0
mod_scores = {}
for idx in pred_df.index:
    mod_scores[idx] = {}
    row = pred_df.loc[idx]
    for mod in pred_cols:
        row_score = logloss(row['winner'], row[mod])
        mod_scores[idx][mod] = row_score
mod_scores = pd.DataFrame.from_dict(mod_scores).T

meta_data = mod_scores.join(X)
for col in [[i for i in list(meta_data) if i not in pred_cols]]:
    meta_data[col] = StandardScaler().fit_transform(meta_data[col])

PSQL = db_connection('psql')
bouts = pg_query(
    PSQL.client,
    "select b.bout_id, weight_desc from ufc.bouts b join ufc.bout_results br on br.bout_id = b.bout_id join ufc.fights f on f.fight_id = b.fight_id join ufc.weights w on b.weight_id = w.weight_id"
)
bouts.columns = ['bout_id', 'weight_id']
weights = pd.get_dummies(bouts['weight_id'])
weights['index'] = bouts['bout_id']
weights.drop_duplicates(inplace=True)
weights.set_index('index', inplace=True)
meta_data = meta_data.join(weights)
meta_data.to_csv(os.path.join(cur_path, 'test_data', 'pred_res_winner.csv'))

adsfasfda

model_feats = {}
for mod in pred_cols:

    meta_y = (meta_data[mod] - 0.693) * -1
예제 #6
0
def pull_pred_data():
    avg_data = pull_avg_data()
    adj_avg_data = pull_adj_avg_data()
    
    nxt_bouts = pg_query(PSQL.client, "select bx.bout_id, fighter_id, opponent_id, date from ufc.bout_fighter_xref bx join ufc.bouts b on b.bout_id = bx.bout_id join ufc.fights f on f.fight_id = b.fight_id where b.fight_id = '%s';" % ('351264d11286d09a'))
    nxt_bouts.columns = ['bout_id', 'fighter_id', 'opponent_id', 'fight_date']

    avg_data = pd.merge(avg_data, nxt_bouts, left_on = ['bout_id', 'fighter_id', 'opponent_id', 'fight_date'], right_on = ['bout_id', 'fighter_id', 'opponent_id', 'fight_date'], how = 'inner')
    adj_avg_data = pd.merge(adj_avg_data, nxt_bouts, left_on = ['bout_id', 'fighter_id', 'opponent_id', 'fight_date'], right_on = ['bout_id', 'fighter_id', 'opponent_id', 'fight_date'], how = 'inner')

    acc_stat_dict = {'acc_ss': ['ssa', 'sss'],
                       'acc_headss': ['headssa', 'headsss'],
                       'acc_bodyss': ['bodyssa', 'bodysss'],
                       'acc_legss': ['legssa', 'legsss'],
                       'acc_disss': ['disssa', 'dissss'],
                       'acc_clinss': ['clinssa', 'clinsss'],
                       'acc_gndss': ['gndssa', 'gndsss'],
                       'acc_td': ['tda', 'tds']}
    share_ss_dict = {'share_headss': ['headssa', 'headsss'],
                       'share_bodyss': ['bodyssa', 'bodysss'],
                       'share_legss': ['legssa', 'legsss'],
                       'share_disss': ['disssa', 'dissss'],
                       'share_clinss': ['clinssa', 'clinsss'],
                       'share_gndss': ['gndssa', 'gndsss']}
    
    for k, v in acc_stat_dict.items():
        avg_data['avg_o_'+k] = (avg_data['avg_o_'+v[1]] / avg_data['avg_o_'+v[0]]).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
        avg_data['avg_d_'+k] = (avg_data['avg_d_'+v[1]] / avg_data['avg_d_'+v[0]]).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
    
    for k, v in share_ss_dict.items():
        avg_data['avg_o_'+k+'_a'] = (avg_data['avg_o_'+v[0]]/avg_data['avg_o_'+'ssa']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
        avg_data['avg_d_'+k+'_a'] = (avg_data['avg_d_'+v[0]]/avg_data['avg_d_'+'ssa']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
    
        avg_data['avg_o_'+k+'_s'] = (avg_data['avg_o_'+v[1]]/avg_data['avg_o_'+'sss']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
        avg_data['avg_d_'+k+'_s'] = (avg_data['avg_d_'+v[1]]/avg_data['avg_d_'+'sss']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
    
    
    for k, v in acc_stat_dict.items():
        adj_avg_data['adj_avg_o_'+k] = (adj_avg_data['adj_avg_o_'+v[1]] / adj_avg_data['adj_avg_o_'+v[0]]).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
        adj_avg_data['adj_avg_d_'+k] = (adj_avg_data['adj_avg_d_'+v[1]] / adj_avg_data['adj_avg_d_'+v[0]]).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
    
    for k, v in share_ss_dict.items():
        adj_avg_data['adj_avg_o_'+k+'_a'] = (adj_avg_data['adj_avg_o_'+v[0]]/adj_avg_data['adj_avg_o_'+'ssa']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
        adj_avg_data['adj_avg_d_'+k+'_a'] = (adj_avg_data['adj_avg_d_'+v[0]]/adj_avg_data['adj_avg_d_'+'ssa']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
    
        adj_avg_data['adj_avg_o_'+k+'_s'] = (adj_avg_data['adj_avg_o_'+v[1]]/adj_avg_data['adj_avg_o_'+'sss']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
        adj_avg_data['adj_avg_d_'+k+'_s'] = (adj_avg_data['adj_avg_d_'+v[1]]/adj_avg_data['adj_avg_d_'+'sss']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
    
    
    data = pd.merge(avg_data, adj_avg_data, left_on = ['bout_id', 'fighter_id', 'fight_date', 'opponent_id'], right_on = ['bout_id', 'fighter_id', 'fight_date', 'opponent_id'])
    data.dropna(inplace = True)
    fighters = pg_query(PSQL.client, 'select fighter_id, height, reach, stance, dob from ufc.fighters')
    fighters.columns = ['fighter_id', 'height', 'reach', 'stance', 'dob']
    fighters.set_index('fighter_id', inplace = True)
    fighters = fighters.join(pd.get_dummies(fighters['stance']))
    fighters.drop('stance', axis = 1, inplace = True)
    fighters.rename(columns = {'': 'Missing Stance'}, inplace = True)
    fighters.reset_index(inplace = True)
    fighter_dob = {i:j for i,j in fighters[['fighter_id', 'dob']].values}
    
    i = 0
    fighter_ages = {}
    for bout, fighter, date in data[['bout_id', 'fighter_id', 'fight_date']].values:
        fighter_ages[i] = {'bout_id': bout, 'fighter_id':fighter, 'age': (datetime.strptime(str(date).split(' ')[0], '%Y-%m-%d') - fighter_dob[fighter]).days/365}
        i += 1
    data = pd.merge(data, pd.DataFrame.from_dict(fighter_ages).T, left_on = ['bout_id', 'fighter_id'], right_on = ['bout_id', 'fighter_id'])
    
    fighter_reach = {i:j for i,j in fighters[['fighter_id', 'reach']].values}
    fighter_height = {i:j for i,j in fighters[['fighter_id', 'height']].values}   
    
    data['reach'] = data['fighter_id'].apply(lambda x: fighter_reach[x])
    data['height'] = data['fighter_id'].apply(lambda x: fighter_height[x])
    

    stats = pull_stats()    
    bouts = pg_query(PSQL.client, "select b.bout_id, weight_id, winner, loser from ufc.bouts b join ufc.bout_results br on br.bout_id = b.bout_id join ufc.fights f on f.fight_id = b.fight_id")
    bouts.columns = ['bout_id', 'weight_id', 'winner', 'loser']
    bouts = {i:{'winner':j, 'loser': k} for i,j,k in bouts[['bout_id', 'winner', 'loser']].values}
    winner_id = 0
    winner = {}
    for b,f in stats[['bout_id', 'fighter_id']].values:
        if b in bouts.keys():
            if bouts[b]['winner'] == f:
                winner[winner_id] = {'bout_id': b, 'fighter_id': f, 'won': 1}
                winner_id += 1
            elif bouts[b]['loser'] == f:
                winner[winner_id] = {'bout_id': b, 'fighter_id': f, 'won': 0}
                winner_id += 1
            else:
                raise ValueError()
    winner = pd.DataFrame.from_dict(winner).T
    stats = pd.merge(stats, winner, left_on = ['bout_id', 'fighter_id'], right_on = ['bout_id', 'fighter_id'])

    bout_len = pg_query(PSQL.client, "SELECT bout_id, length from ufc.bout_results")
    bout_len.columns = ['bout_id', 'length'] 
    stats = pd.merge(stats, bout_len, left_on = 'bout_id', right_on = 'bout_id')

    stats = pd.merge(stats, data['fighter_id'], left_on = 'fighter_id', right_on = 'fighter_id', how = 'inner')

    streak_data = {}         
    for fighter in stats.fighter_id.unique():    
        add_data = stats.loc[stats['fighter_id'] == fighter][['bout_id', 'fight_date', 'length', 'won']]
        add_data.sort_values('fight_date', inplace = True)
        
        f_streak = {}

        f_streak[data.loc[data['fighter_id'] == fighter]['bout_id'].values[0]] = {}
        
        f_streak[data.loc[data['fighter_id'] == fighter]['bout_id'].values[0]]['len_avg'] = add_data['length'].mean()
        f_streak[data.loc[data['fighter_id'] == fighter]['bout_id'].values[0]]['win_avg'] = add_data['won'].mean()
        last_res = add_data.iloc[-1]['won']
        streak_count = 0
        for res in reversed(add_data['won'].values):
            if res == last_res:
                streak_count += 1
            else:
                break
        if last_res == 0:
            streak_count *= -1
        f_streak[data.loc[data['fighter_id'] == fighter]['bout_id'].values[0]]['win_streak'] = streak_count
        
        if len(f_streak.keys()) > 0:
            streak_data[fighter] = f_streak

    streak_avg_data = {}
    i = 0
    for k,v in streak_data.items():
        for kk, vv in v.items():
            vv['fighter_id'] = k
            vv['bout_id'] = kk
            streak_avg_data[i] = vv
            i += 1

    streak_avg_data = pd.DataFrame.from_dict(streak_avg_data).T            
    data = pd.merge(data, streak_avg_data, left_on = ['bout_id', 'fighter_id'], right_on = ['bout_id', 'fighter_id'])
    
    pred_data = {}
    hold_cols = ['bout_id', 'fighter_id', 'fight_date', 'opponent_id']
    for bout in data['bout_id'].unique():
        bout_data = data.loc[data['bout_id'] == bout].sample(frac=1)
        if len(bout_data) != 2:
            continue
        
        bout_data.reset_index(inplace = True, drop = True)
        bout_meta = bout_data[hold_cols]
        bout_data = bout_data[[i for i in list(bout_data) if i not in hold_cols]]
        bout_preds = {}
        for k,v in (bout_data.T[0] - bout_data.T[1]).to_dict().items():
            bout_preds[k+'_diff'] = v
        for k,v in ((bout_data.T[0] + bout_data.T[1])/2).to_dict().items():
            bout_preds[k+'_avg'] = v              
        cur_cols = list(bout_data)
        for col in cur_cols:
            if '_o_' in col:
                bout_preds[col+'_xdif'] = bout_data.loc[0][col] - bout_data.loc[1][col.replace('_o_', '_d_')]            
            elif '_d_' in col:
                bout_preds[col+'_xdif'] = bout_data.loc[0][col] - bout_data.loc[1][col.replace('_d_', '_o_')]
            else:
                continue   
        for k,v in bout_meta.T[0].to_dict().items():
            bout_preds[k] = v
        bout_preds.pop('bout_id')
        pred_data[bout] = bout_preds
        
    pred_data = pd.DataFrame.from_dict(pred_data).T
    pred_data.reset_index(inplace = True)
    pred_data.rename(columns = {'index':'bout_id'}, inplace = True)
    pred_data.set_index('bout_id', inplace = True)


    bouts = pg_query(PSQL.client, "select b.bout_id, weight_desc from ufc.bouts b join ufc.fights f on f.fight_id = b.fight_id join ufc.weights w on b.weight_id = w.weight_id")
    bouts.columns = ['bout_id', 'weight_id']
    weights = pd.get_dummies(bouts['weight_id'])
    weights['index'] = bouts['bout_id']
    weights.drop_duplicates(inplace = True)
    weights.set_index('index', inplace = True) 
    META_X = pred_data.join(weights)
    
    return(pred_data, META_X)
예제 #7
0
PRED_X, META_X= pull_pred_data()

meta = PRED_X[['fighter_id', 'opponent_id']]
win_predictions, win_predicted_errors = comb_preds('winner', PRED_X, META_X)
len_predictions, len_predicted_errors = comb_preds('length', PRED_X, META_X)

win_predictions.name = 'win'
win_predicted_errors.name = 'win_error'

len_predictions.name = 'length'
len_predicted_errors.name = 'length_error'


predictions = meta.join(win_predictions).join(win_predicted_errors).join(len_predictions).join(len_predicted_errors)

fighters = pg_query(PSQL.client, "select fighter_id, name from ufc.fighters")
fighters = {k:v for k,v in fighters.values}

predictions['fighter_id'] = predictions['fighter_id'].apply(lambda x: fighters[x])
predictions['opponent_id'] = predictions['opponent_id'].apply(lambda x: fighters[x])

f_to_code = pg_query(PSQL.client, "select fighter_id, name from ufc.fighters")
f_to_code = {v:k for k,v in f_to_code.values}

odds = {}

for bout in predictions.index:
    print('Fight: %s VS %s' % (predictions.loc[bout]['fighter_id'], predictions.loc[bout]['opponent_id']))
    win_odds = input('Vegas odds for %s:   ' % (predictions.loc[bout]['fighter_id']))
    win_prob = odds_converter(win_odds)
    lose_prob = input('Vegas odds for %s:   ' % (predictions.loc[bout]['opponent_id']))
예제 #8
0
def pull_pred_data(only_avg=False):
    only_avg = True
    avg_data = pull_avg_data()
    if not only_avg:
        adj_avg_data = pull_adj_avg_data()

    acc_stat_dict = {
        'acc_ss': ['ssa', 'sss'],
        'acc_headss': ['headssa', 'headsss'],
        'acc_bodyss': ['bodyssa', 'bodysss'],
        'acc_legss': ['legssa', 'legsss'],
        'acc_disss': ['disssa', 'dissss'],
        'acc_clinss': ['clinssa', 'clinsss'],
        'acc_gndss': ['gndssa', 'gndsss'],
        'acc_td': ['tda', 'tds']
    }
    share_ss_dict = {
        'share_headss': ['headssa', 'headsss'],
        'share_bodyss': ['bodyssa', 'bodysss'],
        'share_legss': ['legssa', 'legsss'],
        'share_disss': ['disssa', 'dissss'],
        'share_clinss': ['clinssa', 'clinsss'],
        'share_gndss': ['gndssa', 'gndsss']
    }

    for k, v in acc_stat_dict.items():
        avg_data['avg_o_' + k] = (
            avg_data['avg_o_' + v[1]] / avg_data['avg_o_' + v[0]]
        ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
        avg_data['avg_d_' + k] = (
            avg_data['avg_d_' + v[1]] / avg_data['avg_d_' + v[0]]
        ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)

    for k, v in share_ss_dict.items():
        avg_data['avg_o_' + k + '_a'] = (
            avg_data['avg_o_' + v[0]] / avg_data['avg_o_' + 'ssa']
        ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
        avg_data['avg_d_' + k + '_a'] = (
            avg_data['avg_d_' + v[0]] / avg_data['avg_d_' + 'ssa']
        ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)

        avg_data['avg_o_' + k + '_s'] = (
            avg_data['avg_o_' + v[1]] / avg_data['avg_o_' + 'sss']
        ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
        avg_data['avg_d_' + k + '_s'] = (
            avg_data['avg_d_' + v[1]] / avg_data['avg_d_' + 'sss']
        ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)

    if not only_avg:
        for k, v in acc_stat_dict.items():
            adj_avg_data['adj_avg_o_' +
                         k] = (adj_avg_data['adj_avg_o_' + v[1]] /
                               adj_avg_data['adj_avg_o_' + v[0]]
                               ).apply(lambda x: x if x == x and x not in
                                       [np.inf, -np.inf] else 0)
            adj_avg_data['adj_avg_d_' +
                         k] = (adj_avg_data['adj_avg_d_' + v[1]] /
                               adj_avg_data['adj_avg_d_' + v[0]]
                               ).apply(lambda x: x if x == x and x not in
                                       [np.inf, -np.inf] else 0)

        for k, v in share_ss_dict.items():
            adj_avg_data['adj_avg_o_' + k +
                         '_a'] = (adj_avg_data['adj_avg_o_' + v[0]] /
                                  adj_avg_data['adj_avg_o_' + 'ssa']
                                  ).apply(lambda x: x if x == x and x not in
                                          [np.inf, -np.inf] else 0)
            adj_avg_data['adj_avg_d_' + k +
                         '_a'] = (adj_avg_data['adj_avg_d_' + v[0]] /
                                  adj_avg_data['adj_avg_d_' + 'ssa']
                                  ).apply(lambda x: x if x == x and x not in
                                          [np.inf, -np.inf] else 0)

            adj_avg_data['adj_avg_o_' + k +
                         '_s'] = (adj_avg_data['adj_avg_o_' + v[1]] /
                                  adj_avg_data['adj_avg_o_' + 'sss']
                                  ).apply(lambda x: x if x == x and x not in
                                          [np.inf, -np.inf] else 0)
            adj_avg_data['adj_avg_d_' + k +
                         '_s'] = (adj_avg_data['adj_avg_d_' + v[1]] /
                                  adj_avg_data['adj_avg_d_' + 'sss']
                                  ).apply(lambda x: x if x == x and x not in
                                          [np.inf, -np.inf] else 0)

        data = pd.merge(
            avg_data,
            adj_avg_data,
            left_on=['bout_id', 'fighter_id', 'fight_date', 'opponent_id'],
            right_on=['bout_id', 'fighter_id', 'fight_date', 'opponent_id'])
    else:
        data = avg_data

    data.dropna(inplace=True)
    fighters = pg_query(
        PSQL.client,
        'select fighter_id, height, reach, stance, dob from ufc.fighters')
    fighters.columns = ['fighter_id', 'height', 'reach', 'stance', 'dob']
    fighters.set_index('fighter_id', inplace=True)
    fighters = fighters.join(pd.get_dummies(fighters['stance']))
    fighters.drop('stance', axis=1, inplace=True)
    fighters.rename(columns={'': 'Missing Stance'}, inplace=True)
    fighters.reset_index(inplace=True)
    fighter_dob = {i: j for i, j in fighters[['fighter_id', 'dob']].values}

    i = 0
    fighter_ages = {}
    for bout, fighter, date in data[['bout_id', 'fighter_id',
                                     'fight_date']].values:
        fighter_ages[i] = {
            'bout_id':
            bout,
            'fighter_id':
            fighter,
            'age': (datetime.strptime(str(date).split(' ')[0], '%Y-%m-%d') -
                    fighter_dob[fighter]).days / 365
        }
        i += 1
    data = pd.merge(data,
                    pd.DataFrame.from_dict(fighter_ages).T,
                    left_on=['bout_id', 'fighter_id'],
                    right_on=['bout_id', 'fighter_id'])

    fighter_reach = {i: j for i, j in fighters[['fighter_id', 'reach']].values}
    fighter_height = {
        i: j
        for i, j in fighters[['fighter_id', 'height']].values
    }

    data['reach'] = data['fighter_id'].apply(lambda x: fighter_reach[x])
    data['height'] = data['fighter_id'].apply(lambda x: fighter_height[x])

    bouts = pg_query(
        PSQL.client,
        "select b.bout_id, weight_id, winner, loser from ufc.bouts b join ufc.bout_results br on br.bout_id = b.bout_id join ufc.fights f on f.fight_id = b.fight_id"
    )
    bouts.columns = ['bout_id', 'weight_id', 'winner', 'loser']
    bouts = {
        i: {
            'winner': j,
            'loser': k
        }
        for i, j, k in bouts[['bout_id', 'winner', 'loser']].values
    }
    winner_id = 0
    winner = {}
    for b, f in data[['bout_id', 'fighter_id']].values:
        if b in bouts.keys():
            if bouts[b]['winner'] == f:
                winner[winner_id] = {'bout_id': b, 'fighter_id': f, 'won': 1}
                winner_id += 1
            elif bouts[b]['loser'] == f:
                winner[winner_id] = {'bout_id': b, 'fighter_id': f, 'won': 0}
                winner_id += 1
            else:
                raise ValueError()
    winner = pd.DataFrame.from_dict(winner).T
    data = pd.merge(data,
                    winner,
                    left_on=['bout_id', 'fighter_id'],
                    right_on=['bout_id', 'fighter_id'])

    bout_len = pg_query(PSQL.client,
                        "SELECT bout_id, length from ufc.bout_results")
    bout_len.columns = ['bout_id', 'length']

    data = pd.merge(data, bout_len, left_on='bout_id', right_on='bout_id')

    stats = pull_stats()
    stats = pd.merge(stats, bout_len, left_on='bout_id', right_on='bout_id')
    winner = {}
    for b, f in stats[['bout_id', 'fighter_id']].values:
        if b in bouts.keys():
            if bouts[b]['winner'] == f:
                winner[winner_id] = {'bout_id': b, 'fighter_id': f, 'won': 1}
                winner_id += 1
            elif bouts[b]['loser'] == f:
                winner[winner_id] = {'bout_id': b, 'fighter_id': f, 'won': 0}
                winner_id += 1
            else:
                raise ValueError()
    winner = pd.DataFrame.from_dict(winner).T
    stats = pd.merge(stats,
                     winner,
                     left_on=['bout_id', 'fighter_id'],
                     right_on=['bout_id', 'fighter_id'])

    streak_data = {}
    for fighter in stats.fighter_id.unique():
        add_data = stats.loc[stats['fighter_id'] == fighter][[
            'bout_id', 'fight_date', 'length', 'won'
        ]]
        add_data.sort_values('fight_date', inplace=True)

        f_streak = {}
        for i in range(len(add_data)):
            if i == 0:
                continue
            f_streak[add_data.iloc[i]['bout_id']] = {}

            f_streak[add_data.iloc[i]['bout_id']][
                'len_avg'] = add_data.iloc[:i]['length'].mean()
            f_streak[add_data.iloc[i]
                     ['bout_id']]['win_avg'] = add_data.iloc[:i]['won'].mean()
            last_res = add_data.iloc[i - 1]['won']
            streak_count = 0
            for res in reversed(add_data.iloc[:i]['won'].values):
                if res == last_res:
                    streak_count += 1
                else:
                    break
            if last_res == 0:
                streak_count *= -1
            f_streak[add_data.iloc[i]['bout_id']]['win_streak'] = streak_count
        if len(f_streak.keys()) > 0:
            streak_data[fighter] = f_streak

    streak_avg_data = {}
    i = 0
    for k, v in streak_data.items():
        for kk, vv in v.items():
            vv['fighter_id'] = k
            vv['bout_id'] = kk
            streak_avg_data[i] = vv
            i += 1

    streak_avg_data = pd.DataFrame.from_dict(streak_avg_data).T
    data = pd.merge(data,
                    streak_avg_data,
                    left_on=['bout_id', 'fighter_id'],
                    right_on=['bout_id', 'fighter_id'])

    pred_data_1 = {}
    hold_cols = ['bout_id', 'fighter_id', 'fight_date', 'opponent_id']
    for bout in data['bout_id'].unique():
        bout_data = data.loc[data['bout_id'] == bout]
        if len(bout_data) != 2:
            continue

        bout_data.reset_index(inplace=True, drop=True)
        bout_meta = bout_data[hold_cols]
        bout_data = bout_data[[
            i for i in list(bout_data) if i not in hold_cols
        ]]
        bout_preds = {}
        for k, v in (bout_data.T[0] - bout_data.T[1]).to_dict().items():
            bout_preds[k + '_diff'] = v
        for k, v in ((bout_data.T[0] + bout_data.T[1]) / 2).to_dict().items():
            bout_preds[k + '_avg'] = v
        bout_preds.pop('won_avg')
        bout_preds.keys()
        bout_preds.pop('length_diff')
        cur_cols = list(bout_data)
        for col in cur_cols:
            if '_o_' in col:
                bout_preds[col +
                           '_xdif'] = bout_data.loc[0][col] - bout_data.loc[1][
                               col.replace('_o_', '_d_')]
            elif '_d_' in col:
                bout_preds[col +
                           '_xdif'] = bout_data.loc[0][col] - bout_data.loc[1][
                               col.replace('_d_', '_o_')]
            else:
                continue
        for k, v in bout_meta.T[0].to_dict().items():
            bout_preds[k] = v
        bout_preds.pop('bout_id')
        pred_data_1[bout] = bout_preds

    pred_data_1 = pd.DataFrame.from_dict(pred_data_1).T
    pred_data_1.reset_index(inplace=True)
    pred_data_1.rename(columns={'index': 'bout_id'}, inplace=True)
    #    pred_data = pd.merge(pred_data, bout_len, left_on = 'bout_id', right_on = 'bout_id')
    pred_data_1.rename(columns={'won_diff': 'winner'}, inplace=True)
    pred_data_1.rename(columns={'length_avg': 'length'}, inplace=True)

    pred_data_2 = {}
    for bout in data['bout_id'].unique():
        bout_data = data.loc[data['bout_id'] == bout]
        if len(bout_data) != 2:
            continue
        bout_data = bout_data.iloc[[1, 0]]

        bout_data.reset_index(inplace=True, drop=True)
        bout_meta = bout_data[hold_cols]
        bout_data = bout_data[[
            i for i in list(bout_data) if i not in hold_cols
        ]]
        bout_preds = {}
        for k, v in (bout_data.T[0] - bout_data.T[1]).to_dict().items():
            bout_preds[k + '_diff'] = v
        for k, v in ((bout_data.T[0] + bout_data.T[1]) / 2).to_dict().items():
            bout_preds[k + '_avg'] = v
        bout_preds.pop('won_avg')
        bout_preds.keys()
        bout_preds.pop('length_diff')
        cur_cols = list(bout_data)
        for col in cur_cols:
            if '_o_' in col:
                bout_preds[col +
                           '_xdif'] = bout_data.loc[0][col] - bout_data.loc[1][
                               col.replace('_o_', '_d_')]
            elif '_d_' in col:
                bout_preds[col +
                           '_xdif'] = bout_data.loc[0][col] - bout_data.loc[1][
                               col.replace('_d_', '_o_')]
            else:
                continue
        for k, v in bout_meta.T[0].to_dict().items():
            bout_preds[k] = v
        bout_preds.pop('bout_id')
        pred_data_2[bout] = bout_preds

    pred_data_2 = pd.DataFrame.from_dict(pred_data_2).T
    pred_data_2.reset_index(inplace=True)
    pred_data_2.rename(columns={'index': 'bout_id'}, inplace=True)
    #    pred_data = pd.merge(pred_data, bout_len, left_on = 'bout_id', right_on = 'bout_id')
    pred_data_2.rename(columns={'won_diff': 'winner'}, inplace=True)
    pred_data_2.rename(columns={'length_avg': 'length'}, inplace=True)

    pred_data = pred_data_1.append(pred_data_2)

    #    pred_data.drop('won_avg', axis = 1, inplace = True)
    [i for i in list(pred_data) if 'len' in i]
    #    pred_data_length = pred_data[[i for i in list(pred_data) if i != 'winner']]
    pred_data_winner = pred_data[[i for i in list(pred_data) if i != 'length']]

    pred_data_winner.set_index('bout_id', inplace=True)
    if only_avg:
        pred_data_winner.to_csv(
            os.path.join(cur_path, 'data', 'only_avg', 'winner_data.csv'))
    else:
        pred_data_winner.to_csv(
            os.path.join(cur_path, 'data', 'winner_data.csv'))
예제 #9
0
파일: gen_preds.py 프로젝트: eghensley/ufc
from _connections import db_connection
from pop_psql import pg_query

pred_data = pd.read_csv(
    os.path.join(cur_path, 'test_data', 'pred_data_TEST.csv'))
pred_data.drop('Unnamed: 0', axis=1, inplace=True)
pred_data.set_index('bout_id', inplace=True)

hold_cols = ['fighter_id', 'fight_date', 'opponent_id']

meta = pred_data[hold_cols]
X = pred_data[[i for i in list(pred_data) if i not in hold_cols]]

PSQL = db_connection('psql')
bouts = pg_query(
    PSQL.client,
    "select b.bout_id, weight_desc from ufc.bouts b join ufc.fights f on f.fight_id = b.fight_id join ufc.weights w on b.weight_id = w.weight_id"
)
bouts.columns = ['bout_id', 'weight_id']
weights = pd.get_dummies(bouts['weight_id'])
weights['index'] = bouts['bout_id']
weights.drop_duplicates(inplace=True)
weights.set_index('index', inplace=True)
meta_X = X.join(weights)

length_preds = {}
length_error_preds = {}
for mod in os.listdir(os.path.join(cur_path, 'fit_models', 'length')):
    length_model = load(os.path.join(cur_path, 'fit_models', 'length', mod))
    feats_folder = os.path.join(cur_path, 'modelling', 'length', 'final',
                                'features')
    with open(os.path.join(feats_folder, '%s.json' % (mod.split('.')[0])),
예제 #10
0
def pull_pred_data(avg_data, adj_avg_data):
    avg_data = pd.read_csv(
        os.path.join(cur_path, 'test_data', 'avg_data_TEST.csv'))
    avg_data.drop('Unnamed: 0', axis=1, inplace=True)

    adj_avg_data = pd.read_csv(
        os.path.join(cur_path, 'test_data', 'adj_avg_data_TEST.csv'))
    adj_avg_data.drop('Unnamed: 0', axis=1, inplace=True)

    acc_stat_dict = {
        'acc_ss': ['ssa', 'sss'],
        'acc_headss': ['headssa', 'headsss'],
        'acc_bodyss': ['bodyssa', 'bodysss'],
        'acc_legss': ['legssa', 'legsss'],
        'acc_disss': ['disssa', 'dissss'],
        'acc_clinss': ['clinssa', 'clinsss'],
        'acc_gndss': ['gndssa', 'gndsss'],
        'acc_td': ['tda', 'tds']
    }
    share_ss_dict = {
        'share_headss': ['headssa', 'headsss'],
        'share_bodyss': ['bodyssa', 'bodysss'],
        'share_legss': ['legssa', 'legsss'],
        'share_disss': ['disssa', 'dissss'],
        'share_clinss': ['clinssa', 'clinsss'],
        'share_gndss': ['gndssa', 'gndsss']
    }

    for k, v in acc_stat_dict.items():
        avg_data['avg_o_' + k] = (
            avg_data['avg_o_' + v[1]] / avg_data['avg_o_' + v[0]]
        ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
        avg_data['avg_d_' + k] = (
            avg_data['avg_d_' + v[1]] / avg_data['avg_d_' + v[0]]
        ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)

    for k, v in share_ss_dict.items():
        avg_data['avg_o_' + k + '_a'] = (
            avg_data['avg_o_' + v[0]] / avg_data['avg_o_' + 'ssa']
        ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
        avg_data['avg_d_' + k + '_a'] = (
            avg_data['avg_d_' + v[0]] / avg_data['avg_d_' + 'ssa']
        ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)

        avg_data['avg_o_' + k + '_s'] = (
            avg_data['avg_o_' + v[1]] / avg_data['avg_o_' + 'sss']
        ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
        avg_data['avg_d_' + k + '_s'] = (
            avg_data['avg_d_' + v[1]] / avg_data['avg_d_' + 'sss']
        ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)

    for k, v in acc_stat_dict.items():
        adj_avg_data['adj_avg_o_' + k] = (
            adj_avg_data['adj_avg_o_' + v[1]] /
            adj_avg_data['adj_avg_o_' + v[0]]
        ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
        adj_avg_data['adj_avg_d_' + k] = (
            adj_avg_data['adj_avg_d_' + v[1]] /
            adj_avg_data['adj_avg_d_' + v[0]]
        ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)

    for k, v in share_ss_dict.items():
        adj_avg_data['adj_avg_o_' + k + '_a'] = (
            adj_avg_data['adj_avg_o_' + v[0]] /
            adj_avg_data['adj_avg_o_' + 'ssa']
        ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
        adj_avg_data['adj_avg_d_' + k + '_a'] = (
            adj_avg_data['adj_avg_d_' + v[0]] /
            adj_avg_data['adj_avg_d_' + 'ssa']
        ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)

        adj_avg_data['adj_avg_o_' + k + '_s'] = (
            adj_avg_data['adj_avg_o_' + v[1]] /
            adj_avg_data['adj_avg_o_' + 'sss']
        ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
        adj_avg_data['adj_avg_d_' + k + '_s'] = (
            adj_avg_data['adj_avg_d_' + v[1]] /
            adj_avg_data['adj_avg_d_' + 'sss']
        ).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)

    data = pd.merge(
        avg_data,
        adj_avg_data,
        left_on=['bout_id', 'fighter_id', 'fight_date', 'opponent_id'],
        right_on=['bout_id', 'fighter_id', 'fight_date', 'opponent_id'])
    data.dropna(inplace=True)
    fighters = pg_query(
        PSQL.client,
        'select fighter_id, height, reach, stance, dob from ufc.fighters')
    fighters.columns = ['fighter_id', 'height', 'reach', 'stance', 'dob']
    fighters.set_index('fighter_id', inplace=True)
    fighters = fighters.join(pd.get_dummies(fighters['stance']))
    fighters.drop('stance', axis=1, inplace=True)
    fighters.rename(columns={'': 'Missing Stance'}, inplace=True)
    fighters.reset_index(inplace=True)
    fighter_dob = {i: j for i, j in fighters[['fighter_id', 'dob']].values}

    i = 0
    fighter_ages = {}
    for bout, fighter, date in data[['bout_id', 'fighter_id',
                                     'fight_date']].values:
        fighter_ages[i] = {
            'bout_id':
            bout,
            'fighter_id':
            fighter,
            'age': (datetime.strptime(date.split(' ')[0], '%Y-%m-%d') -
                    fighter_dob[fighter]).days / 365
        }
        i += 1
    data = pd.merge(data,
                    pd.DataFrame.from_dict(fighter_ages).T,
                    left_on=['bout_id', 'fighter_id'],
                    right_on=['bout_id', 'fighter_id'])

    fighter_reach = {i: j for i, j in fighters[['fighter_id', 'reach']].values}
    fighter_height = {
        i: j
        for i, j in fighters[['fighter_id', 'height']].values
    }

    data['reach'] = data['fighter_id'].apply(lambda x: fighter_reach[x])
    data['height'] = data['fighter_id'].apply(lambda x: fighter_height[x])

    pred_data = {}
    hold_cols = ['bout_id', 'fighter_id', 'fight_date', 'opponent_id']
    for bout in data['bout_id'].unique():
        bout_data = data.loc[data['bout_id'] == bout].sample(frac=1)
        if len(bout_data) != 2:
            continue
        bout_data.reset_index(inplace=True, drop=True)
        bout_meta = bout_data[hold_cols]
        bout_data = bout_data[[
            i for i in list(bout_data) if i not in hold_cols
        ]]
        bout_preds = {}
        for k, v in (bout_data.T[0] - bout_data.T[1]).to_dict().items():
            bout_preds[k + '_diff'] = v
        for k, v in ((bout_data.T[0] + bout_data.T[1]) / 2).to_dict().items():
            bout_preds[k + '_avg'] = v
        cur_cols = list(bout_data)
        for col in cur_cols:
            if '_o_' in col:
                bout_preds[col +
                           '_xdif'] = bout_data.loc[0][col] - bout_data.loc[1][
                               col.replace('_o_', '_d_')]
            elif '_d_' in col:
                bout_preds[col +
                           '_xdif'] = bout_data.loc[0][col] - bout_data.loc[1][
                               col.replace('_d_', '_o_')]
            else:
                continue
    #    bout_preds.pop('won_x')
        for k, v in bout_meta.T[0].to_dict().items():
            bout_preds[k] = v
        bout_preds.pop('bout_id')
        pred_data[bout] = bout_preds

    pred_data = pd.DataFrame.from_dict(pred_data).T
    pred_data.reset_index(inplace=True)
    pred_data.rename(columns={'index': 'bout_id'}, inplace=True)
    pred_data.to_csv(os.path.join(cur_path, 'test_data', 'pred_data_TEST.csv'))
예제 #11
0
파일: proc.py 프로젝트: eghensley/ufc
def pull_pred_data(avg_data, adj_avg_data):
    if os.path.exists(os.path.join(cur_path, 'test_data', 'pred_data_winner.csv')) and os.path.exists(os.path.join(cur_path, 'pred_data_length.csv')):
        pred_data_winner = pd.read_csv(os.path.join(cur_path, 'test_data', 'pred_data_winner.csv'))
        pred_data_winner.drop('Unnamed: 0', inplace = True, axis = 1)

        pred_data_length = pd.read_csv(os.path.join(cur_path, 'test_data', 'pred_data_length.csv'))
        pred_data_length.drop('Unnamed: 0', inplace = True, axis = 1)
        
    else:
        acc_stat_dict = {'acc_ss': ['ssa', 'sss'],
                           'acc_headss': ['headssa', 'headsss'],
                           'acc_bodyss': ['bodyssa', 'bodysss'],
                           'acc_legss': ['legssa', 'legsss'],
                           'acc_disss': ['disssa', 'dissss'],
                           'acc_clinss': ['clinssa', 'clinsss'],
                           'acc_gndss': ['gndssa', 'gndsss'],
                           'acc_td': ['tda', 'tds']}
        share_ss_dict = {'share_headss': ['headssa', 'headsss'],
                           'share_bodyss': ['bodyssa', 'bodysss'],
                           'share_legss': ['legssa', 'legsss'],
                           'share_disss': ['disssa', 'dissss'],
                           'share_clinss': ['clinssa', 'clinsss'],
                           'share_gndss': ['gndssa', 'gndsss']}
        
        
        
        for k, v in acc_stat_dict.items():
            avg_data['avg_o_'+k] = (avg_data['avg_o_'+v[1]] / avg_data['avg_o_'+v[0]]).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
            avg_data['avg_d_'+k] = (avg_data['avg_d_'+v[1]] / avg_data['avg_d_'+v[0]]).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
        
        for k, v in share_ss_dict.items():
            avg_data['avg_o_'+k+'_a'] = (avg_data['avg_o_'+v[0]]/avg_data['avg_o_'+'ssa']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
            avg_data['avg_d_'+k+'_a'] = (avg_data['avg_d_'+v[0]]/avg_data['avg_d_'+'ssa']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
        
            avg_data['avg_o_'+k+'_s'] = (avg_data['avg_o_'+v[1]]/avg_data['avg_o_'+'sss']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
            avg_data['avg_d_'+k+'_s'] = (avg_data['avg_d_'+v[1]]/avg_data['avg_d_'+'sss']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
        
        
        for k, v in acc_stat_dict.items():
            adj_avg_data['adj_avg_o_'+k] = (adj_avg_data['adj_avg_o_'+v[1]] / adj_avg_data['adj_avg_o_'+v[0]]).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
            adj_avg_data['adj_avg_d_'+k] = (adj_avg_data['adj_avg_d_'+v[1]] / adj_avg_data['adj_avg_d_'+v[0]]).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
        
        for k, v in share_ss_dict.items():
            adj_avg_data['adj_avg_o_'+k+'_a'] = (adj_avg_data['adj_avg_o_'+v[0]]/adj_avg_data['adj_avg_o_'+'ssa']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
            adj_avg_data['adj_avg_d_'+k+'_a'] = (adj_avg_data['adj_avg_d_'+v[0]]/adj_avg_data['adj_avg_d_'+'ssa']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
        
            adj_avg_data['adj_avg_o_'+k+'_s'] = (adj_avg_data['adj_avg_o_'+v[1]]/adj_avg_data['adj_avg_o_'+'sss']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
            adj_avg_data['adj_avg_d_'+k+'_s'] = (adj_avg_data['adj_avg_d_'+v[1]]/adj_avg_data['adj_avg_d_'+'sss']).apply(lambda x: x if x == x and x not in [np.inf, -np.inf] else 0)
        
        
        data = pd.merge(avg_data, adj_avg_data, left_on = ['bout_id', 'fighter_id', 'fight_date', 'opponent_id'], right_on = ['bout_id', 'fighter_id', 'fight_date', 'opponent_id'])
        data.dropna(inplace = True)
        fighters = pg_query(PSQL.client, 'select fighter_id, height, reach, stance, dob from ufc.fighters')
        fighters.columns = ['fighter_id', 'height', 'reach', 'stance', 'dob']
        fighters.set_index('fighter_id', inplace = True)
        fighters = fighters.join(pd.get_dummies(fighters['stance']))
        fighters.drop('stance', axis = 1, inplace = True)
        fighters.rename(columns = {'': 'Missing Stance'}, inplace = True)
        fighters.reset_index(inplace = True)
        fighter_dob = {i:j for i,j in fighters[['fighter_id', 'dob']].values}
        
        i = 0
        fighter_ages = {}
        for bout, fighter, date in data[['bout_id', 'fighter_id', 'fight_date']].values:
            fighter_ages[i] = {'bout_id': bout, 'fighter_id':fighter, 'age': (datetime.strptime(date, '%Y-%m-%d') - fighter_dob[fighter]).days/365}
            i += 1
        data = pd.merge(data, pd.DataFrame.from_dict(fighter_ages).T, left_on = ['bout_id', 'fighter_id'], right_on = ['bout_id', 'fighter_id'])
        
        fighter_reach = {i:j for i,j in fighters[['fighter_id', 'reach']].values}
        fighter_height = {i:j for i,j in fighters[['fighter_id', 'height']].values}   
        
        data['reach'] = data['fighter_id'].apply(lambda x: fighter_reach[x])
        data['height'] = data['fighter_id'].apply(lambda x: fighter_height[x])
        
        
        bouts = pg_query(PSQL.client, "select b.bout_id, weight_id, winner, loser from ufc.bouts b join ufc.bout_results br on br.bout_id = b.bout_id join ufc.fights f on f.fight_id = b.fight_id")
        bouts.columns = ['bout_id', 'weight_id', 'winner', 'loser']
        bouts = {i:{'winner':j, 'loser': k} for i,j,k in bouts[['bout_id', 'winner', 'loser']].values}
        winner_id = 0
        winner = {}
        for b,f in data[['bout_id', 'fighter_id']].values:
            if b in bouts.keys():
                if bouts[b]['winner'] == f:
                    winner[winner_id] = {'bout_id': b, 'fighter_id': f, 'won': 1}
                    winner_id += 1
                elif bouts[b]['loser'] == f:
                    winner[winner_id] = {'bout_id': b, 'fighter_id': f, 'won': 0}
                    winner_id += 1
                else:
                    raise ValueError()
        winner = pd.DataFrame.from_dict(winner).T
        data = pd.merge(data, winner, left_on = ['bout_id', 'fighter_id'], right_on = ['bout_id', 'fighter_id'])

        bout_len = pg_query(PSQL.client, "SELECT bout_id, length from ufc.bout_results")
        bout_len.columns = ['bout_id', 'length'] 
        
        
        pred_data = {}
        hold_cols = ['bout_id', 'fighter_id', 'fight_date', 'opponent_id']
        for bout in data['bout_id'].unique():
            bout_data = data.loc[data['bout_id'] == bout].sample(frac=1)
            if len(bout_data) != 2:
                continue
            bout_data.reset_index(inplace = True, drop = True)
            bout_meta = bout_data[hold_cols]
            bout_data = bout_data[[i for i in list(bout_data) if i not in hold_cols]]
            bout_preds = {}
            for k,v in (bout_data.T[0] - bout_data.T[1]).to_dict().items():
                bout_preds[k+'_diff'] = v
            for k,v in ((bout_data.T[0] + bout_data.T[1])/2).to_dict().items():
                bout_preds[k+'_avg'] = v                
            cur_cols = list(bout_data)
            for col in cur_cols:
                if '_o_' in col:
                    bout_preds[col+'_xdif'] = bout_data.loc[0][col] - bout_data.loc[1][col.replace('_o_', '_d_')]            
                elif '_d_' in col:
                    bout_preds[col+'_xdif'] = bout_data.loc[0][col] - bout_data.loc[1][col.replace('_d_', '_o_')]
                else:
                    continue    
        #    bout_preds.pop('won_x')
            for k,v in bout_meta.T[0].to_dict().items():
                bout_preds[k] = v
            bout_preds.pop('bout_id')
            pred_data[bout] = bout_preds
            
        pred_data = pd.DataFrame.from_dict(pred_data).T
        pred_data.reset_index(inplace = True)
        pred_data.rename(columns = {'index':'bout_id'}, inplace = True)
        pred_data = pd.merge(pred_data, bout_len, left_on = 'bout_id', right_on = 'bout_id')
        pred_data.rename(columns = {'won_diff': 'winner'}, inplace = True)
        pred_data.drop('won_avg', axis = 1, inplace = True)
        
        pred_data_length = pred_data[[i for i in list(pred_data) if i != 'winner']]
        pred_data_winner = pred_data[[i for i in list(pred_data) if i != 'length']]
        
        pred_data_winner.to_csv(os.path.join(cur_path, 'test_data', 'pred_data_winner.csv'))
        pred_data_length.to_csv(os.path.join(cur_path, 'test_data', 'pred_data_length.csv'))
        
    return(pred_data_winner, pred_data_length)