Exemplo n.º 1
0
#We are interested in looking at the confusion matrices

import header as h
import pandas as pd
import numpy as np
import fs_definitions as fsd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

df = h.create_master_df()  #Create the master df

for fs in h.FEATURES:

    temp_df = fsd.create_prepped_df(fs, df)  #create the prepped df
    y = temp_df['label']  #Set 'y' to the winner value

    #Remove the winner and label from the features
    temp_prepped_df = temp_df.drop(['Winner', 'label'], axis=1)

    X = temp_prepped_df.values  #Set x to the values

    #class_names = temp_df.Winner #This should match to the winners?
    class_names = ['Blue', 'Red']

    print(class_names)

    print(X.shape)
    print(y.shape)
Exemplo n.º 2
0
pd.set_option('display.max_rows', 500)  #Used for debugging

#df = h.create_fight_df(h.MASTER_CSV_FILE)
#print(df.head)
#print(df.describe())
#print(len(df))
#print(df.head)
#print(df.dtypes)
#OK... Now we need to create some dataframes....
df = h.create_master_df()
#print(df.head)
#print(len(df))
#adding a comment

temp_df = fsd.create_prepped_df('c1', df)

#print(temp_df)

X = temp_df.iloc[:, :-2].values
y = temp_df.iloc[:, -1:].values

#print(X)

#print(y)

print(X.shape)
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
Exemplo n.º 3
0
def get_recent_probs(df: pd.DataFrame, fs: str) -> pd.DataFrame:
    """
    Returns a probability dataframe of the last 50 events.

    Parameters
    ----------
    df : pd.DataFrame
        Master DataFrame
    fs : str
        The feature set

    Returns
    -------
    pd.DataFrame
        The probability DataFrame

    """

    #We want to figure out how to split on the date....

    prepped_df = fsd.create_prepped_df(fs, df)

    list_of_dates = (prepped_df['date_final'].unique())

    list_of_dates = (np.flip(np.sort(list_of_dates)))

    list_of_dates = list_of_dates[:50]

    print(list_of_dates)

    y = prepped_df[['label', 'date_final']]
    ev_df = prepped_df[[
        'date_final', 'B_ev_final', 'R_ev_final', 'Winner', 'label',
        'country_final'
    ]]

    prepped_df = prepped_df.drop(
        ['Winner', 'label', 'R_ev_final', 'B_ev_final', 'country_final'],
        axis=1)

    final_probs = None

    for d in list_of_dates:
        X_test = prepped_df.loc[prepped_df['date_final'] == d]
        X_train = prepped_df.loc[prepped_df['date_final'] != d]
        y_test = y.loc[y['date_final'] == d]
        y_train = y.loc[y['date_final'] != d]
        X_test_ev = ev_df.loc[ev_df['date_final'] == d]
        X_train_ev = ev_df.loc[ev_df['date_final'] != d]

        #Remove the date
        X_test = X_test.drop('date_final', 1)
        X_train = X_train.drop('date_final', 1)
        y_train = y_train.drop('date_final', 1)
        y_test = y_test.drop('date_final', 1)
        print(X_test.shape)
        print(X_train.shape)
        print(y_test.shape)
        print(y_train.shape)
        print(X_test_ev.shape)
        print(X_train_ev.shape)
        print()
        print()

        classifier = get_classifier(fs)
        classifier.fit(X_train, y_train.values.ravel())
        probs = classifier.predict_proba(X_test)
        preds = classifier.predict(X_test)

        preds = preds.reshape((len(preds), 1))
        X_test = np.append(X_test, probs, 1)
        X_test = np.append(X_test, preds, 1)
        X_test = np.append(X_test, X_test_ev, 1)
        trash_df = prepped_df.drop(['date_final'], axis=1)
        colNamesArr = trash_df.columns.values
        colNamesArr = np.append(colNamesArr, [
            'B_prob', 'R_prob', 'preds', 'date_final', 'B_ev_final',
            'R_ev_final', 'Winner', 'label', 'country_final'
        ])
        final_df = pd.DataFrame(X_test)
        final_df.columns = colNamesArr

        if (final_probs is not None):
            final_probs = final_probs.append(final_df)
        else:
            final_probs = final_df

    return (final_probs)

    pass
Exemplo n.º 4
0
import pandas as pd
import numpy as np
import fs_definitions as fsd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

np.set_printoptions(suppress=True)

df = h.create_fight_df('../data/ufc-master.csv')
fs = 'c1'
temp_df = fsd.create_prepped_df(fs, df)

print(df.shape)
print(temp_df.shape)
y = temp_df['label']

#WE NEED TO STRIP OUT R_ev_final and B_ev_final
#I THINK WE CAN DO THIS AND THEN SPLIT....

ev_df = temp_df[['B_ev_final', 'R_ev_final']]

temp_prepped_df = temp_df.drop(['Winner', 'label', 'R_ev_final', 'B_ev_final'],
                               axis=1)

X = temp_prepped_df.values
X_ev = ev_df.values
Exemplo n.º 5
0
def get_test_probs(df: pd.DataFrame,
                   fs: str,
                   seed: int,
                   split: float,
                   count_split=False) -> pd.DataFrame:
    """
    Returns a DataFrame that includes classification results    

    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame we will be analyzing (Probably the master dataframe)
        
    fs: feature set
        The feature set

    seed : int
        A seed for the split

    split : float
        the split amount

    Returns
    -------
    Returns a dataframe that has been put through a classifier function.
    The dataframe will include the probabilities and predictions.

    """
    print("\n\n******************************************************\n\n")
    print("STARTING get_test_probs")

    print("\n\n******************************************************")
    """
    1. Create Prepped DF
    2. Create alternate DF that only includes odds
    3. Remove certain characteristics from Prepped DF
    4. Split both DFs
    4.5: Create classifier
    5. Run classification
    6. Append odds and probs and preds and winner and label to df
    7. Return it.
    """

    #1. create prepped df
    prepped_df = fsd.create_prepped_df(fs, df)

    #2. Create alternate DF that only includes odds
    ev_df = prepped_df[[
        'date_final', 'B_ev_final', 'R_ev_final', 'Winner', 'label',
        'country_final'
    ]]

    #3. Remove certain characteristics from Prepped DF
    y = prepped_df['label']
    prepped_df = prepped_df.drop([
        'Winner', 'label', 'R_ev_final', 'B_ev_final', 'date_final',
        'country_final'
    ],
                                 axis=1)
    X = prepped_df.values
    X_ev = ev_df.values

    #4. Split both DFs
    if count_split == False:
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=split,
                                                            random_state=seed)

        X_train_ev, X_test_ev, y_train_ev, y_test_ev = train_test_split(
            X_ev, y, test_size=split, random_state=seed)
    else:
        X_test = X[:split]
        X_train = X[split:]
        y_test = y[:split]
        y_train = y[split:]
        X_test_ev = X_ev[:split]
        X_train_ev = X_ev[split:]
        y_test_ev = y[:split]
        y_train_ev = y[split:]

    #4.5 Create classifier
    classifier = get_classifier(fs)

    #5. Run classification

    print(f"x-train shape is {X_train.shape}")
    classifier.fit(X_train, y_train)

    probs = classifier.predict_proba(X_test)
    preds = classifier.predict(X_test)
    #p_and_p = np.append(probs, preds, 1)

    #6. Append odds and probs and preds and winner and label to df
    preds = preds.reshape((len(preds), 1))
    X_test = np.append(X_test, probs, 1)
    X_test = np.append(X_test, preds, 1)
    X_test = np.append(X_test, X_test_ev, 1)
    colNamesArr = prepped_df.columns.values
    colNamesArr = np.append(colNamesArr, [
        'B_prob', 'R_prob', 'preds', 'date_final', 'B_ev_final', 'R_ev_final',
        'Winner', 'label', 'country_final'
    ])
    print(X_test.shape)
    print(colNamesArr.shape)
    final_df = pd.DataFrame(X_test)
    final_df.columns = colNamesArr

    return final_df