Exemplo n.º 1
0
def prep_us_nfl_pred(input_df,
                     use_csv=False,
                     data_csv='data/output_dont_commit/reg_output.csv'):

    if use_csv:
        input_df = sf.load_model_data(data_csv)

    # Model Run - K-Means Clustering - Data Preparation
    # Adding Mean Squared Error Column
    input_df['Squared Error'] = input_df.apply(lambda row: (
        (row['Total points'] - row['Total points predicted'])**2),
                                               axis=1)

    # Dropping Unwanted Columns
    us_columns_to_drop = [
        'name', 'pos', 'year', 'home', 'against', 'week', 'score', 'opp_score',
        'month', 'team', 'rush_att', 'rush_td', 'rush_yd', 'rec_td', 'rec_yd',
        'fumble', 'pass_yd', 'pass_td', 'pass_int', 'rush_att_av',
        'rush_td_av', 'rush_yd_av', 'rec_td_av', 'rec_yd_av', 'fumble_av',
        'pass_yd_av', 'pass_td_av', 'pass_int_av', 'rush_att_av^2',
        'rush_td_av^2', 'rush_yd_av^2', 'rec_td_av^2', 'rec_yd_av^2',
        'fumble_av^2', 'pass_yd_av^2', 'pass_td_av^2', 'pass_int_av^2',
        'points', 'Total points predicted'
    ]
    us_input_df = input_df.drop(us_columns_to_drop, axis=1)

    log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Clustering Input Data:" +
             sf.Color.END)
    log.info(us_input_df.head(3))

    # Converting to NumPy Array
    input_npa = us_input_df.as_matrix().astype(np.float)

    return input_df, input_npa
Exemplo n.º 2
0
def prep_us_nfl_pred(input_df, use_csv=False, data_csv='data/output_dont_commit/reg_output.csv'):

    if use_csv:
        input_df = sf.load_model_data(data_csv)

    # Model Run - K-Means Clustering - Data Preparation
    # Adding Mean Squared Error Column
    input_df['Squared Error'] = input_df.apply(lambda row: ((row['Total points'] -
                                                             row['Total points predicted']) ** 2), axis=1)

    # Dropping Unwanted Columns
    us_columns_to_drop = ['name', 'pos', 'year', 'home', 'against', 'week', 'score', 'opp_score',
                          'month', 'team', 'rush_att', 'rush_td', 'rush_yd', 'rec_td', 'rec_yd',
                          'fumble', 'pass_yd', 'pass_td', 'pass_int', 'rush_att_av',
                          'rush_td_av', 'rush_yd_av', 'rec_td_av', 'rec_yd_av', 'fumble_av',
                          'pass_yd_av', 'pass_td_av', 'pass_int_av', 'rush_att_av^2',
                          'rush_td_av^2', 'rush_yd_av^2', 'rec_td_av^2', 'rec_yd_av^2', 'fumble_av^2',
                          'pass_yd_av^2', 'pass_td_av^2', 'pass_int_av^2', 'points', 'Total points predicted']
    us_input_df = input_df.drop(us_columns_to_drop, axis=1)

    log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Clustering Input Data:" + sf.Color.END)
    log.info(us_input_df.head(3))

    # Converting to NumPy Array
    input_npa = us_input_df.as_matrix().astype(np.float)

    return input_df, input_npa
Exemplo n.º 3
0
def prep_reg_nfl_pred(feature_scaling=False,
                      data_csv='dataproc/nfl_pred_data.csv'):

    log = logging.getLogger('debug')

    nfl_df = sf.load_model_data(data_csv)

    # Previewing Column Names
    col_names = nfl_df.columns.tolist()
    log.info(sf.Color.BOLD + sf.Color.GREEN + "Column Names:" + sf.Color.END)
    log.info(col_names)

    # Previewing Source Data
    to_show = col_names[:]
    log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Loaded Data:" +
             sf.Color.END)
    log.info(nfl_df[to_show].head(3))

    # Isolate Output Data
    y = np.array(nfl_df['points'])

    # Columns to drop (From Features Data Frame)
    to_drop = [
        'points', 'rush_att', 'rush_td', 'rush_yd', 'rec_td', 'rec_yd',
        'fumble', 'pass_yd', 'pass_td', 'pass_int'
    ]
    base_feat_space = nfl_df.drop(to_drop, axis=1)

    # Previewing Feature Names
    feature_names = base_feat_space.columns.tolist()
    log.info(sf.Color.BOLD + sf.Color.GREEN + "Feature Names:" + sf.Color.END)
    log.debug(feature_names)

    # Using Label Encoding to Rebase the Values in these Columns
    base_feat_space = MultiColumnLabelEncoder(
        columns=['name', 'pos', 'year', 'against', 'team']).fit_transform(
            base_feat_space)
    log.info(sf.Color.BOLD + sf.Color.GREEN +
             "Sample Data Post Label Encoding:" + sf.Color.END)
    log.info(base_feat_space.head(3))

    # Make NumPy Array For Base Features
    # base_x = base_feat_space.as_matrix().astype(np.float)

    # Handling Polynomials based on Base Features
    features_with_polynomials = [
        'rush_att_av', 'rush_td_av', 'rush_yd_av', 'rec_td_av', 'rec_yd_av',
        'fumble_av', 'pass_yd_av', 'pass_td_av', 'pass_int_av'
    ]
    polynomial_feat_space = base_feat_space[features_with_polynomials].copy(
        deep=True)
    for feature in features_with_polynomials:
        new_feature = feature + '^2'
        log.debug('New Feature %s Added Based on %s', new_feature, feature)
        polynomial_feat_space[new_feature] = polynomial_feat_space[feature]**2

    # Removing original features from polynomial feature space
    polynomial_feat_space = polynomial_feat_space.drop(
        features_with_polynomials, axis=1)
    log.info(sf.Color.BOLD + sf.Color.GREEN +
             "Sample Data Polynomial Features:" + sf.Color.END)
    log.info(polynomial_feat_space.head(3))

    # Make NumPy Array for Polynomial Features
    # polynomial_x = polynomial_feat_space.as_matrix().astype(np.float)
    # Scaling & Normalization

    if feature_scaling:

        # Base Features - Scaling & Normalization
        base_feat_space = pd.DataFrame(
            preprocessing.StandardScaler().fit_transform(base_feat_space),
            columns=base_feat_space.columns)
        log.info(sf.Color.BOLD + sf.Color.GREEN +
                 "Sample Data Post Base Features Scaling:" + sf.Color.END)
        log.info(base_feat_space.head(3))

        # Polynomial Features - Scaling & Normalization
        polynomial_feat_space = pd.DataFrame(
            preprocessing.StandardScaler().fit_transform(
                polynomial_feat_space),
            columns=polynomial_feat_space.columns)
        log.info(sf.Color.BOLD + sf.Color.GREEN +
                 "Sample Data Post Polynomial Features Scaling:" +
                 sf.Color.END)
        log.info(polynomial_feat_space.head(3))

    # Merge Base & Polynomial Features; Convert to NumPy Array
    x_df = pd.concat([base_feat_space, polynomial_feat_space], axis=1)
    x = x_df.as_matrix().astype(np.float)

    # Handle Feature Scaling and Normalization
    # if feature_scaling:
    #    scaler = StandardScaler()
    #    x = scaler.fit_transform(x)

    log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Transformed Data:" +
             sf.Color.END)
    log.info(x[0:3])

    log.info("Feature Space holds %d Observations and %d Features" % x.shape)

    return [x, y, nfl_df]
Exemplo n.º 4
0
def prep_reg_nfl_pred(feature_scaling=False, data_csv='dataproc/nfl_pred_data.csv'):

    log = logging.getLogger('debug')

    nfl_df = sf.load_model_data(data_csv)

    # Previewing Column Names
    col_names = nfl_df.columns.tolist()
    log.info(sf.Color.BOLD + sf.Color.GREEN + "Column Names:" + sf.Color.END)
    log.info(col_names)

    # Previewing Source Data
    to_show = col_names[:]
    log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Loaded Data:" + sf.Color.END)
    log.info(nfl_df[to_show].head(3))

    # Isolate Output Data
    y = np.array(nfl_df['points'])

    # Columns to drop (From Features Data Frame)
    to_drop = ['points', 'rush_att', 'rush_td', 'rush_yd', 'rec_td', 'rec_yd', 'fumble', 'pass_yd',
               'pass_td', 'pass_int']
    base_feat_space = nfl_df.drop(to_drop, axis=1)

    # Previewing Feature Names
    feature_names = base_feat_space.columns.tolist()
    log.info(sf.Color.BOLD + sf.Color.GREEN + "Feature Names:" + sf.Color.END)
    log.debug(feature_names)

    # Using Label Encoding to Rebase the Values in these Columns
    base_feat_space = MultiColumnLabelEncoder(columns=['name', 'pos', 'year', 'against', 'team']).fit_transform(
        base_feat_space)
    log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Data Post Label Encoding:" + sf.Color.END)
    log.info(base_feat_space.head(3))

    # Make NumPy Array For Base Features
    # base_x = base_feat_space.as_matrix().astype(np.float)

    # Handling Polynomials based on Base Features
    features_with_polynomials = ['rush_att_av', 'rush_td_av', 'rush_yd_av', 'rec_td_av', 'rec_yd_av', 'fumble_av',
                                 'pass_yd_av', 'pass_td_av', 'pass_int_av']
    polynomial_feat_space = base_feat_space[features_with_polynomials].copy(deep=True)
    for feature in features_with_polynomials:
        new_feature = feature + '^2'
        log.debug('New Feature %s Added Based on %s', new_feature, feature)
        polynomial_feat_space[new_feature] = polynomial_feat_space[feature] ** 2

    # Removing original features from polynomial feature space
    polynomial_feat_space = polynomial_feat_space.drop(features_with_polynomials, axis=1)
    log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Data Polynomial Features:" + sf.Color.END)
    log.info(polynomial_feat_space.head(3))

    # Make NumPy Array for Polynomial Features
    # polynomial_x = polynomial_feat_space.as_matrix().astype(np.float)
    # Scaling & Normalization

    if feature_scaling:

        # Base Features - Scaling & Normalization
        base_feat_space = pd.DataFrame(preprocessing.StandardScaler().fit_transform(base_feat_space),
                                       columns=base_feat_space.columns)
        log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Data Post Base Features Scaling:" + sf.Color.END)
        log.info(base_feat_space.head(3))

        # Polynomial Features - Scaling & Normalization
        polynomial_feat_space = pd.DataFrame(preprocessing.StandardScaler().fit_transform(polynomial_feat_space),
                                             columns=polynomial_feat_space.columns)
        log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Data Post Polynomial Features Scaling:" + sf.Color.END)
        log.info(polynomial_feat_space.head(3))

    # Merge Base & Polynomial Features; Convert to NumPy Array
    x_df = pd.concat([base_feat_space, polynomial_feat_space], axis=1)
    x = x_df.as_matrix().astype(np.float)

    # Handle Feature Scaling and Normalization
    # if feature_scaling:
    #    scaler = StandardScaler()
    #    x = scaler.fit_transform(x)

    log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Transformed Data:" + sf.Color.END)
    log.info(x[0:3])

    log.info("Feature Space holds %d Observations and %d Features" % x.shape)

    return [x, y, nfl_df]