Exemplo n.º 1
0
def make():
    fr = IntermediateFilePersistence('transformed/2019GeneratedMatchups.csv')

    print("Generating test matchups...")
    df_test = make_dataset.load_test_data()

    df_a = df_test
    df_b = df_test.drop('Season', axis=1)

    df = df_a.assign(key=1).merge(df_b.assign(key=1), on='key').drop('key', 1)
    df = df[df['TeamID_x'] != df['TeamID_y']]

    df['GameID'] = feature_utils.create_game_key(df['Season'], df['TeamID_x'],
                                                 df['TeamID_y'])
    df['TeamA_ID'] = feature_utils.create_key_from_season_team(
        df['Season'], df['TeamID_x'])
    df['TeamB_ID'] = feature_utils.create_key_from_season_team(
        df['Season'], df['TeamID_y'])

    df.drop(labels=['TeamID_x', 'TeamID_y'], inplace=True, axis=1)

    print("Writing test values to disk...")

    fr.write_to_csv(df)

    return df
Exemplo n.º 2
0
def make(df=None):
    if df is None:
        fp = IntermediateFilePersistence('CanonicalFeatureData.csv')
        df = fp.read_from_csv()
    df = __groom(df)
    df = __normalize(df)
    return df
Exemplo n.º 3
0
def load_seed_data():
    df = pd.read_csv(tourney_seeds_csv)
    df['TeamSeasonId'] = feature_utils.create_key_from_season_team(
        df['Season'], df['TeamID'])
    df = df.drop('TeamID', axis=1)
    fr = IntermediateFilePersistence('transformed/SeedData.csv')
    fr.write_to_csv(df)
    return df
Exemplo n.º 4
0
def load_advanced_team_data(start=START_YEAR, end=END_YEAR):
    bball_ref_dir = base_dir + 'external/bball_reference/advanced/'
    df_regular_season_aggregated_advanced = pd.DataFrame()
    for year in range(start, end + 1):
        advanced_csv = bball_ref_dir + str(year) + 'SchoolAdvanced.csv'
        df_advanced = pd.read_csv(advanced_csv, header=1)
        df_advanced['Year'] = year
        df_regular_season_aggregated_advanced = df_regular_season_aggregated_advanced.append(
            df_advanced)
    df = helper.parse_advanced(df_regular_season_aggregated_advanced)
    df = df.dropna(0)
    fr = IntermediateFilePersistence('transformed/SeasonRatings.csv')
    fr.write_to_csv(df)
    return df
Exemplo n.º 5
0
def load_season_team_data(start=START_YEAR, end=END_YEAR):
    bball_ref_dir = base_dir + '/external/bball_reference/'
    df_regular_season_aggregated = pd.DataFrame()
    for year in range(start, end + 1):
        regular_season_csv = bball_ref_dir + str(year) + '_season.csv'
        df_regular_season = pd.read_csv(regular_season_csv, header=1)
        df_regular_season['Year'] = year
        df_regular_season_aggregated = df_regular_season_aggregated.append(
            df_regular_season)

    df = helper.parse_single_season_team_data(df_regular_season_aggregated)
    fr = IntermediateFilePersistence('transformed/SeasonRawStats.csv')
    fr.write_to_csv(df)
    return df_regular_season_aggregated
Exemplo n.º 6
0
def make(game_data=None):
    build_team_data.persist(build_team_data.make())
    team_fp = IntermediateFilePersistence('TeamData.csv')
    df_team = team_fp.read_from_csv()
    if game_data is None:
        game_data = build_game_data.create_labeled_game_data()
        df = __merge_game_with_team_data(game_data, df_team)
        df.drop(labels=['ScoreDiff_y'], inplace=True, axis=1)
        df = df.rename(columns={
            "ScoreDiff_x": "ScoreDiff",
            "LTeamID": "TeamB_ID"
        })
    else:
        df = __merge_game_with_team_data(game_data, df_team)
    df['Round'] = df.apply(feature_utils.__get_round, axis=1)
    return df
Exemplo n.º 7
0
def make(mode, train_dataset=None, train_labels=None):
    if mode is "dev":
        fp = IntermediateFilePersistence('NormalizedFeatureData.csv')
        df = fp.read_from_csv()
        (train_dataset, train_labels), (
            test_dataset, test_labels
        ) = split_dataset.split_training_data_randomly_with_seed(df)

    model = build_model(train_dataset)
    model, hist = train_model(model, train_dataset, train_labels)

    if mode is "dev":
        print("\nEvaluating dev set...\n")
        evaluate_model(model, test_dataset, test_labels)

    plot_training(hist)

    return model
Exemplo n.º 8
0
    (train_dataset, train_labels), (
        test_dataset,
        test_labels) = split_dataset.split_training_data_randomly_with_seed(df)

    model = RandomForestRegressor(random_state=1, max_depth=10)
    df = pd.get_dummies(df)
    model.fit(train_dataset, train_labels)

    features = df.columns
    importances = model.feature_importances_
    imp = pd.DataFrame(importances)
    imp['Feat'] = features[imp.index]
    fp = BaseFilePersistence(
        '/Users/kluteytk/development/projects/MarchMadness2019/data/experiment/FeatureImportance.csv'
    )
    fp.write_to_csv(imp, index=False)
    indices = np.argsort(importances)[-20:]  # top 10 features
    plt.title('Feature Importances')
    plt.barh(range(len(indices)),
             importances[indices],
             color='b',
             align='center')
    plt.yticks(range(len(indices)), [features[i] for i in indices])
    plt.xlabel('Relative Importance')
    plt.show()


if __name__ == '__main__':
    fr = IntermediateFilePersistence('NormalizedFeatureData.csv')
    df = fr.read_from_csv()
    test_corr(df)
Exemplo n.º 9
0
def __read_from_csv_and_split():
    fp = IntermediateFilePersistence('NormalizedFeatureData.csv')
    df = fp.read_from_csv()

    return split_dataset.split_training_data_randomly_with_seed(df, 0)
Exemplo n.º 10
0
def persist(df):
    fr = IntermediateFilePersistence('NormalizedFeatureData.csv')
    fr.write_to_csv(df)
Exemplo n.º 11
0
def persist(df):
    fr = IntermediateFilePersistence('CanonicalFeatureData.csv')
    fr.write_to_csv(df)
Exemplo n.º 12
0
def persist(df):
    fp = IntermediateFilePersistence('TeamData.csv')
    fp.write_to_csv(df)