/
Data_extraction_and_sanitization.py
227 lines (178 loc) · 8.75 KB
/
Data_extraction_and_sanitization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import pandas as pd
import csv
import numpy as np
from pandas import DataFrame as df
#the base file is taken from the team.csv file in the Lahmen baseball database
#The data is used underThis work is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.
# For details see: http://creativecommons.org/licenses/by-sa/3.0/
# Database file can be found at http://www.seanlahman.com/baseball-archive/
#converts the team stats to dataframe
team_csv_file_dataframe = pd.read_csv(r'/PycharmProjects/Machine_Learning_Baseball_Algorithim/csv_files/team.csv')
print(team_csv_file_dataframe.head(10))
#list of stats that we want to keep
relevant_Stats = ['yearID','G','W','L','R','AB','H','2B','3B','HR','BB','SO','SB','CS','HBP','SF','RA','ER','ERA','CG','SHO','SV','IPouts','HA','HRA','BBA','SOA','E','DP','FP']
#make file with the headers of the relevant stats
with open(r'ave_stats_by_year.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow(relevant_Stats)
#gets all stats for a year and returns it as a dataframe
def get_stats_for_year(currentYear):
df = pd.read_csv('/PycharmProjects/Machine_Learning_Baseball_Algorithim/csv_files/relevant_stats.csv')
print('get_stats_for_year(currentYear)')
print( df.head(10))
current_year_stats = df.loc[df['yearID'] == currentYear]
print (current_year_stats)
return current_year_stats
#for each year get the total for each stat and write the average
def get_yearly_averages( startYear, endYear):
currentYear = startYear
while currentYear <= endYear:
get_stats_for_year(currentYear)
currentYear += 1
#get averages for each stat and add them to the ave_stat_by_year.csv file
def calculate_average(dataFrame):
average_stats = []
print("calculate average")
print(dataFrame.describe)
print(dataFrame.mean(axis=0))
# mean(axis = 0) returns the average in each column of dataframe
average_stats = dataFrame.mean(axis=0)
print(average_stats)
return average_stats
#writes the average stats to the csv
def add_average_stats_to_csv(dataframe):
dataframe.to_csv('/PycharmProjects/Machine_Learning_Baseball_Algorithim/csv_files/ave_stats_per_year.csv')
with open(r'ave_stats_by_year.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow(dataframe)
#writes the average stats to the csv for multiple years
def add_average_stats_for_range_of_years_to_csv( start, end):
count = 0
while count + start <= end:
current_year = start+count
add_average_stats_to_csv(calculate_average(get_stats_for_year(current_year)))
count += 1
#gets the average stats for any given year
def recall_average_stats_for_a_given_year(currentYear):
df = pd.read_csv('/PycharmProjects/Machine_Learning_Baseball_Algorithim/csv_files/ave_stats_per_year.csv')
print('recall_average_stats_for_a_given_year')
current_year_stats = df.loc[df['yearID'] == currentYear]
print (current_year_stats)
return current_year_stats
#returns individual team stats for a given year
def recall_team_stats_for_a_given_year(currentYear):
df = pd.read_csv('/PycharmProjects/Machine_Learning_Baseball_Algorithim/csv_files/relevant_stats.csv')
print('recall_team_stats_for_a_given_year(currentYear)')
current_year_team_stats= df.loc[df['yearID'] == currentYear]
print (current_year_team_stats)
return current_year_team_stats
#normalizes by dividing the individual team stats to the average that year
#if a stat for a given team is average it will be == 1
#below average < 1 and above average > 1
def compare_teamStats_to_leauge_Ave( currentYear):
average = recall_average_stats_for_a_given_year(currentYear)
teams = recall_team_stats_for_a_given_year(currentYear)
print('compare_teamStats_to_leauge_Ave( currentYear)')
average = teams.values / average.values
print(average)
averageDataframe = pd.DataFrame(average)
print(averageDataframe)
return averageDataframe
#writes the team yearly comparisions to the CSV file
def write_teamStats_to_leauge_ave_to_csv(dataframe):
dataframe.to_csv('/PycharmProjects/Machine_Learning_Baseball_Algorithim/csv_files/ave_stats_by_team.csv')
with open(r'ave_stats_by_team.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow(dataframe)
#isolates team stats for a given range appends to same dataframe
#adds headers to the dataframe and writes results to csv
def add_teamStats_to_leauge_ave_for_range (start , end):
count = 0
team_stats = compare_teamStats_to_leauge_Ave(start)
team_stats.columns = relevant_Stats
count +=1
while count + start <= end:
current_year = start+count
team = compare_teamStats_to_leauge_Ave(current_year)
team.columns = relevant_Stats
#dataframe_with_all_data.append(team, ignore_index=True)
#team.columns = relevant_Stats
team_stats = team_stats.append(team, ignore_index=True)
count += 1
write_teamStats_to_leauge_ave_to_csv(team_stats)
print(team_stats.head(40))
return team_stats
#appends file with the classification: bad team, good team, average team or playoff team
def add_classification_column_to_data():
df = pd.read_csv('/PycharmProjects/Machine_Learning_Baseball_Algorithim/csv_files/ave_stats_by_team.csv')
determine_classification(df)
#df["classification"] = np.nan
print(df.head(10))
return df
'''
breakdown for team wins
Leauge ave = 81 wins per season = 50% winning percentage = 1 on ave_stat_by_team.csv
Bad team: < 77 | 77/162 = .47530864 | 77/81 = .95061728
Average Team :77 < x < 85 | 84/162 = .51851852 | 84/81 = 1.03703704
Good Team: 85-92 | 92/162 = .56790123 | 92/81 = 1.13580247
Playoff team: > 92 wins |
'''
#assigns classification values
def calculate_classification( row ):
value = row['W']
value = float(value)
if value < float(0.95061728):
return 0 #'Bad_Team'
if float(0.95061728) < value < float(1.03703704):
return 1 #'Average_Team'
if float(1.03703704) < value < float(1.13580247):
return 2 #'Good_Team'
if float(1.13580247) < value:
return 3 #'Playoff_Team'
return -1
def determine_classification(dataFrame):
df = dataFrame
df['Classification'] = df.apply (lambda row: calculate_classification( row ),axis=1)
df.head(15)
return df
#writes the team yearly comparisions to the CSV file
def write_teamStats_to_leauge_ave_to_csv(dataframe):
dataframe.to_csv('/PycharmProjects/Machine_Learning_Baseball_Algorithim/csv_files/stats_with_classification.csv')
with open(r'stats_with_classification.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow(dataframe)
#transformations added additional unneeded columns to the data this gets rid of them
#games wins and losses are unneeded because of the year average
#year ID is also unneeded because it will be 1 for all teams
def remove_excess_columns_classification_column_to_data():
df = pd.read_csv('/PycharmProjects/Machine_Learning_Baseball_Algorithim/csv_files/stats_with_classification.csv')
#adding inplace allows us to drop the excess columns without reassigning dataframe
df.drop(columns=['added', 'added2', 'yearID','G','W','L'], axis=1, inplace=True)
print(df.head(10))
return df
#fixes some overhead that was added in other methods
def write__excess_columns_removed_teamStats_to_leauge_ave_to_csv(dataframe):
#adding index=false removes the issue with additional column from earlier
dataframe.to_csv('/PycharmProjects/Machine_Learning_Baseball_Algorithim/csv_files/team_stats_ave_no_excess_stats.csv', index=False)
with open(r'stats_with_classification.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow(dataframe)
relevant_Stats_file_lahmenCSV = team_csv_file_dataframe[relevant_Stats]
#print(relevant_Stats_file_lahmenCSV.head(10))
relevant_Stats_file_lahmenCSV.to_csv("relevant_stats.csv", index=False)
relevant_Stats_csv = pd.read_csv('/PycharmProjects/Machine_Learning_Baseball_Algorithim/csv_files/relevant_stats.csv')
#test of each method for given year
#get_stats_for_year(1888)
#print("average stats")
calculate_average(get_stats_for_year(1888))
add_average_stats_to_csv(calculate_average(get_stats_for_year(1888)))
add_average_stats_for_range_of_years_to_csv( 1888, 2015)
#test of each method for given year
#recall_average_stats_for_a_given_year(2000)
#recall_team_stats_for_a_given_year(2000)
#compare_teamStats_to_leauge_Ave(2000)
#range starts at 1888 because before then the stats are pretty sparse
add_teamStats_to_leauge_ave_for_range (1888, 2015)
write_teamStats_to_leauge_ave_to_csv(add_classification_column_to_data())
#transformations added additional unneeded columns to the data this gets rid of them
write__excess_columns_removed_teamStats_to_leauge_ave_to_csv(remove_excess_columns_classification_column_to_data())