def get_opponent_vector(self): database_session = MlbDatabase().open_session() # Get the hitters he is facing as well hitter_postgame_entries = database_session.query( PregameHitterGameEntry).filter( PregameHitterGameEntry.game_date == self.game_date, PregameHitterGameEntry.game_time == self.game_time, PregameHitterGameEntry.home_team == self.home_team, PregameHitterGameEntry.is_home_team != self.is_home_team) hitter_array = np.array(np.zeros(31)) for hitter_entry in hitter_postgame_entries: hitter_array += hitter_entry.to_input_vector_raw() database_session.close() return PregameHitterGameEntry.avg_input_vector(hitter_array)
class PitcherRegressionForestTrainer(RegressionForest): SIZE_TRAINING_BATCH = 900 def __init__(self): self._database_session = MlbDatabase().open_session() self._decision_tree = None def get_stochastic_batch(self, input_query, num_samples=None): potential_samples = list() for postgame_entry in input_query: pregame_entry = self._database_session.query( PregamePitcherGameEntry).get( (postgame_entry.rotowire_id, postgame_entry.game_date, postgame_entry.game_time)) if pregame_entry is not None: potential_samples.append((pregame_entry, postgame_entry)) else: print "Can't find %s %s %s %s" % ( postgame_entry.rotowire_id, postgame_entry.home_team, postgame_entry.game_date, postgame_entry.game_time) if num_samples is None: num_samples = len(potential_samples) player_samples = random.sample([itm for itm in potential_samples], num_samples) x = list() y = list() for item in player_samples: pregame_entry = item[0] postgame_entry = item[1] input_vector = pregame_entry.to_input_vector() if pregame_entry.game_entry is None: print "NoneType game entry for %s %s %s %s" % ( pregame_entry.rotowire_id, pregame_entry.home_team, pregame_entry.game_date, pregame_entry.game_time) continue if pregame_entry.game_entry.umpire is None: umpire_vector = UmpireCareerEntry.get_nominal_data( self._database_session) else: ump_entry = self._database_session.query( UmpireCareerEntry).get(pregame_entry.game_entry.umpire) if ump_entry is None: umpire_vector = UmpireCareerEntry.get_nominal_data( self._database_session) else: umpire_vector = ump_entry.to_input_vector() game_datetime = datetime.datetime.strptime(pregame_entry.game_date, "%Y-%m-%d") park_factors = self._database_session.query(ParkEntry).get( (pregame_entry.home_team, "2016")) if park_factors is None: print "Pitcher regression forest: Could not find %s from %s" % ( pregame_entry.home_team, "2016") park_vector = np.array([100, 100]) else: park_vector = park_factors.to_input_vector() final_pitcher_array = np.concatenate([ input_vector, pregame_entry.get_opponent_vector(), park_vector, umpire_vector ]) x.append(final_pitcher_array.tolist()) y.append([postgame_entry.actual_draftkings_points]) return x, y def train_network(self): """ Pure virtual method for training the network """ self.load_model() if self._decision_tree is None: self._decision_tree = RandomForestRegressor(n_estimators=1000) db_query = self._database_session.query(PostgamePitcherGameEntry) mlb_training_data, mlb_evaluation_data = self.get_train_eval_data( db_query, 0.8) x_train, y_train = self.get_stochastic_batch(mlb_training_data) self._decision_tree.fit(x_train, np.ravel(y_train)) self.save_model() x_eval, y_eval = self.get_stochastic_batch(mlb_evaluation_data) y_eval_predictions = self._decision_tree.predict(x_eval) y_eval_predictions = np.array(y_eval_predictions) y_eval = np.array(y_eval) print "Pitcher Training Size: %i | Pitcher Evaluation Size: %i" % ( len(x_train), len(x_eval)) print "Pitcher median absolute error: %f" % median_absolute_error( y_eval, y_eval_predictions) self._database_session.close() def get_prediction(self, input_data): return self._decision_tree.predict([input_data]) def get_prediction_interval(self, input_data, percentile=95): preds = [] for pred in self._decision_tree.estimators_: preds.append(pred.predict(input_data.reshape(1, len(input_data)))) err_down = np.percentile(preds, (100 - percentile) / 2.) err_up = np.percentile(preds, 100 - (100 - percentile) / 2.) return err_down, err_up def get_std_dev(self, input_data): preds = [] for pred in self._decision_tree.estimators_: preds.append(pred.predict(input_data.reshape(1, len(input_data)))) return np.std(preds) def save_model(self): try: joblib.dump(self._decision_tree, 'pitcher_regression_forest.pkl') except: pass def load_model(self): try: self._decision_tree = joblib.load('pitcher_regression_forest.pkl') except: pass
from sql.mlb_database import MlbDatabase from sql.lineup import LineupEntry from sql.postgame_hitter import PostgameHitterGameEntry from sql.pregame_hitter import PregameHitterGameEntry from sql.postgame_pitcher import PostgamePitcherGameEntry from sql.pregame_pitcher import PregamePitcherGameEntry from datetime import date, timedelta from numpy import array, std, mean database_session = MlbDatabase().open_session() query_results = database_session.query(LineupEntry).filter( LineupEntry.game_date != date.today()) lineup_predicted_salary = 0 lineup_actual_salary = 0 lineup_actual_vector = list() lineup_predicted_vector = list() for query_result in query_results: try: #TODO: fix these gets by using the GameEntry to get the game time and such # TODO: this needs to be altered to accommodate double headers, but is not a big priority lineup_actual_salary += database_session.query( PostgameHitterGameEntry).get( (query_result.catcher, query_result.game_date)).actual_draftkings_points lineup_actual_salary += database_session.query( PostgamePitcherGameEntry).get( (query_result.starting_pitcher_1, query_result.game_date)).actual_draftkings_points lineup_actual_salary += database_session.query( PostgamePitcherGameEntry).get(