def main(session, team_stats_tbl, sched_tbl, graph=False): """Create a regression data frame, run a regression through the LinearRegression class, and return the class Args: session: An instantiated Session object from sqlalchemy team_stats_tbl: A mapped team stats table class sched_tbl: A mapped schedule table class graph: A boolean that creates graphs if true Returns: A LinearRegression class """ league_year = Config.get_property("league_year") graph_dir = Config.get_property("graph_dir") if not os.path.exists(graph_dir) and graph: os.mkdir(graph_dir) # Import and specify a list of factors to extract from database ff_list = four_factors_list() # regression_df = create_ff_regression_df(session, team_stats_tbl, sched_tbl, ff_list) regression_df = alt_regression_df(session, team_stats_tbl, sched_tbl, ff_list) print('using alternative/old regression_df') # Separate DF's into them into X (predictors) and y (target) predictors = regression_df[regression_df.columns.drop( list(regression_df.filter(regex='sched')))] target = regression_df["sched_MOV"] ff_reg = LinearRegression(target, predictors) # Note: On Windows, graphs will not appear to update # To change that, go to folder properties -> customize -> optimize for: Documents if graph: ff_reg.predicted_vs_actual(out_path=os.path.join( graph_dir, "pred_vs_actual_{}.png".format(league_year))) ff_reg.residuals_vs_fitted(out_path=os.path.join( graph_dir, "residuals_vs_fitted_{}.png".format(league_year))) ff_reg.qqplot(out_path=os.path.join( graph_dir, "qqplot_{}.png".format(league_year))) ff_reg.influence_plot(out_path=os.path.join( graph_dir, "influence_{}.png".format(league_year))) ff_reg.cooks_distance(out_path=os.path.join( graph_dir, "cooks_distance_{}.png".format(league_year))) ff_reg.residual_independence(out_path=os.path.join( graph_dir, "resid_independence_{}.png".format(league_year))) # Multicollinearity # vif_df = ff_reg.vif() # ff_reg.residual_distribution() return ff_reg
def scrape(): """Scrapes betting line information from bovada and adds it to the session""" league_year = Config.get_property("league_year") lines = odds_for_today() if not lines: return False return lines line_data = DataOperator(lines) tbl_name = "odds_{}".format(league_year) tbl_exists = database.table_exists(tbl_name) if not tbl_exists: create_odds_table(database, line_data, tbl_name, schedule) tbl_exists = database.table_exists(tbl_name) if line_data.validate_data_length() and tbl_exists: # All values in line_data are expected to be be unique from values in the database. A possible place for errors # to occur odds_table = database.get_table_mappings([tbl_name]) # Reconcile ensures the odds_table has appropriate start_times; Add logic so its not called every run reconcile(schedule, odds_table, "start_time", "id", "game_id", session) update_odds_table(odds_table, schedule, line_data.dict_to_rows(), session) else: raise Exception( "Something is wrong here (Not descriptive, but this point shouldn't be hit.)" ) return True
def __init__(self, url=None): """Initialize macro-level SQLalchemy objects as class attributes (engine, metadata, base). A session will allow interaction with the DB.""" if not url: file_path = os.getcwd() self.path = Config.get_property("database") else: self.path = url self.engine = create_engine(self.path, pool_pre_ping=True) self.metadata = MetaData(self.engine) self.Base = declarative_base()
def main(): db = dbinterface.DBInterface() session = Session(bind=db.engine) league_year = Config.get_property("league_year") regression = lm.main(db, session) sched_tbl = db.get_table_mappings("sched_{}".format(league_year)) if not db.table_exists("pred"): # Returns a data manipulator class sample = get.sample_prediction(db, session, ref_tbl=sched_tbl, model=regression) create_prediction_table(db, sample, "game_pred_{}".format(league_year))
def run_all(): """Run the entire NBA_bet project.""" db = DBInterface() year = Config.get_property("league_year") session = Session(bind=db.engine) scraper.scrape_all(db, session, year) session.commit() bets.predict_all(db, session) session.commit() session.close()
def scrape(): """Scrape basketball reference for games in a season, parse the output, and write the output to a database. If the specified year has been completed, it will return every game in the season. If the season is ongoing, it will return every game up to the day before the module is run. This ensures only completed games are returned. Args: database: An instantiated DBInterface object from database.database for database interactions session: A SQLalchemy session object """ league_year = Config.get_property("league_year") # Create table season_data = client.season_schedule(league_year) season_data = br_enum_to_string(season_data) return season_data
def create_table(db, team_stats_data, tbl_name): """Create a table of team stats in a database with appropriate foreign keys and constraints. Args: db: a datotable.database.Database object connected to a database team_stats_data: A datatotable.data.DataOperator object with data on NBA team stats tbl_name: The desired table name ToDo: Currently allows duplicate rows if those values are on different days. Solve with a constraint """ columns = team_stats_data.columns columns['team_id'].append( ForeignKey("teams_{}.id".format(Config.get_property('league_year')))) constraints = [UniqueConstraint("team_id", "scrape_time")] db.map_table(tbl_name=tbl_name, columns=columns, constraints=constraints) db.create_tables() db.clear_mappers()
def predict_all(db): """Generate and store predictions for all games available in the odds table. Checks if the table exists. If it doesn't, generate a table in the database. """ session = Session(bind=db.engine) league_year = Config.get_property("league_year") sched_tbl = db.table_mappings["schedule_{}".format(league_year)] team_stats_tbl = db.table_mappings['team_stats_{}'.format(league_year)] odds_tbl = db.table_mappings['odds_{}'.format(league_year)] regression = ff_reg.main(session, team_stats_tbl, sched_tbl) pred_tbl_name = "predictions_{}".format(league_year) if not db.table_exists(pred_tbl_name): sample = get_sample_prediction(session, regression, sched_tbl) pred_data = predictions.format_data() predictions.create_table() pred_tbl = db.table_mappings[pred_tbl_name] session.add_all([pred_tbl(**row) for row in pred_data.rows]) session.commit() else: # Data operator pred_tbl = db.table_mappings[pred_tbl_name] schedule_tbl = db.table_mappings[pred_tbl_name] update_rows = predictions.insert(session, ) results = predict_games_in_odds(session, regression, odds_tbl) session.add_all(update_rows) session.commit() insert_new_predictions(results, session, pred_tbl, sched_tbl, odds_tbl) session.commit() # Commit here b/c update_prediction_tbl() needs the inserted values update_prediction_table(session, pred_tbl, sched_tbl, odds_tbl)
def team_statistics(tbl_name): """Build a URL for the specified year and return team statistics for the specified table on that page. Performance not guaranteed for tables that are not "misc_stats" Args: tbl_name: The name of the table to be returned Returns: A dictionary version of the specified table. Keys are column titles that return lists ordered by team. """ url = '{BASE_URL}/leagues/NBA_{year}.html'.format( BASE_URL=BASE_URL, # imported from br_references.py year=Config.get_property("league_year")) response = requests.get(url=url, allow_redirects=False) if 200 <= response.status_code < 300: scrape_time = datetime.now() return parse_table( response.content, tbl_name, scrape_time) # Note that this uses the .content attribute raise Exception("Could not connect to URL")
def main(db): year = Config.get_property("league_year") session = nbapredict.management.Session(bind=db.engine) # ~~~~~~~~~~~~~ # Teams # ~~~~~~~~~~~~~ team_dict = team_scraper.scrape() teams_data = DataOperator({"team_name": team_dict["team_name"]}) teams_tbl_name = "teams_{}".format(year) if not db.table_exists(teams_tbl_name): teams.create_team_table(db=db, teams_data=teams_data, tbl_name=teams_tbl_name) teams_tbl = db.table_mappings[teams_tbl_name] session.add_all([teams_tbl(**row) for row in teams_data.rows]) session.commit() del teams_tbl # ~~~~~~~~~~~~~ # Team Stats # ~~~~~~~~~~~~~ team_stats_tbl_name = "team_stats_{}".format(year) teams_tbl = db.table_mappings[teams_tbl_name] team_dict['team_id'] = team_dict.pop('team_name') team_dict['team_id'] = convert.values_to_foreign_key( session=session, foreign_tbl=teams_tbl, foreign_key="id", foreign_value="team_name", child_data=team_dict['team_id']) # When team_stats_tbl is created, the teams_tbl automap object is changed. The changed format does not follow # the expected behavior of an automapped table. I suspect this is because a relationship is established. # If we reloaded, teams_tbl works fine. Therefore, delete the variable here for now del teams_tbl team_dict['scrape_date'] = [ datetime.date(s_time) for s_time in team_dict['scrape_time'] ] team_stats_data = DataOperator(team_dict) if not db.table_exists(team_stats_tbl_name): team_stats.create_table(db=db, team_stats_data=team_stats_data, tbl_name=team_stats_tbl_name) team_stats_tbl = db.table_mappings[team_stats_tbl_name] session.add_all( [team_stats_tbl(**row) for row in team_stats_data.rows]) session.commit() else: team_stats_tbl = db.table_mappings[team_stats_tbl_name] team_stats.insert(session, team_stats_tbl, team_stats_data) # ~~~~~~~~~~~~~ # Schedule # ~~~~~~~~~~~~~ schedule_dict = season_scraper.scrape() schedule_data = DataOperator(schedule_dict) teams_tbl = db.table_mappings['teams_{}'.format(year)] schedule_data = schedule.format_data(session=session, schedule_data=schedule_data, team_tbl=teams_tbl, team_stats_tbl=team_stats_tbl) schedule_tbl_name = "schedule_{}".format(year) if not db.table_exists(schedule_tbl_name): schedule.create_table(db, schedule_data, schedule_tbl_name, teams_tbl, team_stats_tbl) schedule_tbl = db.table_mappings[schedule_tbl_name] session.add_all([schedule_tbl(**row) for row in schedule_data.rows]) session.commit() else: schedule_tbl = db.table_mappings[schedule_tbl_name] update_rows = schedule.update_table(session, schedule_data, schedule_tbl, team_stats_tbl) session.add_all(update_rows) session.commit() # ~~~~~~~~~~~~~ # Odds # ~~~~~~~~~~~~~ odds_dict = line_scraper.scrape() odds_data = None if odds_dict: odds_dict = odds.format_data(session, odds_dict, teams_tbl, schedule_tbl) odds_data = DataOperator(odds_dict) # Evaluate if you have the correct columns in odds_data (i.e. home\away team id's) odds_tbl_name = "odds_{}".format(year) if not db.table_exists(odds_tbl_name) and odds_data: odds.create_table(db, odds_tbl_name, odds_data, schedule_tbl) odds_tbl = db.table_mappings[odds_tbl_name] session.add_all(odds_tbl(**row) for row in odds_data.rows) session.commit() elif odds_data: odds_tbl = db.table_mappings[odds_tbl_name] session.add_all(odds_tbl(**row) for row in odds_data.rows) session.commit() odds.update_table(session, odds_tbl, odds_data) session.commit() odds.delete(session, odds_tbl) session.close()
# ~~~~~~~~~~~~~ # Odds # ~~~~~~~~~~~~~ odds_dict = line_scraper.scrape() odds_data = None if odds_dict: odds_dict = odds.format_data(session, odds_dict, teams_tbl, schedule_tbl) odds_data = DataOperator(odds_dict) # Evaluate if you have the correct columns in odds_data (i.e. home\away team id's) odds_tbl_name = "odds_{}".format(year) if not db.table_exists(odds_tbl_name) and odds_data: odds.create_table(db, odds_tbl_name, odds_data, schedule_tbl) odds_tbl = db.table_mappings[odds_tbl_name] session.add_all(odds_tbl(**row) for row in odds_data.rows) session.commit() elif odds_data: odds_tbl = db.table_mappings[odds_tbl_name] session.add_all(odds_tbl(**row) for row in odds_data.rows) session.commit() odds.update_table(session, odds_tbl, odds_data) session.commit() odds.delete(session, odds_tbl) session.close() if __name__ == "__main__": db = Database("test", Config.get_property("outputs")) main(db)
def odds_for_today(): """Match betting odds from Bovada to the games_query and return the odds Args: date to reflect the current games on Bovada. Returns: A dictionary where the column keys lists of values """ scrape_time = datetime.now() # Check for response from Bovada url = Config.get_property("regularURL") response = bovada_json_request(url) if not response: url = Config.get_property("playoffURL") response = bovada_json_request(url) if not response: return None # Move down tree towards games events = response[0]["events"] # Strip games from the 'event's object (which holds a bunch of random information) bovada_games = [ e for e in events if e['description'].count('@') > 0 and e['type'] == 'GAMEEVENT' ] if not bovada_games: return None # Set-up the line dictionary which stores data in the correct table format lines = { "home_team": [], "away_team": [], 'start_time': [], "spread": [], "home_spread_price": [], "away_spread_price": [], "home_moneyline": [], "away_moneyline": [], "scrape_time": [] } # Iterate through each game returned by bovada and store its information for game in bovada_games: link = game['link'].split('-') link = link[len(link) - 1] str_time = re.findall('[0-9]', link) start_time = ''.join(str_time) start_time = datetime.strptime(start_time, "%Y%m%d%H%M") if datetime.now() > start_time: # An ongoing game will not have the correct betting data. We don't want to store this information print( "This game ({}) is either ongoing or completed. Not scraping". format(game['description'])) continue home_team, away_team = parse_teams(game["competitors"]) # Get only the full match betting information from the game object betting_info = game["displayGroups"][0]["markets"] full_match_bets = [ bet for bet in betting_info if bet["period"]["description"] == "Match" ] # Extract the betting data associated with the game money_lines = False for bet in full_match_bets: if bet["description"] == "Moneyline": home_moneyline, away_moneyline = parse_moneyline(bet) if home_moneyline == "": home_moneyline = None if away_moneyline == "": away_moneyline = None money_lines = True elif bet["description"] == "Point Spread": spread, home_spread_price, away_spread_price = parse_spread( bet) if spread == "": spread = None if home_spread_price == "": home_spread_price = None if away_spread_price == "": away_spread_price = None if not money_lines: home_moneyline = None away_moneyline = None game_lines = [ home_team, away_team, start_time, spread, home_spread_price, away_spread_price, home_moneyline, away_moneyline, scrape_time ] # This section depends on python 3.7+ to preserve the order of dict keys in lines i = 0 for key in lines: lines[key].append(game_lines[i]) i += 1 return lines
# To change that, go to folder properties -> customize -> optimize for: Documents if graph: ff_reg.predicted_vs_actual(out_path=os.path.join( graph_dir, "pred_vs_actual_{}.png".format(league_year))) ff_reg.residuals_vs_fitted(out_path=os.path.join( graph_dir, "residuals_vs_fitted_{}.png".format(league_year))) ff_reg.qqplot(out_path=os.path.join( graph_dir, "qqplot_{}.png".format(league_year))) ff_reg.influence_plot(out_path=os.path.join( graph_dir, "influence_{}.png".format(league_year))) ff_reg.cooks_distance(out_path=os.path.join( graph_dir, "cooks_distance_{}.png".format(league_year))) ff_reg.residual_independence(out_path=os.path.join( graph_dir, "resid_independence_{}.png".format(league_year))) # Multicollinearity # vif_df = ff_reg.vif() # ff_reg.residual_distribution() return ff_reg if __name__ == "__main__": db = Database('test', "../management") session = Session(db.engine) year = Config.get_property('league_year') sched_tbl = db.table_mappings['schedule_{}'.format(year)] team_stats_tbl = db.table_mappings['team_stats_{}'.format(year)] test = main(session, graph=True) t = 2