def check_current_status(self, game_date): if self.scrape_condition == ScrapeCondition.ALWAYS: return Result.Ok() brooks_games_for_date = db.DateScrapeStatus.verify_brooks_daily_dashboard_scraped_for_date( self.db_session, game_date) return Result.Ok() if not brooks_games_for_date else Result.Fail( "skip")
def check_current_status(self, game_date): if self.scrape_condition == ScrapeCondition.ALWAYS: return Result.Ok() scraped_bbref_boxscores = db.DateScrapeStatus.verify_all_bbref_boxscores_scraped_for_date( self.db_session, game_date) return Result.Ok() if not scraped_bbref_boxscores else Result.Fail( "skip")
def launch(self): subprocess.run(["clear"]) print_message(f"Variable Name: {self.setting_name}\n", fg="bright_magenta", bold=True) print_message(f"Current Value: {self.current_setting}\n", fg="bright_yellow", bold=True) if not yes_no_prompt(prompt="\nChange current setting?"): return Result.Ok(self.exit_menu) user_confirmed, new_value = False, None while not user_confirmed: subprocess.run(["clear"]) prompt = f"Enter a new value for {self.setting_name}:\n" new_value = Input( prompt, word_color=colors.foreground["default"]).launch() result = self.confirm_new_value(new_value) if result.failure: return Result.Ok(self.exit_menu) user_confirmed = result.value result = self.dotenv.change_value(self.setting_name, new_value) if not self.restart_required: return result print_message(RESTART_WARNING, fg="bright_magenta", bold=True) pause(message="Press any key to continue...") exit(0)
def view_boxscore(self, team_id, player_type): boxscore = self.get_boxscore(team_id, player_type) while True: subprocess.run(["clear"]) result = self.select_player_prompt(player_type, boxscore) if result.failure: return Result.Ok() mlb_id = result.value if player_type == "BAT": self.view_at_bats_for_player(player_type, mlb_id) else: subprocess.run(["clear"]) result = self.select_pitcher_data_prompt(mlb_id) if result.failure: return Result.Ok() pitcher_data = result.value if pitcher_data == "AT_BATS": self.view_at_bats_for_player(player_type, mlb_id) if pitcher_data == "BAT_STATS": self.view_bat_stats_by_pitch_type_for_player(mlb_id) if pitcher_data == "PITCH_MIX_BY_STANCE": self.view_pitch_mix_batter_stance_splits(mlb_id) if pitcher_data == "PITCH_MIX_BY_SEASON": self.view_pitch_mix_season_splits(mlb_id) if pitcher_data == "PLATE_DISCIPLINE": self.view_pd_pitch_type_splits_for_pitcher(mlb_id) if pitcher_data == "BATTED_BALL": self.view_bb_pitch_type_splits_for_pitcher(mlb_id)
def patch_invalid_pfx_single_game(self): result = self.patch_invalid_pfx.execute(self.game_id) if result.failure: header = f"Invalid PitchFX Data for {self.game_id}\n" subprocess.run(["clear"]) print_message(header, wrap=False, bold=True, underline=True) print_message(result.error, fg="bright_yellow") pause(message="Press any key to continue...") return Result.Ok(True) if not self.prompt_user_create_patch_list(): return Result.Ok(True) result = self.patch_invalid_pfx.match_missing_pfx_data() if result.failure: return result for result, matches in self.patch_invalid_pfx.match_results.items(): if result == "success": for num, match_dict in enumerate(matches, start=1): match_dict["patch"] = self.prompt_user_create_patch( num, len(matches), match_dict) if result == "no_matches": self.display_no_match_found(matches) if result == "many_matches": self.display_many_matches_found(matches) if "success" not in self.patch_invalid_pfx.match_results: header = f"Invalid PitchFX Data for {self.game_id}\n" message = ( "Unable to identify missing data that matches the invalid PitchFX data for this " "game. You should inspect the combined data JSON file for this game and " "investigate the invalid data manually.\n") subprocess.run(["clear"]) print_message(header, wrap=False, bold=True, underline=True) print_message(message, fg="bright_yellow") pause(message="Press any key to continue...") return Result.Ok(True) result = self.patch_invalid_pfx.create_patch_list() if result.failure: return result if not self.patch_invalid_pfx.patch_list or not self.prompt_user_apply_patch_list( ): return Result.Ok(True) result = self.patch_invalid_pfx.apply_patch_list() if result.failure: return result self.patch_results = result.value print() if self.patch_results["fixed_all_errors"]: patch_result = f"PitchFX data for {self.game_id} is now completely reconciled (no errors of any type)!\n" print_success(patch_result) if self.patch_results["invalid_pfx"]: patch_result = f"{self.game_id} still contains invalid PitchFX data after applying the patch list.\n" print_error(patch_result) if self.patch_results["pfx_errors"]: patch_result = f"{self.game_id} still contains PitchFX data errors associated with valid at bats.\n" print_error(patch_result) pause(message="Press any key to continue...") subprocess.run(["clear"]) if self.prompt_user_view_patched_data(): self.display_patched_data_tables( **self.patch_results["patch_diff_report"]) return Result.Ok()
def match_missing_pfx_data(self): if not self.game_contains_invalid_pfx(): return Result.Ok() self.events.match_missing_pfx_data_start() for pfx in self.get_invalid_pfx_dict_list(): matches = self.find_game_events_matching_this_pfx(pfx) if not matches: match_dict = { "success": False, "invalid_pfx": pfx, "missing_pfx": {} } self.match_results["no_matches"].append(match_dict) continue if len(matches) > 1: result = self.check_for_exact_match(pfx) if result.failure: match_dict = { "success": False, "invalid_pfx": pfx, "missing_pfx": matches } self.match_results["many_matches"].append(match_dict) continue exact_match = result.value else: exact_match = matches[0] match_dict = self.found_successful_match(pfx, exact_match) self.match_results["success"].append(match_dict) self.events.match_missing_pfx_data_complete(self.match_results) return Result.Ok()
def update_player_data(self, task, data_set, no_prompts): subprocess.run(["clear"]) print_heading(f"Update {data_set}", fg="bright_yellow") spinner = Halo(spinner=get_random_dots_spinner(), color=get_random_cli_color()) spinner.text = "Updating player data..." spinner.start() result = task.execute() if result.failure: spinner.stop() return result spinner.succeed(f"{data_set} was successfully updated!") if no_prompts: return Result.Ok() updated_players = result.value or [] if not updated_players: pause(message="Press any key to continue...") return Result.Ok(updated_players) heading = f"Updated {data_set}: Results" message = f"{len(updated_players)} changes total:" table_viewer = DictListTableViewer( dict_list=updated_players, prompt="Press Enter to continue", confirm_only=True, table_color="bright_yellow", heading=heading, heading_color="bright_yellow", message=message, message_color="blue", ) table_viewer.launch() return Result.Ok(updated_players)
def launch(self): subprocess.run(["clear"]) print_heading(self.menu_heading, fg="bright_yellow") if not node_is_installed(): print_message(INSTALL_ERROR, fg="bright_red", bold=True) pause(message="Press any key to continue...") return if not NODEJS_INBOX.exists(): NODEJS_INBOX.mkdir(parents=True, exist_ok=True) if node_modules_folder_exists(): message = UPDATE_MESSAGE prompt = UPDATE_PROMPT temp_folder = None command = "npm update --timeout=9999999" else: message = INSTALL_MESSAGE prompt = INSTALL_PROMPT temp_folder = TemporaryDirectory(dir=NIGHTMAREJS_FOLDER) command = f"npm install --timeout=9999999 --cache={temp_folder.name}" print_message(message, fg="bright_yellow") if not yes_no_prompt(prompt, wrap=False): return Result.Ok(self.exit_menu) subprocess.run(["clear"]) print_heading(self.menu_heading, fg="bright_yellow") result = run_command(command, cwd=str(NIGHTMAREJS_FOLDER)) if result.failure: return result if temp_folder: temp_folder.cleanup() pause(message="\nPress any key to continue...") return Result.Ok(self.exit_menu)
def synchronize_files(self): if self.all_files_are_in_sync: message = "All files for selected data sets are in sync!" print_message(message, fg="bright_green", bold=True) pause(message="Press any key to continue...") return Result.Ok() for file_type, file_type_dict in self.sync_files.items(): for data_set, (out_of_sync, missing_files, outdated_files) in file_type_dict.items(): if not out_of_sync: continue all_sync_files = [] missing_count = 0 outdated_count = 0 if missing_files: all_sync_files.extend(missing_files) missing_count = len(missing_files) if outdated_files: all_sync_files.extend(outdated_files) outdated_count = len(outdated_files) table_viewer = self.create_table_viewer( all_sync_files, data_set, file_type, missing_count, outdated_count) apply_changes = table_viewer.launch() if apply_changes: self.apply_pending_changes(file_type, data_set, missing_files, outdated_files) return Result.Ok()
def parse_pitch_log(scraped_html, game, pitcher_id, url): page_content = html.fromstring(scraped_html, base_url=url) pitch_log = _initialize_pitch_log(game, pitcher_id, url) result = _parse_pitcher_details(page_content, game, pitcher_id) if result.failure: return Result.Ok(pitch_log) pitcher_dict = result.value pitch_log.pitcher_name = pitcher_dict["name"] pitch_log.pitcher_team_id_bb = pitcher_dict["team_id"] pitch_log.opponent_team_id_bb = pitcher_dict["opponent_id"] parsed = page_content.xpath(PITCHFX_URL_XPATH) if not parsed: return Result.Ok(pitch_log) rel_url = parsed[0] pitch_log.pitchfx_url = Template(T_PITCHFX_URL).substitute(rel_url=rel_url) result = _parse_pitch_counts(page_content) if result.failure: return Result.Ok(pitch_log) pitch_log.pitch_count_by_inning = result.value total_pitches = sum(pitch_log.pitch_count_by_inning.values()) pitch_log.total_pitch_count = int(total_pitches) pitch_log.parsed_all_info = True return Result.Ok(pitch_log)
def _get_summary_report_for_date_range(start_date, end_date, status_date_range): start_str = start_date.strftime(DATE_MONTH_NAME) end_str = end_date.strftime(DATE_MONTH_NAME) heading = f"### STATUS REPORT FOR {start_str} - {end_str} ###" if not status_date_range: pages = [ DisplayPage([ "All data has been scraped for all dates in the requested range" ], heading, wrap=False) ] return Result.Ok( _create_report_viewer(pages, text_color="bright_magenta")) dict_list = [{ "game_date": ds.game_date_str, "status": ds.scrape_status_description } for ds in status_date_range] date_report = DictListTableViewer( dict_list, prompt="Press Enter to dismiss report", confirm_only=True, heading=heading, heading_color="bright_magenta", message=None, table_color="bright_magenta", ) return Result.Ok(date_report)
def update_player_id_map(self): if not self.db_initialized: return Result.Ok() result = db.Season.is_date_in_season(self.db_session, datetime.now()) if result.failure: return Result.Ok() subprocess.run(["clear"]) return self.update_id_map_task.launch(no_prompts=True)
def delete_html(self, data_set, url_id): result_local = Result.Ok() result_s3 = Result.Ok() if self.html_stored_local(data_set): result_local = self.delete_html_local(data_set, url_id) if self.html_stored_s3(data_set): # pragma: no cover result_s3 = self.delete_html_s3(data_set, url_id) return Result.Combine([result_local, result_s3])
def execute(self, sync_direction, file_type, data_set, year): self.get_all_s3_objects() result = self.find_out_of_sync_files(sync_direction, file_type, data_set, year) if result.failure: self.events.error_occurred("Error occurred analyzing which files need to be synced.") return result (out_of_sync, missing_files, outdated_files) = result.value if not out_of_sync: return Result.Ok() self.sync_files(sync_direction, missing_files, outdated_files, file_type, data_set, year) return Result.Ok()
def get_object(self, setting_name, new_value): setting = self.config_json.get(setting_name) class_name = setting.get("CLASS_NAME") if class_name == "UrlScrapeDelay": result = self.validate_new_url_delay_setting(new_value) if result.failure: return result return Result.Ok(UrlScrapeDelay(*new_value).to_dict()) if class_name == "BatchJobSettings": return Result.Ok(BatchJobSettings(*new_value).to_dict()) if class_name == "BatchScrapeDelay": return Result.Ok(BatchScrapeDelay(*new_value).to_dict())
def get_game_status_record(db_session, bbref_game_id, bb_game_id): game_status = db.GameScrapeStatus.find_by_bbref_game_id( db_session, bbref_game_id) if game_status: return Result.Ok(game_status) game_status = db.GameScrapeStatus.find_by_bb_game_id( db_session, bb_game_id) if game_status: game_status.bbref_game_id = bbref_game_id return Result.Ok(game_status) error = f"scrape_status_game does not contain an entry for game_id: {bbref_game_id}" return Result.Fail(error)
def find_out_of_sync_files(self, sync_direction, file_type, data_set, year): self.events.find_out_of_sync_files_start() (s3_objects, local_files) = self.get_all_files_in_src_and_dest(file_type, data_set, year) if (sync_direction == SyncDirection.UP_TO_S3 and not local_files) or ( sync_direction == SyncDirection.DOWN_TO_LOCAL and not s3_objects ): sync_results = (False, [], []) self.events.find_out_of_sync_files_complete(sync_results) return Result.Ok(sync_results) src_files = local_files if sync_direction == SyncDirection.UP_TO_S3 else s3_objects dest_files = s3_objects if sync_direction == SyncDirection.UP_TO_S3 else local_files sync_results = get_files_to_sync(src_files, dest_files) self.events.find_out_of_sync_files_complete(sync_results) return Result.Ok(sync_results)
def launch(self): if not self.prompt_user_run_task(): return Result.Ok(True) self.subscribe_to_events() result = self.backup_db.execute() self.unsubscribe_from_events() self.spinner.stop() if result.failure: return result zip_file = result.value subprocess.run(["clear"]) self.display_zip_file_details(zip_file) pause(message="\nPress any key to continue...") return Result.Ok(True)
def upload_to_s3(self, file_type, data_set, scraped_data, s3_key, filepath): # pragma: no cover delete_file = not self.check_file_stored_local(file_type, data_set) if file_type == VigFile.PARSED_JSON: result = self.write_to_file(file_type, scraped_data, filepath) if result.failure: return result try: self.get_s3_bucket().upload_file(str(filepath), s3_key) if delete_file: filepath.unlink() return Result.Ok() if delete_file else Result.Ok(filepath) except botocore.exceptions.ClientError as ex: error_code = ex.response["Error"]["Code"] return Result.Fail(f"{repr(ex)} (Error Code: {error_code})")
def execute(self): if not self.team_map_filepath.exists(): self.initialize_bbref_player_team_map() return team_map = self.read_bbref_player_team_map_from_file() current_team_map = self.get_current_team_id_map() new_team_map = list(set(current_team_map) - set(team_map)) if not new_team_map: return Result.Ok([]) team_map = team_map + new_team_map team_map.sort(key=lambda x: (x.player_ID, x.year_ID, x.stint_ID)) self.write_bbref_player_team_map_to_file(team_map) new_team_dicts = [asdict(team_map) for team_map in new_team_map ] if new_team_map else None return Result.Ok(new_team_dicts)
def launch(self): if not self.prompt_user_run_task(): return Result.Ok(True) self.subscribe_to_events() result = self.calc_pitch_times.execute() self.spinner.stop() self.unsubscribe_from_events() if result.failure: return result subprocess.run(["clear"]) results = result.value time_between_pitches = db.TimeBetweenPitches.from_calc_results(self.db_session, results) self.display_pitch_metrics(time_between_pitches.as_dict()) pause(message="\nPress any key to continue...") return Result.Ok()
def execute(self): if not self.id_map_filepath.exists(): self.initialize_bbref_player_id_map() return id_map = self.read_bbref_player_id_map_from_file() current_id_map = self.get_current_player_id_map() new_id_map = list(set(current_id_map) - set(id_map)) if not new_id_map: return Result.Ok([]) id_map = id_map + new_id_map id_map.sort(key=lambda x: x.player_ID) self.write_bbref_player_id_map_to_file(id_map) new_id_dicts = [asdict(id_map) for id_map in new_id_map] if new_id_map else None return Result.Ok(new_id_dicts)
def download_file(url: str, local_folder: Path): file_name = get_file_name_from_url(url) local_file_path = local_folder.joinpath(file_name) r = requests.head(url) remote_file_size = int(r.headers.get("content-length", 0)) if not remote_file_size: return Result.Fail( f'Request for "{file_name}" did not return a response containing the file size.' ) local_file_size = 0 resume_header = None fopen_mode = "wb" if not local_file_path.exists(): print(f'"{file_name}" does not exist. Downloading...') else: local_file_size = local_file_path.stat().st_size if local_file_size == remote_file_size: print(f'"{file_name}" is complete. Skipping...') return Result.Ok(local_file_path) print(f'"{file_name}" is incomplete. Resuming...') resume_header = {"Range": f"bytes={local_file_size}-"} fopen_mode = "ab" r = requests.get(url, stream=True, headers=resume_header) with open(local_file_path, fopen_mode) as f: with tqdm( total=remote_file_size, unit="B", unit_scale=True, unit_divisor=1024, desc=local_file_path.name, initial=local_file_size, ascii=True, miniters=1, ) as pbar: for chunk in r.iter_content(32 * CHUNK_SIZE): f.write(chunk) pbar.update(len(chunk)) local_file_size = local_file_path.stat().st_size if local_file_size == remote_file_size: return Result.Ok(local_file_path) more_or_fewer = "more" if local_file_size > remote_file_size else "fewer" error = ( f'Recieved {more_or_fewer} bytes than expected for "{file_name}"!\n' f"Expected File Size: {remote_file_size:,} bytes\n" f"Received File Size: {local_file_size:,} bytes") return Result.Fail(error)
def verify_bbref_boxscore_ATL201803290(bbref_boxscore): boxscore_url = get_bbref_boxscore_url(BBREF_GAME_ID) assert bbref_boxscore.bbref_game_id == BBREF_GAME_ID assert bbref_boxscore.boxscore_url == boxscore_url assert bbref_boxscore.away_team_data.team_id_br == "PHI" assert bbref_boxscore.away_team_data.total_runs_scored_by_team == 5 assert bbref_boxscore.away_team_data.total_runs_scored_by_opponent == 8 assert bbref_boxscore.away_team_data.total_wins_before_game == 0 assert bbref_boxscore.away_team_data.total_losses_before_game == 1 assert bbref_boxscore.away_team_data.total_hits_by_team == 6 assert bbref_boxscore.away_team_data.total_hits_by_opponent == 9 assert bbref_boxscore.away_team_data.total_errors_by_team == 1 assert bbref_boxscore.away_team_data.total_errors_by_opponent == 0 assert bbref_boxscore.home_team_data.team_id_br == "ATL" assert bbref_boxscore.home_team_data.total_runs_scored_by_team == 8 assert bbref_boxscore.home_team_data.total_runs_scored_by_opponent == 5 assert bbref_boxscore.home_team_data.total_wins_before_game == 1 assert bbref_boxscore.home_team_data.total_losses_before_game == 0 assert bbref_boxscore.home_team_data.total_hits_by_team == 9 assert bbref_boxscore.home_team_data.total_hits_by_opponent == 6 assert bbref_boxscore.home_team_data.total_errors_by_team == 0 assert bbref_boxscore.home_team_data.total_errors_by_opponent == 1 assert bbref_boxscore.game_meta_info.attendance == 40208 assert bbref_boxscore.game_meta_info.park_name == "SunTrust Park" assert bbref_boxscore.game_meta_info.game_duration == "3:28" assert bbref_boxscore.game_meta_info.day_night == "Day Game" assert bbref_boxscore.game_meta_info.field_type == "On Grass" assert bbref_boxscore.game_meta_info.first_pitch_temperature == 74 assert bbref_boxscore.game_meta_info.first_pitch_wind == "Wind 16mph from Left to Right" assert bbref_boxscore.game_meta_info.first_pitch_clouds == "Cloudy" assert bbref_boxscore.game_meta_info.first_pitch_precipitation == "No Precipitation" return Result.Ok()
def check_for_exact_match(self, pfx): # given the invalid pfx data passed as an argument, the process to find an exact match is: exact_match = [ self.get_event_dict(event) # iterate through all game events, for each game event: for event in self.game_events # if game event is missing pitchfx data if event["at_bat_pitchfx_audit"]["missing_pitchfx_count"] > 0 and ( # AND game_event and invalid pfx took place in the same inning event["inning_id"][-5:] == pfx["inning_id"] and ( # AND game event and invalid pfx have the same batter OR the same pitcher event["batter_id_mlb"] == pfx["batter_id"] or event["pitcher_id_mlb"] == pfx["pitcher_id"]) # AND number of pitches missing is the same as the number of invalid pfx and event["at_bat_pitchfx_audit"]["missing_pitchfx_count"] == pfx["pitch_count"] # AND invalid pfx pitch seq. numbers are the same as the missing pitches and all(p_num in pfx["invalid_pfx"] for p_num in event["at_bat_pitchfx_audit"] ["missing_pitch_numbers"])) ] if not exact_match: # zero game events matched all the criteria -> NO EXACT MATCH return Result.Fail("") if len(exact_match) != 1: # more than one game event matched all the criteria -> NO EXACT MATCH return Result.Fail("") # one game event matched all the criteria -> EXACT MATCH return Result.Ok(exact_match[0])
def _parse_pitcher_details(page_content, game, pitcher_id): query = Template(T_PITCHER_NAME_XPATH).substitute(id=pitcher_id) parsed = page_content.xpath(query) if not parsed: error = "Failed to parse pitcher name from game log page." return Result.Fail(error) selected_pitcher = parsed[0] indices = [ n for n in range(len(selected_pitcher)) if selected_pitcher.find("-", n) == n ] if not indices or len(indices) < 2: error = "Failed to parse pitcher name from game log page." return Result.Fail(error) indices.reverse() name = selected_pitcher[:indices[1]].strip() result = _parse_team_ids(game, selected_pitcher) if result.failure: return result id_dict = result.value pitcher_dict = { "name": name, "team_id": id_dict["team_id"], "opponent_id": id_dict["opponent_id"], } return Result.Ok(pitcher_dict)
def apply_patch_list(self): if not self.game_contains_invalid_pfx() or not self.patch_list: return Result.Ok({"created_patch_list": False}) self.events.combine_scraped_data_start() box_before = deepcopy(self.boxscore) result = self.verify_scraped_data_can_be_combined( apply_patch_list=True) if result.failure: return result box_after = deepcopy(self.boxscore) patch_results = self.summarize_patch_results(box_before, box_after) result = self.combine_data_and_update_db() if result.failure: return result self.events.combine_scraped_data_complete(patch_results) return Result.Ok(patch_results)
def initialize_attributes(self): self.invalid_pfx_map = {} self.game_events = {} self.boxscore = {} self.match_results = defaultdict(list) self.patch_list = None return Result.Ok()
def apply(self, data): for patch in self.patch_list: result = patch.apply(data) if result.failure: return result data = result.value return Result.Ok(data)
def parse_scraped_html(self): parsed = 0 self.spinner.text = self.url_tracker.parse_html_report(parsed) for urls_for_date in self.url_tracker.all_urls.values(): for url_details in urls_for_date: if url_details.url_id not in self.url_tracker.parse_url_ids: continue result = self.parse_html(url_details) if result.failure: if ("Unable to parse any game data" in result.error or "will be completed at a later date" in result.error): continue return result parsed_data = result.value result = self.scraped_data.save_json(self.data_set, parsed_data) if result.failure: return result result = self.update_status(parsed_data) if result.failure: return result self.db_session.commit() parsed += 1 self.spinner.text = self.url_tracker.parse_html_report(parsed) return Result.Ok()