def test_all(): year = '20142015' report_type = 'PL' game_type = '02' for x in range (750,1150): game_num = Operations.pad_game_num(x) game_info = GameHeader.harvest(year, game_num, report_type, game_type) game_personnel = Roster.harvest(year, game_num) game_summary = GameSummary.harvest(year, game_num, game_info, game_personnel) yield check_refs, game_personnel, game_summary
def harvest(year, game_num): """ Extract roster information from a html file on the local machine and create database entries """ game_info = GameHeader.harvest(year, game_num, "RO", "02") tree = Operations.germinate_report_seed(year, game_num, "RO", "02") tables = tree.xpath("//table//table//table//table") away_roster = chop_ind_roster_branch(tables, "away", game_info, year) home_roster = chop_ind_roster_branch(tables, "home", game_info, year) away_coach, home_coach = chop_coach_branch(tables) away_coach.team = game_info.away_team home_coach.team = game_info.home_team referees, linesmen = chop_officials_branch(tables) return GamePersonnel(away_roster, home_roster, away_coach, home_coach, referees, linesmen) if __name__ == "__main__": for x in range(1, 100): game_num = Operations.pad_game_num(x) print harvest("20142015", game_num)
def harvest(season, start_game, finish_game, game_type): """ Grabs all resports for games with game numbers between start_game and finish_game of game_type in season from nhl.com and stores them in a local file """ destination_path = "C:/Users/Ruben/Projects/HockeyScraper/Reports/" report_types = ["GS", "ES", "FC", "FS", "PL", "TV", "TH", "RO", "SS"] not_found_urls = [] start_time = time.time() total_delay = 0.0 saved_counter = 0 imported_counter = 0 # Seeing if season being grabbed has been instantiated and if not, doing so season_folders = os.listdir(destination_path) if season not in season_folders: new_season_folder_path = destination_path + season + "/" os.mkdir(new_season_folder_path) # Seeing if game reports have been dowloaded, and doing so if they have not files_path = destination_path + season + "/" alreadY_saved_files = os.listdir(files_path) for game_num in range(start_game, finish_game): game_padded = Operations.pad_game_num(game_num) for report_type in report_types: file_name = report_type + game_type + game_padded + ".HTM" url = "http://www.nhl.com/scores/htmlreports/" + season + "/" + file_name if file_name in alreadY_saved_files: print file_name + " - Already Saved" saved_counter += 1 else: report = requests.get(url) tree = html.fromstring(report.text) check = tree.xpath("//head/title/text()") if check != ["404 Not Found"]: temp_file = open(files_path + file_name, "w") temp_file.write(report.text.encode("utf-8")) temp_file.close() delay = randint(1, 15) / 60.0 total_delay += delay time.sleep(delay) print file_name + " - Imported - %0.2fs Delay" % delay imported_counter += 1 else: not_found_urls.append(url) print file_name + " - 404 ERROR, NOT FOUND" total_time = time.time() - start_time print str(imported_counter), " - files imported" print str(saved_counter), " - files already saved" print str(saved_counter + imported_counter) + " - total files - %0.1f games" % ( (saved_counter + imported_counter) / 9.0 ) print "%0.2fs - total time taken" % total_time print "%0.2fs - time taken per file imported" % (total_time / imported_counter) print "%0.2fs - time taken per game imported" % (total_time / imported_counter * 9) print "%0.2fs - time spend in delays - %0.2f percent of total time" % (total_delay, total_delay / total_time * 100) print "The following reports were not found: " for item in not_found_urls: print item
game_personnel.home_roster) pruned_events = [] for index, event in enumerate(raw_events): pruned_event = prune_event(index, raw_events, game_personnel) pruned_events.append(pruned_event) #if pruned_event.event_type == 'HIT': # print pruned_event return PlayByPlay(raw_events, pruned_events) if __name__ == '__main__': year = '20142015' report_type = 'PL' game_type = '02' for game_num_raw in range (7,8): game_num = Operations.pad_game_num (game_num_raw) game_info = GameHeader.harvest(year, game_num, report_type, game_type) game_personnel = Roster.harvest (year, game_num) temp_pbp = harvest(year, game_num, report_type, game_type, game_info, game_personnel) print game_info for item in temp_pbp.pruned_events: if item.event_type == 'PENL': print item