示例#1
0
def test_all():
	year = '20142015'
	report_type = 'PL'
	game_type = '02'

	for x in range (750,1150):
		game_num = Operations.pad_game_num(x)
		game_info = GameHeader.harvest(year, game_num, report_type, game_type)
		game_personnel = Roster.harvest(year, game_num)
		game_summary = GameSummary.harvest(year, game_num, game_info, game_personnel)
	
		yield check_refs, game_personnel, game_summary
示例#2
0

def harvest(year, game_num):
    """
	Extract roster information from a html file on the
	local machine and create database entries
	"""

    game_info = GameHeader.harvest(year, game_num, "RO", "02")

    tree = Operations.germinate_report_seed(year, game_num, "RO", "02")

    tables = tree.xpath("//table//table//table//table")

    away_roster = chop_ind_roster_branch(tables, "away", game_info, year)
    home_roster = chop_ind_roster_branch(tables, "home", game_info, year)

    away_coach, home_coach = chop_coach_branch(tables)
    away_coach.team = game_info.away_team
    home_coach.team = game_info.home_team

    referees, linesmen = chop_officials_branch(tables)

    return GamePersonnel(away_roster, home_roster, away_coach, home_coach, referees, linesmen)


if __name__ == "__main__":
    for x in range(1, 100):
        game_num = Operations.pad_game_num(x)
        print harvest("20142015", game_num)
def harvest(season, start_game, finish_game, game_type):
    """
	Grabs all resports for games with game numbers between start_game
	and finish_game of game_type in season from nhl.com and stores them in a
	local file
	"""

    destination_path = "C:/Users/Ruben/Projects/HockeyScraper/Reports/"
    report_types = ["GS", "ES", "FC", "FS", "PL", "TV", "TH", "RO", "SS"]
    not_found_urls = []
    start_time = time.time()
    total_delay = 0.0
    saved_counter = 0
    imported_counter = 0

    # Seeing if season being grabbed has been instantiated and if not, doing so
    season_folders = os.listdir(destination_path)

    if season not in season_folders:
        new_season_folder_path = destination_path + season + "/"
        os.mkdir(new_season_folder_path)

        # Seeing if game reports have been dowloaded, and doing so if they have not
    files_path = destination_path + season + "/"
    alreadY_saved_files = os.listdir(files_path)

    for game_num in range(start_game, finish_game):
        game_padded = Operations.pad_game_num(game_num)

        for report_type in report_types:
            file_name = report_type + game_type + game_padded + ".HTM"
            url = "http://www.nhl.com/scores/htmlreports/" + season + "/" + file_name

            if file_name in alreadY_saved_files:
                print file_name + " - Already Saved"
                saved_counter += 1
            else:
                report = requests.get(url)
                tree = html.fromstring(report.text)
                check = tree.xpath("//head/title/text()")

                if check != ["404 Not Found"]:
                    temp_file = open(files_path + file_name, "w")
                    temp_file.write(report.text.encode("utf-8"))
                    temp_file.close()
                    delay = randint(1, 15) / 60.0
                    total_delay += delay
                    time.sleep(delay)
                    print file_name + " - Imported - %0.2fs Delay" % delay
                    imported_counter += 1
                else:
                    not_found_urls.append(url)
                    print file_name + " - 404 ERROR, NOT FOUND"

    total_time = time.time() - start_time
    print str(imported_counter), " - files imported"
    print str(saved_counter), " - files already saved"
    print str(saved_counter + imported_counter) + " - total files - %0.1f games" % (
        (saved_counter + imported_counter) / 9.0
    )
    print "%0.2fs - total time taken" % total_time
    print "%0.2fs - time taken per file imported" % (total_time / imported_counter)
    print "%0.2fs - time taken per game imported" % (total_time / imported_counter * 9)
    print "%0.2fs - time spend in delays - %0.2f percent of total time" % (total_delay, total_delay / total_time * 100)
    print "The following reports were not found: "
    for item in not_found_urls:
        print item
		game_personnel.home_roster)
	pruned_events = []

	for index, event in enumerate(raw_events):

		pruned_event = prune_event(index, raw_events, game_personnel)
		pruned_events.append(pruned_event)
		#if pruned_event.event_type == 'HIT':
		#	print pruned_event
			
		
	return PlayByPlay(raw_events, pruned_events)

if __name__ == '__main__':
	
	year = '20142015'
	report_type = 'PL'
	game_type = '02'

	for game_num_raw in range (7,8):
		game_num = Operations.pad_game_num (game_num_raw)
		
		game_info = GameHeader.harvest(year, game_num, report_type, game_type)
		game_personnel = Roster.harvest (year, game_num)
		temp_pbp = harvest(year, game_num, report_type, game_type, game_info, game_personnel)

		print game_info
		for item in temp_pbp.pruned_events:
			if item.event_type == 'PENL':
				print item