def _test_hash_eq(self, other_name: str): link = BBRefLink(self.name) link2 = BBRefLink(self.name) p1 = Page.from_link(link) p2 = Page.from_link(link) assert hash(link) == hash(link2) assert link == link2 assert hash(p1) == hash(p2) assert p1 == p2 game_link = BBRefLink(other_name) p3 = Page.from_link(game_link) assert hash(link) != hash(game_link) assert link != game_link assert hash(p1) != hash(p3) assert p1 != p3
def test_can_parse(self, url: str): utils.clean_db() link = BBRefLink(url) page = Page.from_link(link) if isinstance(page, GamePage): utils.insert_mock_players(page) ScrapeNode.from_page(page).scrape()
def test_from_page(self): link = BBRefLink("WAS201710120.shtml") page = Page.from_link(link) new_node1 = ScrapeNode.from_page(page) new_node2 = ScrapeNode.from_page(page) assert new_node1 is new_node2 assert new_node1.__class__ == InsertableScrapeNode
def scrape_year(year: int) -> None: sched_url = f"https://www.baseball-reference.com/leagues/MLB/{year}-schedule.shtml" sched_link = BBRefLink(sched_url) # since the current year's schedule will be continually updated as new games # are played, do not want to use cached version of its page use_cache = False if year == CUR_YEAR else True sched = Page.from_link(sched_link, use_cache) ScrapeNode.from_page(sched).scrape()
def test_cannot_parse(self): url = "https://www.baseball-reference.com/boxes/PIT/PIT196507020.shtml" utils.clean_db() link = BBRefLink(url) page = Page.from_link(link) assert isinstance(page, GamePage) utils.insert_mock_players(page) with pytest.raises(MissingPlayDataError): ScrapeNode.from_page(page).scrape()
def test_no_visit_twice(self): # these games share the same lineups utils.clean_db() games = ["WAS201710120.shtml", "CHN201710110.shtml"] for game, expected_scrape_num in zip(games, [39, 1]): link = BBRefLink("WAS201710120.shtml") page = Page.from_link(link) node = ScrapeNode.from_page(page) assert node.scrape() == expected_scrape_num
def test_dup_team_names(self): for url in [ "https://www.baseball-reference.com/boxes/WS2/WS2197109300.shtml", "https://www.baseball-reference.com/boxes/CLE/CLE196007171.shtml" ]: link = BBRefLink(url) page = Page.from_link(link) assert isinstance(page, GamePage) utils.insert_mock_players(page) ScrapeNode.from_page(page).scrape()
def test_malformed_html(self): # the web handler will download the correct html, so need to copy # malformed html to cached file beforehand to test properly player_pages = os.path.join("tests", "scraping", "resources", "PlayerPage") copyfile(src=os.path.join(player_pages, "malformed_arod.shtml"), dst=os.path.join(player_pages, "rodrial01.shtml")) url = "https://www.baseball-reference.com/players/r/rodrial01.shtml" link = BBRefLink(url) page = Page.from_link(link)
def insert_game(url: str) -> None: link = BBRefLink(url) page = Page.from_link(link) insert_mock_players(page) # type: ignore ScrapeNode.from_page(page).scrape()
def test_page_types(self): for url, page_type in zip(RES_URLS, [GamePage, SchedulePage, PlayerPage]): link = BBRefLink(url) assert type(Page.from_link(link)) == page_type