Пример #1
0
 def _test_hash_eq(self, other_name: str):
     link = BBRefLink(self.name)
     link2 = BBRefLink(self.name)
     p1 = Page.from_link(link)
     p2 = Page.from_link(link)
     assert hash(link) == hash(link2)
     assert link == link2
     assert hash(p1) == hash(p2)
     assert p1 == p2
     game_link = BBRefLink(other_name)
     p3 = Page.from_link(game_link)
     assert hash(link) != hash(game_link)
     assert link != game_link
     assert hash(p1) != hash(p3)
     assert p1 != p3
Пример #2
0
 def test_can_parse(self, url: str):
     utils.clean_db()
     link = BBRefLink(url)
     page = Page.from_link(link)
     if isinstance(page, GamePage):
         utils.insert_mock_players(page)
     ScrapeNode.from_page(page).scrape()
Пример #3
0
 def test_from_page(self):
     link = BBRefLink("WAS201710120.shtml")
     page = Page.from_link(link)
     new_node1 = ScrapeNode.from_page(page)
     new_node2 = ScrapeNode.from_page(page)
     assert new_node1 is new_node2
     assert new_node1.__class__ == InsertableScrapeNode
Пример #4
0
def scrape_year(year: int) -> None:
    sched_url = f"https://www.baseball-reference.com/leagues/MLB/{year}-schedule.shtml"
    sched_link = BBRefLink(sched_url)
    # since the current year's schedule will be continually updated as new games
    # are played, do not want to use cached version of its page
    use_cache = False if year == CUR_YEAR else True
    sched = Page.from_link(sched_link, use_cache)
    ScrapeNode.from_page(sched).scrape()
Пример #5
0
 def test_cannot_parse(self):
     url = "https://www.baseball-reference.com/boxes/PIT/PIT196507020.shtml"
     utils.clean_db()
     link = BBRefLink(url)
     page = Page.from_link(link)
     assert isinstance(page, GamePage)
     utils.insert_mock_players(page)
     with pytest.raises(MissingPlayDataError):
         ScrapeNode.from_page(page).scrape()
Пример #6
0
 def test_no_visit_twice(self):
     # these games share the same lineups
     utils.clean_db()
     games = ["WAS201710120.shtml", "CHN201710110.shtml"]
     for game, expected_scrape_num in zip(games, [39, 1]):
         link = BBRefLink("WAS201710120.shtml")
         page = Page.from_link(link)
         node = ScrapeNode.from_page(page)
         assert node.scrape() == expected_scrape_num
Пример #7
0
 def test_dup_team_names(self):
     for url in [
             "https://www.baseball-reference.com/boxes/WS2/WS2197109300.shtml",
             "https://www.baseball-reference.com/boxes/CLE/CLE196007171.shtml"
     ]:
         link = BBRefLink(url)
         page = Page.from_link(link)
         assert isinstance(page, GamePage)
         utils.insert_mock_players(page)
         ScrapeNode.from_page(page).scrape()
Пример #8
0
 def test_malformed_html(self):
     # the web handler will download the correct html, so need to copy
     # malformed html to cached file beforehand to test properly
     player_pages = os.path.join("tests", "scraping", "resources",
                                 "PlayerPage")
     copyfile(src=os.path.join(player_pages, "malformed_arod.shtml"),
              dst=os.path.join(player_pages, "rodrial01.shtml"))
     url = "https://www.baseball-reference.com/players/r/rodrial01.shtml"
     link = BBRefLink(url)
     page = Page.from_link(link)
Пример #9
0
def insert_game(url: str) -> None:
    link = BBRefLink(url)
    page = Page.from_link(link)
    insert_mock_players(page)  # type: ignore
    ScrapeNode.from_page(page).scrape()
Пример #10
0
 def test_page_types(self):
     for url, page_type in zip(RES_URLS, [GamePage, SchedulePage, PlayerPage]):
         link = BBRefLink(url)
         assert type(Page.from_link(link)) == page_type