def get_fangraphs_tabular_data_from_html( html: Union[str, bytes], column_name_mapper: Callable = None, known_percentages: List[str] = []) -> pd.DataFrame: xpath: str = '//table[@class="rgMasterTable"]' html_dom = lxml.etree.HTML(html) headings_xpath = f"({xpath}/thead//th[contains(@class, 'rgHeader')])[position()>1]/descendant-or-self::*/text()" headings = html_dom.xpath(headings_xpath) if column_name_mapper: headings = [column_name_mapper(h) for h in headings] data_rows_xpath = f"({xpath}/tbody//tr)" data_rows_dom = html_dom.xpath(data_rows_xpath) data_rows = [[ postprocessing.try_parse(y, headings[index], known_percentages=known_percentages) for index, y in enumerate( x.xpath('td[position()>1]/descendant-or-self::*/text()')) ] for x in data_rows_dom] fg_data = pd.DataFrame(data_rows, columns=headings) return fg_data
def test_try_parse_long_date() -> None: expected_datetime = datetime(year=2020, month=9, day=3, hour=5, minute=40, second=30, microsecond=210000) assert postprocessing.try_parse('2020-09-03T05:40:30.210Z', 'game_dt') == expected_datetime
def test_try_parse_percentage_column(self): assert postprocessing.try_parse('50', 'CS%') == 0.5
def test_try_parse_percentage_value(self): assert postprocessing.try_parse('10%', 'avg') == 0.1
def test_try_parse_float(self): assert postprocessing.try_parse('1.0', 'runs') == 1.0
def test_try_parse_int(self): assert postprocessing.try_parse('1', 'runs') == 1
def test_try_parse_null() -> None: assert pd.isna(postprocessing.try_parse(None, 'runs'))
def test_try_parse_percentage_column_known() -> None: assert postprocessing.try_parse('50', 'CS', known_percentages=['CS']) == 0.5
def test_try_parse_float_nonstr() -> None: assert postprocessing.try_parse(1.0, 'runs') == 1.0
def test_try_parse_float() -> None: assert postprocessing.try_parse('1.0', 'runs') == 1.0
def test_try_parse_int_nonstr() -> None: assert postprocessing.try_parse(1, 'runs') == 1
def test_try_parse_int() -> None: assert postprocessing.try_parse('1', 'runs') == 1
def test_try_parse_date_nonstr() -> None: expected = datetime(year=2020, month=9, day=4) assert postprocessing.try_parse(expected, 'game_dt') == expected
def test_try_parse_short_date() -> None: assert postprocessing.try_parse('2020-09-04', 'game_dt') == datetime(year=2020, month=9, day=4)
def test_try_parse_int_nonstr(self): assert postprocessing.try_parse(1, 'runs') == 1
def test_try_parse_date_nonstr(self): dt = datetime(year=2020, month=9, day=4) assert postprocessing.try_parse(dt, 'game_dt') == dt