def test_load_when_txt_file_specified(): dl = DataLoader("src/tests/test_data/sample/email.txt", client=None) data, structured, name = dl.load() assert isinstance(data, str) assert data.startswith("Dear Mr. Connell") assert not structured assert name == "src/tests/test_data/sample/email.txt"
def test_load_when_xlsx_file_specified(): dl = DataLoader("src/tests/test_data/sample/dummy.xlsx", client=None) data, structured, name = dl.load() assert isinstance(data, pd.DataFrame) assert len(data) == 4 assert len(data.columns) == 3 assert structured assert name == "src/tests/test_data/sample/dummy.xlsx"
def test_load_when_xml_file_specified(): dl = DataLoader("src/tests/test_data/sample/employees.xml", client=None) data, structured, name = dl.load() assert isinstance(data, pd.DataFrame) assert len(data) == 6 assert len(data.columns) == 9 assert structured assert name == "src/tests/test_data/sample/employees.xml"
def test_load_when_pdf_file_specified(): dl = DataLoader("src/tests/test_data/sample/academic_paper.pdf", client=None) data, structured, name = dl.load() assert isinstance(data, str) assert data.startswith("Enriching Word Vectors") assert not structured assert name == "src/tests/test_data/sample/academic_paper.pdf"
def test_load_when_csv_file_specified(): dl = DataLoader("src/tests/test_data/sample/names.csv", client=None) data, structured, name = dl.load() assert isinstance(data, pd.DataFrame) assert data.columns == ["name"] assert len(data.values) == 250 assert structured assert name == "src/tests/test_data/sample/names.csv"
def test_load_when_dataframe_specified(): test_df = pd.read_csv("src/tests/test_data/sample/names.csv") dl = DataLoader(test_df, client=None) data, structured, name = dl.load() assert isinstance(data, pd.DataFrame) assert data.columns == ["name"] assert len(data.values) == 250 assert structured assert name == "pandas DataFrame (hash 5214317343533855748)"
def test_init(): dl = DataLoader("test_source", "test_client") assert dl.data_source == "test_source" assert dl.client == "test_client"
def test_load_when_multi_sheet_xlsx(): dl = DataLoader("src/tests/test_data/sample/dummy_two_sheets.xlsx", client=None) with pytest.raises(NotImplementedError): dl.load()
def _load(cls, data_source, client): """Load the data source.""" return DataLoader(data_source, client).load()