def test_load_dataset_local(dataset_loading_script_dir, data_dir, keep_in_memory, caplog): with assert_arrow_memory_increases( ) if keep_in_memory else assert_arrow_memory_doesnt_increase(): dataset = load_dataset(dataset_loading_script_dir, data_dir=data_dir, keep_in_memory=keep_in_memory) assert isinstance(dataset, DatasetDict) assert all(isinstance(d, Dataset) for d in dataset.values()) assert len(dataset) == 2 assert isinstance(next(iter(dataset["train"])), dict) for offline_simulation_mode in list(OfflineSimulationMode): with offline(offline_simulation_mode): caplog.clear() # Load dataset from cache dataset = datasets.load_dataset(DATASET_LOADING_SCRIPT_NAME, data_dir=data_dir) assert len(dataset) == 2 assert "Using the latest cached version of the module" in caplog.text with pytest.raises(FileNotFoundError) as exc_info: datasets.load_dataset(SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST) m_combined_path = re.search( fr"http\S*{re.escape(SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST + '/' + SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST + '.py')}\b", str(exc_info.value), ) assert m_combined_path is not None and is_remote_url( m_combined_path.group()) assert os.path.abspath(SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST) in str( exc_info.value)
def check_if_url_is_valid(url): if is_remote_url(url) and "\\" in url: raise ValueError( f"Bad remote url '{url} since it contains a backslash" )