Exemplo n.º 1
0
    def test_dataset_download(self, tmp_path, gmb_schema):
        "Test Dataset class downloads a dataset properly."

        data_dir = tmp_path / 'gmb'
        gmb_dataset = Dataset(gmb_schema,
                              data_dir=data_dir,
                              mode=Dataset.InitializationMode.DOWNLOAD_ONLY)
        assert len(list(data_dir.iterdir(
        ))) == 2  # 'groningen_meaning_bank_modified' and '.nourish.dataset'
        unarchived_data_dir = data_dir / 'groningen_meaning_bank_modified'
        unarchived_data_dir_files = [
            'gmb_subset_full.txt', 'LICENSE.txt', 'README.txt'
        ]
        assert unarchived_data_dir.is_dir()
        assert len(list(
            unarchived_data_dir.iterdir())) == len(unarchived_data_dir_files)
        assert all(f.name in unarchived_data_dir_files
                   for f in unarchived_data_dir.iterdir())

        # Force check previously downloaded dataset should error
        with pytest.raises(RuntimeError) as e:
            gmb_dataset.download(check=True)
        assert str(e.value) == (
            'Dataset.download() was previously called. To overwrite existing data files, rerun '
            'Dataset.download() with ``check`` set to ``False``.')
Exemplo n.º 2
0
    def test_nourish_dir(self, tmp_path, gmb_schema):
        "Test ``Dataset._nourish_dir``."
        # Automatic creation
        nourish_dir = tmp_path / 'data_dir' / '.nourish.dataset'
        dataset = Dataset(gmb_schema,
                          data_dir=tmp_path / 'data_dir',
                          mode=Dataset.InitializationMode.LAZY)
        assert dataset._nourish_dir == nourish_dir
        # Non-directory present
        nourish_dir.rmdir()
        assert nourish_dir.exists() is False
        nourish_dir.touch()
        with pytest.raises(NotADirectoryError) as e:
            dataset._nourish_dir
        assert str(
            e.value) == f'"{nourish_dir}" exists and is not a directory.'

        # Non-directory parent present
        dataset = Dataset(gmb_schema,
                          data_dir='setup.py',
                          mode=Dataset.InitializationMode.LAZY)
        # These are raised by pathlib.Path.mkdir
        # Also see https://bugs.python.org/issue42872
        ExceptionClass = FileExistsError if os.name == 'nt' else NotADirectoryError
        with pytest.raises(ExceptionClass) as e:
            dataset._nourish_dir
        # This error message may be generated by pathlib.Path.mkdir() (as in DirectoryLock.lock()). We only make sure
        # the path is in the string.
        # On Windows, backslashes in the error message are doubled:
        # [WinError 183] Cannot create a file when that file already exists: 'D:\\\\a\\\\nourish\\\\nourish\\\\setup.py'
        assert str(pathlib.Path.cwd() / "setup.py").replace(
            '\\', '\\\\') in str(e.value)
Exemplo n.º 3
0
    def test_zip_extractor(self, dataset_base_url, dataset_dir, gmb_schema,
                           tmp_path):
        "Test _ZipExtractor to make sure zip datasets are properly extracted and verified."

        fake_schema = gmb_schema
        fake_schema[
            'download_url'] = dataset_base_url + '/extractables/test.zip'
        fake_schema['sha512sum'] = hashlib.sha512(
            (dataset_dir / 'extractables/test.zip').read_bytes()).hexdigest()
        zip_dataset = Dataset(fake_schema,
                              data_dir=tmp_path,
                              mode=Dataset.InitializationMode.DOWNLOAD_ONLY)
        assert zip_dataset.is_downloaded() is True

        # Content of the file list
        with open(zip_dataset._file_list_file, mode='r') as f:
            file_list = json.load(f)

        def test_incorrect_file_list(change: dict):
            "Test a single case that somewhere in the file list things are wrong."

            wrong_file_list = copy.deepcopy(file_list)
            wrong_file_list['contents'].update(change)
            with open(zip_dataset._file_list_file, mode='w') as f:
                json.dump(wrong_file_list, f)
            assert zip_dataset.is_downloaded() is False

        # Can't find a file
        test_incorrect_file_list({'non-existing-file': {'isdir': False}})
        # File type incorrect
        test_incorrect_file_list({'test-dir/test.csv': {'isdir': True}})
        # Size incorrect
        changed = copy.deepcopy(file_list['contents']['test-dir/test.txt'])
        changed['size'] += 100
        test_incorrect_file_list({'test-dir/test.txt': changed})
Exemplo n.º 4
0
    def test_compression_extractors(self, compressed_file, dataset_base_url,
                                    dataset_dir, gmb_schema, tmp_path):
        "Test compression extractors (gzip, bzip2, and lzma) to make sure datasets are properly extracted and verified."

        fake_schema = gmb_schema
        fake_schema[
            'download_url'] = dataset_base_url + '/extractables/' + compressed_file
        compressed_fp = dataset_dir / ('extractables/' + compressed_file)
        fake_schema['sha512sum'] = hashlib.sha512(
            (compressed_fp).read_bytes()).hexdigest()
        dataset = Dataset(fake_schema,
                          data_dir=tmp_path,
                          mode=Dataset.InitializationMode.DOWNLOAD_ONLY)
        assert dataset.is_downloaded() is True

        # Content of the file list
        with open(dataset._file_list_file, mode='r') as f:
            file_list = json.load(f)

        def test_incorrect_file_list(change: dict):
            "Test a single case that somewhere in the file list things are wrong."

            wrong_file_list = copy.deepcopy(file_list)
            wrong_file_list['contents'].update(change)
            with open(dataset._file_list_file, mode='w') as f:
                json.dump(wrong_file_list, f)
            assert dataset.is_downloaded() is False

        # Can't find the file
        test_incorrect_file_list({'filename': 'non-existing-file'})
        # Size incorrect
        changed = copy.deepcopy(file_list['contents'])
        changed['size'] += 100
        test_incorrect_file_list(changed)
Exemplo n.º 5
0
    def test_csv_pandas_loader(self, tmp_path, noaa_jfk_schema):
        "Test the basic functioning of CSVPandasLoader."

        dataset = Dataset(noaa_jfk_schema,
                          tmp_path,
                          mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD)
        data = dataset.data['jfk_weather_cleaned']
        assert isinstance(data, pd.DataFrame)
        assert data.shape == (75119, 16)
        dataset.delete()
Exemplo n.º 6
0
    def test_symlink_data_dir(self, tmp_symlink_dir, gmb_schema):
        "Test when ``data_dir`` is a symlink. The symlink should not be resolved."

        dataset = Dataset(gmb_schema,
                          data_dir=tmp_symlink_dir,
                          mode=Dataset.InitializationMode.LAZY)
        assert dataset._data_dir == tmp_symlink_dir
Exemplo n.º 7
0
    def test_csv_pandas_column_unsupported_data_types(self, tmp_path,
                                                      noaa_jfk_schema,
                                                      err_column,
                                                      other_columns):
        "Test column data types when they are unsupported."

        # Clear columns
        column_dict = noaa_jfk_schema['subdatasets']['jfk_weather_cleaned'][
            'format']['options']['columns'] = {}

        # Update column dictionary as specified
        for col in other_columns:
            if col.dtype is not None:
                column_dict[col.name] = col.dtype
        column_dict[err_column.name] = err_column.dtype

        with pytest.raises(ValueError) as e:
            Dataset(noaa_jfk_schema,
                    tmp_path,
                    mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD)
        # Pandas is a 3rd-party library. We don't check for the exact wording but only some keywords
        # Examples:
        #   ValueError: cannot safely convert passed user dtype of int64 for float64 dtyped data in column 1
        #   ValueError: could not convert string to float: '2010-01-01 01:00:00'
        assert 'convert' in str(e.value)
        for t in (err_column.dtype, err_column.check):
            assert re.search(rf"{t}(\d*|ing)\b",
                             str(e.value))  # "ing" is for "str'ing'"
Exemplo n.º 8
0
    def test_data_dir(self, tmp_path, gmb_schema):
        "Test ``Dataset._data_dir``."
        # Automatic creation
        dataset = Dataset(gmb_schema,
                          data_dir=tmp_path / 'data_dir',
                          mode=Dataset.InitializationMode.LAZY)
        assert dataset._data_dir == tmp_path / 'data_dir'

        # Non-directory present
        dataset = Dataset(gmb_schema,
                          data_dir='setup.py',
                          mode=Dataset.InitializationMode.LAZY)
        with pytest.raises(NotADirectoryError) as e:
            dataset._data_dir
        assert str(
            e.value
        ) == f'"{pathlib.Path.cwd()/"setup.py"}" exists and is not a directory.'
Exemplo n.º 9
0
    def test_relative_data_dir(self, gmb_schema, chdir_tmp_path, tmp_sub_dir,
                               tmp_relative_sub_dir):
        "Test when ``data_dir`` is relative."

        dataset = Dataset(gmb_schema,
                          data_dir=tmp_relative_sub_dir,
                          mode=Dataset.InitializationMode.LAZY)
        assert dataset._data_dir == tmp_sub_dir
        assert dataset._data_dir.is_absolute()
Exemplo n.º 10
0
    def test_supported_file_extensions(self, dataset_base_url, dataset_dir,
                                       extractable, extractable_type,
                                       gmb_schema, tmp_path):
        "Test extract_data_files and verify_data_files to make sure proper extractors are used for various datasets."

        fake_schema = gmb_schema
        fake_schema[
            'download_url'] = dataset_base_url + '/extractables/' + extractable
        fake_schema['sha512sum'] = hashlib.sha512(
            (dataset_dir / 'extractables' /
             extractable).read_bytes()).hexdigest()
        dataset = Dataset(fake_schema,
                          data_dir=tmp_path,
                          mode=Dataset.InitializationMode.DOWNLOAD_ONLY)
        assert dataset.is_downloaded() is True
        with open(dataset._file_list_file, mode='r') as f:
            file_list = json.load(f)
        assert file_list['type'] == extractable_type
Exemplo n.º 11
0
    def test_unloaded_access_to_data(self, tmp_path, gmb_schema):
        "Test access to ``Dataset.data`` when no data has been loaded."

        dataset = Dataset(gmb_schema,
                          data_dir=tmp_path,
                          mode=Dataset.InitializationMode.LAZY)
        with pytest.raises(RuntimeError) as e:
            dataset.data
        assert str(e.value) == (
            'Data has not been downloaded and/or loaded yet. Call Dataset.download() to download '
            'data, call Dataset.load() to load data.')

        # Same after downloading
        dataset.download()
        with pytest.raises(RuntimeError) as e:
            dataset.data
        assert str(e.value) == (
            'Data has not been downloaded and/or loaded yet. Call Dataset.download() to download '
            'data, call Dataset.load() to load data.')
Exemplo n.º 12
0
    def test_invalid_sha512(self, tmp_path, gmb_schema):
        "Test if Dataset class catches an invalid hash."

        gmb_schema['sha512sum'] = 'invalid hash example'

        with pytest.raises(IOError) as e:
            Dataset(gmb_schema,
                    data_dir=tmp_path,
                    mode=Dataset.InitializationMode.DOWNLOAD_ONLY)
        assert 'the file may by corrupted' in str(e.value)
Exemplo n.º 13
0
    def test_download_false(self, tmp_path, gmb_schema):
        "Test to see the function loads properly when download=False and dataset was previously downloaded."

        init(DATADIR=tmp_path)
        data_dir = tmp_path / 'dax' / 'gmb' / '1.0.2'
        gmb = Dataset(gmb_schema,
                      data_dir=data_dir,
                      mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD)
        gmb_data = load_dataset('gmb', version='1.0.2', download=False)
        assert gmb.data == gmb_data
Exemplo n.º 14
0
 def test_csv_pandas_no_delimiter(self, tmp_path, noaa_jfk_schema):
     "Test when no delimiter is given."
     # Remove the delimiter option
     del noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format'][
         'options']['delimiter']
     data = Dataset(noaa_jfk_schema,
                    tmp_path,
                    mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD
                    ).data['jfk_weather_cleaned']
     assert len(data.columns) == 16  # number of columns remain the same
Exemplo n.º 15
0
 def test_cache_dir_is_not_a_dir(self, tmp_path, gmb_schema):
     "Test when ``nourish_dir`` (i.e., ``data_dir/.nourish.dataset``) exists and is not a dir."
     (tmp_path /
      '.nourish.dataset').touch()  # Occupy this path with a regular file
     with pytest.raises(NotADirectoryError) as e:
         Dataset(gmb_schema,
                 data_dir=tmp_path,
                 mode=Dataset.InitializationMode.DOWNLOAD_ONLY)
     assert str(
         e.value
     ) == f"\"{tmp_path/'.nourish.dataset'}\" exists and is not a directory."
Exemplo n.º 16
0
    def test_csv_pandas_loader_non_option(self, tmp_path, noaa_jfk_schema):
        "Test CSVPandasLoader when None option is passed."

        del noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format'][
            'options']
        dataset = Dataset(noaa_jfk_schema,
                          tmp_path,
                          mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD)
        data = dataset.data['jfk_weather_cleaned']
        assert isinstance(data, pd.DataFrame)
        assert len(data) == 75119
Exemplo n.º 17
0
    def test_custom_data_dir(self, tmp_path, wikitext103_schema):
        "Test to make sure Dataset constructor uses new global data dir if one was supplied earlier to nourish.init."

        init(DATADIR=tmp_path)
        assert get_config().DATADIR == tmp_path
        assert isinstance(get_config().DATADIR, pathlib.Path)
        wikitext = Dataset(wikitext103_schema,
                           data_dir=tmp_path,
                           mode=Dataset.InitializationMode.LAZY)
        assert wikitext._data_dir == tmp_path
        assert isinstance(wikitext._data_dir, pathlib.Path)
Exemplo n.º 18
0
    def test_default_dataset_schema_name(self, tmp_path, gmb_schema):
        "Test the default schemata name."

        init(DATADIR=tmp_path)
        data_dir = tmp_path / 'default' / 'gmb' / '1.0.2'
        gmb = Dataset(gmb_schema,
                      data_dir=data_dir,
                      mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD)
        _get_schemata_manager().dataset_schemata._schemata.pop(
            'name')  # Remove the "name" key
        gmb_data = load_dataset('gmb', version='1.0.2', download=False)
        assert gmb.data == gmb_data
Exemplo n.º 19
0
    def test_loading_undownloaded(self, tmp_path, gmb_schema):
        "Test loading before ``Dataset.download()`` has been called."

        dataset = Dataset(gmb_schema,
                          data_dir=tmp_path,
                          mode=Dataset.InitializationMode.LAZY)

        with pytest.raises(FileNotFoundError) as e:
            dataset.load(check=False)
        assert (
            'Failed to load subdataset "gmb_subset_full" because some files are not found. '
            'Did you forget to call Dataset.download()?\nCaused by:\n') in str(
                e.value)

        # Half-loaded data objects should get reset to None
        assert dataset._data is None
        with pytest.raises(RuntimeError) as e:
            dataset.data
        assert str(e.value) == (
            'Data has not been downloaded and/or loaded yet. Call Dataset.download() to download '
            'data, call Dataset.load() to load data.')

        # Force check undownloaded dataset should error
        with pytest.raises(RuntimeError) as e:
            dataset.load(check=True)
        assert str(e.value) == (
            f'Downloaded data files are not present in {dataset._data_dir_} or are corrupted.'
        )
Exemplo n.º 20
0
    def test_csv_pandas_header(self, tmp_path, noaa_jfk_schema):
        "Test CSVPandasLoader header options"

        noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format'][
            'options']['no_header'] = True
        noaa_dataset = Dataset(noaa_jfk_schema,
                               tmp_path,
                               mode=Dataset.InitializationMode.DOWNLOAD_ONLY)
        with pytest.raises(
                ValueError
        ) as exinfo:  # Pandas should error from trying to read string as another dtype
            noaa_dataset.load()
        assert ('could not convert string to float' in str(exinfo.value))
        noaa_dataset.delete()

        false_test_cases = [False, '',
                            None]  # These should all be treated as False
        for case in false_test_cases:
            noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format'][
                'options']['no_header'] = case
            self.test_csv_pandas_loader(tmp_path, noaa_jfk_schema)

        del noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format'][
            'options']['no_header']
        self.test_csv_pandas_loader(tmp_path, noaa_jfk_schema)
Exemplo n.º 21
0
    def test_csv_pandas_delimiter(self, tmp_path, noaa_jfk_schema, delimiter):
        "Test common delimiter settings. Note that the case of comma has been tested in ``test_csv_pandas_loader``."

        del noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format'][
            'options']['columns']
        # Change delimiter to tab, |, ;, space
        noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format'][
            'options']['delimiter'] = delimiter
        data = Dataset(noaa_jfk_schema,
                       tmp_path,
                       mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD
                       ).data['jfk_weather_cleaned']
        # None of these delimiters exist in the file, number of columns should be 1.
        assert len(data.columns) == 1
Exemplo n.º 22
0
    def test_deleting_data_dir(self, tmp_path, gmb_schema):
        "Test ``Dataset.delete()``."

        # Note we don't use tmp_sub_dir fixture because we want data_dir to be non-existing at the beginning of the
        # test.
        data_dir = tmp_path / 'data-dir'
        dataset = Dataset(gmb_schema,
                          data_dir=data_dir,
                          mode=Dataset.InitializationMode.LAZY)
        assert not data_dir.exists()  # sanity check: data_dir doesn't exist
        dataset.delete()  # no exception should be raised here
        assert not data_dir.exists()  # sanity check: data_dir doesn't exist

        dataset.download()
        # Sanity check: Files are in place
        assert dataset.is_downloaded()
        assert len(os.listdir(data_dir)) > 0
        # Delete the dir
        dataset.delete()
        assert not data_dir.exists()
Exemplo n.º 23
0
    def test_zerobyte_files(self, dataset_base_url, dataset_dir, gmb_schema,
                            tmp_path, zerobyte_file):
        "Test compression extractors to make sure they handle zero-byte files."

        fake_schema = gmb_schema
        fake_schema[
            'download_url'] = dataset_base_url + '/extractables/' + zerobyte_file
        zerobyte_fp = dataset_dir / ('extractables/' + zerobyte_file)
        fake_schema['sha512sum'] = hashlib.sha512(
            (zerobyte_fp).read_bytes()).hexdigest()
        with pytest.raises(OSError) as e:
            Dataset(fake_schema,
                    data_dir=(tmp_path),
                    mode=Dataset.InitializationMode.DOWNLOAD_ONLY)
        assert str(
            e.value) == ('The extracted file test-zerobyte.csv is empty.')
Exemplo n.º 24
0
    def test_unsupported_file_extensions(self, tmp_path, gmb_schema,
                                         schemata_file_https_url,
                                         schemata_file_relative_dir):
        "Test if Dataset class catches an unsupported filetype (flat files like ``.yaml`` currently unsupported)."

        fake_schema = gmb_schema
        fake_schema[
            'download_url'] = schemata_file_https_url + '/datasets.yaml'
        fake_schema['sha512sum'] = \
            hashlib.sha512((schemata_file_relative_dir / 'datasets.yaml').read_bytes()).hexdigest()

        with pytest.raises(RuntimeError) as e:
            Dataset(fake_schema,
                    data_dir=tmp_path,
                    mode=Dataset.InitializationMode.DOWNLOAD_ONLY)
        assert str(e.value) == 'Filetype not (yet) supported'
Exemplo n.º 25
0
    def test_download_data_dir_is_not_a_dir(self, gmb_schema):
        "Test when downloading when ``data_dir`` exists and is not a dir."

        # These are raised by pathlib.Path.mkdir
        # Also see https://bugs.python.org/issue42872
        ExceptionClass = FileExistsError if os.name == 'nt' else NotADirectoryError
        with pytest.raises(ExceptionClass) as e:
            Dataset(gmb_schema,
                    data_dir='./setup.py',
                    mode=Dataset.InitializationMode.DOWNLOAD_ONLY)
        # This error message may be generated by pathlib.Path.mkdir() (as in DirectoryLock.lock()). We only make sure
        # the path is in the string.
        # On Windows, backslashes in the error message are doubled:
        # [WinError 183] Cannot create a file when that file already exists: 'D:\\\\a\\\\nourish\\\\nourish\\\\setup.py'
        assert str(pathlib.Path.cwd() / "setup.py").replace(
            '\\', '\\\\') in str(e.value)
Exemplo n.º 26
0
    def test_is_downloaded(self, tmp_path, gmb_schema):
        "Test is_downloaded method using a ``.tar.gz`` archive."

        data_dir = tmp_path / 'non-existing-dir'
        assert not data_dir.exists()  # Sanity check: data_dir must not exist
        gmb = Dataset(gmb_schema,
                      data_dir=data_dir,
                      mode=Dataset.InitializationMode.LAZY)
        assert gmb.is_downloaded() is False

        gmb.download()
        assert gmb.is_downloaded() is True

        # JSON decoding error
        gmb._file_list_file.write_text("nonsense\n", encoding='utf-8')
        with pytest.raises(JSONDecodeError):
            # We don't check the value of the exception because we clearly only are only interested in ensuring that the
            # file isn't decodable
            gmb.is_downloaded()
Exemplo n.º 27
0
    def test_constructor_download_and_load(self, tmp_path, wikitext103_schema):
        "Test the full power of Dataset.__init__() (mode being ``InitializationMode.DOWNLOAD_AND_LOAD``)."

        dataset = Dataset(wikitext103_schema,
                          data_dir=tmp_path,
                          mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD)

        assert (hashlib.sha512(dataset.data['train'].encode()).hexdigest() == (
            'df7615f77cb9dd19975881f271e3e3525bee38c08a67fea36a51c96be69a3ecabc9e05c02cbaf'
            '6fc63a0082efb44156f61c81061d3b0272bbccd7657c682e791'))

        assert (hashlib.sha512(dataset.data['valid'].encode()).hexdigest() == (
            'e4834d365d5f8313503895fd8304d29a566ff4a2df77efb32457fdc353304fb61460511f89bb9'
            '0f14a47132c1539aaa324d3e71f5f56045a61a7292ad25a3c02'))

        assert (hashlib.sha512(dataset.data['test'].encode()).hexdigest() == (
            '6fe665d33c0f788eba76da50539f0ca02432c70c94b788a493da491215e86043fc732dbeef9bb'
            '49a72341c7283ea55f59d10941ac41f7ac58aea3bdcd72f5cd8'))
Exemplo n.º 28
0
    def test_csv_pandas_column_data_types(self, tmp_path, noaa_jfk_schema,
                                          columns):
        "Test the column data types."

        assert len(columns) > 0  # Sanity check, make sure columns are there

        # Clear columns
        column_dict = noaa_jfk_schema['subdatasets']['jfk_weather_cleaned'][
            'format']['options']['columns'] = {}

        # Update column dictionary as specified
        for col in columns:
            if col.dtype is not None:
                column_dict[col.name] = col.dtype

        dataset = Dataset(noaa_jfk_schema,
                          tmp_path,
                          mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD)
        data = dataset.data['jfk_weather_cleaned']
        for col in columns:
            assert col.check(data.dtypes[col.name])
Exemplo n.º 29
0
    def test_mode(self, tmp_path, gmb_schema):
        "Test if Dataset class catches an invalid mode."

        with pytest.raises(ValueError) as e:
            Dataset(gmb_schema, data_dir=tmp_path, mode='DOWNLOAD_ONLY')
        assert str(e.value) == 'DOWNLOAD_ONLY not a valid mode'
Exemplo n.º 30
0
def downloaded_noaa_jfk_dataset(noaa_jfk_schema) -> Dataset:
    with TemporaryDirectory() as tmp_data_dir:
        yield Dataset(noaa_jfk_schema,
                      data_dir=tmp_data_dir,
                      mode=Dataset.InitializationMode.DOWNLOAD_ONLY)