Exemplo n.º 1
0
    def test_deleted(self, tmpdir, datadir):
        shutil.copy(datadir / 'tfrecords' / 'simple' / 'train-00000-of-00001', tmpdir)
        fpath = Path(tmpdir / 'train-00000-of-00001')

        # Generate info
        original_info = generate_fileinfo(fpath)

        # Remove the file
        os.unlink(fpath)

        with pytest.raises(FileNotFoundError):
            get_fileinfo(fpath, shallow_check=False)
Exemplo n.º 2
0
    def test_generate(self, tmpdir, datadir):
        shutil.copy(datadir / 'tfrecords' / 'simple' / 'train-00000-of-00001', tmpdir)
        shutil.copy(datadir / 'tfrecords' / 'simple' / 'validation-00000-of-00001', tmpdir)

        result = self._invoke_cli('generate-metadata', str(tmpdir / '*'))
        assert result.exit_code == 0
        assert "2 files matched" in result.output
        assert "Finished" in result.output

        info = get_fileinfo(Path(tmpdir / 'train-00000-of-00001'))
        assert info.file_size == 350
        assert info.total_records == 10
        assert info.md5_hash == '3c8c216b7293fdef623b04e01bb5878a'

        info = get_fileinfo(Path(tmpdir / 'validation-00000-of-00001'))
        assert info.file_size == 350
        assert info.total_records == 10
        assert info.md5_hash == 'fdebe01f545d90f127a15ea2f28d3d1d'
Exemplo n.º 3
0
    def test_invalid_hash(self, tmpdir, datadir):
        shutil.copy(datadir / 'tfrecords' / 'simple' / 'train-00000-of-00001', tmpdir)
        fpath = Path(tmpdir / 'train-00000-of-00001')

        # Generate info
        original_info = generate_fileinfo(fpath)

        # Change a bit the content by writing random data at the beginning
        with open(fpath, 'r+b') as f:
            f.seek(0, io.SEEK_SET)
            print(f.tell())
            f.write('junk'.encode('utf-8'))

        with pytest.raises(TFRecordValidationError):
            get_fileinfo(fpath, shallow_check=False)

        # Try to regenerate, should fail because not valid tf records
        with pytest.raises(tf.errors.DataLossError):
            generate_fileinfo(fpath)
Exemplo n.º 4
0
    def test_invalid_size(self, tmpdir, datadir):
        shutil.copy(datadir / 'tfrecords' / 'simple' / 'train-00000-of-00001', tmpdir)
        fpath = Path(tmpdir / 'train-00000-of-00001')

        # Generate info
        original_info = generate_fileinfo(fpath)

        # Change a bit the size
        with open(fpath, 'ab') as f:
            f.write('junk'.encode('utf-8'))

        with pytest.raises(TFRecordValidationError):
            get_fileinfo(fpath, shallow_check=False)

        # Try to regenerate, will not fail because tf can handle trailing rubbish
        info2 = generate_fileinfo(fpath)
        assert original_info is not info2
        assert original_info != info2
        assert info2.file_size == 354
        assert info2.md5_hash == '76a086a01e382560309ccfc232711dec'
Exemplo n.º 5
0
    def test_non_existing_meta_file_generate(self, tmpdir, datadir):
        shutil.copy(datadir / 'tfrecords' / 'simple' / 'train-00000-of-00001', tmpdir)

        fpath = Path(tmpdir / 'train-00000-of-00001')

        # Try to get on folder without metadata
        with pytest.raises(TFRecordInfoMissingError):
            get_fileinfo(fpath)

        # Try to generate
        info = generate_fileinfo(fpath)
        assert info.md5_hash == '3c8c216b7293fdef623b04e01bb5878a'
        assert info.file_size == 350
        assert info.name == 'train-00000-of-00001'
        assert info.full_path == Path(fpath)

        # Try again to fetch from generated metadata
        info2 = get_fileinfo(fpath)

        assert info is not info2
        assert info == info2
Exemplo n.º 6
0
    def test_generation_multiple_file_info(self, tmpdir, datadir):
        shutil.copy(datadir / 'tfrecords' / 'simple' / 'train-00000-of-00001', tmpdir)
        shutil.copy(datadir / 'tfrecords' / 'simple' / 'validation-00000-of-00001', tmpdir)

        fpath1 = Path(tmpdir / 'train-00000-of-00001')
        fpath2 = Path(tmpdir / 'validation-00000-of-00001')

        # Generate for both files
        original_info1 = generate_fileinfo(fpath1)
        original_info2 = generate_fileinfo(fpath2)

        assert original_info1.md5_hash == '3c8c216b7293fdef623b04e01bb5878a'
        assert original_info2.md5_hash == 'fdebe01f545d90f127a15ea2f28d3d1d'

        # Get from stored metadata
        info1 = get_fileinfo(fpath1)
        info2 = get_fileinfo(fpath2)

        assert info1 is not original_info1
        assert info1 == original_info1

        assert info2 is not original_info2
        assert info2 == original_info2
Exemplo n.º 7
0
def validate(pattern: str, shallow_check: bool):
    """
    Validate each one of the files matched using the input file pattern.
    """
    start = time.time()

    files = resolve_glob_pattern(pattern)
    click.echo(f"{len(files)} files matched with the pattern.")

    with click.progressbar(files) as files:
        for file in files:
            try:
                get_fileinfo(file, shallow_check
                             )  # inside here happens the validation step too
            except TFRecordValidationError:
                raise
            except TFRecordInfoMissingError:
                raise
            except Exception as e:  # Probably not a valid tfrecords file
                click.echo(f'Probably not a valid tf_record file {e}')

    end = time.time()

    click.echo(f"Total execution time: {end - start}")
Exemplo n.º 8
0
def total_examples(pattern) -> int:
    """
    Get total examples for all the files matched with the given input file pattern.
    """

    files = resolve_glob_pattern(pattern)
    click.echo(f"{len(files)} files matched with the pattern.")

    total_rows = 0
    for file in files:
        try:
            total_rows += get_fileinfo(file).total_records
        except Exception:
            pass

    click.echo(f"Total number of examples: {total_rows}")
Exemplo n.º 9
0
 def test_non_existing_meta(self, tmpdir, datadir):
     shutil.copy(datadir / 'tfrecords' / 'simple' / 'train-00000-of-00001', tmpdir)
     with pytest.raises(TFRecordInfoMissingError):
         get_fileinfo(Path(tmpdir / 'train-00000-of-00001'))
Exemplo n.º 10
0
    def test_get_info_non_existing_file(self, tmpdir):
        with pytest.raises(FileNotFoundError):
            get_fileinfo(Path(tmpdir / 'nonexisting.tfrecords'))

        with pytest.raises(FileNotFoundError):
            generate_fileinfo(Path(tmpdir / 'nonexisting.tfrecords'))