def test_HashCache_GetHash_unmodified_directory(database_path, hash_fn): """Test that an unmodified file returns the same hash.""" c = hashcache.HashCache(database_path, hash_fn) with tempfile.TemporaryDirectory() as d: hash_1 = c.GetHash(pathlib.Path(d)) hash_2 = c.GetHash(pathlib.Path(d)) assert hash_1 == hash_2
def test_HashCache_GetHash_empty_directory(database_path, hash_fn): """Test the hash of an empty directory.""" c = hashcache.HashCache(database_path, hash_fn) with tempfile.TemporaryDirectory() as d: assert EMPTY_FILE_HASHES[hash_fn] == c.GetHash(pathlib.Path(d)) # Once more to test a cache hit. assert EMPTY_FILE_HASHES[hash_fn] == c.GetHash(pathlib.Path(d))
def test_HashCache_GetHash_non_existent_path(database_path, hash_fn): """Test that a non-existent path raises an error.""" c = hashcache.HashCache(database_path, hash_fn) with tempfile.TemporaryDirectory() as d: with pytest.raises(FileNotFoundError) as e_info: c.GetHash(pathlib.Path(d) / 'a') assert f"File not found: '{d}/a'" == str(e_info.value)
def test_HashCache_GetHash_modified_directory(database_path, hash_fn): """Test that modifying a directory changes the hash.""" c = hashcache.HashCache(database_path, hash_fn) with tempfile.TemporaryDirectory() as d: hash_1 = c.GetHash(pathlib.Path(d)) time.sleep(1) (pathlib.Path(d) / 'a').touch() hash_2 = c.GetHash(pathlib.Path(d)) assert hash_1 != hash_2
def test_HashCache_GetHash_unmodified_file(database_path, hash_fn): """Test that an unmodified file returns the same hash.""" c = hashcache.HashCache(database_path, hash_fn) with tempfile.TemporaryDirectory() as d: (pathlib.Path(d) / 'a').touch() hash_1 = c.GetHash(pathlib.Path(d) / 'a') # Touch does not change the contents of the file, but will cause a # cache miss because of the changed mtime timestamp. (pathlib.Path(d) / 'a').touch() hash_2 = c.GetHash(pathlib.Path(d) / 'a') assert hash_1 == hash_2
def test_HashCache_GetHash_modified_file(database_path, hash_fn): """Test that modifying a file changes the hash.""" c = hashcache.HashCache(database_path, hash_fn) with tempfile.TemporaryDirectory() as d: (pathlib.Path(d) / 'a').touch() hash_1 = c.GetHash(pathlib.Path(d) / 'a') time.sleep(1) with open(pathlib.Path(d) / 'a', 'w') as f: f.write('Hello') hash_2 = c.GetHash(pathlib.Path(d) / 'a') assert hash_1 != hash_2
def test_HashCache_GetHash_in_memory_modified_file(database_path, hash_fn): """Test that modifying a file does not change the hash if in memory. This test emphasizes the danger of the in-memory hash, as it means that the validity of the cache is tied to the lifecycle of the process. """ c = hashcache.HashCache(database_path, hash_fn, keep_in_memory=True) with tempfile.TemporaryDirectory() as d: (pathlib.Path(d) / 'a').touch() hash_1 = c.GetHash(pathlib.Path(d) / 'a') time.sleep(1) with open(pathlib.Path(d) / 'a', 'w') as f: f.write('Hello') hash_2 = c.GetHash(pathlib.Path(d) / 'a') assert hash_1 == hash_2 # Clear the in-memory cache and re-run the test. Now it will be a cache miss # and the correct hash will be returned. c.Clear() hash_3 = c.GetHash(pathlib.Path(d) / 'a') assert hash_1 != hash_3
def __init__(self, config: corpus_pb2.Corpus): """Instantiate a corpus from a proto config. If this is a new corpus, a number of files will be created, which may take some time. Args: config: A Corpus message. Raises: TypeError: If the config argument is not a Sampler proto. UserError: In case the corpus is not found, or config contains invalid options. EmptyCorpusException: In case the corpus contains no data. """ if not isinstance(config, corpus_pb2.Corpus): t = type(config).__name__ raise TypeError(f"Config must be a Corpus proto. Received: '{t}'") # Make a local copy of the configuration. self.config = corpus_pb2.Corpus() self.config.CopyFrom(AssertConfigIsValid(config)) self._atomizer = None self._created = False cache.cachepath('corpus').mkdir(parents=True, exist_ok=True) hc = hashcache.HashCache(cache.cachepath('hashcache.db'), 'sha1') self.content_id = ResolveContentId(self.config, hc) # Database of pre-processed files. preprocessed_id = ResolvePreprocessedId(self.content_id, self.config) cache.cachepath('corpus', 'preprocessed', preprocessed_id).mkdir(exist_ok=True, parents=True) preprocessed_db_path = cache.cachepath('corpus', 'preprocessed', preprocessed_id, 'preprocessed.db') if (self.config.HasField('content_id') and not preprocessed_db_path.is_file()): raise errors.UserError( f"Content ID not found: '{self.content_id}'") self.preprocessed = preprocessed.PreprocessedContentFiles( preprocessed_db_path) # Create symlink to contentfiles. symlink = self.preprocessed.database_path.parent / 'contentfiles' if not symlink.is_symlink(): if config.HasField('local_directory'): os.symlink( str( ExpandConfigPath( config.local_directory, path_prefix=FLAGS.clgen_local_path_prefix)), symlink) elif config.HasField('local_tar_archive'): os.symlink( str( ExpandConfigPath( config.local_tar_archive, path_prefix=FLAGS.clgen_local_path_prefix)), symlink) # Data of encoded pre-preprocessed files. encoded_id = ResolveEncodedId(self.content_id, self.config) cache.cachepath('corpus', 'encoded', encoded_id).mkdir(exist_ok=True, parents=True) self.encoded = encoded.EncodedContentFiles( cache.cachepath('corpus', 'encoded', encoded_id, 'encoded.db')) self.atomizer_path = cache.cachepath('corpus', 'encoded', encoded_id, 'atomizer.pkl') # Create symlink to preprocessed files. symlink = self.encoded.database_path.parent / 'preprocessed' if not symlink.is_symlink(): os.symlink( os.path.relpath(self.preprocessed.database_path.parent, self.encoded.database_path.parent), symlink) self.hash = encoded_id self.cache = cache.mkcache('corpus', 'encoded', encoded_id)
def test_HashCache_unrecognized_hash_fn(database_path, hash_fn): """Test that a non-existent path raises an error.""" with pytest.raises(ValueError) as e_info: hashcache.HashCache(database_path, 'null') assert "Hash function not recognized: 'null'" == str(e_info.value)