Exemplo n.º 1
0
def test_HashCache_GetHash_empty_directory(database_path, hash_fn):
    """Test the hash of an empty directory."""
    c = hashcache.HashCache(database_path, hash_fn)
    with tempfile.TemporaryDirectory() as d:
        assert EMPTY_FILE_HASHES[hash_fn] == c.GetHash(pathlib.Path(d))
        # Once more to test a cache hit.
        assert EMPTY_FILE_HASHES[hash_fn] == c.GetHash(pathlib.Path(d))
Exemplo n.º 2
0
def test_HashCache_GetHash_unmodified_directory(database_path, hash_fn):
    """Test that an unmodified file returns the same hash."""
    c = hashcache.HashCache(database_path, hash_fn)
    with tempfile.TemporaryDirectory() as d:
        hash_1 = c.GetHash(pathlib.Path(d))
        hash_2 = c.GetHash(pathlib.Path(d))
        assert hash_1 == hash_2
Exemplo n.º 3
0
def test_HashCache_GetHash_non_existent_path(database_path, hash_fn):
    """Test that a non-existent path raises an error."""
    c = hashcache.HashCache(database_path, hash_fn)
    with tempfile.TemporaryDirectory() as d:
        with pytest.raises(FileNotFoundError) as e_info:
            c.GetHash(pathlib.Path(d) / 'a')
        assert f"File not found: '{d}/a'" == str(e_info.value)
Exemplo n.º 4
0
def test_HashCache_GetHash_modified_directory(database_path, hash_fn):
    """Test that modifying a directory changes the hash."""
    c = hashcache.HashCache(database_path, hash_fn)
    with tempfile.TemporaryDirectory() as d:
        hash_1 = c.GetHash(pathlib.Path(d))
        time.sleep(1)
        (pathlib.Path(d) / 'a').touch()
        hash_2 = c.GetHash(pathlib.Path(d))
        assert hash_1 != hash_2
Exemplo n.º 5
0
def test_HashCache_GetHash_unmodified_file(database_path, hash_fn):
    """Test that an unmodified file returns the same hash."""
    c = hashcache.HashCache(database_path, hash_fn)
    with tempfile.TemporaryDirectory() as d:
        (pathlib.Path(d) / 'a').touch()
        hash_1 = c.GetHash(pathlib.Path(d) / 'a')
        # Touch does not change the contents of the file, but will cause a
        # cache miss because of the changed mtime timestamp.
        (pathlib.Path(d) / 'a').touch()
        hash_2 = c.GetHash(pathlib.Path(d) / 'a')
        assert hash_1 == hash_2
Exemplo n.º 6
0
def test_HashCache_GetHash_modified_file(database_path, hash_fn):
    """Test that modifying a file changes the hash."""
    c = hashcache.HashCache(database_path, hash_fn)
    with tempfile.TemporaryDirectory() as d:
        (pathlib.Path(d) / 'a').touch()
        hash_1 = c.GetHash(pathlib.Path(d) / 'a')
        time.sleep(1)
        with open(pathlib.Path(d) / 'a', 'w') as f:
            f.write('Hello')
        hash_2 = c.GetHash(pathlib.Path(d) / 'a')
        assert hash_1 != hash_2
Exemplo n.º 7
0
def test_HashCache_GetHash_in_memory_modified_file(database_path, hash_fn):
    """Test that modifying a file does not change the hash if in memory.

  This test emphasizes the danger of the in-memory hash, as it means that the
  validity of the cache is tied to the lifecycle of the process.
  """
    c = hashcache.HashCache(database_path, hash_fn, keep_in_memory=True)
    with tempfile.TemporaryDirectory() as d:
        (pathlib.Path(d) / 'a').touch()
        hash_1 = c.GetHash(pathlib.Path(d) / 'a')
        time.sleep(1)
        with open(pathlib.Path(d) / 'a', 'w') as f:
            f.write('Hello')
        hash_2 = c.GetHash(pathlib.Path(d) / 'a')
        assert hash_1 == hash_2
        # Clear the in-memory cache and re-run the test. Now it will be a cache miss
        # and the correct hash will be returned.
        c.Clear()
        hash_3 = c.GetHash(pathlib.Path(d) / 'a')
        assert hash_1 != hash_3
Exemplo n.º 8
0
    def __init__(self, config: corpus_pb2.Corpus):
        """Instantiate a corpus from a proto config.

    If this is a new corpus, a number of files will be created, which may
    take some time.

    Args:
      config: A Corpus message.

    Raises:
      TypeError: If the config argument is not a Sampler proto.
      UserError: In case the corpus is not found, or config contains invalid
        options.
      EmptyCorpusException: In case the corpus contains no data.
    """
        if not isinstance(config, corpus_pb2.Corpus):
            t = type(config).__name__
            raise TypeError(f"Config must be a Corpus proto. Received: '{t}'")

        # Make a local copy of the configuration.
        self.config = corpus_pb2.Corpus()
        self.config.CopyFrom(AssertConfigIsValid(config))
        self._atomizer = None
        self._created = False

        cache.cachepath('corpus').mkdir(parents=True, exist_ok=True)
        print(cache.cachepath('hashcache.db'))
        hc = hashcache.HashCache(cache.cachepath('hashcache.db'), 'sha1')
        self.content_id = ResolveContentId(self.config, hc)
        # Database of pre-processed files.
        preprocessed_id = ResolvePreprocessedId(self.content_id, self.config)
        cache.cachepath('corpus', 'preprocessed',
                        preprocessed_id).mkdir(exist_ok=True, parents=True)
        preprocessed_db_path = cache.cachepath('corpus', 'preprocessed',
                                               preprocessed_id,
                                               'preprocessed.db')
        if (self.config.HasField('content_id')
                and not preprocessed_db_path.is_file()):
            raise errors.UserError(
                f"Content ID not found: '{self.content_id}'")
        self.preprocessed = preprocessed.PreprocessedContentFiles(
            preprocessed_db_path)
        # Create symlink to contentfiles.
        symlink = pathlib.Path(
            self.preprocessed.url[len('sqlite:///'):]).parent / 'contentfiles'
        if not symlink.is_symlink():
            if config.HasField('local_directory'):
                os.symlink(
                    str(
                        ExpandConfigPath(
                            config.local_directory,
                            path_prefix=FLAGS.clgen_local_path_prefix)),
                    symlink)
            elif config.HasField('local_tar_archive'):
                os.symlink(
                    str(
                        ExpandConfigPath(
                            config.local_tar_archive,
                            path_prefix=FLAGS.clgen_local_path_prefix)),
                    symlink)
        # Data of encoded pre-preprocessed files.
        encoded_id = ResolveEncodedId(self.content_id, self.config)
        cache.cachepath('corpus', 'encoded', encoded_id).mkdir(exist_ok=True,
                                                               parents=True)
        self.encoded = encoded.EncodedContentFiles(
            cache.cachepath('corpus', 'encoded', encoded_id, 'encoded.db'))
        self.atomizer_path = cache.cachepath('corpus', 'encoded', encoded_id,
                                             'atomizer.pkl')
        # Create symlink to preprocessed files.
        symlink = pathlib.Path(
            self.encoded.url[len('sqlite:///'):]).parent / 'preprocessed'
        if not symlink.is_symlink():
            os.symlink(
                os.path.relpath(
                    pathlib.Path(
                        self.preprocessed.url[len('sqlite:///'):]).parent,
                    pathlib.Path(self.encoded.url[len('sqlite:///'):]).parent),
                symlink)
        self.hash = encoded_id
        self.cache = cache.mkcache('corpus', 'encoded', encoded_id)
Exemplo n.º 9
0
def test_HashCache_unrecognized_hash_fn(database_path, hash_fn):
    """Test that a non-existent path raises an error."""
    with pytest.raises(ValueError) as e_info:
        hashcache.HashCache(database_path, 'null')
    assert "Hash function not recognized: 'null'" == str(e_info.value)
Exemplo n.º 10
0
  def __init__(self, config: corpus_pb2.Corpus):
    """Instantiate a corpus from a proto config.

    If this is a new corpus, a number of files will be created, which may
    take some time.

    Args:
      config: A Corpus message.

    Raises:
      TypeError: If the config argument is not a Sampler proto.
      UserError: In case the corpus is not found, or config contains invalid
        options.
      EmptyCorpusException: In case the corpus contains no data.
    """
    if not isinstance(config, corpus_pb2.Corpus):
      t = type(config).__name__
      raise TypeError(f"Config must be a Corpus proto. Received: '{t}'")

    # Make a local copy of the configuration.
    self.config = corpus_pb2.Corpus()
    self.config.CopyFrom(AssertConfigIsValid(config))
    self._atomizer = None
    self._created = False

    # An in-memory cache of the encoded contentfiles indices arrays.
    # Set and used in GetTrainingData().
    self._indices_arrays: typing.Optional[typing.List[np.array]] = None

    cache.cachepath('corpus').mkdir(parents=True, exist_ok=True)
    hc = hashcache.HashCache(cache.cachepath('hashcache.db'), 'sha1')
    self.content_id = ResolveContentId(self.config, hc)
    # Database of pre-processed files.
    preprocessed_id = ResolvePreprocessedId(self.content_id, self.config)
    cache.cachepath('corpus', 'preprocessed', preprocessed_id).mkdir(
        exist_ok=True, parents=True)
    preprocessed_db_path = cache.cachepath('corpus', 'preprocessed',
                                           preprocessed_id, 'preprocessed.db')
    if (self.config.HasField('content_id') and
        not preprocessed_db_path.is_file()):
      raise errors.UserError(f"Content ID not found: '{self.content_id}'")
    self.preprocessed = preprocessed.PreprocessedContentFiles(
        f'sqlite:///{preprocessed_db_path}')
    # Create symlink to contentfiles.
    symlink = pathlib.Path(
        self.preprocessed.url[len('sqlite:///'):]).parent / 'contentfiles'
    if not symlink.is_symlink():
      if config.HasField('local_directory'):
        os.symlink(
            str(
                ExpandConfigPath(
                    config.local_directory,
                    path_prefix=FLAGS.clgen_local_path_prefix)), symlink)
      elif config.HasField('local_tar_archive'):
        os.symlink(
            str(
                ExpandConfigPath(
                    config.local_tar_archive,
                    path_prefix=FLAGS.clgen_local_path_prefix)), symlink)
    # Data of encoded pre-preprocessed files.
    encoded_id = ResolveEncodedId(self.content_id, self.config)
    cache.cachepath('corpus', 'encoded', encoded_id).mkdir(
        exist_ok=True, parents=True)
    db_path = cache.cachepath('corpus', 'encoded', encoded_id, 'encoded.db')
    # TODO(github.com/ChrisCummins/phd/issues/46): Refactor this conditional
    # logic by making Corpus an abstract class and creating concrete subclasses
    # for the different types of corpus.
    if self.config.HasField('pre_encoded_corpus_url'):
      self.encoded = encoded.EncodedContentFiles(config.pre_encoded_corpus_url)
    else:
      self.encoded = encoded.EncodedContentFiles(f'sqlite:///{db_path}')
    self.atomizer_path = cache.cachepath('corpus', 'encoded', encoded_id,
                                         'atomizer.pkl')
    # Create symlink to preprocessed files.
    # TODO(github.com/ChrisCummins/phd/issues/46): Refactor this conditional
    # logic after splitting Corpus class.
    if not self.config.HasField('pre_encoded_corpus_url'):
      symlink = pathlib.Path(
          self.encoded.url[len('sqlite:///'):]).parent / 'preprocessed'
      if not symlink.is_symlink():
        os.symlink(
            os.path.relpath(
                pathlib.Path(self.preprocessed.url[len('sqlite:///'):]).parent,
                pathlib.Path(self.encoded.url[len('sqlite:///'):]).parent),
            symlink)
    self.hash = encoded_id
    self.cache = cache.mkcache('corpus', 'encoded', encoded_id)