예제 #1
0
def test_Corpus_inequality(clgen_cache_dir, abc_corpus_config):
    """Test that two corpuses with different options are not equivalent."""
    del clgen_cache_dir
    c1 = corpuses.Corpus(abc_corpus_config)
    abc_corpus_config.greedy_multichar_atomizer.tokens[:] = ['a']
    c2 = corpuses.Corpus(abc_corpus_config)
    assert c1 != c2
예제 #2
0
def test_Corpus_config_hash_different_options(clgen_cache_dir,
                                              abc_corpus_config):
    """Test that the corpus ID is changed with a different option value."""
    del clgen_cache_dir
    c1 = corpuses.Corpus(abc_corpus_config)
    abc_corpus_config.greedy_multichar_atomizer.tokens[:] = ['a']
    c3 = corpuses.Corpus(abc_corpus_config)
    assert c1.hash != c3.hash
예제 #3
0
def test_Corpus_content_id(clgen_cache_dir, abc_corpus_config):
    """Test that the content_id field resolves to the correct corpus."""
    del clgen_cache_dir
    c1 = corpuses.Corpus(abc_corpus_config)
    content_id = c1.content_id
    # Create an identical corpus but using the content_id field rather than
    # a local_directory.
    abc_corpus_config.ClearField('contentfiles')
    abc_corpus_config.content_id = content_id
    c2 = corpuses.Corpus(abc_corpus_config)
    assert c1.hash == c2.hash
예제 #4
0
def test_Corpus_pre_encoded_corpus_url_GetTrainingData(abc_pre_encoded):
    """Test the training data accessor of a pre-encoded corpus."""
    c = corpuses.Corpus(
        corpus_pb2.Corpus(pre_encoded_corpus_url=abc_pre_encoded))
    c.Create()
    # abc_pre_encoded contains two contentfiles, totalling with 8 tokens.
    assert len(c.GetTrainingData(shuffle=True)) == 8
예제 #5
0
def test_Corpus_Create_num_contentfiles(clgen_cache_dir, abc_corpus_config):
    """Test the number of contentfiles in a known corpus."""
    del clgen_cache_dir
    c = corpuses.Corpus(abc_corpus_config)
    assert c.GetNumContentFiles() == 0
    c.Create()
    assert c.GetNumContentFiles() == 3
예제 #6
0
파일: run.py 프로젝트: SpringRi/phd
def PostprocessSampleCorpus(instance: clgen.Instance):
    """Create a corpus from the model samples and pre-process."""
    sample_dir = instance.model.SamplerCache(instance.sampler)

    # Read the sample protos and write them to a directory of content files.
    contentfiles_dir = pathlib.Path(str(sample_dir) + '.contentfiles')
    contentfiles_dir.mkdir(exist_ok=True)
    logging.info('Writing output contentfiles to %s', contentfiles_dir)
    if len(list(contentfiles_dir.iterdir())) != len(list(
            sample_dir.iterdir())):
        for proto_path in sample_dir.iterdir():
            sample = pbutil.FromFile(proto_path, model_pb2.Sample())
            with open(contentfiles_dir / proto_path.name, 'w') as f:
                f.write(sample.text)

    logging.info('Creating output corpus')
    output_corpus_config = corpus_pb2.Corpus()
    output_corpus_config.CopyFrom(instance.model.corpus.config)
    output_corpus_config.local_directory = str(contentfiles_dir)
    # We derive the programming language name from the input corpus directory.
    # This depends on corpuses being in directories named after their language,
    # e.g. ~/corpuses/opencl, or ~/corpuses/java.A
    preprocessed_dir = instance.model.corpus.preprocessed.url[len('sqlite:///'
                                                                  ):].parent
    language = (preprocessed_dir / 'contentfiles').resolve().name
    output_corpus_config.preprocessor[:] = POSTPROCESSORS[language]
    output_corpus = corpuses.Corpus(output_corpus_config)
    try:
        output_corpus.Create()
    except errors.EmptyCorpusException:
        pass
    return output_corpus
예제 #7
0
def test_Corpus_Create_preprocess_outcomes(clgen_cache_dir, abc_corpus_config):
    """Test the number of preprocessed kernels in a known corpus."""
    del clgen_cache_dir
    # Add a file containing a "good" OpenCL contentfile.
    with open(abc_corpus_config.local_directory + '/cl_good.cl', 'w') as f:
        f.write("""
// A good kernel.
kernel void foo(global int* a) {
  a[get_global_id(0)] *= 2;
}
""")
    abc_corpus_config.preprocessor[:] = [
        'deeplearning.clgen.preprocessors.opencl:ClangPreprocessWithShim',
        'deeplearning.clgen.preprocessors.opencl:Compile',
        'deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers',
        'deeplearning.clgen.preprocessors.opencl:StripDoubleUnderscorePrefixes',
        'deeplearning.clgen.preprocessors.common:StripDuplicateEmptyLines',
        'deeplearning.clgen.preprocessors.opencl:SanitizeKernelPrototype',
        'deeplearning.clgen.preprocessors.common:StripTrailingWhitespace',
        'deeplearning.clgen.preprocessors.cxx:ClangFormat',
        'deeplearning.clgen.preprocessors.common:MinimumLineCount3',
    ]
    c = corpuses.Corpus(abc_corpus_config)
    assert c.GetNumContentFiles() == 0
    assert c.GetNumPreprocessedFiles() == 0
    c.Create()
    assert c.GetNumContentFiles() == 4
    assert c.GetNumPreprocessedFiles() == 1
예제 #8
0
def test_Corpus_archive_hash(clgen_cache_dir, abc_corpus_config,
                             abc_corpus_archive):
    """Test that the ID of a known archive corpus matches expected value."""
    del clgen_cache_dir
    abc_corpus_config.local_tar_archive = abc_corpus_archive
    c = corpuses.Corpus(abc_corpus_config)
    assert ABC_CORPUS_HASH == c.hash
예제 #9
0
def test_Corpus_greedy_multichar_atomizer_no_atoms(clgen_cache_dir,
                                                   abc_corpus_config):
    """Test that a GreedyMulticharAtomizer raises error if no tokens provided."""
    del clgen_cache_dir
    abc_corpus_config.greedy_multichar_atomizer.tokens[:] = []
    with pytest.raises(errors.UserError) as e_info:
        corpuses.Corpus(abc_corpus_config)
    assert 'GreedyMulticharAtomizer.tokens is empty' == str(e_info.value)
예제 #10
0
def test_Corpus_invalid_content_id(clgen_cache_dir, abc_corpus_config):
    """Test that UserError is raised if content_id does not resolve to cache."""
    del clgen_cache_dir
    abc_corpus_config.ClearField('contentfiles')
    abc_corpus_config.content_id = '1234invalid'
    with pytest.raises(errors.UserError) as e_ctx:
        corpuses.Corpus(abc_corpus_config)
    assert "Content ID not found: '1234invalid'" == str(e_ctx.value)
예제 #11
0
def test_Corpus_preprocessed_symlink(clgen_cache_dir, abc_corpus_config):
    """Test path of symlink to pre-preprocessed files."""
    del clgen_cache_dir
    c = corpuses.Corpus(abc_corpus_config)
    c.Create()
    assert (c.encoded.database_path.parent / 'preprocessed').is_symlink()
    path = str((c.encoded.database_path.parent / 'preprocessed').resolve())
    # We can't do a literal comparison because of bazel sandboxing.
    assert path.endswith(str(c.preprocessed.database_path.parent))
예제 #12
0
def test_Corpus_archive_not_found(clgen_cache_dir, abc_corpus_config):
    """Test that UserError is raised if local_tar_archive does not exist."""
    del clgen_cache_dir
    with tempfile.TemporaryDirectory() as d:
        abc_corpus_config.local_tar_archive = f'{d}/missing_archive.tar.bz2'
        with pytest.raises(errors.UserError) as e_ctx:
            corpuses.Corpus(abc_corpus_config)
    assert f"Archive not found: '{d}/missing_archive.tar.bz2'" == str(
        e_ctx.value)
예제 #13
0
def test_Corpus_Create_empty_directory_raises_error(clgen_cache_dir,
                                                    abc_corpus_config):
    """Test that a corpus with no content files raises an error."""
    del clgen_cache_dir
    with tempfile.TemporaryDirectory() as d:
        abc_corpus_config.local_directory = d
        with pytest.raises(errors.EmptyCorpusException) as e_info:
            corpuses.Corpus(abc_corpus_config).Create()
        assert f"Empty content files directory: '{d}'" == str(e_info.value)
예제 #14
0
def test_Corpus_stale_hash_file(clgen_cache_dir, abc_corpus_config):
    """Test that content_id does not change

  This is to emphasize a limitation in the checksum caching methodology. An
  ideal system would break this test, since the directory has been modified.
  However, since we write the directory checksum to a file, the content id does
  not change.
  """
    del clgen_cache_dir
    c1 = corpuses.Corpus(abc_corpus_config)
    c1.Create()
    with open(pathlib.Path(abc_corpus_config.local_directory) / 'z', 'w') as f:
        f.write('this directory has been modified\n')
    c2 = corpuses.Corpus(abc_corpus_config)
    # Even though we have modified the content files directory, this change is not
    # reflected in the content id, since we use the file created during the
    # instantiation of c1.
    assert c1.content_id == c2.content_id
예제 #15
0
def test_Corpus_greedy_multichar_atomizer_empty_atoms(clgen_cache_dir,
                                                      abc_corpus_config):
    """Test that a GreedyMulticharAtomizer raises error for zero-length string."""
    del clgen_cache_dir
    with pytest.raises(errors.UserError) as e_info:
        abc_corpus_config.greedy_multichar_atomizer.tokens[:] = ['']
        corpuses.Corpus(abc_corpus_config)
    assert 'Empty string found in GreedyMulticharAtomizer.tokens is empty' == str(
        e_info.value)
예제 #16
0
def test_Corpus_badpath(clgen_cache_dir, abc_corpus_config):
    """Test that CLgenError is raised when corpus has a non-existent path."""
    del clgen_cache_dir
    abc_corpus_config.local_directory = "notarealpath"
    with pytest.raises(errors.UserError) as e_info:
        corpuses.Corpus(abc_corpus_config)
    # We resolve the absolute path, so we can't match the whole string.
    assert str(e_info.value).startswith("File not found: '")
    assert str(e_info.value).endswith("notarealpath'")
예제 #17
0
def test_Corpus_archive_cannot_be_unpacked(clgen_cache_dir, abc_corpus_config):
    """Test that UserError is raised if cannot untar local_tar_archive."""
    del clgen_cache_dir
    with tempfile.TemporaryDirectory() as d:
        (pathlib.Path(d) / 'empty.tar.bz2').touch()
        abc_corpus_config.local_tar_archive = str(
            pathlib.Path(d) / 'empty.tar.bz2')
        with pytest.raises(errors.UserError) as e_ctx:
            corpuses.Corpus(abc_corpus_config)
    assert f"Archive unpack failed: '{d}/empty.tar.bz2'" == str(e_ctx.value)
예제 #18
0
def test_Corpus_atomizer_before_Create(clgen_cache_dir, abc_corpus_config):
    """Test that error is raised if atomizer is accessed before Create()."""
    del clgen_cache_dir
    c = corpuses.Corpus(abc_corpus_config)
    with pytest.raises(ValueError) as e_ctx:
        c.atomizer
    assert 'Must call Create() before accessing atomizer property.' == str(
        e_ctx.value)
    c.Create()
    c.atomizer
예제 #19
0
def test_Corpus_GetTextCorpus_no_shuffle(clgen_cache_dir, abc_corpus_config):
    """Test the concatenation of the abc corpus."""
    del clgen_cache_dir
    c = corpuses.Corpus(abc_corpus_config)
    assert c.GetTextCorpus(shuffle=False) == ''
    c.Create()
    # We don't know the ordering of the text corpus.
    assert 'The cat sat on the mat.' in c.GetTextCorpus(shuffle=False)
    assert 'Such corpus.\nVery wow.' in c.GetTextCorpus(shuffle=False)
    assert 'Hello, world!' in c.GetTextCorpus(shuffle=False)
    assert c.GetTextCorpus(shuffle=False).count('\n\n') == 2
예제 #20
0
def test_Corpus_hash_file(clgen_cache_dir, abc_corpus_config):
    """Test that content id is written to a file."""
    del clgen_cache_dir
    hash_file_path = pathlib.Path(
        str(pathlib.Path(abc_corpus_config.local_directory)) + '.sha1.txt')
    assert not hash_file_path.is_file()
    c = corpuses.Corpus(abc_corpus_config)
    assert hash_file_path.is_file()
    with open(hash_file_path) as f:
        content_id = f.read().strip()
    assert c.content_id == content_id
예제 #21
0
def test_Corpus_preprocessed_symlink(clgen_cache_dir, abc_corpus_config):
    """Test path of symlink to pre-preprocessed files."""
    del clgen_cache_dir
    c = corpuses.Corpus(abc_corpus_config)
    c.Create()
    assert (pathlib.Path(c.encoded.url[len("sqlite:///"):]).parent /
            "preprocessed").is_symlink()
    path = (pathlib.Path(c.encoded.url[len("sqlite:///"):]).parent /
            "preprocessed").resolve()
    # We can't do a literal comparison because of bazel sandboxing.
    assert str(path).endswith(
        str(pathlib.Path(c.preprocessed.url[len("sqlite:///"):]).parent))
예제 #22
0
def test_Corpus_Create_empty_preprocessed_raises_error(clgen_cache_dir,
                                                       abc_corpus_config):
    """Test that a pre-processed corpus with no data raises an error."""
    del clgen_cache_dir
    c = corpuses.Corpus(abc_corpus_config)
    # Empty the pre-processed database:
    c.preprocessed.Create(abc_corpus_config)
    with c.preprocessed.Session(commit=True) as session:
        session.query(preprocessed.PreprocessedContentFile).delete()
    with pytest.raises(errors.EmptyCorpusException) as e_info:
        c.Create()
    assert ("Pre-processed corpus contains no files: "
            f"'{c.preprocessed.database_path}'") == str(e_info.value)
예제 #23
0
def test_Corpus_GetTextCorpus_separator(clgen_cache_dir, abc_corpus):
    """Test the concatenation of the abc corpus with a custom separator."""
    del clgen_cache_dir
    c = corpuses.Corpus(
        corpus_pb2.Corpus(local_directory=abc_corpus,
                          ascii_character_atomizer=True,
                          contentfile_separator='\n!!\n'))
    c.Create()
    # We don't know the ordering of the text corpus.
    assert 'The cat sat on the mat.' in c.GetTextCorpus(shuffle=False)
    assert 'Such corpus.\nVery wow.' in c.GetTextCorpus(shuffle=False)
    assert 'Hello, world!' in c.GetTextCorpus(shuffle=False)
    assert c.GetTextCorpus(shuffle=False).count('!!') == 2
예제 #24
0
def test_Corpus_Create_empty_preprocessed_raises_error(clgen_cache_dir,
                                                       abc_corpus_config):
    """Test that a pre-processed corpus with no data raises an error."""
    del clgen_cache_dir
    c = corpuses.Corpus(abc_corpus_config)
    # Empty the pre-processed database:
    c.preprocessed.Create(abc_corpus_config)
    with c.preprocessed.Session(commit=True) as session:
        session.query(preprocessed.PreprocessedContentFile).delete()
    with test.Raises(errors.EmptyCorpusException) as e_info:
        c.Create()
    assert isinstance(e_info.value, errors.EmptyCorpusException)
    assert str(e_info.value).startswith(
        "Pre-processed corpus contains no files: 'sqlite:////")
예제 #25
0
def test_Corpus_GetTextCorpus_random_order(clgen_cache_dir, abc_corpus_config):
    """Test that random shuffling of contentfiles changes the corpus."""
    del clgen_cache_dir
    c = corpuses.Corpus(abc_corpus_config)
    assert c.GetTextCorpus(shuffle=True) == ''
    c.Create()
    # Generate five concatenations with a random order. The idea is that it is
    # extremely unlikely that the same ordering would be randomly selected all
    # five times, however, this is not impossible, so consider this test flaky.
    c1 = c.GetTextCorpus(shuffle=True)
    c2 = c.GetTextCorpus(shuffle=True)
    c3 = c.GetTextCorpus(shuffle=True)
    c4 = c.GetTextCorpus(shuffle=True)
    c5 = c.GetTextCorpus(shuffle=True)
    assert len({c1, c2, c3, c4, c5}) > 1
예제 #26
0
def test_Corpus_GetTrainingData_decode(clgen_cache_dir, abc_corpus):
    """Test the decoded output of GetTrainingData()."""
    del clgen_cache_dir
    c = corpuses.Corpus(
        corpus_pb2.Corpus(local_directory=abc_corpus,
                          ascii_character_atomizer=True,
                          contentfile_separator='\n!!\n'))
    c.Create()
    decoded = c.atomizer.DeatomizeIndices(c.GetTrainingData(shuffle=False))
    # Test that each content file (plus contentfile separator) is in corpus.
    assert '\nSuch corpus.\nVery wow.\n!!\n' in decoded
    assert 'Hello, world!\n!!\n' in decoded
    assert 'The cat sat on the mat.\n!!\n' in decoded
    # Test the total length of the corpus.
    assert len('\nSuch corpus.\nVery wow.\n!!\n' + 'Hello, world!\n!!\n' +
               'The cat sat on the mat.\n!!\n') == len(decoded)
예제 #27
0
def main(argv: typing.List[str]):
    """Main entry point."""
    if len(argv) > 1:
        raise app.UsageError("Unknown arguments: '{}'.".format(" ".join(
            argv[1:])))

    os.environ["CLGEN_CACHE"] = f"{FLAGS.result_cache_dir}/clgen"
    # An OpenCL corpus, configured as described in CGO'17.
    corpus = corpuses.Corpus(
        corpus_pb2.Corpus(
            local_directory=FLAGS.github_kernels_dir,
            ascii_character_atomizer=True,
            contentfile_separator="\n\n",
            preprocessor=[
                "deeplearning.clgen.preprocessors.opencl:ClangPreprocessWithShim",
                "deeplearning.clgen.preprocessors.opencl:Compile",
                "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers",
                "deeplearning.clgen.preprocessors.opencl:StripDoubleUnderscorePrefixes",
                "deeplearning.clgen.preprocessors.common:StripDuplicateEmptyLines",
                "deeplearning.clgen.preprocessors.opencl:SanitizeKernelPrototype",
                "deeplearning.clgen.preprocessors.common:StripTrailingWhitespace",
                "deeplearning.clgen.preprocessors.opencl:ClangFormat",
                "deeplearning.clgen.preprocessors.common:MinimumLineCount3",
                "deeplearning.clgen.preprocessors.opencl:Compile",
            ],
        ))
    corpus.Create()

    cache_dir = pathlib.Path(FLAGS.result_cache_dir) / corpus.hash
    cache_dir.mkdir(parents=True, exist_ok=True)

    driver = cldrive.CldriveHarness(
        harness_pb2.CldriveHarness(
            opencl_env=[FLAGS.opencl_env],
            opencl_opt=[FLAGS.opencl_opt],
        ))

    with corpus.preprocessed.Session() as session:
        # Query to return all successfully preprocessed OpenCL kernels in a stable
        # order.
        q = (session.query(preprocessed.PreprocessedContentFile.text).filter(
            preprocessed.PreprocessedContentFile.preprocessing_succeeded ==
            True).order_by(preprocessed.PreprocessedContentFile.id))

        num_good_files = q.count()
        num_files = session.query(preprocessed.PreprocessedContentFile).count()
        app.Log(
            1,
            "Corpus of %s files (%.1f%% of %s)",
            humanize.Commas(num_good_files),
            (num_good_files / num_files) * 100,
            humanize.Commas(num_files),
        )

        srcs = [x[0] for x in q]
        batch_size = 8
        max_batch = math.ceil(len(srcs) / batch_size)

        all_outcomes = []
        for i, start_idx in enumerate(range(0, len(srcs), batch_size)):
            cached_results_path = cache_dir / f"{i}.pkl"

            if cached_results_path.is_file():
                app.Log(1, "batch %d of %d", i + 1, max_batch)
                # Read cached results.
                with open(cached_results_path, "rb") as f:
                    outcomes = pickle.load(f)
            elif FLAGS.summarize_only:
                continue
            else:
                app.Log(1, "batch %d of %d", i + 1, max_batch)
                # Evaluate OpenCL kernels and cache results.
                batch = srcs[start_idx:start_idx + batch_size]
                testcases = labtypes.flatten(
                    [OpenClSourceToTestCases(src) for src in batch])
                results = RunTestCasesOrDie(driver, testcases)

                outcomes = [
                    GetOutcomeWithDynamicChecks(result, driver)
                    for result in results
                ]
                with open(cached_results_path, "wb") as f:
                    pickle.dump(outcomes, f)

            all_outcomes += outcomes
            df = pd.DataFrame(
                list(zip(all_outcomes, np.ones(len(all_outcomes)))) +
                [("Total", len(all_outcomes))],
                columns=["outcome", "count"],
            )
            summary = df.groupby("outcome").sum().reset_index()
            summary["ratio"] = [
                f"{x:.2%}" for x in
                # Double the "ratio" values because the 'count' column contains a
                # grand total row.
                2 * summary["count"].values / summary["count"].sum()
            ]
            summary["count"] = [
                humanize.Commas(int(x)) for x in summary["count"]
            ]
            print(summary)
            del df
            del summary
예제 #28
0
    def __init__(self, config: model_pb2.Model):
        """Instantiate a model.

    Args:
      config: A Model message.

    Raises:
      TypeError: If the config argument is not a Model proto.
      UserError: In case on an invalid config.
    """
        # Error early, so that a cache isn't created.
        if not isinstance(config, model_pb2.Model):
            t = type(config).__name__
            raise TypeError(f"Config must be a Model proto. Received: '{t}'")
        # Validate config options.
        if config.training.sequence_length < 1:
            raise errors.UserError(
                'TrainingOptions.sequence_length must be >= 1')

        self.config = model_pb2.Model()
        self.config.CopyFrom(builders.AssertIsBuildable(config))
        self.corpus = corpuses.Corpus(config.corpus)
        self.hash = self._ComputeHash(self.corpus, self.config)
        self.cache = cache.mkcache('model', self.hash)
        # Create the necessary cache directories.
        (self.cache.path / 'checkpoints').mkdir(exist_ok=True)
        (self.cache.path / 'samples').mkdir(exist_ok=True)
        (self.cache.path / 'logs').mkdir(exist_ok=True)

        # Create symlink to encoded corpus.
        symlink = self.cache.path / 'corpus'
        if not symlink.is_symlink():
            os.symlink(
                os.path.relpath(
                    pathlib.Path(
                        self.corpus.encoded.url[len('sqlite:///'):]).parent,
                    self.cache.path), symlink)

        # Create symlink to the atomizer.
        symlink = self.cache.path / 'atomizer'
        if not symlink.is_symlink():
            os.symlink(
                os.path.relpath(self.corpus.atomizer_path, self.cache.path),
                symlink)

        # Validate metadata against cache.
        if self.cache.get('META.pbtxt'):
            cached_meta = pbutil.FromFile(
                pathlib.Path(self.cache['META.pbtxt']),
                internal_pb2.ModelMeta())
            # Exclude num_epochs and corpus location from metadata comparison.
            config_to_compare = model_pb2.Model()
            config_to_compare.CopyFrom(self.config)
            config_to_compare.corpus.ClearField('contentfiles')
            config_to_compare.training.ClearField('num_epochs')
            # These fields should have already been cleared, but we'll do it again
            # so that metadata comparisons don't fail when the cached meta schema
            # is updated.
            cached_to_compare = model_pb2.Model()
            cached_to_compare.CopyFrom(cached_meta.config)
            cached_to_compare.corpus.ClearField('contentfiles')
            cached_to_compare.training.ClearField('num_epochs')
            if config_to_compare != cached_to_compare:
                raise errors.InternalError('Metadata mismatch')
            self.meta = cached_meta
        else:
            self.meta = internal_pb2.ModelMeta()
            self.meta.config.CopyFrom(self.config)
            self._WriteMetafile()

        self.backend = {
            model_pb2.NetworkArchitecture.TENSORFLOW:
            tensorflow_backend.TensorFlowBackend,
            model_pb2.NetworkArchitecture.KERAS: keras_backend.KerasBackend,
        }[config.architecture.backend](self.config, self.cache, self.corpus)
예제 #29
0
def test_Corpus_hash(clgen_cache_dir, abc_corpus_config):
    """Test that the ID of a known corpus matches expected value."""
    del clgen_cache_dir
    c = corpuses.Corpus(abc_corpus_config)
    assert ABC_CORPUS_HASH == c.hash
예제 #30
0
def test_Corpus_config_type_error():
    """Test that a TypeError is raised if config is not a Sampler proto."""
    with pytest.raises(TypeError) as e_info:
        corpuses.Corpus(1)
    assert "Config must be a Corpus proto. Received: 'int'" == str(
        e_info.value)