예제 #1
0
def test_ResolveContentId_pre_encoded_corpus_url_mismatch():
    """Test that corpuses with different pre-trained URLs have different IDs."""
    config_1 = corpus_pb2.Corpus(
        pre_encoded_corpus_url="mysql://*****:*****@foo:3306/clgen?charset=utf-8"
    )
    config_2 = corpus_pb2.Corpus(
        pre_encoded_corpus_url="sqlite:////tmp/encoded.db")
    assert corpuses.ResolveContentId(config_1) != corpuses.ResolveContentId(
        config_2)
예제 #2
0
def EnumerateLanguageInstanceConfigs(
    language: typing.Dict[str, typing.List[str]]
) -> typing.List[clgen_pb2.Instance]:
    """Enumerate the options for a language."""
    configs = []
    for corpus, model, sampler in itertools.product(language["corpuses"],
                                                    EnumerateModels(),
                                                    language["samplers"]):
        instance_config = clgen_pb2.Instance()
        instance_config.working_dir = FLAGS.working_dir
        instance_config.model.CopyFrom(model)
        instance_config.model.corpus.CopyFrom(
            pbutil.FromFile(
                bazelutil.DataPath(
                    f"phd/experimental/deeplearning/polyglot/corpuses/{corpus}.pbtxt"
                ),
                corpus_pb2.Corpus(),
            ))
        instance_config.sampler.CopyFrom(
            pbutil.FromFile(
                bazelutil.DataPath(
                    f"phd/experimental/deeplearning/polyglot/samplers/{sampler}.pbtxt"
                ),
                sampler_pb2.Sampler(),
            ))
        configs.append(instance_config)
    return configs
예제 #3
0
def test_ResolveContentId_pre_encoded_corpus_url():
    """Test that pre_encoded_corpus_url field returns checksum of URL."""
    config = corpus_pb2.Corpus(
        pre_encoded_corpus_url="mysql://*****:*****@foo:3306/clgen?charset=utf-8"
    )
    assert corpuses.ResolveContentId(config) == (
        "1fb56a3a74a939ee5be79172b3510a498abe7f3c")
예제 #4
0
def test_Corpus_pre_encoded_corpus_url_GetTrainingData(abc_pre_encoded):
    """Test the training data accessor of a pre-encoded corpus."""
    c = corpuses.Corpus(
        corpus_pb2.Corpus(pre_encoded_corpus_url=abc_pre_encoded))
    c.Create()
    # abc_pre_encoded contains two contentfiles, totalling with 8 tokens.
    assert len(c.GetTrainingData(shuffle=True)) == 8
예제 #5
0
def abc_corpus_config(abc_corpus):
    """The proto config for a simple Corpus."""
    return corpus_pb2.Corpus(
        local_directory=abc_corpus,
        ascii_character_atomizer=True,
        contentfile_separator="\n\n",
    )
예제 #6
0
파일: run.py 프로젝트: SpringRi/phd
def PostprocessSampleCorpus(instance: clgen.Instance):
    """Create a corpus from the model samples and pre-process."""
    sample_dir = instance.model.SamplerCache(instance.sampler)

    # Read the sample protos and write them to a directory of content files.
    contentfiles_dir = pathlib.Path(str(sample_dir) + '.contentfiles')
    contentfiles_dir.mkdir(exist_ok=True)
    logging.info('Writing output contentfiles to %s', contentfiles_dir)
    if len(list(contentfiles_dir.iterdir())) != len(list(
            sample_dir.iterdir())):
        for proto_path in sample_dir.iterdir():
            sample = pbutil.FromFile(proto_path, model_pb2.Sample())
            with open(contentfiles_dir / proto_path.name, 'w') as f:
                f.write(sample.text)

    logging.info('Creating output corpus')
    output_corpus_config = corpus_pb2.Corpus()
    output_corpus_config.CopyFrom(instance.model.corpus.config)
    output_corpus_config.local_directory = str(contentfiles_dir)
    # We derive the programming language name from the input corpus directory.
    # This depends on corpuses being in directories named after their language,
    # e.g. ~/corpuses/opencl, or ~/corpuses/java.A
    preprocessed_dir = instance.model.corpus.preprocessed.url[len('sqlite:///'
                                                                  ):].parent
    language = (preprocessed_dir / 'contentfiles').resolve().name
    output_corpus_config.preprocessor[:] = POSTPROCESSORS[language]
    output_corpus = corpuses.Corpus(output_corpus_config)
    try:
        output_corpus.Create()
    except errors.EmptyCorpusException:
        pass
    return output_corpus
예제 #7
0
def MakeClgenInstanceConfig(
    working_dir: pathlib.Path,
    encoded_db: encoded.EncodedContentFiles,
    num_training_epochs: int,
    seed_text: str,
    neurons_per_layer: int,
    num_layers: int,
) -> clgen_pb2.Instance:
    """Construct a CLgen instance.

  Args:
    working_dir: The directory to cache CLgen working files in.
    encoded_db: The directory of encoded content files.
    num_training_epochs: The number of epochs to train for.
    seed_text: The text to initiate sampling with.
    neurons_per_layer: Number of neurons in a layer.
  """
    return clgen_pb2.Instance(
        working_dir=str(working_dir),
        model=model_pb2.Model(
            corpus=corpus_pb2.Corpus(pre_encoded_corpus_url=encoded_db.url, ),
            architecture=model_pb2.NetworkArchitecture(
                backend=model_pb2.NetworkArchitecture.TENSORFLOW,
                neuron_type=model_pb2.NetworkArchitecture.LSTM,
                neurons_per_layer=neurons_per_layer,
                num_layers=num_layers,
                post_layer_dropout_micros=0,
            ),
            training=model_pb2.TrainingOptions(
                num_epochs=num_training_epochs,
                sequence_length=64,
                batch_size=64,
                shuffle_corpus_contentfiles_between_epochs=True,
                adam_optimizer=model_pb2.AdamOptimizer(
                    initial_learning_rate_micros=2000,
                    learning_rate_decay_per_epoch_micros=50000,
                    beta_1_micros=900000,
                    beta_2_micros=999000,
                    normalized_gradient_clip_micros=5000000,
                ),
            ),
        ),
        sampler=sampler_pb2.Sampler(
            start_text=seed_text,
            batch_size=64,
            sequence_length=1024,
            temperature_micros=1000000,  # = 1.0 real value
            termination_criteria=[
                sampler_pb2.SampleTerminationCriterion(
                    symtok=sampler_pb2.SymmetricalTokenDepth(
                        depth_increase_token="{",
                        depth_decrease_token="}",
                    )),
                sampler_pb2.SampleTerminationCriterion(
                    maxlen=sampler_pb2.MaxTokenLength(
                        maximum_tokens_in_sample=20000, )),
            ],
        ),
    )
예제 #8
0
def test_Corpus_GetTextCorpus_separator(clgen_cache_dir, abc_corpus):
    """Test the concatenation of the abc corpus with a custom separator."""
    del clgen_cache_dir
    c = corpuses.Corpus(
        corpus_pb2.Corpus(local_directory=abc_corpus,
                          ascii_character_atomizer=True,
                          contentfile_separator='\n!!\n'))
    c.Create()
    # We don't know the ordering of the text corpus.
    assert 'The cat sat on the mat.' in c.GetTextCorpus(shuffle=False)
    assert 'Such corpus.\nVery wow.' in c.GetTextCorpus(shuffle=False)
    assert 'Hello, world!' in c.GetTextCorpus(shuffle=False)
    assert c.GetTextCorpus(shuffle=False).count('!!') == 2
예제 #9
0
def CreateCorpusProtoFromFlags() -> corpus_pb2.Corpus:
  corpus = corpus_pb2.Corpus(
    local_directory=FLAGS.clgen_corpus_dir,
    preprocessor=FLAGS.clgen_preprocessor,
    contentfile_separator="\n\n",
  )
  if FLAGS.clgen_multichar_tokenizer:
    corpus.greedy_multichar_atomizer.CopyFrom(
      corpus_pb2.GreedyMulticharAtomizer(tokens=TOKEN_LISTS["opencl"]["tokens"])
    )
  else:
    corpus.ascii_character_atomizer = True

  return corpus
예제 #10
0
def ResolveEncodedId(content_id: str, config: corpus_pb2.Corpus) -> str:
    """Compute the hash of a corpus of preprocessed and encoded contentfiles.

  The hash is computed from the ID of the input files and the serialized
  representation of the config proto.
  """
    config_without_contentfiles = corpus_pb2.Corpus()
    config_without_contentfiles.CopyFrom(config)
    # Clear the contentfiles field, since we use the content_id to uniquely
    # identify the input files. This means that corpuses with the same content
    # files delivered through different means (e.g. two separate but identical
    # directories) have the same hash.
    config_without_contentfiles.ClearField("contentfiles")
    return crypto.sha1_list(content_id,
                            config_without_contentfiles.SerializeToString())
예제 #11
0
def test_Corpus_GetTrainingData_decode(clgen_cache_dir, abc_corpus):
    """Test the decoded output of GetTrainingData()."""
    del clgen_cache_dir
    c = corpuses.Corpus(
        corpus_pb2.Corpus(local_directory=abc_corpus,
                          ascii_character_atomizer=True,
                          contentfile_separator='\n!!\n'))
    c.Create()
    decoded = c.atomizer.DeatomizeIndices(c.GetTrainingData(shuffle=False))
    # Test that each content file (plus contentfile separator) is in corpus.
    assert '\nSuch corpus.\nVery wow.\n!!\n' in decoded
    assert 'Hello, world!\n!!\n' in decoded
    assert 'The cat sat on the mat.\n!!\n' in decoded
    # Test the total length of the corpus.
    assert len('\nSuch corpus.\nVery wow.\n!!\n' + 'Hello, world!\n!!\n' +
               'The cat sat on the mat.\n!!\n') == len(decoded)
예제 #12
0
    def Create(self) -> None:
        """Create the corpus files.

    Raises:
      EmptyCorpusException: If there are no content files, or no successfully
        pre-processed files.
    """
        self._created = True
        app.Log(1, "Content ID: %s", self.content_id)

        # Nothing to do for already-encoded databases.
        # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor this after
        # splitting out Corpus class.
        if self.config.HasField("pre_encoded_corpus_url"):
            with self.dashboard_db.Session(commit=True) as session:
                config_to_store = corpus_pb2.Corpus()
                config_to_store.CopyFrom(self.config)
                # Clear the contentfiles field, since we use the content_id to uniquely
                # identify the input files. This means that corpuses with the same content
                # files delivered through different means (e.g. two separate but identical
                # directories) have the same hash.
                config_to_store.ClearField("contentfiles")
                corpus = session.GetOrAdd(
                    dashboard_db.Corpus,
                    config_proto_sha1=crypto.sha1(
                        config_to_store.SerializeToString()),
                    config_proto=str(config_to_store),
                    preprocessed_url="",
                    encoded_url=self.encoded.url,
                    summary=self.GetShortSummary(),
                )
                session.flush()
                self._dashboard_db_id = corpus.id
            return

        preprocessed_lock_path = (
            pathlib.Path(self.preprocessed.url[len("sqlite:///"):]).parent /
            "LOCK")
        with lockfile.LockFile(preprocessed_lock_path):
            self.preprocessed.Create(self.config)
        if not self.preprocessed.size:
            raise errors.EmptyCorpusException(
                f"Pre-processed corpus contains no files: '{self.preprocessed.url}'"
            )
        encoded_lock_path = (
            pathlib.Path(self.encoded.url[len("sqlite:///"):]).parent / "LOCK")
        with lockfile.LockFile(encoded_lock_path):
            start_time = time.time()
            atomizer = self.atomizer
            app.Log(
                1,
                "%s: %s tokens in %s ms",
                type(atomizer).__name__,
                humanize.Commas(atomizer.vocab_size),
                humanize.Commas(int((time.time() - start_time) * 1000)),
            )
            self.encoded.Create(self.preprocessed, atomizer,
                                self.config.contentfile_separator)

        # Add entry to dashboard database
        with self.dashboard_db.Session(commit=True) as session:
            config_to_store = corpus_pb2.Corpus()
            config_to_store.CopyFrom(self.config)
            # Clear the contentfiles field, since we use the content_id to uniquely
            # identify the input files. This means that corpuses with the same content
            # files delivered through different means (e.g. two separate but identical
            # directories) have the same hash.
            config_to_store.ClearField("contentfiles")
            corpus = session.GetOrAdd(
                dashboard_db.Corpus,
                config_proto_sha1=crypto.sha1(
                    config_to_store.SerializeToString()),
                config_proto=str(config_to_store),
                preprocessed_url=self.preprocessed.url,
                encoded_url=self.encoded.url,
                summary=self.GetShortSummary(),
            )
            session.flush()
            self._dashboard_db_id = corpus.id
예제 #13
0
    def __init__(self, config: corpus_pb2.Corpus):
        """Instantiate a corpus from a proto config.

    If this is a new corpus, a number of files will be created, which may
    take some time.

    Args:
      config: A Corpus message.

    Raises:
      TypeError: If the config argument is not a Sampler proto.
      UserError: In case the corpus is not found, or config contains invalid
        options.
      EmptyCorpusException: In case the corpus contains no data.
    """
        if not isinstance(config, corpus_pb2.Corpus):
            t = type(config).__name__
            raise TypeError(f"Config must be a Corpus proto. Received: '{t}'")

        # Make a local copy of the configuration.
        self.config = corpus_pb2.Corpus()
        self.config.CopyFrom(AssertConfigIsValid(config))
        self._atomizer = None
        self._created = False
        self.dashboard_db = dashboard_db.GetDatabase()
        self._dashboard_db_id: typing.Optional[int] = None  # Set in Create()

        # An in-memory cache of the encoded contentfiles indices arrays.
        # Set and used in GetTrainingData().
        self._indices_arrays: typing.Optional[typing.List[np.array]] = None

        cache.cachepath("corpus").mkdir(parents=True, exist_ok=True)
        hc = hashcache.HashCache(cache.cachepath("hashcache.db"), "sha1")
        self.content_id = ResolveContentId(self.config, hc)
        # Database of pre-processed files.
        preprocessed_id = ResolvePreprocessedId(self.content_id, self.config)
        cache.cachepath("corpus", "preprocessed",
                        preprocessed_id).mkdir(exist_ok=True, parents=True)
        preprocessed_db_path = cache.cachepath("corpus", "preprocessed",
                                               preprocessed_id,
                                               "preprocessed.db")
        if (self.config.HasField("content_id")
                and not preprocessed_db_path.is_file()):
            raise errors.UserError(
                f"Content ID not found: '{self.content_id}'")
        self.preprocessed = preprocessed.PreprocessedContentFiles(
            f"sqlite:///{preprocessed_db_path}")
        # Create symlink to contentfiles.
        symlink = (
            pathlib.Path(self.preprocessed.url[len("sqlite:///"):]).parent /
            "contentfiles")
        if not symlink.is_symlink():
            if config.HasField("local_directory"):
                os.symlink(
                    str(
                        ExpandConfigPath(
                            config.local_directory,
                            path_prefix=FLAGS.clgen_local_path_prefix)),
                    symlink,
                )
            elif config.HasField("local_tar_archive"):
                os.symlink(
                    str(
                        ExpandConfigPath(
                            config.local_tar_archive,
                            path_prefix=FLAGS.clgen_local_path_prefix,
                        )),
                    symlink,
                )
        # Data of encoded pre-preprocessed files.
        encoded_id = ResolveEncodedId(self.content_id, self.config)
        cache.cachepath("corpus", "encoded", encoded_id).mkdir(exist_ok=True,
                                                               parents=True)
        db_path = cache.cachepath("corpus", "encoded", encoded_id,
                                  "encoded.db")
        # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor this conditional
        # logic by making Corpus an abstract class and creating concrete subclasses
        # for the different types of corpus.
        if self.config.HasField("pre_encoded_corpus_url"):
            self.encoded = encoded.EncodedContentFiles(
                config.pre_encoded_corpus_url)
        else:
            self.encoded = encoded.EncodedContentFiles(f"sqlite:///{db_path}")
        self.atomizer_path = cache.cachepath("corpus", "encoded", encoded_id,
                                             "atomizer.pkl")
        # Create symlink to preprocessed files.
        # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor this conditional
        # logic after splitting Corpus class.
        if not self.config.HasField("pre_encoded_corpus_url"):
            symlink = (
                pathlib.Path(self.encoded.url[len("sqlite:///"):]).parent /
                "preprocessed")
            if not symlink.is_symlink():
                os.symlink(
                    os.path.relpath(
                        pathlib.Path(
                            self.preprocessed.url[len("sqlite:///"):]).parent,
                        pathlib.Path(
                            self.encoded.url[len("sqlite:///"):]).parent,
                    ),
                    symlink,
                )
        self.hash = encoded_id
        self.cache = cache.mkcache("corpus", "encoded", encoded_id)
예제 #14
0
    def __init__(self, config: corpus_pb2.Corpus):
        """Instantiate a corpus from a proto config.

    If this is a new corpus, a number of files will be created, which may
    take some time.

    Args:
      config: A Corpus message.

    Raises:
      TypeError: If the config argument is not a Sampler proto.
      UserError: In case the corpus is not found, or config contains invalid
        options.
      EmptyCorpusException: In case the corpus contains no data.
    """
        if not isinstance(config, corpus_pb2.Corpus):
            t = type(config).__name__
            raise TypeError(f"Config must be a Corpus proto. Received: '{t}'")

        # Make a local copy of the configuration.
        self.config = corpus_pb2.Corpus()
        self.config.CopyFrom(AssertConfigIsValid(config))
        self._atomizer = None
        self._created = False

        cache.cachepath('corpus').mkdir(parents=True, exist_ok=True)
        hc = hashcache.HashCache(cache.cachepath('hashcache.db'), 'sha1')
        self.content_id = ResolveContentId(self.config, hc)
        # Database of pre-processed files.
        preprocessed_id = ResolvePreprocessedId(self.content_id, self.config)
        cache.cachepath('corpus', 'preprocessed',
                        preprocessed_id).mkdir(exist_ok=True, parents=True)
        preprocessed_db_path = cache.cachepath('corpus', 'preprocessed',
                                               preprocessed_id,
                                               'preprocessed.db')
        if (self.config.HasField('content_id')
                and not preprocessed_db_path.is_file()):
            raise errors.UserError(
                f"Content ID not found: '{self.content_id}'")
        self.preprocessed = preprocessed.PreprocessedContentFiles(
            preprocessed_db_path)
        # Create symlink to contentfiles.
        symlink = self.preprocessed.database_path.parent / 'contentfiles'
        if not symlink.is_symlink():
            if config.HasField('local_directory'):
                os.symlink(
                    str(
                        ExpandConfigPath(
                            config.local_directory,
                            path_prefix=FLAGS.clgen_local_path_prefix)),
                    symlink)
            elif config.HasField('local_tar_archive'):
                os.symlink(
                    str(
                        ExpandConfigPath(
                            config.local_tar_archive,
                            path_prefix=FLAGS.clgen_local_path_prefix)),
                    symlink)
        # Data of encoded pre-preprocessed files.
        encoded_id = ResolveEncodedId(self.content_id, self.config)
        cache.cachepath('corpus', 'encoded', encoded_id).mkdir(exist_ok=True,
                                                               parents=True)
        self.encoded = encoded.EncodedContentFiles(
            cache.cachepath('corpus', 'encoded', encoded_id, 'encoded.db'))
        self.atomizer_path = cache.cachepath('corpus', 'encoded', encoded_id,
                                             'atomizer.pkl')
        # Create symlink to preprocessed files.
        symlink = self.encoded.database_path.parent / 'preprocessed'
        if not symlink.is_symlink():
            os.symlink(
                os.path.relpath(self.preprocessed.database_path.parent,
                                self.encoded.database_path.parent), symlink)
        self.hash = encoded_id
        self.cache = cache.mkcache('corpus', 'encoded', encoded_id)
예제 #15
0
def CreateCorpusProtoFromFlags() -> corpus_pb2.Corpus:
  corpus = corpus_pb2.Corpus(
      local_directory=FLAGS.clgen_corpus_dir,
      preprocessor=[
          "deeplearning.clgen.preprocessors.opencl:ClangPreprocessWithShim",
          "deeplearning.clgen.preprocessors.opencl:Compile",
          "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers",
          "deeplearning.clgen.preprocessors.opencl:StripDoubleUnderscorePrefixes",
          "deeplearning.clgen.preprocessors.common:StripDuplicateEmptyLines",
          "deeplearning.clgen.preprocessors.opencl:SanitizeKernelPrototype",
          "deeplearning.clgen.preprocessors.common:StripTrailingWhitespace",
          "deeplearning.clgen.preprocessors.opencl:ClangFormat",
          "deeplearning.clgen.preprocessors.common:MinimumLineCount3",
          "deeplearning.clgen.preprocessors.opencl:StripDoubleUnderscorePrefixes",
          "deeplearning.clgen.preprocessors.opencl:Compile",
      ],
      contentfile_separator='\n\n',
  )
  if FLAGS.clgen_multichar_tokenizer:
    corpus.greedy_multichar_atomizer.CopyFrom(
        corpus_pb2.GreedyMulticharAtomizer(tokens=[
            "  ",
            "__assert",
            "__attribute",
            "__builtin_astype",
            "__clc_fabs",
            "__clc_fma",
            "__inline",
            "abs",
            "alignas",
            "alignof",
            "atomic_add",
            "auto",
            "barrier",
            "bool",
            "break",
            "case",
            "char",
            "clamp",
            "complex",
            "const",
            "constant",
            "continue",
            "default",
            "defined",
            "do",
            "double",
            "else",
            "enum",
            "error",
            "event_t",
            "extern",
            "fabs",
            "false",
            "float",
            "for",
            "get_global_id",
            "get_global_size",
            "get_local_id",
            "get_local_size",
            "get_num_groups",
            "global",
            "goto",
            "half",
            "if",
            "image1d_array_t",
            "image1d_buffer_t",
            "image1d_t",
            "image2d_array_t",
            "image2d_t",
            "image3d_t",
            "imaginary",
            "include",
            "inline",
            "int",
            "into",
            "kernel",
            "line",
            "local",
            "long",
            "noreturn",
            "pragma",
            "private",
            "quad",
            "read_only",
            "read_write",
            "register",
            "restrict",
            "return",
            "sampler_t",
            "short",
            "shuffle",
            "signed",
            "size_t",
            "sizeof",
            "sqrt",
            "static",
            "struct",
            "switch",
            "true",
            "typedef",
            "u32",
            "uchar",
            "uint",
            "ulong",
            "undef",
            "union",
            "unsigned",
            "void",
            "volatile",
            "while",
            "wide",
            "write_only",
        ]))
  else:
    corpus.ascii_character_atomizer = True

  return corpus
예제 #16
0
def main(argv: typing.List[str]):
    """Main entry point."""
    if len(argv) > 1:
        raise app.UsageError("Unknown arguments: '{}'.".format(" ".join(
            argv[1:])))

    instance = clgen.Instance(
        clgen_pb2.Instance(
            working_dir=FLAGS.clgen_dir,
            model=model_pb2.Model(
                corpus=corpus_pb2.Corpus(
                    local_directory=FLAGS.clgen_corpus_dir,
                    ascii_character_atomizer=True,
                    preprocessor=[
                        "deeplearning.clgen.preprocessors.opencl:ClangPreprocessWithShim",
                        "deeplearning.clgen.preprocessors.opencl:Compile",
                        "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers",
                        "deeplearning.clgen.preprocessors.opencl:StripDoubleUnderscorePrefixes",
                        "deeplearning.clgen.preprocessors.common:StripDuplicateEmptyLines",
                        "deeplearning.clgen.preprocessors.opencl:SanitizeKernelPrototype",
                        "deeplearning.clgen.preprocessors.common:StripTrailingWhitespace",
                        "deeplearning.clgen.preprocessors.opencl:ClangFormat",
                        "deeplearning.clgen.preprocessors.common:MinimumLineCount3",
                        "deeplearning.clgen.preprocessors.opencl:Compile",
                    ],
                    contentfile_separator="\n\n",
                ),
                architecture=model_pb2.NetworkArchitecture(
                    backend=model_pb2.NetworkArchitecture.TENSORFLOW,
                    neuron_type=model_pb2.NetworkArchitecture.LSTM,
                    neurons_per_layer=512,
                    num_layers=2,
                    post_layer_dropout_micros=0,
                ),
                training=model_pb2.TrainingOptions(
                    num_epochs=50,
                    sequence_length=64,
                    batch_size=64,
                    shuffle_corpus_contentfiles_between_epochs=True,
                    adam_optimizer=model_pb2.AdamOptimizer(
                        initial_learning_rate_micros=2000,
                        learning_rate_decay_per_epoch_micros=50000,
                        beta_1_micros=900000,
                        beta_2_micros=999000,
                        normalized_gradient_clip_micros=5000000,
                    ),
                ),
            ),
            sampler=sampler_pb2.Sampler(
                start_text="kernel void ",
                batch_size=64,
                sequence_length=1024,
                temperature_micros=1000000,  # = 1.0 real value
                termination_criteria=[
                    sampler_pb2.SampleTerminationCriterion(
                        symtok=sampler_pb2.SymmetricalTokenDepth(
                            depth_increase_token="{",
                            depth_decrease_token="}",
                        )),
                    sampler_pb2.SampleTerminationCriterion(
                        maxlen=sampler_pb2.MaxTokenLength(
                            maximum_tokens_in_sample=20000, )),
                ],
            ),
        ), )
    db = grewe_features_db.Database(FLAGS.db)
    profile_dir = pathlib.Path(FLAGS.profile_dir)
    profile_dir.mkdir(parents=True, exist_ok=True)
    profiler = prof.AutoCsvProfiler(profile_dir)

    with instance.Session(), multiprocessing.Pool() as pool:
        while True:
            Sample(instance, db, profiler, pool)
예제 #17
0
def main(argv: typing.List[str]):
    """Main entry point."""
    if len(argv) > 1:
        raise app.UsageError("Unknown arguments: '{}'.".format(" ".join(
            argv[1:])))

    os.environ["CLGEN_CACHE"] = f"{FLAGS.result_cache_dir}/clgen"
    # An OpenCL corpus, configured as described in CGO'17.
    corpus = corpuses.Corpus(
        corpus_pb2.Corpus(
            local_directory=FLAGS.github_kernels_dir,
            ascii_character_atomizer=True,
            contentfile_separator="\n\n",
            preprocessor=[
                "deeplearning.clgen.preprocessors.opencl:ClangPreprocessWithShim",
                "deeplearning.clgen.preprocessors.opencl:Compile",
                "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers",
                "deeplearning.clgen.preprocessors.opencl:StripDoubleUnderscorePrefixes",
                "deeplearning.clgen.preprocessors.common:StripDuplicateEmptyLines",
                "deeplearning.clgen.preprocessors.opencl:SanitizeKernelPrototype",
                "deeplearning.clgen.preprocessors.common:StripTrailingWhitespace",
                "deeplearning.clgen.preprocessors.opencl:ClangFormat",
                "deeplearning.clgen.preprocessors.common:MinimumLineCount3",
                "deeplearning.clgen.preprocessors.opencl:Compile",
            ],
        ))
    corpus.Create()

    cache_dir = pathlib.Path(FLAGS.result_cache_dir) / corpus.hash
    cache_dir.mkdir(parents=True, exist_ok=True)

    driver = cldrive.CldriveHarness(
        harness_pb2.CldriveHarness(
            opencl_env=[FLAGS.opencl_env],
            opencl_opt=[FLAGS.opencl_opt],
        ))

    with corpus.preprocessed.Session() as session:
        # Query to return all successfully preprocessed OpenCL kernels in a stable
        # order.
        q = (session.query(preprocessed.PreprocessedContentFile.text).filter(
            preprocessed.PreprocessedContentFile.preprocessing_succeeded ==
            True).order_by(preprocessed.PreprocessedContentFile.id))

        num_good_files = q.count()
        num_files = session.query(preprocessed.PreprocessedContentFile).count()
        app.Log(
            1,
            "Corpus of %s files (%.1f%% of %s)",
            humanize.Commas(num_good_files),
            (num_good_files / num_files) * 100,
            humanize.Commas(num_files),
        )

        srcs = [x[0] for x in q]
        batch_size = 8
        max_batch = math.ceil(len(srcs) / batch_size)

        all_outcomes = []
        for i, start_idx in enumerate(range(0, len(srcs), batch_size)):
            cached_results_path = cache_dir / f"{i}.pkl"

            if cached_results_path.is_file():
                app.Log(1, "batch %d of %d", i + 1, max_batch)
                # Read cached results.
                with open(cached_results_path, "rb") as f:
                    outcomes = pickle.load(f)
            elif FLAGS.summarize_only:
                continue
            else:
                app.Log(1, "batch %d of %d", i + 1, max_batch)
                # Evaluate OpenCL kernels and cache results.
                batch = srcs[start_idx:start_idx + batch_size]
                testcases = labtypes.flatten(
                    [OpenClSourceToTestCases(src) for src in batch])
                results = RunTestCasesOrDie(driver, testcases)

                outcomes = [
                    GetOutcomeWithDynamicChecks(result, driver)
                    for result in results
                ]
                with open(cached_results_path, "wb") as f:
                    pickle.dump(outcomes, f)

            all_outcomes += outcomes
            df = pd.DataFrame(
                list(zip(all_outcomes, np.ones(len(all_outcomes)))) +
                [("Total", len(all_outcomes))],
                columns=["outcome", "count"],
            )
            summary = df.groupby("outcome").sum().reset_index()
            summary["ratio"] = [
                f"{x:.2%}" for x in
                # Double the "ratio" values because the 'count' column contains a
                # grand total row.
                2 * summary["count"].values / summary["count"].sum()
            ]
            summary["count"] = [
                humanize.Commas(int(x)) for x in summary["count"]
            ]
            print(summary)
            del df
            del summary
예제 #18
0
파일: corpuses.py 프로젝트: fivosts/clgen
  def __init__(self, config: typing.Union[corpus_pb2.Corpus, corpus_pb2.PreTrainCorpus]):
    """Instantiate a corpus from a proto config.

    If this is a new corpus, a number of files will be created, which may
    take some time.

    Args:
      config: A Corpus message.

    Raises:
      TypeError: If the config argument is not a Sampler proto.
      UserError: In case the corpus is not found, or config contains invalid
        options.
      EmptyCorpusException: In case the corpus contains no data.
    """
    if not isinstance(config, corpus_pb2.Corpus) and not isinstance(config, corpus_pb2.PreTrainCorpus):
      raise TypeError(f"Config must be a Corpus proto. Received: '{type(config).__name__}'")

    # Make a local copy of the configuration.
    if isinstance(config, corpus_pb2.Corpus):
      self.config    = corpus_pb2.Corpus()
      self.pre_train = False
    else:
      self.config    = corpus_pb2.PreTrainCorpus()
      self.pre_train = True

    self.config.CopyFrom(AssertConfigIsValid(config))
    self._tokenizer = None
    self._created = False

    # An in-memory cache of the encoded contentfiles indices arrays.
    # Set and used in GetTrainingData().
    self._indices_arrays: typing.Optional[typing.List[np.array]] = None

    if environment.WORLD_RANK == 0:
      cache.cachepath("corpus").mkdir(parents=True, exist_ok=True)
    distrib.barrier()
    self.content_id = ResolveContentId(self.config)
    # Database of pre-processed files.
    preprocessed_id = ResolvePreprocessedId(self.content_id, self.config)
    if environment.WORLD_RANK == 0:
      cache.cachepath("corpus", "preprocessed", preprocessed_id).mkdir(exist_ok=True, parents=True)
    distrib.barrier()
    preprocessed_db_path = cache.cachepath("corpus", "preprocessed",
                                           preprocessed_id, "preprocessed.db")

    if self.config.HasField("content_id") and not preprocessed_db_path.is_file():
      raise ValueError(f"Content ID not found: '{self.content_id}'")
    self.preprocessed = preprocessed.PreprocessedContentFiles(
      f"sqlite:///{preprocessed_db_path}"
    )
    # Create symlink to contentfiles.
    if environment.WORLD_RANK == 0:
      symlink = (pathlib.Path(self.preprocessed.url[len("sqlite:///") :]).parent / "contentfiles")
      if not symlink.is_symlink():
        if config.HasField("local_directory"):
          os.symlink(
            str(ExpandConfigPath(config.local_directory,   path_prefix=FLAGS.clgen_local_path_prefix)),
            symlink,
          )
        elif config.HasField("local_tar_archive"):
          os.symlink(
            str(ExpandConfigPath(config.local_tar_archive, path_prefix=FLAGS.clgen_local_path_prefix)),
            symlink,
          )
        elif config.HasField("bq_database"):
          os.symlink(
            str(ExpandConfigPath(config.bq_database, path_prefix=FLAGS.clgen_local_path_prefix)),
            symlink,
          )  
        # elif config.HasField("fetch_github"):
        #   os.symlink(
        #     str(ExpandConfigPath(config.fetch_github, path_prefix=FLAGS.clgen_local_path_prefix)),
        #     symlink,
        #   )
    distrib.barrier()
    # Data of encoded pre-preprocessed files.
    encoded_id = ResolveEncodedId(self.content_id, self.config)
    if environment.WORLD_RANK == 0:
      cache.cachepath("corpus", "encoded", encoded_id).mkdir(exist_ok=True, parents=True)
    distrib.barrier()
    db_path = cache.cachepath("corpus", "encoded", encoded_id, "encoded.db")
    if self.config.HasField("pre_encoded_corpus_url"):
      self.encoded = encoded.EncodedContentFiles(config.pre_encoded_corpus_url, self.pre_train)
    else:
      self.encoded = encoded.EncodedContentFiles(f"sqlite:///{db_path}", self.pre_train)
    self.tokenizer_path = cache.cachepath(
      "corpus", "encoded", encoded_id, "tokenizer.pkl"
    )
    if environment.WORLD_RANK == 0 and not self.config.HasField("pre_encoded_corpus_url"):
      symlink = (pathlib.Path(self.encoded.url[len("sqlite:///") :]).parent / "preprocessed")
      if not symlink.is_symlink():
        os.symlink(
          os.path.relpath(
            pathlib.Path(self.preprocessed.url[len("sqlite:///") :]).parent,
            pathlib.Path(self.encoded.url[len("sqlite:///") :]).parent,
            ),
          symlink,
        )
    self.hash = encoded_id
    self.cache = cache.mkcache("corpus", "encoded", encoded_id)
    if environment.WORLD_RANK == 0:
      commit.saveCommit(self.cache.path)
      commit.saveCommit(self.cache.path.parent.parent / "preprocessed" / preprocessed_id)
    distrib.barrier()
    l.logger().info("Initialized {}train corpus in {}".format("pre_" if self.pre_train else "", self.cache.path))
    return