def EnumerateModels() -> typing.List[model_pb2.Model]: """Enumerate the model configurations.""" models = [] base_model = pbutil.FromString(BASE_MODEL, model_pb2.Model()) for num_neurons, num_layers in itertools.product(NUM_NEURONS, NUM_LAYERS): model = model_pb2.Model() model.CopyFrom(base_model) model.architecture.neurons_per_layer = num_neurons model.architecture.num_layers = num_layers models.append(model) return models
def Create(self) -> bool: if self._created: return False self._created = True self.corpus.Create() # Add entry to dashboard database with self.dashboard_db.Session(commit=True) as session: config_to_store = model_pb2.Model() config_to_store.CopyFrom(self.config) config_to_store.ClearField("corpus") config_to_store.training.ClearField("num_epochs") corpus = session.GetOrAdd( dashboard_db.Model, corpus_id=self.corpus.dashboard_db_id, config_proto_sha1=crypto.sha1( config_to_store.SerializeToString()), config_proto=str(config_to_store), cache_path=(f"ssh://{system.USERNAME}@{system.HOSTNAME}" f"/{self.cache.path}"), summary=self.GetShortSummary(), ) session.flush() self._dashboard_db_id = corpus.id self.backend.dashboard_model_id = self.dashboard_db_id self.backend.dashboard_db = self.dashboard_db
def abc_model_config(abc_corpus_config): """The proto config for a simple Model.""" architecture = model_pb2.NetworkArchitecture( backend=model_pb2.NetworkArchitecture.TENSORFLOW, embedding_size=2, neuron_type=model_pb2.NetworkArchitecture.LSTM, neurons_per_layer=4, num_layers=1, post_layer_dropout_micros=2000, ) optimizer = model_pb2.AdamOptimizer( initial_learning_rate_micros=2000, learning_rate_decay_per_epoch_micros=5000, beta_1_micros=900000, beta_2_micros=999000, normalized_gradient_clip_micros=5000000, ) training = model_pb2.TrainingOptions( num_epochs=1, sequence_length=10, batch_size=5, shuffle_corpus_contentfiles_between_epochs=False, adam_optimizer=optimizer, ) return model_pb2.Model(corpus=abc_corpus_config, architecture=architecture, training=training)
def MakeClgenInstanceConfig( working_dir: pathlib.Path, encoded_db: encoded.EncodedContentFiles, num_training_epochs: int, seed_text: str, neurons_per_layer: int, num_layers: int, ) -> clgen_pb2.Instance: """Construct a CLgen instance. Args: working_dir: The directory to cache CLgen working files in. encoded_db: The directory of encoded content files. num_training_epochs: The number of epochs to train for. seed_text: The text to initiate sampling with. neurons_per_layer: Number of neurons in a layer. """ return clgen_pb2.Instance( working_dir=str(working_dir), model=model_pb2.Model( corpus=corpus_pb2.Corpus(pre_encoded_corpus_url=encoded_db.url, ), architecture=model_pb2.NetworkArchitecture( backend=model_pb2.NetworkArchitecture.TENSORFLOW, neuron_type=model_pb2.NetworkArchitecture.LSTM, neurons_per_layer=neurons_per_layer, num_layers=num_layers, post_layer_dropout_micros=0, ), training=model_pb2.TrainingOptions( num_epochs=num_training_epochs, sequence_length=64, batch_size=64, shuffle_corpus_contentfiles_between_epochs=True, adam_optimizer=model_pb2.AdamOptimizer( initial_learning_rate_micros=2000, learning_rate_decay_per_epoch_micros=50000, beta_1_micros=900000, beta_2_micros=999000, normalized_gradient_clip_micros=5000000, ), ), ), sampler=sampler_pb2.Sampler( start_text=seed_text, batch_size=64, sequence_length=1024, temperature_micros=1000000, # = 1.0 real value termination_criteria=[ sampler_pb2.SampleTerminationCriterion( symtok=sampler_pb2.SymmetricalTokenDepth( depth_increase_token="{", depth_decrease_token="}", )), sampler_pb2.SampleTerminationCriterion( maxlen=sampler_pb2.MaxTokenLength( maximum_tokens_in_sample=20000, )), ], ), )
def test_BuildOptimizer_rmsprop(): """Test RmsOptimizer proto value conversion to Keras config.""" config = model_pb2.Model() config.training.ClearField("optimizer") config.training.rmsprop_optimizer.initial_learning_rate_micros = 1000 config.training.rmsprop_optimizer.learning_rate_decay_per_epoch_micros = 1000 optimizer = builders.BuildOptimizer(config) optimizer_config = optimizer.get_config() assert pytest.approx(optimizer_config["decay"]) == 0.001 assert pytest.approx(optimizer_config["rho"]) == 0.9
def test_BuildOptimizer_adam(): """Test AdamOptimizer proto value conversion to Keras config.""" config = model_pb2.Model() config.training.ClearField("optimizer") config.training.adam_optimizer.initial_learning_rate_micros = 2000 config.training.adam_optimizer.learning_rate_decay_per_epoch_micros = 5000 config.training.adam_optimizer.beta_1_micros = 900000 config.training.adam_optimizer.beta_2_micros = 999000 config.training.adam_optimizer.normalized_gradient_clip_micros = 5000000 optimizer = builders.BuildOptimizer(config) optimizer_config = optimizer.get_config() assert pytest.approx(optimizer_config["decay"]) == 0.005 assert pytest.approx(optimizer_config["beta_1"]) == 0.9 assert pytest.approx(optimizer_config["beta_2"]) == 0.999 assert pytest.approx(optimizer_config["clipnorm"]) == 5.0
def _ComputeHash( pre_train_corpus_: corpuses.Corpus, corpus_: corpuses.Corpus, config: model_pb2.Model, ) -> str: """Compute model hash. The hash is computed from the ID of the corpus and the serialized representation of the config proto. The number of epochs that the model is trained for does not affect the hash, since we can share checkpoints between different models if the only variable is the epoch count. E.g. we have a model trained for 10 epochs, we can use the checkpoint as the starting point for a training a model for 20 epochs. Args: corpus: A corpus instance. config: A Model config proto. Returns: The unique model ID. """ config_to_hash = model_pb2.Model() config_to_hash.CopyFrom(config) config_to_hash.ClearField("pre_train_corpus") config_to_hash.ClearField("corpus") config_to_hash.training.ClearField("num_epochs") config_to_hash.training.ClearField("num_train_steps") config_to_hash.training.ClearField("batch_size") if config_to_hash.training.HasField("data_generator"): config_to_hash.training.data_generator.ClearField( "steps_per_epoch") config_to_hash.training.data_generator.ClearField("validation_set") if pre_train_corpus_: hash_list = [ pre_train_corpus_.hash, corpus_.hash, config_to_hash.SerializeToString() ] else: hash_list = [corpus_.hash, config_to_hash.SerializeToString()] return crypto.sha1_list(hash_list)
def _ComputeHash(corpus_: corpuses.Corpus, config: model_pb2.Model) -> str: """Compute model hash. The hash is computed from the ID of the corpus and the serialized representation of the config proto. The number of epochs that the model is trained for does not affect the hash, since we can share checkpoints between different models if the only variable is the epoch count. E.g. we have a model trained for 10 epochs, we can use the checkpoint as the starting point for a training a model for 20 epochs. Args: corpus: A corpus instance. config: A Model config proto. Returns: The unique model ID. """ config_to_hash = model_pb2.Model() config_to_hash.CopyFrom(config) config_to_hash.ClearField("corpus") config_to_hash.training.ClearField("num_epochs") return crypto.sha1_list(corpus_.hash, config_to_hash.SerializeToString())
def CreateModelProtoFromFlags() -> model_pb2.Model: return model_pb2.Model( corpus=CreateCorpusProtoFromFlags(), architecture=model_pb2.NetworkArchitecture( backend=model_pb2.NetworkArchitecture.TENSORFLOW, neuron_type=model_pb2.NetworkArchitecture.LSTM, neurons_per_layer=FLAGS.clgen_layer_size, num_layers=FLAGS.clgen_num_layers, post_layer_dropout_micros=0, ), training=model_pb2.TrainingOptions( num_epochs=FLAGS.clgen_num_epochs, sequence_length=FLAGS.clgen_training_sequence_length, batch_size=FLAGS.clgen_training_batch_size, shuffle_corpus_contentfiles_between_epochs=True, adam_optimizer=model_pb2.AdamOptimizer( initial_learning_rate_micros=2000, learning_rate_decay_per_epoch_micros=50000, beta_1_micros=900000, beta_2_micros=999000, normalized_gradient_clip_micros=5000000, ), ))
def __init__(self, config: model_pb2.Model): """Instantiate a model. Args: config: A Model message. Raises: TypeError: If the config argument is not a Model proto. UserError: In case on an invalid config. """ # Error early, so that a cache isn't created. if not isinstance(config, model_pb2.Model): t = type(config).__name__ raise TypeError(f"Config must be a Model proto. Received: '{t}'") # Validate config options. if config.training.sequence_length < 1: raise errors.UserError( 'TrainingOptions.sequence_length must be >= 1') self.config = model_pb2.Model() self.config.CopyFrom(builders.AssertIsBuildable(config)) self.corpus = corpuses.Corpus(config.corpus) self.hash = self._ComputeHash(self.corpus, self.config) self.cache = cache.mkcache('model', self.hash) # Create the necessary cache directories. (self.cache.path / 'checkpoints').mkdir(exist_ok=True) (self.cache.path / 'samples').mkdir(exist_ok=True) (self.cache.path / 'logs').mkdir(exist_ok=True) # Create symlink to encoded corpus. symlink = self.cache.path / 'corpus' if not symlink.is_symlink(): os.symlink( os.path.relpath( pathlib.Path( self.corpus.encoded.url[len('sqlite:///'):]).parent, self.cache.path), symlink) # Create symlink to the atomizer. symlink = self.cache.path / 'atomizer' if not symlink.is_symlink(): os.symlink( os.path.relpath(self.corpus.atomizer_path, self.cache.path), symlink) # Validate metadata against cache. if self.cache.get('META.pbtxt'): cached_meta = pbutil.FromFile( pathlib.Path(self.cache['META.pbtxt']), internal_pb2.ModelMeta()) # Exclude num_epochs and corpus location from metadata comparison. config_to_compare = model_pb2.Model() config_to_compare.CopyFrom(self.config) config_to_compare.corpus.ClearField('contentfiles') config_to_compare.training.ClearField('num_epochs') # These fields should have already been cleared, but we'll do it again # so that metadata comparisons don't fail when the cached meta schema # is updated. cached_to_compare = model_pb2.Model() cached_to_compare.CopyFrom(cached_meta.config) cached_to_compare.corpus.ClearField('contentfiles') cached_to_compare.training.ClearField('num_epochs') if config_to_compare != cached_to_compare: raise errors.InternalError('Metadata mismatch') self.meta = cached_meta else: self.meta = internal_pb2.ModelMeta() self.meta.config.CopyFrom(self.config) self._WriteMetafile() self.backend = { model_pb2.NetworkArchitecture.TENSORFLOW: tensorflow_backend.TensorFlowBackend, model_pb2.NetworkArchitecture.KERAS: keras_backend.KerasBackend, }[config.architecture.backend](self.config, self.cache, self.corpus)
def __init__(self, config: model_pb2.Model): """Instantiate a model. Args: config: A Model message. Raises: TypeError: If the config argument is not a Model proto. UserError: In case on an invalid config. """ # Error early, so that a cache isn't created. if not isinstance(config, model_pb2.Model): t = type(config).__name__ raise TypeError(f"Config must be a Model proto. Received: '{t}'") self.config = model_pb2.Model() # Validate config options. self.config.CopyFrom(builders.AssertIsBuildable(config)) if FLAGS.num_train_steps: self.config.training.num_train_steps = FLAGS.num_train_steps if FLAGS.num_pretrain_steps: self.config.training.num_pretrain_steps = FLAGS.num_pretrain_steps if FLAGS.num_epochs: self.config.training.num_epochs = FLAGS.num_epochs # Initialize distrib lock path. if environment.WORLD_SIZE > 1: if environment.WORLD_RANK == 0: lock_cache = cache.mkcache("locks") lock_cache.path.mkdir(exist_ok=True) else: while not cache.cachepath("locks").exists(): time.sleep(0.5) lock_cache = cache.mkcache("locks") distrib.init(lock_cache.path) # Initialize corpuses self.corpus = corpuses.Corpus(config.corpus) self.pre_train_corpus = None if config.HasField("pre_train_corpus"): self.pre_train_corpus = corpuses.Corpus(config.pre_train_corpus) self.hash = self._ComputeHash(self.pre_train_corpus, self.corpus, self.config) self._created = False distrib.lock() self.cache = cache.mkcache("model", self.hash) distrib.unlock() if environment.WORLD_RANK == 0: # Create the necessary cache directories. (self.cache.path / "checkpoints").mkdir(exist_ok=True) (self.cache.path / "samples").mkdir(exist_ok=True) # Create symlink to encoded corpus. symlink = self.cache.path / "corpus" if not symlink.is_symlink(): os.symlink( os.path.relpath( pathlib.Path(self.corpus.encoded.url[len("sqlite:///" ):]).parent, self.cache.path, ), symlink, ) if self.pre_train_corpus: symlink = self.cache.path / "pre_train_corpus" if not symlink.is_symlink(): os.symlink( os.path.relpath( pathlib.Path(self.pre_train_corpus.encoded. url[len("sqlite:///"):]).parent, self.cache.path, ), symlink, ) # Create symlink to the tokenizer and create a backup inside checkpoints. symlink = self.cache.path / "tokenizer" if not symlink.is_symlink(): os.symlink( os.path.relpath(self.corpus.tokenizer_path, self.cache.path), symlink) if (self.cache.path / "checkpoints" / "backup_tokenizer.pkl").exists(): shutil.copyfile( self.cache.path / "checkpoints" / "backup_tokenizer.pkl", self.corpus.tokenizer_path) # Validate metadata against cache. if self.cache.get("META.pbtxt"): cached_meta = pbutil.FromFile( pathlib.Path(self.cache["META.pbtxt"]), internal_pb2.ModelMeta()) # Exclude num_epochs and corpus location from metadata comparison. config_to_compare = model_pb2.Model() config_to_compare.CopyFrom(self.config) config_to_compare.corpus.ClearField("contentfiles") if config_to_compare.HasField("pre_train_corpus"): config_to_compare.pre_train_corpus.ClearField( "contentfiles") config_to_compare.training.ClearField("num_epochs") config_to_compare.training.ClearField("num_train_steps") if config_to_compare.HasField("pre_train_corpus"): config_to_compare.training.ClearField("num_pretrain_steps") config_to_compare.training.ClearField("batch_size") if config_to_compare.training.HasField("data_generator"): config_to_compare.training.data_generator.ClearField( "steps_per_epoch") config_to_compare.training.data_generator.ClearField( "validation_set") # These fields should have already been cleared, but we'll do it again # so that metadata comparisons don't fail when the cached meta schema # is updated. cached_to_compare = model_pb2.Model() cached_to_compare.CopyFrom(cached_meta.config) cached_to_compare.corpus.ClearField("contentfiles") if cached_to_compare.HasField("pre_train_corpus"): cached_to_compare.pre_train_corpus.ClearField( "contentfiles") cached_to_compare.training.ClearField("num_epochs") cached_to_compare.training.ClearField("num_train_steps") if cached_to_compare.HasField("pre_train_corpus"): cached_to_compare.training.ClearField("num_pretrain_steps") cached_to_compare.training.ClearField("batch_size") if cached_to_compare.training.HasField("data_generator"): cached_to_compare.training.data_generator.ClearField( "steps_per_epoch") cached_to_compare.training.data_generator.ClearField( "validation_set") if cached_to_compare.training.sequence_length != config_to_compare.training.sequence_length: l.logger().warning( "Mismatch between pre-trained and current config sequence_length!\ This can only be intended in BERT model!") cached_to_compare.training.ClearField("sequence_length") config_to_compare.training.ClearField("sequence_length") if config_to_compare != cached_to_compare: raise SystemError("Metadata mismatch: {} \n\n {}".format( config_to_compare, cached_to_compare)) self.meta = cached_meta else: self.meta = internal_pb2.ModelMeta() self.meta.config.CopyFrom(self.config) self._WriteMetafile() ## Store current commit commit.saveCommit(self.cache.path) self.backend = { model_pb2.NetworkArchitecture.TENSORFLOW_SEQ: tf_sequential.tfSequential, model_pb2.NetworkArchitecture.KERAS_SEQ: keras_sequential.kerasSequential, model_pb2.NetworkArchitecture.TENSORFLOW_BERT: tf_bert.tfBert, model_pb2.NetworkArchitecture.TORCH_BERT: torch_bert.torchBert, }[config.architecture.backend](self.config, self.cache, self.hash) l.logger().info("Initialized {} in {}".format(self.backend, self.cache.path)) return
def main(argv: typing.List[str]): """Main entry point.""" if len(argv) > 1: raise app.UsageError("Unknown arguments: '{}'.".format(" ".join( argv[1:]))) instance = clgen.Instance( clgen_pb2.Instance( working_dir=FLAGS.clgen_dir, model=model_pb2.Model( corpus=corpus_pb2.Corpus( local_directory=FLAGS.clgen_corpus_dir, ascii_character_atomizer=True, preprocessor=[ "deeplearning.clgen.preprocessors.opencl:ClangPreprocessWithShim", "deeplearning.clgen.preprocessors.opencl:Compile", "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers", "deeplearning.clgen.preprocessors.opencl:StripDoubleUnderscorePrefixes", "deeplearning.clgen.preprocessors.common:StripDuplicateEmptyLines", "deeplearning.clgen.preprocessors.opencl:SanitizeKernelPrototype", "deeplearning.clgen.preprocessors.common:StripTrailingWhitespace", "deeplearning.clgen.preprocessors.opencl:ClangFormat", "deeplearning.clgen.preprocessors.common:MinimumLineCount3", "deeplearning.clgen.preprocessors.opencl:Compile", ], contentfile_separator="\n\n", ), architecture=model_pb2.NetworkArchitecture( backend=model_pb2.NetworkArchitecture.TENSORFLOW, neuron_type=model_pb2.NetworkArchitecture.LSTM, neurons_per_layer=512, num_layers=2, post_layer_dropout_micros=0, ), training=model_pb2.TrainingOptions( num_epochs=50, sequence_length=64, batch_size=64, shuffle_corpus_contentfiles_between_epochs=True, adam_optimizer=model_pb2.AdamOptimizer( initial_learning_rate_micros=2000, learning_rate_decay_per_epoch_micros=50000, beta_1_micros=900000, beta_2_micros=999000, normalized_gradient_clip_micros=5000000, ), ), ), sampler=sampler_pb2.Sampler( start_text="kernel void ", batch_size=64, sequence_length=1024, temperature_micros=1000000, # = 1.0 real value termination_criteria=[ sampler_pb2.SampleTerminationCriterion( symtok=sampler_pb2.SymmetricalTokenDepth( depth_increase_token="{", depth_decrease_token="}", )), sampler_pb2.SampleTerminationCriterion( maxlen=sampler_pb2.MaxTokenLength( maximum_tokens_in_sample=20000, )), ], ), ), ) db = grewe_features_db.Database(FLAGS.db) profile_dir = pathlib.Path(FLAGS.profile_dir) profile_dir.mkdir(parents=True, exist_ok=True) profiler = prof.AutoCsvProfiler(profile_dir) with instance.Session(), multiprocessing.Pool() as pool: while True: Sample(instance, db, profiler, pool)