def _EndOfEpochTestSample( self, corpus, sampler: samplers.Sampler, step: int, epoch_num: int ): """Run sampler""" import tensorflow as tf atomizer = corpus.atomizer sampler.Specialize(atomizer) sampler.batch_size = 1 seed = 0 self.InitSampling(sampler, seed) self.InitSampleBatch(sampler) samples, stats = [], [] for i in range(FLAGS.clgen_per_epoch_test_samples): done = np.zeros(1, dtype=np.bool) while not done[0]: start_time = time.time() sample_in_progress = sampler.tokenized_start_text.copy() indices = self.SampleNextIndices(sampler, done) # Iterate over all samples in batch to determine whether they're # done. for index in indices[0]: sample_in_progress.append(atomizer.decoder[index]) if not sampler.SampleIsComplete(sample_in_progress): continue stats.append( (len(sample_in_progress), int((time.time() - start_time) * 1000)) ) sample = "".join(sample_in_progress) samples.append(sample) app.Log(1, "End-of-epoch sample %d:\n%s", i + 1, sample) done[0] = True break # Write samples to file. with self.dashboard_db.Session(commit=True) as dbs: dbs.add_all( [ dashboard_db.TrainingSample( model_id=self.dashboard_model_id, epoch=epoch_num, step=step, sample=sample, token_count=stats[0], sample_time=stats[1], ) for sample, stats in zip(samples, stats) ] ) samples_as_markdown = [ self.FormatCodeAsMarkdown(sample) for sample in samples ] samples_tensor = tf.convert_to_tensor(samples_as_markdown, dtype=tf.string) summary_op = tf.summary.text("samples", samples_tensor) summary = self.inference_sess.run(summary_op) self.summary_writer.add_summary(summary, step)
def Sample( self, sampler: samplers.Sampler, sample_observers: typing.List[sample_observers_lib.SampleObserver], seed: int = None, ) -> None: """Sample a model. This method uses the observer model, returning nothing. To access the samples produced, implement a SampleObserver and pass it in as an argument. Sampling continues indefinitely until one of the sample observers returns False when notified of a new sample. If the model is not already trained, calling Sample() first trains the model. Thus a call to Sample() is equivalent to calling Train() then Sample(). Args: sampler: The sampler to sample using. sample_observers: A list of SampleObserver objects that are notified of new generated samples. seed: A numeric value to seed the RNG with. If not present, the RNG is seeded randomly. Raises: UserError: If called with no sample observers. UnableToAcquireLockError: If the model is locked (i.e. there is another process currently modifying the model). InvalidStartText: If the sampler start text cannot be encoded. InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be encoded. """ if not sample_observers: raise errors.UserError("Cannot sample without any observers") sample_start_time = labdate.MillisecondsTimestamp() self.Train() with logutil.TeeLogsToFile(f"sampler_{sampler.hash}", self.cache.path / "logs"): app.Log(1, "Sampling: '%s'", sampler.start_text) atomizer = self.corpus.atomizer sampler.Specialize(atomizer) self.backend.InitSampling(sampler, seed) [obs.Specialize(self, sampler) for obs in sample_observers] batch_count = 1 while self._SampleBatch(sampler, atomizer, sample_observers): batch_count += 1 time_now = labdate.MillisecondsTimestamp() app.Log( 1, "Produced %s sample batches at a rate of %s ms / batch.", humanize.Commas(batch_count), humanize.Commas( int((time_now - sample_start_time) / max(batch_count, 1))), )
def _SampleBatch( self, sampler: samplers.Sampler, atomizer: atomizers.AtomizerBase, sample_observers: typing.List[sample_observers_lib.SampleObserver], ) -> typing.List[model_pb2.Sample]: """Run a single iteration of the batched sample inner-loop.""" samples_in_progress = [ sampler.tokenized_start_text.copy() for _ in range(sampler.batch_size) ] done = np.zeros(sampler.batch_size, dtype=np.bool) start_time = labdate.MillisecondsTimestamp() wall_time_start = start_time self.backend.InitSampleBatch(sampler) # The return value of this method. If any of the sample_observers return # False, this value is set to False. continue_sampling = True # Sampling loop. Continues until all samples in the batch are done. while not done.all(): indices = self.backend.SampleNextIndices(sampler, done) # Iterate over all samples in batch to determine whether they're # done. for i in range(sampler.batch_size): if done[i]: continue for index in indices[i]: samples_in_progress[i].append(atomizer.decoder[index]) if sampler.SampleIsComplete(samples_in_progress[i]): end_time = labdate.MillisecondsTimestamp() done[i] = 1 sample = model_pb2.Sample( text="".join(samples_in_progress[i]), sample_start_epoch_ms_utc=start_time, sample_time_ms=end_time - start_time, wall_time_ms=end_time - wall_time_start, num_tokens=len(samples_in_progress[i]), ) # Notify sample observers. continue_sampling &= all([ not obs.OnSample(sample) for obs in sample_observers ]) # Wall sample time is the difference between the end of the previous # sample and the end of the current sample. wall_time_start = labdate.MillisecondsTimestamp() break return continue_sampling
def _EndOfEpochTestSample(self, corpus, sampler: samplers.Sampler, step: int): """Run sampler""" import tensorflow as tf atomizer = corpus.atomizer sampler.Specialize(atomizer) sampler.batch_size = 1 seed = 0 self.InitSampling(sampler, seed) self.InitSampleBatch(sampler) samples = [] for i in range(12): done = np.zeros(1, dtype=np.bool) while not done[0]: sample_in_progress = sampler.tokenized_start_text.copy() indices = self.SampleNextIndices(sampler, done) # Iterate over all samples in batch to determine whether they're # done. for index in indices[0]: sample_in_progress.append(atomizer.decoder[index]) if not sampler.SampleIsComplete(sample_in_progress): continue sample = ''.join(sample_in_progress) samples.append(sample) app.Log(1, 'End-of-epoch sample %d:\n%s', i + 1, sample) done[0] = True break # Write samples to file. samples_as_markdown = [ f'```\n{sample.strip()}\n```' for sample in samples ] samples_tensor = tf.convert_to_tensor(samples_as_markdown, dtype=tf.string) summary_op = tf.summary.text('samples', samples_tensor) summary = self.inference_sess.run(summary_op) self.summary_writer.add_summary(summary, step)
def _SampleBatch( self, sampler: samplers.Sampler, atomizer: atomizers.AtomizerBase, sample_observers: typing.List[sample_observers_lib.SampleObserver], ) -> bool: """Run a single iteration of the batched sample inner-loop.""" start_time = labdate.MillisecondsTimestamp() # We're use the sampler.encoded_start_text attribute as a way to re-seed the # model state during rollback, so save the original value here so that we # can restore it at the end of the sample batch. original_sampler_encoded_start_text = sampler.encoded_start_text.copy() self.backend.InitSampleBatch(sampler) backtracker = OpenClBacktrackingHelper(atomizer, self._target_features) self._logger.OnSampleStart(backtracker) sampled_tokens = self.SampleOneWithBacktracking( sampler, atomizer, backtracker) self._logger.OnSampleEnd(backtracker) end_time = labdate.MillisecondsTimestamp() # Format text. if len(sampled_tokens): text = preprocessors.Preprocess( "".join(sampled_tokens), [ "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers", "deeplearning.clgen.preprocessors.opencl:SanitizeKernelPrototype", "deeplearning.clgen.preprocessors.common:StripTrailingWhitespace", "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers", "deeplearning.clgen.preprocessors.common:StripDuplicateEmptyLines", "deeplearning.clgen.preprocessors.cxx:ClangFormat", ], ) else: text = "" # Restore the sampler's start text. sampler.encoded_start_text = original_sampler_encoded_start_text # Notify sample observers. sample = model_pb2.Sample( text=text, sample_start_epoch_ms_utc=start_time, sample_time_ms=end_time - start_time, wall_time_ms=end_time - start_time, num_tokens=len(sampled_tokens), ) return all([not obs.OnSample(sample) for obs in sample_observers])
def SampleFast(self, sampler: samplers.Sampler, min_num_samples: int, seed: int = None) -> typing.List[model_pb2.Sample]: """Sample a model. Same as Sample(), but without printing or caching samples. Because samples are not cached, infinite sampling loops are not supported, since we must return the sample protos at some point. Args: sampler: The sampler to sample using. min_num_samples: The minimum number of samples to return. Note that the true number of samples returned may be higher than this value, as sampling occurs in batches. The model will continue producing samples until the lowest mulitple of the sampler batch size property that is larger than this value. E.g. if min_num_samples is 7 and the Sampler batch size is 10, 10 samples will be returned. seed: A numeric value to seed the RNG with. If not present, the RNG is seeded randomly. Returns: A list of Sample protos. Raises: UnableToAcquireLockError: If the model is locked (i.e. there is another process currently modifying the model). InvalidStartText: If the sampler start text cannot be encoded. InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be encoded. """ self.Train() sample_count = 1 with logutil.TeeLogsToFile(f'sampler_{sampler.hash}', self.cache.path / 'logs'): logging.info("Sampling: '%s'", sampler.start_text) sample_start_time = labdate.MillisecondsTimestamp() atomizer = self.corpus.atomizer sampler.Specialize(atomizer) batch_size = self.backend.InitSampling(sampler, seed) samples = [] # Per-sample batch outer loop. Continues until we have as many samples # as we want. while True: samples_in_progress = [ sampler.tokenized_start_text.copy() for _ in range(batch_size) ] done = np.zeros(batch_size, dtype=np.bool) start_time = labdate.MillisecondsTimestamp() wall_time_start = start_time self.backend.InitSampleBatch(sampler, batch_size) # Sampling loop. Continues until all samples in the batch are done. while True: indices = self.backend.SampleNextIndices( sampler, batch_size) # Iterate over all samples in batch to determine whether they're # done. for i in range(batch_size): if done[i]: continue token = atomizer.decoder[indices[i]] samples_in_progress[i].append(token) if sampler.SampleIsComplete(samples_in_progress[i]): end_time = labdate.MillisecondsTimestamp() done[i] = 1 sample = model_pb2.Sample( text=''.join(samples_in_progress[i]), sample_start_epoch_ms_utc=start_time, sample_time_ms=end_time - start_time, wall_time_ms=end_time - wall_time_start, num_tokens=len(samples_in_progress[i])) sample_count += 1 samples.append(sample) wall_time_start = labdate.MillisecondsTimestamp() # Complete the batch. if done.all(): break # Complete sampling. Note that sample_count starts at 1. if sample_count > min_num_samples: now = labdate.MillisecondsTimestamp() logging.info( 'Produced %s samples at a rate of %s ms / sample.', humanize.intcomma(len(samples)), humanize.intcomma( int((now - sample_start_time) / len(samples)))) break return samples
def Sample(self, sampler: samplers.Sampler, min_num_samples: int, seed: int = None) -> typing.List[model_pb2.Sample]: """Sample a model. If the model is not already trained, calling Sample() first trains the model. Thus a call to Sample() is equivalent to calling Train() then Sample(). Args: sampler: The sampler to sample using. min_num_samples: The minimum number of samples to return. Note that the true number of samples returned may be higher than this value, as sampling occurs in batches. The model will continue producing samples until the lowest mulitple of the sampler batch size property that is larger than this value. E.g. if min_num_samples is 7 and the Sampler batch size is 10, 10 samples will be returned. seed: A numeric value to seed the RNG with. If not present, the RNG is seeded randomly. Returns: A list of Sample protos. Raises: UnableToAcquireLockError: If the model is locked (i.e. there is another process currently modifying the model). InvalidStartText: If the sampler start text cannot be encoded. InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be encoded. """ self.Train() sample_count = 1 self.SamplerCache(sampler).mkdir(exist_ok=True) with logutil.TeeLogsToFile(f'sampler_{sampler.hash}', self.cache.path / 'logs'): logging.info("Sampling: '%s'", sampler.start_text) if min_num_samples < 0: logging.warning( 'Entering an infinite sample loop, this process will never end!' ) sample_start_time = labdate.MillisecondsTimestamp() atomizer = self.corpus.atomizer sampler.Specialize(atomizer) batch_size = self.backend.InitSampling(sampler, seed) samples = [] sample_dir = self.SamplerCache(sampler) # Per-sample batch outer loop. Continues until we have as many samples # as we want. while True: samples_in_progress = [ sampler.tokenized_start_text.copy() for _ in range(batch_size) ] done = np.zeros(batch_size, dtype=np.bool) start_time = labdate.MillisecondsTimestamp() wall_time_start = start_time self.backend.InitSampleBatch(sampler, batch_size) # Sampling loop. Continues until all samples in the batch are done. while True: indices = self.backend.SampleNextIndices( sampler, batch_size) # Iterate over all samples in batch to determine whether they're # done. for i in range(batch_size): if done[i]: continue token = atomizer.decoder[indices[i]] samples_in_progress[i].append(token) if sampler.SampleIsComplete(samples_in_progress[i]): end_time = labdate.MillisecondsTimestamp() done[i] = 1 sample = model_pb2.Sample( text=''.join(samples_in_progress[i]), sample_start_epoch_ms_utc=start_time, sample_time_ms=end_time - start_time, wall_time_ms=end_time - wall_time_start, num_tokens=len(samples_in_progress[i])) print(f'=== BEGIN CLGEN SAMPLE {sample_count} ' f'===\n\n{sample.text}\n') sample_count += 1 sample_id = crypto.sha256_str(sample.text) sample_path = sample_dir / f'{sample_id}.pbtxt' pbutil.ToFile(sample, sample_path) if min_num_samples > 0: samples.append(sample) wall_time_start = labdate.MillisecondsTimestamp() # Complete the batch. if done.all(): break # Complete sampling. Note that sample_count starts at 1. if sample_count > min_num_samples: now = labdate.MillisecondsTimestamp() logging.info( 'Produced %s samples at a rate of %s ms / sample.', humanize.intcomma(len(samples)), humanize.intcomma( int((now - sample_start_time) / max(len(samples), 1)))) break return samples
def Sample( self, sampler: samplers.Sampler, min_num_samples: int, seed: int = None) -> typing.Iterable[model_pb2.Sample]: """Sample a model. If the model is not already trained, calling Sample() first trains the model. Thus a call to Sample() is equivalent to calling Train() then Sample(). Args: sampler: The sampler to sample using. min_num_samples: The minimum number of samples to return. Note that the true number of samples returned may be higher than this value, as sampling occurs in batches. The model will continue producing samples until the lowest mulitple of the sampler batch size property that is larger than this value. E.g. if min_num_samples is 7 and the Sampler batch size is 10, 10 samples will be returned. seed: A numeric value to seed the RNG with. If not present, the RNG is seeded randomly. Returns: A iterator over samples. Raises: UnableToAcquireLockError: If the model is locked (i.e. there is another process currently modifying the model). InvalidStartText: If the sampler start text cannot be encoded. InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be encoded. """ sample_count = 1 atomizer = self.atomizer sampler.Specialize(atomizer) batch_size = self.backend.InitSampling(sampler, seed) sample_start_time = labdate.MillisecondsTimestamp() # Per-sample batch outer loop. Continues until we have as many samples # as we want. while True: samples_in_progress = [ sampler.tokenized_start_text.copy() for _ in range(batch_size)] done = np.zeros(batch_size, dtype=np.bool) start_time = labdate.MillisecondsTimestamp() wall_time_start = start_time self.backend.InitSampleBatch(sampler, batch_size) # Sampling loop. Continues until all samples in the batch are done. while True: indices = self.backend.SampleNextIndices(sampler, batch_size) # Iterate over all samples in batch to determine whether they're # done. for i in range(batch_size): if done[i]: continue token = atomizer.decoder[indices[i]] samples_in_progress[i].append(token) if sampler.SampleIsComplete(samples_in_progress[i]): end_time = labdate.MillisecondsTimestamp() done[i] = 1 sample = model_pb2.Sample( text=''.join(samples_in_progress[i]), sample_start_epoch_ms_utc=start_time, sample_time_ms=end_time - start_time, wall_time_ms=end_time - wall_time_start, num_tokens=len(samples_in_progress[i])) sample_count += 1 yield sample wall_time_start = labdate.MillisecondsTimestamp() # Complete the batch. if done.all(): break # Complete sampling. Note that sample_count starts at 1. if sample_count > min_num_samples: now = labdate.MillisecondsTimestamp() logging.info( 'Produced %s samples at a rate of %s ms / sample.', humanize.intcomma(sample_count - 1), humanize.intcomma( int((now - sample_start_time) / max(sample_count - 1, 1)))) break