def test_TensorFlowBackend_Sample_return_value_matches_cached_sample( clgen_cache_dir, abc_tensorflow_model_config): """Test that Sample() returns Sample protos.""" del clgen_cache_dir abc_tensorflow_model_config.training.batch_size = 1 m = models.Model(abc_tensorflow_model_config) sample_observer = sample_observers.InMemorySampleSaver() m.Sample( MockSampler(hash="hash"), [ sample_observers.MaxSampleCountObserver(1), sample_observer, sample_observers.LegacySampleCacheObserver(), ], ) samples = sample_observer.samples # Samples are produced in batches of sampler.batch_size elements. assert len(samples) == 1 assert len(list((m.cache.path / "samples" / "hash").iterdir())) == 1 cached_sample_path = (m.cache.path / "samples" / "hash" / list( (m.cache.path / "samples" / "hash").iterdir())[0]) assert cached_sample_path.is_file() cached_sample = pbutil.FromFile(cached_sample_path, model_pb2.Sample()) assert samples[0].text == cached_sample.text assert samples[0].sample_time_ms == cached_sample.sample_time_ms assert (samples[0].sample_start_epoch_ms_utc == cached_sample.sample_start_epoch_ms_utc)
def test_PrintSampleObserver(capsys): observer = sample_observers.PrintSampleObserver() sample = model_pb2.Sample(text="Hello, world!") assert observer.OnSample(sample) captured = capsys.readouterr() assert captured.out == """\
def PostprocessSampleCorpus(instance: clgen.Instance): """Create a corpus from the model samples and pre-process.""" sample_dir = instance.model.SamplerCache(instance.sampler) # Read the sample protos and write them to a directory of content files. contentfiles_dir = pathlib.Path(str(sample_dir) + '.contentfiles') contentfiles_dir.mkdir(exist_ok=True) logging.info('Writing output contentfiles to %s', contentfiles_dir) if len(list(contentfiles_dir.iterdir())) != len(list( sample_dir.iterdir())): for proto_path in sample_dir.iterdir(): sample = pbutil.FromFile(proto_path, model_pb2.Sample()) with open(contentfiles_dir / proto_path.name, 'w') as f: f.write(sample.text) logging.info('Creating output corpus') output_corpus_config = corpus_pb2.Corpus() output_corpus_config.CopyFrom(instance.model.corpus.config) output_corpus_config.local_directory = str(contentfiles_dir) # We derive the programming language name from the input corpus directory. # This depends on corpuses being in directories named after their language, # e.g. ~/corpuses/opencl, or ~/corpuses/java.A preprocessed_dir = instance.model.corpus.preprocessed.url[len('sqlite:///' ):].parent language = (preprocessed_dir / 'contentfiles').resolve().name output_corpus_config.preprocessor[:] = POSTPROCESSORS[language] output_corpus = corpuses.Corpus(output_corpus_config) try: output_corpus.Create() except errors.EmptyCorpusException: pass return output_corpus
def write_samples_cache(db_sample_obs : sample_observers.SamplesDatabaseObserver, tokenizer : "tokenizers.TokenizerBase", samples : typing.List[ActiveSample] ) -> None: for sample in samples: try: s = model_pb2.Sample( train_step = -1, text = tokenizer.ArrayToCode(sample.sample, with_formatting = True), sample_indices = "", encoded_sample_indices = "", original_input = "", sample_feed = tokenizer.ArrayToCode(sample.sample_feed.input_feed, with_formatting = True), encoded_text = "", sample_start_epoch_ms_utc = 0, sample_time_ms = 0, wall_time_ms = 0, feature_vector = '\n'.join(["{}:{}".format(k, v) for k, v in sample.features.items()]) if sample.features else "None", num_tokens = np.where(sample.sample == tokenizer.padToken)[0][0] if tokenizer.padToken in sample.sample else len(sample), compile_status = True, categorical_sampling = FLAGS.categorical_sampling, date_added = datetime.datetime.utcnow().strftime("%m/%d/%Y, %H:%M:%S"), ) db_sample_obs.OnSample(s) except Exception: pass return
def test_SaveSampleTextObserver(tempdir: pathlib.Path): observer = sample_observers.SaveSampleTextObserver(tempdir) contents = "Hello, world!" sample = model_pb2.Sample(text=contents) assert observer.OnSample(sample) path = tempdir / f"{crypto.sha256_str(contents)}.txt" assert path.is_file() assert fs.Read(path) == contents
def _SampleBatch( self, sampler: samplers.Sampler, atomizer: atomizers.AtomizerBase, sample_observers: typing.List[sample_observers_lib.SampleObserver], ) -> typing.List[model_pb2.Sample]: """Run a single iteration of the batched sample inner-loop.""" samples_in_progress = [ sampler.tokenized_start_text.copy() for _ in range(sampler.batch_size) ] done = np.zeros(sampler.batch_size, dtype=np.bool) start_time = labdate.MillisecondsTimestamp() wall_time_start = start_time self.backend.InitSampleBatch(sampler) # The return value of this method. If any of the sample_observers return # False, this value is set to False. continue_sampling = True # Sampling loop. Continues until all samples in the batch are done. while not done.all(): indices = self.backend.SampleNextIndices(sampler, done) # Iterate over all samples in batch to determine whether they're # done. for i in range(sampler.batch_size): if done[i]: continue for index in indices[i]: samples_in_progress[i].append(atomizer.decoder[index]) if sampler.SampleIsComplete(samples_in_progress[i]): end_time = labdate.MillisecondsTimestamp() done[i] = 1 sample = model_pb2.Sample( text="".join(samples_in_progress[i]), sample_start_epoch_ms_utc=start_time, sample_time_ms=end_time - start_time, wall_time_ms=end_time - wall_time_start, num_tokens=len(samples_in_progress[i]), ) # Notify sample observers. continue_sampling &= all([ not obs.OnSample(sample) for obs in sample_observers ]) # Wall sample time is the difference between the end of the previous # sample and the end of the current sample. wall_time_start = labdate.MillisecondsTimestamp() break return continue_sampling
def test_InMemorySampleSaver(): observer = sample_observers.InMemorySampleSaver() sample = model_pb2.Sample(text="Hello, world!") assert observer.OnSample(sample) assert len(observer.samples) == 1 assert observer.samples[-1].text == "Hello, world!" assert observer.OnSample(sample) assert len(observer.samples) == 2 assert observer.samples[-1].text == "Hello, world!"
def _SampleBatch( self, sampler: samplers.Sampler, atomizer: atomizers.AtomizerBase, sample_observers: typing.List[sample_observers_lib.SampleObserver], ) -> bool: """Run a single iteration of the batched sample inner-loop.""" start_time = labdate.MillisecondsTimestamp() # We're use the sampler.encoded_start_text attribute as a way to re-seed the # model state during rollback, so save the original value here so that we # can restore it at the end of the sample batch. original_sampler_encoded_start_text = sampler.encoded_start_text.copy() self.backend.InitSampleBatch(sampler) backtracker = OpenClBacktrackingHelper(atomizer, self._target_features) self._logger.OnSampleStart(backtracker) sampled_tokens = self.SampleOneWithBacktracking( sampler, atomizer, backtracker) self._logger.OnSampleEnd(backtracker) end_time = labdate.MillisecondsTimestamp() # Format text. if len(sampled_tokens): text = preprocessors.Preprocess( "".join(sampled_tokens), [ "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers", "deeplearning.clgen.preprocessors.opencl:SanitizeKernelPrototype", "deeplearning.clgen.preprocessors.common:StripTrailingWhitespace", "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers", "deeplearning.clgen.preprocessors.common:StripDuplicateEmptyLines", "deeplearning.clgen.preprocessors.cxx:ClangFormat", ], ) else: text = "" # Restore the sampler's start text. sampler.encoded_start_text = original_sampler_encoded_start_text # Notify sample observers. sample = model_pb2.Sample( text=text, sample_start_epoch_ms_utc=start_time, sample_time_ms=end_time - start_time, wall_time_ms=end_time - start_time, num_tokens=len(sampled_tokens), ) return all([not obs.OnSample(sample) for obs in sample_observers])
def test_SamplesDatabaseObserver_add_one(db: samples_database.SamplesDatabase): sample_proto = model_pb2.Sample( text='Hello, observer', num_tokens=10, wall_time_ms=5, sample_start_epoch_ms_utc=1000, ) with db.Observer() as obs: obs.OnSample(sample_proto) with db.Session() as s: assert s.query(samples_database.Sample).count() == 1 assert s.query(samples_database.Sample).one().ToProto() == sample_proto
def run_extractors(sample: Sample) -> Sample: if sample.compile_status: return Sample(**Sample.FromProto( 0, model_pb2.Sample( train_step=sample.train_step, text=sample.text, sample_indices=sample.sample_indices, encoded_sample_indices=sample.encoded_sample_indices, original_input=sample.original_input, sample_feed=sample.sample_feed, encoded_text=sample.encoded_text, sample_time_ms=sample.sample_time_ms, feature_vector=extractor.ExtractRawFeatures(sample.text), num_tokens=sample.num_tokens, compile_status=sample.compile_status, categorical_sampling=int(sample.categorical_sampling), date_added=sample.date_added.strftime("%m/%d/%Y, %H:%M:%S"), ))) else: return Sample(**Sample.FromProto( 0, model_pb2.Sample( train_step=sample.train_step, text=sample.text, sample_indices=sample.sample_indices, encoded_sample_indices=sample.encoded_sample_indices, original_input=sample.original_input, sample_feed=sample.sample_feed, encoded_text=sample.encoded_text, sample_time_ms=sample.sample_time_ms, feature_vector="", num_tokens=sample.num_tokens, compile_status=sample.compile_status, categorical_sampling=int(sample.categorical_sampling), date_added=sample.date_added.strftime("%m/%d/%Y, %H:%M:%S"), )))
def test_KerasBackend_Sample_return_value_matches_cached_sample( clgen_cache_dir, abc_keras_model_config): """Test that Sample() returns Sample protos.""" del clgen_cache_dir m = models.Model(abc_keras_model_config) samples = m.Sample(MockSampler(hash='hash'), 1) assert len(samples) == 1 assert len(list((m.cache.path / 'samples' / 'hash').iterdir())) == 1 cached_sample_path = (m.cache.path / 'samples' / 'hash' / list( (m.cache.path / 'samples' / 'hash').iterdir())[0]) assert cached_sample_path.is_file() cached_sample = pbutil.FromFile(cached_sample_path, model_pb2.Sample()) assert samples[0].text == cached_sample.text assert samples[0].sample_time_ms == cached_sample.sample_time_ms assert samples[ 0].sample_start_epoch_ms_utc == cached_sample.sample_start_epoch_ms_utc
def ToProto(dp: ActiveFeed) -> samples_database.Sample: return samples_database.Sample(**samples_database.Sample.FromProto( 0, model_pb2.Sample( train_step=-1, text=dp.sample, sample_indices="", encoded_sample_indices="", original_input="", sample_feed=dp.input_feed, encoded_text="", sample_time_ms=0, feature_vector=extractor.ExtractRawFeatures(dp.sample), num_tokens=dp.num_tokens, compile_status=dp.compile_status, categorical_sampling=1, date_added=dp.date_added.strftime("%m/%d/%Y, %H:%M:%S"), )))
def update_tokenizer(sample: Sample, tokenizer) -> Sample: encoded = tokenizer.TokenizeString(sample.text) return Sample(**Sample.FromProto( 0, model_pb2.Sample( train_step=sample.train_step, text=sample.text, sample_indices=sample.sample_indices, encoded_sample_indices=sample.sample_indices, original_input=sample.original_input, sample_feed=sample.sample_feed, encoded_text=','.join([str(x) for x in encoded]), sample_time_ms=sample.sample_time_ms, feature_vector=sample.feature_vector, num_tokens=len(encoded), compile_status=sample.compile_status, categorical_sampling=int(sample.categorical_sampling), date_added=sample.date_added.strftime("%m/%d/%Y, %H:%M:%S"), )))
def test_KerasBackend_Sample_return_value_matches_cached_sample( clgen_cache_dir, abc_keras_model_config): """Test that Sample() returns Sample protos.""" del clgen_cache_dir m = models.Model(abc_keras_model_config) sample_observer = sample_observers.InMemorySampleSaver() m.Sample( MockSampler(hash="hash"), [sample_observers.MaxSampleCountObserver(1), sample_observer], ) samples = sample_observer.samples assert len(samples) == 1 assert len(list((m.cache.path / "samples" / "hash").iterdir())) == 1 cached_sample_path = (m.cache.path / "samples" / "hash" / list( (m.cache.path / "samples" / "hash").iterdir())[0]) assert cached_sample_path.is_file() cached_sample = pbutil.FromFile(cached_sample_path, model_pb2.Sample()) assert samples[0].text == cached_sample.text assert samples[0].sample_time_ms == cached_sample.sample_time_ms assert (samples[0].sample_start_epoch_ms_utc == cached_sample.sample_start_epoch_ms_utc)
def _SampleLMBatch( self, sampler: 'samplers.Sampler', tokenizer: tokenizers.TokenizerBase, sample_observers: typing.List[sample_observers_lib.SampleObserver], epoch: int, ) -> bool: """ Run a sampling iteration over BERT models. """ start_time = datetime.datetime.utcnow() seq_count = 0 self.backend.InitSampleBatch(sampler, workload_size=FLAGS.sample_workload_size) try: org_inputs, input_ids, samples, indices = self.backend.SampleNextIndices( sampler) except StopIteration: return False, seq_count if not samples: # Return empty means model has not produced something that can be stored. # This 'if' accommodates active sampling, which is very selective. return True, seq_count continue_sampling = True if environment.WORLD_RANK == 0: assert len(org_inputs) == len(input_ids) == len(samples) == len( indices), "Length mismatch, {}-{}-{}-{}".format( len(org_inputs), len(input_ids), len(samples), len(indices)) for org, inp, sample, idxs in zip(org_inputs, input_ids, samples, indices): src = self.tokenizer.ArrayToCode(sample, with_formatting=True) try: stdout = opencl.Compile(src) compile_flag = True features = extractor.ExtractRawFeatures(src) except ValueError: compile_flag = False features = "" end_time = datetime.datetime.utcnow() sample = model_pb2.Sample( train_step=epoch, text=src, sample_indices=','.join([ self.tokenizer.decoder[idx].replace('\n', '\\n') for idx in idxs ]).replace('\n', '\\n'), encoded_sample_indices=','.join([str(idx) for idx in idxs]), original_input=self.tokenizer.tokensToString( org, with_formatting=False, ignore_token=self.tokenizer.padToken), sample_feed=self.tokenizer.tokensToString( inp, with_formatting=False, ignore_token=self.tokenizer.padToken), encoded_text=",".join([str(x) for x in sample]), sample_start_epoch_ms_utc=int(start_time.strftime("%s%f")), sample_time_ms=int( round(1000 * ((end_time - start_time) / len(samples)).total_seconds())), wall_time_ms=int( round(1000 * ((end_time - start_time) / len(samples)).total_seconds())), feature_vector=features, num_tokens=np.where( sample == self.tokenizer.padToken)[0][0] if self.tokenizer.padToken in sample else len(sample), compile_status=compile_flag, categorical_sampling=self.backend.samplesWithCategorical(), date_added=datetime.datetime.utcnow().strftime( "%m/%d/%Y, %H:%M:%S"), ) # Notify sample observers. continue_sampling &= all( [obs.OnSample(sample) for obs in sample_observers]) seq_count += 1 if environment.WORLD_SIZE > 1: distrib.write(str(continue_sampling)) else: status = distrib.read() if status == "True": continue_sampling = True elif status == "False": continue_sampling = False else: raise OSError( "Broken distributed message: '{}'".format(status)) return continue_sampling, seq_count
def _Train( self, corpus, test_sampler: typing.Optional[samplers.Sampler], ) -> None: """Core training function""" if not self.is_trained: train_input_fn = self.train.data_generator.generateTfDataset( sequence_length=self.config.training.sequence_length, num_cpu_threads=os.cpu_count(), use_tpu=FLAGS.use_tpu, is_training=True) l.logger().info( "Splitting {} steps into {} equivalent epochs, {} steps each. Rejected {} redundant step(s)" .format( self.num_train_steps, self.num_epochs, self.steps_per_epoch, self.config.training.num_train_steps - self.num_train_steps)) try: if FLAGS.sample_per_epoch == 0: self.train.estimator.train(input_fn=train_input_fn, max_steps=self.num_train_steps) else: sampler, observers = self._getTestSampler( test_sampler, self.config.training.sequence_length) self.InitSampling(sampler, self.config.training.random_seed) for ep in range(self.num_epochs): self.train.estimator.train(input_fn=train_input_fn, steps=self.steps_per_epoch) for _ in range(FLAGS.sample_per_epoch): start_time = datetime.datetime.utcnow() self.InitSampleBatch() sample_batch, sample_indices = self.SampleNextIndices( ) end_time = datetime.datetime.utcnow() for sample, sind in zip(sample_batch, sample_indices): try: stdout = opencl.Compile( self.tokenizer.ArrayToCode(sample)) compile_flag = 1 except ValueError: compile_flag = 0 feature_vector = extractor.ExtractFeatures( self.tokenizer.ArrayToCode(sample)) sample_proto = model_pb2.Sample( train_step=(ep + 1) * self.steps_per_epoch, sample_feed=sampler.start_text, text=self.tokenizer.tokensToString( sample, ignore_token=self.tokenizer.padToken). replace("\\n", "\n"), encoded_text=",".join( [str(t) for t in sample]), sample_indices='\n'.join([ self.tokenizer.tokensToString( mind).replace('\n', '\\n') for mind in sind ]), encoded_sample_indices='\n'.join([ ','.join([str(x) for x in mind]) for mind in sind ]), sample_time_ms=int( round(1000 * ((end_time - start_time) / sampler.batch_size ).total_seconds())), feature_vector="\n".join([ "{}:{}".format(k, v) for (k, v) in feature_vector.items() ]), num_tokens=len(sample), compile_status=compile_flag, categorical_sampling=self. samplesWithCategorical(), date_added=datetime.datetime.utcnow( ).strftime("%m/%d/%Y, %H:%M:%S"), ) for obs in observers: obs.OnSample(sample_proto) except KeyboardInterrupt: pass if not FLAGS.force_eval: self.Validate() if FLAGS.force_eval and not self.is_validated: self.Validate() # self.telemetry.TfRecordEpochs() return
def _SampleSeqBatch( self, sampler: 'samplers.Sampler', tokenizer: tokenizers.TokenizerBase, sample_observers: typing.List[sample_observers_lib.SampleObserver], epoch: int, ) -> bool: """ Run a single iteration of the batched sample inner-loop for sequential models. """ start_time = datetime.datetime.utcnow() self.backend.InitSampleBatch(sampler) samples_in_progress = [ sampler.tokenized_start_text.copy() for _ in range(sampler.batch_size) ] done = np.zeros(sampler.batch_size, dtype=np.bool) wall_time_start = start_time seq_count = 0 # The return value of this method. If any of the sample_observers return # False, this value is set to False. continue_sampling = True # Sampling loop. Continues until all samples in the batch are done. while not done.all(): indices = self.backend.SampleNextIndices(sampler, done) # Iterate over all samples in batch to determine whether they're # done. for i in range(len(indices)): if done[i]: continue for index in indices[i]: samples_in_progress[i].append(tokenizer.decoder[index]) step_ind = "" encoded_step_indices = "" if sampler.SampleIsComplete(samples_in_progress[i]): end_time = datetime.datetime.utcnow() sample_kernel = [x for x in samples_in_progress[i]] features = extractor.ExtractRawFeatures(''.join( samples_in_progress[i])) done[i] = 1 try: stdout = opencl.Compile(''.join( samples_in_progress[i])) compile_flag = True except ValueError: compile_flag = False sample = model_pb2.Sample( train_step=epoch, text=samples_in_progress[i], sample_indices="", encoded_sample_indices="", sample_feed=sampler.start_text, encoded_text=",".join([ str(tokenizer.vocab[x]) for x in sample_kernel ]), sample_start_epoch_ms_utc=int( start_time.strftime("%s%f")), sample_time_ms=int( round(1000 * ((end_time - start_time) / sampler.batch_size).total_seconds())), wall_time_ms=int( round(1000 * ((end_time - start_time) / sampler.batch_size).total_seconds())), feature_vector=features, num_tokens=len(samples_in_progress[i]), compile_status=compile_flag, categorical_sampling=self.backend. samplesWithCategorical(), date_added=datetime.datetime.utcnow().strftime( "%m/%d/%Y, %H:%M:%S"), ) # Notify sample observers. continue_sampling &= all( [obs.OnSample(sample) for obs in sample_observers]) seq_count += 1 # Wall sample time is the difference between the end of the previous # sample and the end of the current sample. wall_time_start = datetime.datetime.utcnow() break return continue_sampling, seq_count
def Train(self, corpus, test_sampler : typing.Optional[samplers.Sampler] = None, pre_train : bool = False, **unused_kwargs ) -> None: """ Main training entry point. """ self._ConfigTrainParams( torchLMDataGenerator.TrainMaskLMBatchGenerator( corpus, self.config.training, self.cache.path, self.config.training.num_pretrain_steps if pre_train else None, pre_train, self.feature_encoder, self.feature_tokenizer, self.feature_sequence_length, ), pre_train ) if FLAGS.only_sample: return self.current_step = self.loadCheckpoint(self.train, pre_train = pre_train) if self.pytorch.num_gpus > 0: self.torch.cuda.empty_cache() if self.current_step >= 0: l.logger().info("Loaded checkpoint step {}".format(self.current_step)) self.current_step = max(0, self.current_step) if self.current_step < self.num_train_steps: self.train.model.zero_grad() ## Set batch size in case of TPU training or distributed training. if self.torch_tpu_available: total_train_batch_size = self.train_batch_size * self.pytorch.torch_xla.xrt_world_size() else: total_train_batch_size = ( self.train_batch_size * (self.torch.distributed.get_world_size() if self.pytorch.num_nodes > 1 else 1) ) # Set dataloader in case of TPU training. if self.torch_tpu_available: loader = self.pytorch.torch_ploader.ParallelLoader( self.train.data_generator.dataloader, [self.pytorch.device] ).per_device_loader(self.pytorch.device) else: loader = self.train.data_generator.dataloader # Get dataloader iterator and setup hooks. batch_iterator = iter(loader) if self.is_world_process_zero(): train_hook = hooks.tensorMonitorHook( self.logfile_path if not pre_train else self.pre_logfile_path, self.current_step, min(self.steps_per_epoch, FLAGS.monitor_frequency) ) if FLAGS.reward_compilation >= 0 and not pre_train: correct_sample_obs = sample_observers.SamplesDatabaseObserver( self.logfile_path / "correct_samples.db" ) else: correct_sample_obs = None total_steps = self.config.training.num_pretrain_steps if pre_train else self.config.training.num_train_steps l.logger().info( "Splitting {} steps into {} equivalent epochs, {} steps each. Rejected {} redundant step(s)".format( self.num_train_steps, self.num_epochs, self.steps_per_epoch, total_steps - self.num_train_steps ) ) try: self.train.model.train() epoch_iter = tqdm.auto.trange(self.num_epochs, desc="Epoch", leave = False) if self.is_world_process_zero() else range(self.num_epochs) for epoch in epoch_iter: # In distributed mode, calling the set_epoch() method at # the beginning of each epoch before creating the DataLoader iterator # is necessary to make shuffling work properly across multiple epochs. # Otherwise, the same ordering will be always used. if self.pytorch.num_nodes > 1: loader.sampler.set_epoch(epoch) if epoch < self.current_step // self.steps_per_epoch: continue # Stupid bar won't resume. batch_iter = tqdm.auto.trange(self.steps_per_epoch, desc="Batch", leave = False) if self.is_world_process_zero() else range(self.steps_per_epoch) for step in batch_iter: if self.is_world_process_zero(): start = datetime.datetime.utcnow() try: inputs = next(batch_iterator) except StopIteration: # dataloader has different len() than steps_per_epoch. # This is the easiest way to infinite-loop dataloaders in pytorch. batch_iterator = iter(loader) inputs = next(batch_iterator) self.current_step += 1 # Move inputs to torch device. inputs = self.to_device(inputs) # Run model step on batch step_out = self.model_step(self.train.model, inputs, step = epoch * self.steps_per_epoch + step) # Collect losses and backpropagate total_loss = step_out['total_loss'].mean() total_loss.backward() self.torch.nn.utils.clip_grad_norm_(self.train.model.parameters(), self.max_grad_norm) if self.torch_tpu_available: self.pytorch.torch_xla.optimizer_step(self.train.optimizer) else: self.train.optimizer.step() self.train.scheduler.step() ## Collect tensors for logging. if self.pytorch.num_nodes > 1: total_loss = [self.torch.zeros(tuple(step_out['total_loss' ].shape), dtype = self.torch.float32).to(self.pytorch.device) for _ in range(self.torch.distributed.get_world_size())] masked_lm_loss = [self.torch.zeros(tuple(step_out['masked_lm_loss' ].shape), dtype = self.torch.float32).to(self.pytorch.device) for _ in range(self.torch.distributed.get_world_size())] # next_sentence_loss = [self.torch.zeros(tuple(step_out['next_sentence_loss'].shape), dtype = self.torch.float32).to(self.pytorch.device) for _ in range(self.torch.distributed.get_world_size())] masked_lm_lengths = [self.torch.zeros(tuple(inputs ['masked_lm_lengths' ].shape), dtype = self.torch.int64 ).to(self.pytorch.device) for _ in range(self.torch.distributed.get_world_size())] self.torch.distributed.all_gather(masked_lm_loss, step_out["masked_lm_loss"]) # self.torch.distributed.all_gather(next_sentence_loss, step_out["next_sentence_loss"]) self.torch.distributed.all_gather(masked_lm_lengths, inputs['masked_lm_lengths'].to(self.pytorch.device)) self.torch.distributed.all_gather(total_loss, step_out['total_loss']) else: total_loss = step_out['total_loss' ].unsqueeze(0).cpu() masked_lm_loss = step_out['masked_lm_loss' ].unsqueeze(0).cpu() # next_sentence_loss = step_out['next_sentence_loss'].unsqueeze(0).cpu() masked_lm_lengths = inputs['masked_lm_lengths' ].cpu() if self.is_world_process_zero(): exec_time_ms = int(round((datetime.datetime.utcnow() - start).total_seconds() * 1000)) if FLAGS.reward_compilation >= 0 and FLAGS.reward_compilation <= epoch * self.steps_per_epoch + step and not pre_train: ## Logging when compiler reward is enabled in training. ## This is not compatible with using DDP, and basically compiler-rewarded training is deprecated and proven to be wrong and inefficient. correct_samples = [(x, y) for en, (x, y) in enumerate(zip(inputs['input_ids'].cpu().numpy(), step_out['generated_samples'].cpu().numpy())) if step_out['compile_status'][en] == 1] for s in correct_samples: feature_vector = extractor.ExtractFeatures(self.tokenizer.ArrayToCode(s[1])) correct_sample_obs.OnSample(model_pb2.Sample( train_step = self.current_step, sample_feed = self.tokenizer.tokensToString(s[0], ignore_token = self.tokenizer.padToken).replace("\\n", "\n"), text = self.tokenizer.tokensToString(s[1], ignore_token = self.tokenizer.padToken).replace("\\n", "\n"), encoded_text = ",".join([str(t) for t in s[1]]), sample_indices = '', encoded_sample_indices = '', sample_time_ms = int(round(exec_time_ms / self.train_batch_size)), feature_vector = "\n".join(["{}:{}".format(k, v) for (k, v) in feature_vector.items()]), num_tokens = len([x for x in s[1] if x != self.tokenizer.padToken]), categorical_sampling = False, compile_status = True, date_added = datetime.datetime.utcnow().strftime("%m/%d/%Y, %H:%M:%S"), ) ) if not pre_train: ## Fine-tuning logging. train_hook.step( masked_lm_loss = sum([ml.mean().item() for ml in masked_lm_loss]) / len(masked_lm_loss), # next_sentence_loss = sum([nsl.mean().item() for nsl in next_sentence_loss]) / len(next_sentence_loss), total_loss = sum([tl.mean().item() for tl in total_loss]) / len(total_loss), learning_rate = self.train.scheduler.get_last_lr()[0], num_correct_samples = (correct_sample_obs.sample_id if correct_sample_obs is not None else None), batch_avg_hole_len = sum([sum([int(l) for l in b if l != -1]) / len([int(l) for l in b if l != -1]) for b in masked_lm_lengths]) / len(masked_lm_lengths), batch_execution_time_ms = exec_time_ms, time_per_sample_ms = exec_time_ms / self.train_batch_size, ) else: ## Pre-training logging. train_hook.step( masked_lm_loss = sum([ml.mean().item() for ml in masked_lm_loss]) / len(masked_lm_loss), # next_sentence_loss = sum([nsl.mean().item() for nsl in next_sentence_loss]) / len(next_sentence_loss), total_loss = sum([tl.mean().item() for tl in total_loss]) / len(total_loss), learning_rate = self.train.scheduler.get_last_lr()[0], batch_avg_hole_len = sum([sum([int(l) for l in b if l != -1]) / len([int(l) for l in b if l != -1]) for b in masked_lm_lengths]) / len(masked_lm_lengths), batch_execution_time_ms = exec_time_ms, time_per_sample_ms = exec_time_ms / self.train_batch_size, ) self.train.model.zero_grad() if self.current_step == 0: l.logger().info("Starting Loss: {}".format(sum([tl.mean().item() for tl in total_loss]) / len(total_loss))) # End of Epoch self.saveCheckpoint(self.train, pre_train) if self.is_world_process_zero(): set_mail = "Epoch {} Loss: {}\n".format(self.current_step // self.steps_per_epoch, train_hook.epoch_loss) l.logger().info("Epoch {} Loss: {}".format(self.current_step // self.steps_per_epoch, train_hook.epoch_loss)) if self.pytorch.num_nodes > 1: loader.sampler.set_epoch(epoch) if FLAGS.validate_per_epoch and self.train.data_generator.config.validation_split > 0: val_ml_loss = self.Validate(per_epoch = True, pre_train = pre_train) if self.is_world_process_zero(): train_hook.end_epoch( val_masked_lm_loss = val_ml_loss, # val_next_sentence_loss = val_nsp_loss, val_total_loss = val_ml_loss # + val_nsp_loss, ) set_mail += "Validation Loss: {}\n".format(val_ml_loss) elif self.is_world_process_zero(): train_hook.end_epoch() if FLAGS.notify_me: client.getClient().send_message("clgen:torch_bert", set_mail) if self.torch_tpu_available: self.pytorch.torch_xla.master_print(self.pytorch.torch_xla_met.metrics_report()) if FLAGS.sample_per_epoch > 0: sampler, observers = self._getTestSampler(test_sampler, self.config.training.sequence_length) self.InitSampling(sampler, self.config.training.random_seed) for _ in range(FLAGS.sample_per_epoch): start_time = datetime.datetime.utcnow() self.InitSampleBatch(sampler) org_inputs, input_ids, samples, indices = self.SampleNextIndices() end_time = datetime.datetime.utcnow() for org, inp, sample, idxs in zip(org_inputs, input_ids, samples, indices): try: stdout = opencl.Compile(self.tokenizer.ArrayToCode(sample)) compile_flag = 1 except ValueError: compile_flag = 0 feature_vector = extractor.ExtractFeatures(self.tokenizer.ArrayToCode(sample)) sample_proto = model_pb2.Sample( train_step = self.current_step, sample_feed = sampler.start_text, original_input = self.tokenizer.tokensToString(org, with_formatting = True, ignore_token = self.tokenizer.padToken), text = self.tokenizer.tokensToString(sample, with_formatting = True, ignore_token = self.tokenizer.padToken).replace("\\n", "\n"), encoded_text = ",".join([str(t) for t in sample]), sample_indices = ','.join([self.tokenizer.decoder[idx].replace('\n', '\\n') for idx in idxs]).replace('\n', '\\n'), encoded_sample_indices = ','.join([str(idx) for idx in idxs]), sample_time_ms = int(round(1000 * ((end_time - start_time) / sampler.batch_size).total_seconds())), feature_vector = "\n".join(["{}:{}".format(k, v) for (k, v) in feature_vector.items()]), num_tokens = len(sample), compile_status = compile_flag, categorical_sampling = self.samplesWithCategorical(), date_added = datetime.datetime.utcnow().strftime("%m/%d/%Y, %H:%M:%S"), ) for obs in observers: obs.OnSample(sample_proto) except KeyboardInterrupt: pass if not FLAGS.force_eval: _ = self.Validate(pre_train = pre_train) if FLAGS.force_eval and not self.is_validated: _ = self.Validate(pre_train = pre_train) return
def Sample( self, sampler: samplers.Sampler, min_num_samples: int, seed: int = None) -> typing.Iterable[model_pb2.Sample]: """Sample a model. If the model is not already trained, calling Sample() first trains the model. Thus a call to Sample() is equivalent to calling Train() then Sample(). Args: sampler: The sampler to sample using. min_num_samples: The minimum number of samples to return. Note that the true number of samples returned may be higher than this value, as sampling occurs in batches. The model will continue producing samples until the lowest mulitple of the sampler batch size property that is larger than this value. E.g. if min_num_samples is 7 and the Sampler batch size is 10, 10 samples will be returned. seed: A numeric value to seed the RNG with. If not present, the RNG is seeded randomly. Returns: A iterator over samples. Raises: UnableToAcquireLockError: If the model is locked (i.e. there is another process currently modifying the model). InvalidStartText: If the sampler start text cannot be encoded. InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be encoded. """ sample_count = 1 atomizer = self.atomizer sampler.Specialize(atomizer) batch_size = self.backend.InitSampling(sampler, seed) sample_start_time = labdate.MillisecondsTimestamp() # Per-sample batch outer loop. Continues until we have as many samples # as we want. while True: samples_in_progress = [ sampler.tokenized_start_text.copy() for _ in range(batch_size)] done = np.zeros(batch_size, dtype=np.bool) start_time = labdate.MillisecondsTimestamp() wall_time_start = start_time self.backend.InitSampleBatch(sampler, batch_size) # Sampling loop. Continues until all samples in the batch are done. while True: indices = self.backend.SampleNextIndices(sampler, batch_size) # Iterate over all samples in batch to determine whether they're # done. for i in range(batch_size): if done[i]: continue token = atomizer.decoder[indices[i]] samples_in_progress[i].append(token) if sampler.SampleIsComplete(samples_in_progress[i]): end_time = labdate.MillisecondsTimestamp() done[i] = 1 sample = model_pb2.Sample( text=''.join(samples_in_progress[i]), sample_start_epoch_ms_utc=start_time, sample_time_ms=end_time - start_time, wall_time_ms=end_time - wall_time_start, num_tokens=len(samples_in_progress[i])) sample_count += 1 yield sample wall_time_start = labdate.MillisecondsTimestamp() # Complete the batch. if done.all(): break # Complete sampling. Note that sample_count starts at 1. if sample_count > min_num_samples: now = labdate.MillisecondsTimestamp() logging.info( 'Produced %s samples at a rate of %s ms / sample.', humanize.intcomma(sample_count - 1), humanize.intcomma( int((now - sample_start_time) / max(sample_count - 1, 1)))) break
def SampleFast(self, sampler: samplers.Sampler, min_num_samples: int, seed: int = None) -> typing.List[model_pb2.Sample]: """Sample a model. Same as Sample(), but without printing or caching samples. Because samples are not cached, infinite sampling loops are not supported, since we must return the sample protos at some point. Args: sampler: The sampler to sample using. min_num_samples: The minimum number of samples to return. Note that the true number of samples returned may be higher than this value, as sampling occurs in batches. The model will continue producing samples until the lowest mulitple of the sampler batch size property that is larger than this value. E.g. if min_num_samples is 7 and the Sampler batch size is 10, 10 samples will be returned. seed: A numeric value to seed the RNG with. If not present, the RNG is seeded randomly. Returns: A list of Sample protos. Raises: UnableToAcquireLockError: If the model is locked (i.e. there is another process currently modifying the model). InvalidStartText: If the sampler start text cannot be encoded. InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be encoded. """ self.Train() sample_count = 1 with logutil.TeeLogsToFile(f'sampler_{sampler.hash}', self.cache.path / 'logs'): logging.info("Sampling: '%s'", sampler.start_text) sample_start_time = labdate.MillisecondsTimestamp() atomizer = self.corpus.atomizer sampler.Specialize(atomizer) batch_size = self.backend.InitSampling(sampler, seed) samples = [] # Per-sample batch outer loop. Continues until we have as many samples # as we want. while True: samples_in_progress = [ sampler.tokenized_start_text.copy() for _ in range(batch_size) ] done = np.zeros(batch_size, dtype=np.bool) start_time = labdate.MillisecondsTimestamp() wall_time_start = start_time self.backend.InitSampleBatch(sampler, batch_size) # Sampling loop. Continues until all samples in the batch are done. while True: indices = self.backend.SampleNextIndices( sampler, batch_size) # Iterate over all samples in batch to determine whether they're # done. for i in range(batch_size): if done[i]: continue token = atomizer.decoder[indices[i]] samples_in_progress[i].append(token) if sampler.SampleIsComplete(samples_in_progress[i]): end_time = labdate.MillisecondsTimestamp() done[i] = 1 sample = model_pb2.Sample( text=''.join(samples_in_progress[i]), sample_start_epoch_ms_utc=start_time, sample_time_ms=end_time - start_time, wall_time_ms=end_time - wall_time_start, num_tokens=len(samples_in_progress[i])) sample_count += 1 samples.append(sample) wall_time_start = labdate.MillisecondsTimestamp() # Complete the batch. if done.all(): break # Complete sampling. Note that sample_count starts at 1. if sample_count > min_num_samples: now = labdate.MillisecondsTimestamp() logging.info( 'Produced %s samples at a rate of %s ms / sample.', humanize.intcomma(len(samples)), humanize.intcomma( int((now - sample_start_time) / len(samples)))) break return samples
def Sample(self, sampler: samplers.Sampler, min_num_samples: int, seed: int = None) -> typing.List[model_pb2.Sample]: """Sample a model. If the model is not already trained, calling Sample() first trains the model. Thus a call to Sample() is equivalent to calling Train() then Sample(). Args: sampler: The sampler to sample using. min_num_samples: The minimum number of samples to return. Note that the true number of samples returned may be higher than this value, as sampling occurs in batches. The model will continue producing samples until the lowest mulitple of the sampler batch size property that is larger than this value. E.g. if min_num_samples is 7 and the Sampler batch size is 10, 10 samples will be returned. seed: A numeric value to seed the RNG with. If not present, the RNG is seeded randomly. Returns: A list of Sample protos. Raises: UnableToAcquireLockError: If the model is locked (i.e. there is another process currently modifying the model). InvalidStartText: If the sampler start text cannot be encoded. InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be encoded. """ self.Train() sample_count = 1 self.SamplerCache(sampler).mkdir(exist_ok=True) with logutil.TeeLogsToFile(f'sampler_{sampler.hash}', self.cache.path / 'logs'): logging.info("Sampling: '%s'", sampler.start_text) if min_num_samples < 0: logging.warning( 'Entering an infinite sample loop, this process will never end!' ) sample_start_time = labdate.MillisecondsTimestamp() atomizer = self.corpus.atomizer sampler.Specialize(atomizer) batch_size = self.backend.InitSampling(sampler, seed) samples = [] sample_dir = self.SamplerCache(sampler) # Per-sample batch outer loop. Continues until we have as many samples # as we want. while True: samples_in_progress = [ sampler.tokenized_start_text.copy() for _ in range(batch_size) ] done = np.zeros(batch_size, dtype=np.bool) start_time = labdate.MillisecondsTimestamp() wall_time_start = start_time self.backend.InitSampleBatch(sampler, batch_size) # Sampling loop. Continues until all samples in the batch are done. while True: indices = self.backend.SampleNextIndices( sampler, batch_size) # Iterate over all samples in batch to determine whether they're # done. for i in range(batch_size): if done[i]: continue token = atomizer.decoder[indices[i]] samples_in_progress[i].append(token) if sampler.SampleIsComplete(samples_in_progress[i]): end_time = labdate.MillisecondsTimestamp() done[i] = 1 sample = model_pb2.Sample( text=''.join(samples_in_progress[i]), sample_start_epoch_ms_utc=start_time, sample_time_ms=end_time - start_time, wall_time_ms=end_time - wall_time_start, num_tokens=len(samples_in_progress[i])) print(f'=== BEGIN CLGEN SAMPLE {sample_count} ' f'===\n\n{sample.text}\n') sample_count += 1 sample_id = crypto.sha256_str(sample.text) sample_path = sample_dir / f'{sample_id}.pbtxt' pbutil.ToFile(sample, sample_path) if min_num_samples > 0: samples.append(sample) wall_time_start = labdate.MillisecondsTimestamp() # Complete the batch. if done.all(): break # Complete sampling. Note that sample_count starts at 1. if sample_count > min_num_samples: now = labdate.MillisecondsTimestamp() logging.info( 'Produced %s samples at a rate of %s ms / sample.', humanize.intcomma(len(samples)), humanize.intcomma( int((now - sample_start_time) / max(len(samples), 1)))) break return samples