def test_TeeLogsToFile_contextmanager(capsys): """Test that contextmanager temporarily also logs to file.""" with tempfile.TemporaryDirectory() as d: FLAGS.logtostderr = True logging.info('This is not going in a file') with logutil.TeeLogsToFile('test', d): logging.info('Hello, file!') logging.info('This is not going in a file') # Test file contents. with open(pathlib.Path(d) / 'test.INFO') as f: lines = f.read().rstrip().split('\n') assert len(lines) == 1 assert lines[0].endswith('Hello, file!') out, err = capsys.readouterr() assert not out # Test stderr contents. lines = err.rstrip().split('\n') assert len(lines) == 3 assert lines[0].endswith('This is not going in a file') assert lines[1].endswith('Hello, file!') assert lines[2].endswith('This is not going in a file')
def Sample(self, sampler: samplers.Sampler, min_num_samples: int, seed: int = None) -> typing.List[model_pb2.Sample]: """Sample a model. If the model is not already trained, calling Sample() first trains the model. Thus a call to Sample() is equivalent to calling Train() then Sample(). Args: sampler: The sampler to sample using. min_num_samples: The minimum number of samples to return. Note that the true number of samples returned may be higher than this value, as sampling occurs in batches. The model will continue producing samples until the lowest mulitple of the sampler batch size property that is larger than this value. E.g. if min_num_samples is 7 and the Sampler batch size is 10, 10 samples will be returned. seed: A numeric value to seed the RNG with. If not present, the RNG is seeded randomly. Returns: A list of Sample protos. Raises: UnableToAcquireLockError: If the model is locked (i.e. there is another process currently modifying the model). InvalidStartText: If the sampler start text cannot be encoded. InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be encoded. """ self.Train() min_num_samples = 500 sample_count = 1 #For logging purposes only self.SamplerCache(sampler).mkdir(exist_ok=True) with logutil.TeeLogsToFile(f'sampler_{sampler.hash}', self.cache.path / 'logs'): logging.info("Sampling: '%s'", sampler.start_text) if min_num_samples < 0: logging.warning( 'Entering an infinite sample loop, this process will never end!' ) sample_start_time = labdate.MillisecondsTimestamp() atomizer = self.corpus.atomizer sampler.Specialize(atomizer) batch_size = self.backend.InitSampling(sampler, seed) print("Sampling Batch Size :" + str(batch_size)) samples = [] sample_dir = self.SamplerCache(sampler) # Per-sample batch outer loop. Continues until we have as many samples # as we want. while True: samples_in_progress = [ sampler.tokenized_start_text.copy() for _ in range(batch_size) ] #print (samples_in_progress) done = np.zeros(batch_size, dtype=np.bool) #print(done) start_time = labdate.MillisecondsTimestamp() wall_time_start = start_time self.backend.InitSampleBatch(sampler, batch_size) # Sampling loop. Continues until all samples in the batch are done. while True: indices = self.backend.SampleNextIndices( sampler, batch_size) #print(indices) # Iterate over all samples in batch to determine whether they're # done. for i in range(batch_size): if done[i]: continue token = atomizer.decoder[indices[i]] samples_in_progress[i].append(token) if sampler.SampleIsComplete(samples_in_progress[i]): end_time = labdate.MillisecondsTimestamp() done[i] = 1 sample = model_pb2.Sample( text=''.join(samples_in_progress[i]), sample_start_epoch_ms_utc=start_time, sample_time_ms=end_time - start_time, wall_time_ms=end_time - wall_time_start, num_tokens=len(samples_in_progress[i])) print(f'=== BEGIN CLGEN SAMPLE {sample_count} ' f'===\n\n{sample.text}\n') sample_count += 1 #sample_id = crypto.sha256_str(sample.text) sample_path = sample_dir / f'Sample{sample_count}.mdl' #previously .txt #name of the samples with open(sample_path, 'w') as samplefile: samplefile.write(''.join( samples_in_progress[i])) #pbutil.ToFile(sample, sample_path) if min_num_samples > 0: samples.append(sample) wall_time_start = labdate.MillisecondsTimestamp() # Complete the batch. if done.all(): break # Complete sampling. Note that sample_count starts at 1. if sample_count > min_num_samples: now = labdate.MillisecondsTimestamp() logging.info( 'Produced %s samples at a rate of %s ms / sample.', humanize.intcomma(len(samples)), humanize.intcomma( int((now - sample_start_time) / max(len(samples), 1)))) break return samples
def SampleFast(self, sampler: samplers.Sampler, min_num_samples: int, seed: int = None) -> typing.List[model_pb2.Sample]: """Sample a model. Same as Sample(), but without printing or caching samples. Because samples are not cached, infinite sampling loops are not supported, since we must return the sample protos at some point. Args: sampler: The sampler to sample using. min_num_samples: The minimum number of samples to return. Note that the true number of samples returned may be higher than this value, as sampling occurs in batches. The model will continue producing samples until the lowest mulitple of the sampler batch size property that is larger than this value. E.g. if min_num_samples is 7 and the Sampler batch size is 10, 10 samples will be returned. seed: A numeric value to seed the RNG with. If not present, the RNG is seeded randomly. Returns: A list of Sample protos. Raises: UnableToAcquireLockError: If the model is locked (i.e. there is another process currently modifying the model). InvalidStartText: If the sampler start text cannot be encoded. InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be encoded. """ self.Train() sample_count = 1 with logutil.TeeLogsToFile(f'sampler_{sampler.hash}', self.cache.path / 'logs'): logging.info("Sampling: '%s'", sampler.start_text) sample_start_time = labdate.MillisecondsTimestamp() atomizer = self.corpus.atomizer sampler.Specialize(atomizer) batch_size = self.backend.InitSampling(sampler, seed) samples = [] # Per-sample batch outer loop. Continues until we have as many samples # as we want. while True: samples_in_progress = [ sampler.tokenized_start_text.copy() for _ in range(batch_size) ] done = np.zeros(batch_size, dtype=np.bool) start_time = labdate.MillisecondsTimestamp() wall_time_start = start_time self.backend.InitSampleBatch(sampler, batch_size) # Sampling loop. Continues until all samples in the batch are done. while True: indices = self.backend.SampleNextIndices( sampler, batch_size) print("Done :" + str(done)) # Iterate over all samples in batch to determine whether they're # done. for i in range(batch_size): if done[i]: continue token = atomizer.decoder[indices[i]] samples_in_progress[i].append(token) if sampler.SampleIsComplete(samples_in_progress[i]): end_time = labdate.MillisecondsTimestamp() done[i] = 1 sample = model_pb2.Sample( text=''.join(samples_in_progress[i]), sample_start_epoch_ms_utc=start_time, sample_time_ms=end_time - start_time, wall_time_ms=end_time - wall_time_start, num_tokens=len(samples_in_progress[i])) sample_count += 1 samples.append(sample) wall_time_start = labdate.MillisecondsTimestamp() # Complete the batch. if done.all(): break # Complete sampling. Note that sample_count starts at 1. if sample_count > min_num_samples: now = labdate.MillisecondsTimestamp() logging.info( 'Produced %s samples at a rate of %s ms / sample.', humanize.intcomma(len(samples)), humanize.intcomma( int((now - sample_start_time) / len(samples)))) break return samples
def Train(self, corpus) -> 'keras.models.Sequential': """Locked training. If there are cached epoch checkpoints, the one closest to the target number of epochs will be loaded, and the model will be trained for only the remaining number of epochs, if any. This means that calling this function twice will only actually train the model the first time, and all subsequent calls will be no-ops. This method must only be called when the model is locked. Returns: The trained Keras model. """ model = builders.BuildKerasModel(self.config, self.atomizer.vocab_size) with open(self.cache.keypath('model.yaml'), 'w') as f: f.write(model.to_yaml()) model.compile(loss='categorical_crossentropy', optimizer=builders.BuildOptimizer(self.config)) # Print a model summary. buf = io.StringIO() model.summary(print_fn=lambda x: buf.write(x + '\n')) logging.info('Model summary:\n%s', buf.getvalue()) # TODO(cec): Add an atomizer.CreateVocabularyFile() method, with frequency # counts for a given corpus. def Escape(token: str) -> str: """Make a token visible and printable.""" if token == '\t': return '\\t' elif token == '\n': return '\\n' elif not token.strip(): return f"'{token}'" else: return token if not (self.cache.path / 'embeddings' / 'metadata.tsv').is_file(): with open(self.cache.path / 'embeddings' / 'metadata.tsv', 'w') as f: for _, token in sorted(self.atomizer.decoder.items(), key=lambda x: x[0]): f.write(Escape(token) + '\n') target_num_epochs = self.config.training.num_epochs starting_epoch = 0 epoch_checkpoints = self.epoch_checkpoints if len(epoch_checkpoints) >= target_num_epochs: # We have already trained a model to at least this number of epochs, so # simply the weights from that epoch and call it a day. logging.info('Loading weights from %s', epoch_checkpoints[target_num_epochs - 1]) model.load_weights(epoch_checkpoints[target_num_epochs - 1]) return model # Now entering the point at which training is inevitable. with logutil.TeeLogsToFile('train', self.cache.path / 'logs'): # Deferred importing of Keras so that we don't have to activate the # TensorFlow backend every time we import this module. import keras if epoch_checkpoints: # We have already trained a model at least part of the way to our target # number of epochs, so load the most recent one. starting_epoch = len(epoch_checkpoints) logging.info('Resuming training from epoch %d.', starting_epoch) model.load_weights(epoch_checkpoints[-1]) callbacks = [ keras.callbacks.ModelCheckpoint(str( self.cache.path / 'checkpoints' / '{epoch:03d}.hdf5'), verbose=1, mode="min", save_best_only=False), keras.callbacks.TensorBoard( str(self.cache.path / 'embeddings'), write_graph=True, embeddings_freq=1, embeddings_metadata={ 'embedding_1': str(self.cache.path / 'embeddings' / 'metadata.tsv'), }), telemetry.TrainingLogger(self.cache.path / 'logs').KerasCallback(keras), ] generator = data_generators.AutoGenerator(corpus, self.config.training) steps_per_epoch = (corpus.encoded.token_count - 1) // (self.config.training.batch_size * self.config.training.sequence_length) logging.info( 'Step counts: %s per epoch, %s left to do, %s total', humanize.intcomma(steps_per_epoch), humanize.intcomma( (target_num_epochs - starting_epoch) * steps_per_epoch), humanize.intcomma(target_num_epochs * steps_per_epoch)) model.fit_generator(generator, steps_per_epoch=steps_per_epoch, callbacks=callbacks, initial_epoch=starting_epoch, epochs=target_num_epochs) return model