def Create(self) -> None: """Create the corpus files. Raises: EmptyCorpusException: If there are no content files, or no successfully pre-processed files. """ self._created = True logging.info('Content ID: %s', self.content_id) preprocessed_lock_path = self.preprocessed.database_path.parent / 'LOCK' with lockfile.LockFile(preprocessed_lock_path).acquire( replace_stale=True, block=True): self.preprocessed.Create(self.config) if not self.preprocessed.size: raise errors.EmptyCorpusException( "Pre-processed corpus contains no files: " f"'{self.preprocessed.database_path}'") encoded_lock_path = self.encoded.database_path.parent / 'LOCK' with lockfile.LockFile(encoded_lock_path).acquire(replace_stale=True, block=True): start_time = time.time() atomizer = self.atomizer logging.info( '%s: %s tokens in %s ms', type(atomizer).__name__, humanize.intcomma(atomizer.vocab_size), humanize.intcomma(int((time.time() - start_time) * 1000))) for key, value in atomizer.vocab.items(): logging.info('atomizer.vocab %s : %s', key, value) self.encoded.Create(self.preprocessed, atomizer, self.config.contentfile_separator)
def is_locked(self) -> bool: """Return whether the corpus is locked.""" preprocessed_lock_path = self.preprocessed.database_path.parent / 'LOCK' if lockfile.LockFile(preprocessed_lock_path).islocked: return True encoded_lock_path = self.encoded.database_path.parent / 'LOCK' if lockfile.LockFile(encoded_lock_path).islocked: return True return False
def test_LockFile_release_fail(dummy_lockfile_path): """Test that releasing a lock owned by a different host fails.""" lock = lockfile.LockFile(dummy_lockfile_path) assert lock.islocked with pytest.raises(lockfile.UnableToReleaseLockError) as e_ctx: lock.release() assert str(e_ctx.value) == f"""\
def test_LockFile_release_deletes_file(): """Test that lockfile is removed after lockfile.release().""" with tempfile.TemporaryDirectory() as d: path = pathlib.Path(d) / 'LOCK' lock = lockfile.LockFile(path) lock.acquire() lock.release() assert not lock.path.is_file()
def test_LockFile_owned_by_self(): """Test that lockfile.owned_by_self returns True after acquired.""" with tempfile.TemporaryDirectory() as d: path = pathlib.Path(d) / 'LOCK' lock = lockfile.LockFile(path) assert not lock.owned_by_self lock.acquire() assert lock.owned_by_self
def test_LockFile_islocked(): """Test that lockfile.islocked returns True after acquired.""" with tempfile.TemporaryDirectory() as d: path = pathlib.Path(d) / 'LOCK' lock = lockfile.LockFile(path) assert not lock.islocked lock.acquire() assert lock.islocked
def test_LockFile_file_exists(): """Test that lockfile is created on acquire.""" with tempfile.TemporaryDirectory() as d: path = pathlib.Path(d) / 'LOCK' lock = lockfile.LockFile(path) assert not lock.path.is_file() lock.acquire() assert lock.path.is_file()
def IsEligible(instance: clgen.Instance) -> bool: """Return whether an instance is eligible for training or sampling.""" if instance.model.corpus.is_locked: logging.info('Corpus is locked') return False if instance.model.training_lock.islocked: logging.info('Model is locked') return False sample_dir = instance.model.SamplerCache(instance.sampler) sample_lock = lockfile.LockFile(sample_dir / 'LOCK') if sample_lock.islocked: logging.info('Sampler is locked') return False return True
def test_LockFile_force_replace_stale(): """Test that lockfile is replaced if forced.""" with tempfile.TemporaryDirectory() as d: path = pathlib.Path(d) / 'LOCK' lock = lockfile.LockFile(path) MAX_PROCESSES = 4194303 # OS-dependent. This value is for Linux lock.acquire(pid=MAX_PROCESSES + 1) assert lock.islocked assert not lock.owned_by_self with pytest.raises(lockfile.UnableToAcquireLockError): lock.acquire() lock.acquire(force=True) assert lock.islocked assert lock.owned_by_self lock.release() assert not fs.exists(lock.path)
def SampleModel(instance: clgen.Instance) -> None: """Take --output_corpus_size samples from model.""" logging.info('Training and sampling the CLgen model ...') target_samples = FLAGS.output_corpus_size sample_dir = instance.model.SamplerCache(instance.sampler) sample_dir.mkdir(exist_ok=True) num_samples = len(list(sample_dir.iterdir())) logging.info('Need to generate %d samples in %s', max(target_samples - num_samples, 0), sample_dir) if num_samples < target_samples: sample_lock = lockfile.LockFile(sample_dir / 'LOCK') with sample_lock.acquire(replace_stale=True, block=True): num_samples = len(list(sample_dir.iterdir())) while num_samples < target_samples: samples = instance.model.SampleFast( instance.sampler, target_samples - num_samples) for sample in samples: sample_id = crypto.sha256_str(sample.text) pbutil.ToFile(sample, sample_dir / f'{sample_id}.pbtxt') num_samples = len(list(sample_dir.iterdir()))
def training_lock(self) -> lockfile.LockFile: """A lockfile for exclusive training.""" return lockfile.LockFile(self.cache.keypath('LOCK'))