def test_cp_dir(self): fs.rm("/tmp/labm8") fs.rm("/tmp/labm8.copy") fs.mkdir("/tmp/labm8/foo/bar") self._test(False, fs.exists("/tmp/labm8.copy")) fs.cp("/tmp/labm8/", "/tmp/labm8.copy") self._test(True, fs.isdir("/tmp/labm8.copy")) self._test(True, fs.isdir("/tmp/labm8.copy/foo")) self._test(True, fs.isdir("/tmp/labm8.copy/foo/bar"))
def test_cp_over_dir(self): fs.mkdir("/tmp/labm8.tmp.src") system.echo("Hello, world!", "/tmp/labm8.tmp.src/foo") fs.rm("/tmp/labm8.tmp.copy") fs.mkdir("/tmp/labm8.tmp.copy") self._test(True, fs.isdir("/tmp/labm8.tmp.src")) self._test(True, fs.isfile("/tmp/labm8.tmp.src/foo")) self._test(True, fs.isdir("/tmp/labm8.tmp.copy")) self._test(False, fs.isfile("/tmp/labm8.tmp.copy/foo")) fs.cp("/tmp/labm8.tmp.src", "/tmp/labm8.tmp.copy/") self._test(True, fs.isdir("/tmp/labm8.tmp.src")) self._test(True, fs.isfile("/tmp/labm8.tmp.src/foo")) self._test(True, fs.isdir("/tmp/labm8.tmp.copy")) self._test(True, fs.isfile("/tmp/labm8.tmp.copy/foo")) self._test(fs.read("/tmp/labm8.tmp.src/foo"), fs.read("/tmp/labm8.tmp.copy/foo"))
def __init__(self, *args, **kwargs): """ Construct a SkelCL server. """ # Fail if we can't find the path if not fs.isdir(self.LLVM_PATH): io.fatal("Could not find llvm path '{0}'".format(self.LLVM_PATH)) super(Server, self).__init__(*args, **kwargs) io.info("Registered server %s/SkelCLServer ..." % SESSION_NAME) # Setup persistent database. self.db = migrate(Database()) self.db.status_report() # Create an in-memory sample strategy cache. self.strategies = cache.TransientCache()
def test_mkopen(self): fs.rm("/tmp/labm8.dir") self._test(False, fs.isdir("/tmp/labm8.dir/")) f = fs.mkopen("/tmp/labm8.dir/foo", "w") self._test(True, fs.isdir("/tmp/labm8.dir/")) f.close()
def test_mkdir_exists(self): fs.mkdir("/tmp/labm8.dir/") self._test(True, fs.isdir("/tmp/labm8.dir/")) fs.mkdir("/tmp/labm8.dir/") fs.mkdir("/tmp/labm8.dir/") self._test(True, fs.isdir("/tmp/labm8.dir/"))
def test_mkdir_parents(self): self._test(False, fs.isdir("/tmp/labm8.dir/foo/bar")) fs.mkdir("/tmp/labm8.dir/foo/bar") self._test(True, fs.isdir("/tmp/labm8.dir/foo/bar"))
def test_mkdir(self): fs.rm("/tmp/labm8.dir") self._test(False, fs.isdir("/tmp/labm8.dir")) fs.mkdir("/tmp/labm8.dir") self._test(True, fs.isdir("/tmp/labm8.dir"))
def test_isdir(self): self._test(False, fs.isdir(__file__)) self._test(True, fs.isdir("/")) self._test(False, fs.isdir("/not/a/real/path (I hope!)"))
def test_init_and_empty(self): c = cache.Cache("__test__") self.assertTrue(fs.isdir(fs.path(cache.ROOT, "__test__"))) c.empty() self.assertFalse(fs.isdir(fs.path(cache.ROOT, "__test__")))
def test_mkdir_exists(): fs.mkdir("/tmp/labm8.dir/") assert fs.isdir("/tmp/labm8.dir/") fs.mkdir("/tmp/labm8.dir/") fs.mkdir("/tmp/labm8.dir/") assert fs.isdir("/tmp/labm8.dir/")
def test_mkdir_parents(): assert not fs.isdir("/tmp/labm8.dir/foo/bar") fs.mkdir("/tmp/labm8.dir/foo/bar") assert fs.isdir("/tmp/labm8.dir/foo/bar")
def test_mkdir(): fs.rm("/tmp/labm8.dir") assert not fs.isdir("/tmp/labm8.dir") fs.mkdir("/tmp/labm8.dir") assert fs.isdir("/tmp/labm8.dir")
def test_isdir(): assert not fs.isdir(__file__) assert fs.isdir("/") assert not fs.isdir("/not/a/real/path (I hope!)")
def test_init_and_empty(self): c = cache.FSCache("/tmp/labm8-cache-init-and-empty") self.assertTrue(fs.isdir("/tmp/labm8-cache-init-and-empty")) c.clear() self.assertFalse(fs.isdir("/tmp/labm8-cache-init-and-empty"))
def _locked_train(self) -> 'Model': tf = self._init_tensorflow(infer=False) # training options learning_rate = self.train_opts["learning_rate"] decay_rate = self.train_opts["lr_decay_rate"] # resume from prior checkpoint ckpt_path, ckpt_paths = None, None if self.checkpoint_path: # check that all necessary files exist assert(fs.isdir(self.checkpoint_path)) ckpt = tf.train.get_checkpoint_state(self.checkpoint_path) assert(ckpt) assert(ckpt.model_checkpoint_path) ckpt_path, ckpt_paths = self._get_params_path(ckpt) with tf.Session() as sess: tf.global_variables_initializer().run() # keep all checkpoints saver = tf.train.Saver(tf.global_variables(), max_to_keep=100) # restore model from closest checkpoint if ckpt_path: log.debug("restoring", ckpt_path) saver.restore(sess, ckpt_path) log.verbose("restored checkpoint {}".format(ckpt_path)) # make sure we don't lose track of other checkpoints if ckpt_paths: saver.recover_last_checkpoints(ckpt_paths) coord = tf.train.Coordinator() self.corpus.create_batches() threading.Thread(target=self.enqueue_x, args=(coord, sess)).start() max_batch = self.epochs * self.corpus.num_batches # progress bar bar = progressbar.ProgressBar(max_value=max_batch) if sess.run(self.epoch) != self.epochs: log.info("training", self) for e in range(sess.run(self.epoch) + 1, self.epochs + 1): epoch_start = time() # decay and set learning rate new_learning_rate = learning_rate * ( (float(100 - decay_rate) / 100.0) ** (e - 1)) sess.run(tf.assign(self.learning_rate, new_learning_rate)) sess.run(tf.assign(self.epoch, e)) for b in range(self.corpus.num_batches): train_cost, _, state, _ = sess.run([self.cost, self.KL_cost, self.final_state, self.train_op]) # update progress bar batch_num = (e - 1) * self.corpus.num_batches + b bar.update(batch_num) save = self.opts["train_opts"]["intermediate_checkpoints"] save |= e == self.epochs # always save on last epoch if save: saver.save(sess, self.cache.keypath("model.ckpt"), global_step=batch_num) next_checkpoint = e * self.corpus.num_batches + b max_epoch = self.epochs log.verbose("\n{self} epoch {e} / {max_epoch}. " "next checkpoint at batch {next_checkpoint}" .format(**vars())) # update training time epoch_duration = time() - epoch_start self.stats["epoch_costs"].append(float(train_cost)) self.stats["epoch_times"].append(epoch_duration) self.stats["epoch_batches"].append(batch_num + 1) self._flush_meta() coord.request_stop() return self
#!/usr/bin/env python3.6 import sys from progressbar import ProgressBar from labm8 import crypto from labm8 import fs if __name__ == "__main__": inpath = sys.argv[1] outdir = sys.argv[2] print(f"reading from {inpath} into {outdir}") assert fs.isfile(inpath) assert not fs.exists(outdir) or fs.isdir(outdir) fs.mkdir(outdir) with open(inpath) as infile: text = infile.read() kernels = text.split("// ==== START SAMPLE ====") kernels = [kernel.strip() for kernel in kernels if kernel.strip()] print(len(kernels), "kernels") sha1s = [crypto.sha1_str(kernel) for kernel in kernels] for kernel, sha1 in ProgressBar()(list(zip(kernels, sha1s))): with open(f"{outdir}/{sha1}.txt", "w") as outfile: print(kernel, file=outfile)
def test_mkopen(): fs.rm("/tmp/labm8.dir") assert not fs.isdir("/tmp/labm8.dir/") f = fs.mkopen("/tmp/labm8.dir/foo", "w") assert fs.isdir("/tmp/labm8.dir/") f.close()
def __init__(self, contentid: str, path: str = None, **opts): """ Instantiate a corpus. If this is a new corpus, a number of files will be created, which may take some time. Arguments: contentid (str): ID of corpus content. path (str, optional): Path to corpus. **opts: Keyword options. """ def _init_error(err: Exception) -> None: """ tidy up in case of error """ log.error("corpus creation failed. Deleting corpus files") paths = [ fs.path(self.contentcache.path, "kernels.db"), fs.path(self.cache.path, "corpus.txt"), fs.path(self.cache.path, "tensor.npy"), fs.path(self.cache.path, "atomizer.pkl") ] for path in paths: if fs.exists(path): log.info("removing", path) fs.rm(path) raise err # Validate options for key in opts.keys(): if key not in DEFAULT_CORPUS_OPTS: raise clgen.UserError( "Unsupported corpus option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_CORPUS_OPTS.keys())))) self.opts = deepcopy(DEFAULT_CORPUS_OPTS) clgen.update(self.opts, opts) self.contentid = contentid self.hash = self._hash(contentid, self.opts) self.cache = Cache(fs.path("corpus", self.hash)) self.contentcache = Cache(fs.path("contentfiles", contentid)) self.kernels_db = self.contentcache['kernels.db'] log.debug("corpus {hash}".format(hash=self.hash)) try: if path is not None: if not fs.isdir(path): raise clgen.UserError( "Corpus path '{}' is not a directory".format(path)) # create kernels database if necessary if not self.contentcache["kernels.db"]: self._create_kernels_db(path, self.opts["encoding"]) assert (self.contentcache["kernels.db"]) # create corpus text if not exists if not self.cache["corpus.txt"]: self._create_txt() assert (self.cache["corpus.txt"]) # create atomizer if needed if self.cache["atomizer.pkl"]: self._load_atomizer() assert (self.cache["atomizer.pkl"]) else: self._create_atomizer(self.opts["vocabulary"]) except Exception as e: _init_error(e)
def train(self, quiet: bool = False) -> None: """ Train model. """ tf = self._init_tensorflow(infer=False) # training options learning_rate = self.train_opts["learning_rate"] decay_rate = self.train_opts["lr_decary_rate"] checkpoint_path = fs.path(self.cache.path, "model.ckpt") # resume from prior checkpoint ckpt_path, ckpt_paths = None, None if self.checkpoint_path: # check if all necessary files exist assert (fs.isdir(self.checkpoint_path)) ckpt = tf.train.get_checkpoint_state(self.checkpoint_path) assert (ckpt) assert (ckpt.model_checkpoint_path) ckpt_path, ckpt_paths = self._get_params_path(ckpt) with tf.Session() as sess: tf.global_variables_initializer().run() # keep all checkpoints saver = tf.train.Saver(tf.global_variables(), max_to_keep=100) # restore model from closest checkpoint if ckpt_path: log.debug("restoring", ckpt_path) saver.restore(sess, ckpt_path) log.info("restored checkpoint {}".format(ckpt_path)) # make sure we don't lose track of other checkpoints if ckpt_paths: saver.recover_last_checkpoints(ckpt_paths) start_batch = sess.run(self.epoch) * self.corpus.num_batches batch_count = 0 total_elapsed = 0 total_atomize = 0 total_checkpoint, avg_checkpoint = 0, 0 eta_d, eta_h, eta_m = 0, 0, 0 for e in range(sess.run(self.epoch) + 1, self.epochs + 1): if quiet: log.info("epoch", e, "of", self.epochs + 1) # decay and set learning rate new_learning_rate = learning_rate * ( (float(100 - decay_rate) / 100.0)**(e - 1)) sess.run(tf.assign(self.learning_rate, new_learning_rate)) sess.run(tf.assign(self.epoch, e)) time_start = time.time() self.corpus.create_batches() total_atomize += time.time() - time_start avg_atomize = total_atomize / e state = sess.run(self.initial_state) for b in range(self.corpus.num_batches): time_start = time.time() batch_count += 1 x, y = self.corpus.next_batch() feed = {self.input_data: x, self.targets: y} for i, (c, h) in enumerate(self.initial_state): feed[c] = state[i].c feed[h] = state[i].h train_loss, state, _ = sess.run( [self.cost, self.final_state, self.train_op], feed) batch_num = (e - 1) * self.corpus.num_batches + b max_batch = self.epochs * self.corpus.num_batches progress = float((batch_num + 1 - start_batch) / (max_batch - start_batch)) time_end = time.time() elapsed = time_end - time_start if not quiet: total_elapsed += elapsed avg_elapsed = total_elapsed / batch_count remaining_time = ( (max_batch - batch_count) * avg_elapsed + # batches (e - self.epochs) * avg_atomize + # atomizings (e - self.epochs) * avg_checkpoint) # checkpoints eta_h, eta_m = divmod(remaining_time / 60, 60) eta_d, eta_h = divmod(eta_h, 24) print("\r\033[K" "{progress:3.1f}% | " "{size}x{layers}x{max_epoch} {model} | " "epoch={epoch_num}/{max_epoch} | " "batch={batch_num}/{max_batch} | " "lr={lr:.5f} | " "loss={tloss:.3f} | " "t1={time_atomize:.3f}s " "t2={time_batch:.3f}s " "t3={time_checkpoint:.3f}s | " "eta={eta_d}d{eta_h}h{eta_m:02d}m".format( size=self.rnn_size, layers=self.num_layers, model=self.model_type.upper(), progress=progress * 100, epoch_num=e, max_epoch=self.epochs, batch_num=b + 1, max_batch=self.corpus.num_batches, lr=new_learning_rate, tloss=train_loss, time_atomize=avg_atomize, time_batch=avg_elapsed, time_checkpoint=avg_checkpoint, eta_d=int(eta_d), eta_h=int(eta_h), eta_m=int(eta_m)), end="") save = self.opts["train_opts"]["intermediate_checkpoints"] save |= e == self.epochs # last epoch if save: if not quiet: print() time_start = time.time() saver.save(sess, checkpoint_path, global_step=batch_num) total_checkpoint += time.time() - time_start avg_checkpoint = total_checkpoint / e log.info("model saved to {}".format(checkpoint_path))
def __init__(self, contentid: str, path: str = None, **opts): """ Instantiate a corpus. If this is a new corpus, a number of files will be created, which may take some time. Parameters ---------- contentid : str ID of corpus content. path : str, optional Path to corpus. **opts Keyword options. """ # Validate options for key in opts.keys(): if key not in DEFAULT_CORPUS_OPTS: raise clgen.UserError( "Unsupported corpus option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_CORPUS_OPTS.keys())))) self.opts = deepcopy(DEFAULT_CORPUS_OPTS) types.update(self.opts, opts) self.opts["id"] = contentid # check that contentid exists self.language = clgen.Language.from_str(opts.get("language")) if (path is None and not fs.isdir( clgen.cachepath("contentfiles", f"{self.language}-{contentid}"))): raise clgen.UserError( "corpus {self.language}-{contentid} not found".format( **vars())) self.contentid = contentid self.contentcache = clgen.mkcache("contentfiles", f"{self.language}-{contentid}") self.kernels_db = self.contentcache.keypath('kernels.db') self.hash = self._hash(contentid, self.opts) self.cache = clgen.mkcache("corpus", f"{self.language}-{self.hash}") log.debug("contentfiles {self.contentid}".format(**vars())) log.debug("corpus {hash}".format(hash=self.hash)) # validate metadata against cache self.stats = {"preprocess_time": 0} meta = deepcopy(self.to_json()) if self.cache.get("META"): cached_meta = jsonutil.read_file(self.cache["META"]) self.stats = cached_meta["stats"] # restore stats if "created" in cached_meta: del cached_meta["created"] del meta["created"] if "stats" in cached_meta: del cached_meta["stats"] del meta["stats"] if meta != cached_meta: raise clgen.InternalError("corpus metadata mismatch") else: self._flush_meta() with self.lock.acquire(replace_stale=True): self._create_files(path)