def _create_kernels_db(self, path: str, encoding: str = "default") -> None: """creates and caches kernels.db""" log.debug("creating database") # create a database and put it in the cache tmppath = fs.path(self.contentcache.path, "kernels.db.tmp") dbutil.create_db(tmppath) self.contentcache["kernels.db"] = tmppath # get a list of files in the corpus filelist = [ f for f in fs.ls(path, abspaths=True, recursive=True) if fs.isfile(f) ] # import files into database fetch.fetch_fs(self.contentcache["kernels.db"], filelist) # preprocess files preprocess.preprocess_db(self.contentcache["kernels.db"]) # encode kernel db encode(self.contentcache["kernels.db"], encoding) # print database stats explore.explore(self.contentcache["kernels.db"])
def evaluate(model, sampler): """ evaluate sampling efficiency """ print("starting sampling") sampler.sample(model) print("preprocessing sample") sample_db = sampler.cache(model)["kernels.db"] preprocess.preprocess_db(sample_db) num_kernels = dbutil.num_rows_in(sample_db, "ContentFiles") num_good_kernels = dbutil.num_good_kernels(sample_db) num_ugly_kernels = dbutil.num_rows_in(sample_db, "PreprocessedFiles", "WHERE status=2") discard_rate = 1 - (num_good_kernels / num_kernels) ugly_rate = 1 - (num_ugly_kernels / num_kernels) total_charcount = dbutil.cc(sample_db, "ContentFiles") good_charcount = dbutil.cc(sample_db, "PreprocessedFiles", condition="WHERE status=0") return { "argspec": sampler.kernel_opts["args"], "host": system.HOSTNAME, "date": time.nowstr(), "num_kernels": num_kernels, "num_good_kernels": num_good_kernels, "discard_rate": discard_rate, "ugly_rate": ugly_rate, "total_charcount": total_charcount, "good_charcount": good_charcount, "corpus_dir": model.corpus.cache.path, "model_dir": model.cache.path, "sampler_dir": sampler.cache(model).path, }
def evaluate(model, sampler): """ evaluate sampling efficiency """ model.cache.empty() # clear checkpoint cache print("starting training") tstart = time() # start timer model.train() # train model training_time = time() - tstart # clear the sample cache sampler.cache(model).empty() # sample kernels and time print("starting sampling") tstart = time() sampler.sample(model) tend = time() elapsed = tend - tstart # preprocess sample sample_db = sampler.cache(model)["kernels.db"] preprocess.preprocess_db(sample_db) num_kernels = dbutil.num_rows_in(sample_db, "ContentFiles") num_good_kernels = dbutil.num_good_kernels(sample_db) num_ugly_kernels = dbutil.num_rows_in(sample_db, "PreprocessedFiles", "WHERE status=2") discard_rate = 1 - (num_good_kernels / num_kernels) ugly_rate = 1 - (num_ugly_kernels / num_kernels) total_charcount = dbutil.cc(sample_db, "ContentFiles") good_charcount = dbutil.cc(sample_db, "PreprocessedFiles", condition="WHERE status=0") efficiency = good_charcount / total_charcount throughput = good_charcount / elapsed return { "training_time": training_time, "sampling_time": elapsed, "num_kernels": num_kernels, "num_good_kernels": num_good_kernels, "discard_rate": discard_rate, "ugly_rate": ugly_rate, "total_charcount": total_charcount, "good_charcount": good_charcount, "efficiency": efficiency, # good_chars / total_chars "throughput": throughput, # good_chars / second "corpus_dir": model.corpus.cache.path, "model_dir": model.cache.path, "sampler_dir": sampler.cache(model).path, }
def sample_iteration(self, model: Model, quiet: bool = False) -> None: """ Run one sample iteration. Arguments: model (Model): CLgen model. """ assert (isinstance(model, Model)) cache = self.cache(model) if self.kernel_opts.get("args", None): start_text = serialize_argspec(self.kernel_opts["args"]) else: start_text = "__kernel void A(" tmppath = fs.path(cache.path, "sampler-{pid}.tmp.cl".format(pid=system.PID)) with open(tmppath, "w") as outfile: opts = { "output": outfile, "num_samples": self.batch_size, "temperature": self.kernel_opts.get("temperature", 1), "max_length": self.kernel_opts.get("max_length", 10000), "seed_text": start_text, "quiet": quiet } model.sample(**opts) sys.stdout.flush() sys.stderr.flush() fetch.process_sample_file(cache["kernels.db"], tmppath, max_kernel_len=opts["max_length"], quiet=True) if self.static_checker: # TODO: Parse dynamic checker requirement preprocess.preprocess_db(cache["kernels.db"]) fs.rm(tmppath)