def sample(self, model: clgen.Model) -> None: """ Sample CLgen model. Parameters ---------- model : clgen.Model CLgen model. """ cache = self.cache(model) # create samples database if it doesn't exist if not cache.get("kernels.db"): tmp_kernels_db = cache.keypath("kernels.tmp.db") dbutil.create_db(tmp_kernels_db) cache["kernels.db"] = tmp_kernels_db # producer-consumer queue queue = Queue(maxsize=128) log.info("sampling", self) sampler = SampleProducer(model, self.start_text, queue, **self.kernel_opts) sampler.start() consumer = SampleConsumer(cache["kernels.db"], sampler, self, cache, queue, **self.sampler_opts) consumer.start() sampler.join() consumer.join() clgen.explore(cache["kernels.db"])
def _create_kernels_db(self, path: str, encoding: str = "default") -> None: """creates and caches kernels.db""" log.debug("creating database") # create a database and put it in the cache tmppath = fs.path(self.contentcache.path, "kernels.db.tmp") dbutil.create_db(tmppath) self.contentcache["kernels.db"] = tmppath # get a list of files in the corpus filelist = [ f for f in fs.ls(path, abspaths=True, recursive=True) if fs.isfile(f) ] # import files into database fetch.fetch_fs(self.contentcache["kernels.db"], filelist) # preprocess files preprocess.preprocess_db(self.contentcache["kernels.db"]) # encode kernel db encode(self.contentcache["kernels.db"], encoding) # print database stats explore.explore(self.contentcache["kernels.db"])
def test_create_db_gh(): db_path = tests.data_path("db", "tmp.db", exists=False) fs.rm(db_path) dbutil.create_db(db_path, github=True) assert fs.exists(db_path) with pytest.raises(clgen.UserError): dbutil.create_db(db_path, github=True)
def test_remove_bad_preprocessed(self): fs.rm("tmp.db") dbutil.create_db("tmp.db") db = sqlite3.connect("tmp.db") c = db.cursor() # Create some data to test with: c.execute("DELETE FROM PreprocessedFiles") c.execute("INSERT INTO PreprocessedFiles VALUES(?,?,?)", ("id1", 0, "good output")) c.execute("INSERT INTO PreprocessedFiles VALUES(?,?,?)", ("id2", 1, "bad output")) c.execute("INSERT INTO PreprocessedFiles VALUES(?,?,?)", ("id3", 2, "ugly output")) db.commit() c.close() # Check that data was written properly: c = db.cursor() c.execute("SELECT Count(*) FROM PreprocessedFiles") count = c.fetchone()[0] self.assertEqual(3, count) db.close() preprocess.remove_bad_preprocessed("tmp.db") # Check that clean worked: db = sqlite3.connect("tmp.db") c = db.cursor() c.execute("SELECT Count(*) FROM PreprocessedFiles") count = c.fetchone()[0] self.assertEqual(3, count) c.execute("SELECT contents FROM PreprocessedFiles WHERE status=1 " "OR status=2") rows = c.fetchall() print(rows) self.assertTrue(all(not r == "[DELETED]" for r in rows)) # Clean up: c.execute("DELETE FROM PreprocessedFiles") db.commit() c.close() # Check that clean-up worked: c = db.cursor() c.execute("SELECT Count(*) FROM PreprocessedFiles") count = c.fetchone()[0] self.assertEqual(0, count) fs.rm("tmp.db")
def _create_kernels_db(self, path: str) -> None: """creates and caches kernels.db""" log.debug("creating database") # create a database and put it in the cache tmppath = self.contentcache.keypath("kernels.db.tmp") dbutil.create_db(tmppath) self.contentcache["kernels.db"] = tmppath # get a list of files in the corpus filelist = [f for f in fs.ls(path, abspaths=True, recursive=True) if fs.isfile(f)] # import files into database clgen.fetch(self.contentcache["kernels.db"], filelist)
def test_insert(): db_path = tests.data_path("db", "tmp.db", exists=False) fs.rm(db_path) dbutil.create_db(db_path) db = dbutil.connect(db_path) c = db.cursor() assert dbutil.num_rows_in(db_path, "ContentFiles") == 0 dbutil.sql_insert_dict(c, "ContentFiles", {"id": "a", "contents": "foo"}) dbutil.sql_insert_dict(c, "PreprocessedFiles", { "id": "a", "status": 0, "contents": "bar" }) dbutil.sql_insert_dict(c, "PreprocessedFiles", { "id": "b", "status": 1, "contents": "car" }) db.commit() c = db.cursor() assert dbutil.num_rows_in(db_path, "ContentFiles") == 1 assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 2 assert dbutil.cc(db_path, "ContentFiles", "contents") == 3 assert dbutil.cc(db_path, "ContentFiles", "id") == 1 assert dbutil.lc(db_path, "ContentFiles", "contents") == 1 dbutil.remove_bad_preprocessed(db_path) assert dbutil.num_rows_in(db_path, "ContentFiles") == 1 # remove_bad_preprocessed doesn't actually delete any rows, just # replaces contents assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 2 dbutil.remove_preprocessed(db_path) assert dbutil.num_rows_in(db_path, "ContentFiles") == 1 assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 0
def merge(outpath, inpaths=[]): if not fs.isfile(outpath): dbutil.create_db(outpath) log.info("created", outpath) db = dbutil.connect(outpath) if not inpaths: inpaths = get_all_sampler_datasets() for inpath in inpaths: log.info("merging from", inpath) c = db.cursor() c.execute("ATTACH '{}' AS rhs".format(inpath)) c.execute("INSERT OR IGNORE INTO ContentFiles " "SELECT * FROM rhs.ContentFiles") c.execute("INSERT OR IGNORE INTO PreprocessedFiles " "SELECT * FROM rhs.PreprocessedFiles") c.execute("DETACH rhs") db.commit() explore.explore(outpath)
def sample(self, model: Model, quiet: bool = False) -> None: """ Sample CLgen model. Arguments: model (Model): CLgen model. """ cache = self.cache(model) # create samples database if it doesn't exist if not cache["kernels.db"]: dbutil.create_db(fs.path(cache.path, "kernels.tmp.db")) cache["kernels.db"] = fs.path(cache.path, "kernels.tmp.db") batch_i = 0 while True: # stop if we have enough kernels has_max_kernels = self.max_kernels >= 0 num_good_kernels = dbutil.num_good_kernels(cache["kernels.db"]) if has_max_kernels and num_good_kernels >= self.max_kernels: return # stop if we've done enough batches has_max_batches = self.max_batches >= 0 if has_max_batches and batch_i >= self.max_batches: return batch_i += 1 print("sample batch", batch_i, "...") self.sample_iteration(model, quiet=quiet) print() explore(self.cache(model)["kernels.db"]) log.info("samples database:", cache["kernels.db"])
def _main(db_file: Path, github: bool) -> None: """ Create an empty OpenCL kernel database. """ dbutil.create_db(db_file, github) print(fs.abspath(db_file))