def evaluate(model, sampler): """ evaluate sampling efficiency """ print("starting sampling") sampler.sample(model) print("preprocessing sample") sample_db = sampler.cache(model)["kernels.db"] preprocess.preprocess_db(sample_db) num_kernels = dbutil.num_rows_in(sample_db, "ContentFiles") num_good_kernels = dbutil.num_good_kernels(sample_db) num_ugly_kernels = dbutil.num_rows_in(sample_db, "PreprocessedFiles", "WHERE status=2") discard_rate = 1 - (num_good_kernels / num_kernels) ugly_rate = 1 - (num_ugly_kernels / num_kernels) total_charcount = dbutil.cc(sample_db, "ContentFiles") good_charcount = dbutil.cc(sample_db, "PreprocessedFiles", condition="WHERE status=0") return { "argspec": sampler.kernel_opts["args"], "host": system.HOSTNAME, "date": time.nowstr(), "num_kernels": num_kernels, "num_good_kernels": num_good_kernels, "discard_rate": discard_rate, "ugly_rate": ugly_rate, "total_charcount": total_charcount, "good_charcount": good_charcount, "corpus_dir": model.corpus.cache.path, "model_dir": model.cache.path, "sampler_dir": sampler.cache(model).path, }
def evaluate(model, sampler): """ evaluate sampling efficiency """ model.cache.empty() # clear checkpoint cache print("starting training") tstart = time() # start timer model.train() # train model training_time = time() - tstart # clear the sample cache sampler.cache(model).empty() # sample kernels and time print("starting sampling") tstart = time() sampler.sample(model) tend = time() elapsed = tend - tstart # preprocess sample sample_db = sampler.cache(model)["kernels.db"] preprocess.preprocess_db(sample_db) num_kernels = dbutil.num_rows_in(sample_db, "ContentFiles") num_good_kernels = dbutil.num_good_kernels(sample_db) num_ugly_kernels = dbutil.num_rows_in(sample_db, "PreprocessedFiles", "WHERE status=2") discard_rate = 1 - (num_good_kernels / num_kernels) ugly_rate = 1 - (num_ugly_kernels / num_kernels) total_charcount = dbutil.cc(sample_db, "ContentFiles") good_charcount = dbutil.cc(sample_db, "PreprocessedFiles", condition="WHERE status=0") efficiency = good_charcount / total_charcount throughput = good_charcount / elapsed return { "training_time": training_time, "sampling_time": elapsed, "num_kernels": num_kernels, "num_good_kernels": num_good_kernels, "discard_rate": discard_rate, "ugly_rate": ugly_rate, "total_charcount": total_charcount, "good_charcount": good_charcount, "efficiency": efficiency, # good_chars / total_chars "throughput": throughput, # good_chars / second "corpus_dir": model.corpus.cache.path, "model_dir": model.cache.path, "sampler_dir": sampler.cache(model).path, }
def test_insert(): db_path = tests.data_path("db", "tmp.db", exists=False) fs.rm(db_path) dbutil.create_db(db_path) db = dbutil.connect(db_path) c = db.cursor() assert dbutil.num_rows_in(db_path, "ContentFiles") == 0 dbutil.sql_insert_dict(c, "ContentFiles", {"id": "a", "contents": "foo"}) dbutil.sql_insert_dict(c, "PreprocessedFiles", { "id": "a", "status": 0, "contents": "bar" }) dbutil.sql_insert_dict(c, "PreprocessedFiles", { "id": "b", "status": 1, "contents": "car" }) db.commit() c = db.cursor() assert dbutil.num_rows_in(db_path, "ContentFiles") == 1 assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 2 assert dbutil.cc(db_path, "ContentFiles", "contents") == 3 assert dbutil.cc(db_path, "ContentFiles", "id") == 1 assert dbutil.lc(db_path, "ContentFiles", "contents") == 1 dbutil.remove_bad_preprocessed(db_path) assert dbutil.num_rows_in(db_path, "ContentFiles") == 1 # remove_bad_preprocessed doesn't actually delete any rows, just # replaces contents assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 2 dbutil.remove_preprocessed(db_path) assert dbutil.num_rows_in(db_path, "ContentFiles") == 1 assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 0