def content_db(db_path: str, in_db_path: str, table: str='PreprocessedFiles') -> None: """ Fetch kernels from a content database. Arguments: db_path (str): Output path. in_db_path (str): Input path. table (str, optional): Table to fetch from. """ odb = dbutil.connect(db_path) idb = dbutil.connect(in_db_path) ic = idb.cursor() ic.execute('SELECT id,contents FROM {}'.format(table)) rows = ic.fetchall() for id, contents in rows: kernels = clutil.get_cl_kernels(contents) ids = [clgen.checksum_str(kernel) for kernel in kernels] # print("{} kernels in {}".format(len(kernels), id)) for kid, kernel in zip(ids, kernels): oc = odb.cursor() oc.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)', (kid, kernel)) odb.commit()
def graph_bc_lc(db_path: str) -> None: """ Plot distribution of bytecode line counts. """ import matplotlib.pyplot as plt import seaborn as sns sns.set(color_codes=True) out_path = fs.path(IMG_DIR, 'bc_lcs.png') print('graph', out_path, '...') db = dbutil.connect(db_path) c = db.cursor() c.execute("SELECT contents FROM Bytecodes") ocl = c.fetchall() ocl_lcs = [len(decode(x[0]).split('\n')) for x in ocl] # Filter range data = [x for x in ocl_lcs if x < 500] sns.distplot(data, bins=20, kde=False) plt.xlabel('Line count') plt.ylabel('Number of Bytecode files') plt.title('Distribution of Bytecode lengths') plt.savefig(out_path)
def process_cl_file(db_path: str, path: str) -> None: """ Process OpenCL file. Arguments: db_path (str): Path to output database. path (str): Path to input file. Raises: FetchError: In case of IO error. """ db = dbutil.connect(db_path) c = db.cursor() log.debug("fetch {path}".format(path=fs.abspath(path))) try: contents = inline_fs_headers(path, []) except IOError: raise FetchError( "cannot read file '{path}'".format(path=fs.abspath(path))) c.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)', (path, contents)) db.commit() c.close()
def fetch_fs(db_path: str, paths: list=[]) -> None: """ Fetch from a list of files. Arguments: db_path (str): Output dataset. paths (str[]): List of file paths. """ paths = clgen.files_from_list(paths) # expand directories db = dbutil.connect(db_path) c = db.cursor() for path in paths: log.debug("fetch", path) try: contents = inline_fs_headers(path, []) except IOError: db.commit() raise FetchError( "cannot read file '{path}'".format(path=fs.abspath(path))) c.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)', (path, contents)) db.commit()
def graph_ocl_stars(db_path: str) -> None: """ Plot distribution of stargazers per file. """ import matplotlib.pyplot as plt import seaborn as sns sns.set(color_codes=True) out_path = fs.path(IMG_DIR, '/ocl_stars.png') print('graph', out_path, '...') db = dbutil.connect(db_path) c = db.cursor() c.execute('SELECT stars FROM ContentMeta LEFT JOIN Repositories ' 'ON ContentMeta.repo_url=Repositories.url') stars = [x[0] for x in c.fetchall()] # Filter range data = [x for x in stars if x < 50] sns.distplot(data, bins=20, kde=False) plt.xlabel('GitHub Stargazer count') plt.ylabel('Number of files') plt.title('Stargazers per file') plt.savefig(out_path)
def print_bytecode_features(db_path: str) -> None: """ Print Bytecode features. Arguments: db_path: Path to dataset. """ db = dbutil.connect(db_path) c = db.cursor() c.execute('SELECT sha,contents FROM Bytecodes') query = c.fetchall() uniq_features = set() for row in query: sha, contents = row features = bytecode_features(contents) # Add the table key features['sha'] = sha for key in features.keys(): uniq_features.add(key) log.info('Features:') for feature in uniq_features: log.info(' ', feature)
def _static_features(kernels_db: str) -> None: log.verbose("Static feature encoding") db = dbutil.connect(kernels_db) c = db.cursor() c.execute("SELECT id,contents FROM PreprocessedFiles WHERE status=0") for row in list(c.fetchall()): id, contents = row c.execute("DELETE FROM PreprocessedFiles WHERE id=?", (id,)) for i, kernel in enumerate(get_cl_kernels(contents)): features = get_kernel_features(kernel) kid = "{}-{}".format(id, i) if len(features) == 8: log.verbose("features", kid) feature_str = ("/* {:10} {:10} {:10} {:10} {:10} {:10}" "{:10.3f} {:10.3f} */".format( int(features[0]), int(features[1]), int(features[2]), int(features[3]), int(features[4]), int(features[5]), features[6], features[7])) newsource = feature_str + '\n' + kernel c.execute(""" INSERT INTO PreprocessedFiles (id,contents,status) VALUES (?,?,?) """, (kid, newsource, 0)) else: log.verbose("ignored", kid) c.close() db.commit()
def run(self) -> None: i = dbutil.num_rows_in(self.db_path, "ContentFiles") if not log.is_verbose(): bar = progressbar.ProgressBar(max_value=self.max_i) bar.update(self.progress()) try: while True: sample_time = time() sample = self.queue.get(timeout=60) kernels = clutil.get_cl_kernels(sample) ids = [crypto.sha1_str(k) for k in kernels] if self.sampler_opts["static_checker"]: preprocess_opts = { "use_shim": False, "use_gpuverify": self.sampler_opts["gpuverify"] } pp = [clgen.preprocess_for_db(k, **preprocess_opts) for k in kernels] db = dbutil.connect(self.db_path) c = db.cursor() # insert raw samples for kid, src in zip(ids, kernels): dbutil.sql_insert_dict(c, "ContentFiles", {"id": kid, "contents": src}, ignore_existing=True) # insert preprocessed samples if self.sampler_opts["static_checker"]: for kid, (status, src) in zip(ids, pp): dbutil.sql_insert_dict(c, "PreprocessedFiles", { "id": kid, "status": status, "contents": src }, ignore_existing=True) c.close() db.commit() db.close() # update progress bar progress = self.progress() if not log.is_verbose(): bar.update(progress) sample_time = time() - sample_time self.sampler.stats["progress"] = progress self.sampler.stats["time"] += sample_time self.sampler._flush_meta(self.cache) # determine if we are done sampling if self.term_condition(): self.producer.stop() return finally: # always kill the sampler thread print() self.producer.stop()
def remove_bad_preprocessed(db_path: str) -> None: """ Remove all ugly and bad contents from PreprocessedFiles table. Arguments: db_path (str): Dataset. """ original_size = fs.du(db_path, human_readable=False) original_size_human_readable = fs.du(db_path, human_readable=True) log.info("vacuuming", original_size_human_readable, "database") sys.stdout.flush() # Remove contents from bad or ugly preprocessed files. db = dbutil.connect(db_path) c = db.cursor() c.execute("UPDATE PreprocessedFiles SET contents='[DELETED]' " "WHERE status=1 OR status=2") db.commit() c.close() c = db.cursor() c.execute("VACUUM") db.commit() c.close() new_size = fs.du(db_path, human_readable=False) new_size_human_readable = fs.du(db_path, human_readable=True) reduction_ratio = (1 - (new_size / original_size)) * 100 log.info("done. new size {}. ({:.0f}% reduction)".format( new_size_human_readable, reduction_ratio), sep=".")
def test_remove_preprocessed(self): tmpdb = 'test_remove_preprocessed.db' fs.cp(tests.db_path('10-kernels-preprocessed'), tmpdb) self.assertEqual(8, dbutil.num_good_kernels(tmpdb)) db = dbutil.connect(tmpdb) self.assertFalse(dbutil.is_modified(db)) db.close() dbutil.remove_preprocessed(tmpdb) self.assertEqual(0, dbutil.num_good_kernels(tmpdb)) db = dbutil.connect(tmpdb) self.assertTrue(dbutil.is_modified(db)) db.close() fs.rm(tmpdb)
def process_sample_file(db_path: str, sample_path: str, first_only: bool=False, max_kernel_len: int=5000, quiet: bool=False) -> None: """ Fetch from a CLgen sample file. Arguments: db_path (str): Output path. sample_path (str): Sample path. first_only (bool, optional): If True, only fetch the first kernel in sample. ma_kernel_len (int, optional): Maximum kernel length. """ db = dbutil.connect(db_path) c = db.cursor() with open(sample_path) as infile: sample = infile.read() i = 0 tail = 0 offset = len('__kernel void ') while True: if not quiet: print('\r\033[Kkernel', i, end='') sys.stdout.flush() # Find the starting index of the next kernel. tail = sample.find('__kernel void ', tail) # If we didn't find another kernel, stop. if tail == -1: break # Find the end index of this kernel. head = clutil.get_cl_kernel_end_idx(sample, start_idx=tail, max_len=max_kernel_len) # Look for other ends end = sample.find('__kernel void ', tail + offset, tail + offset + max_kernel_len) head = min(end, head) if end != -1 else head kernel = sample[tail:head] id = clgen.checksum_str(kernel) c.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)', (id, kernel)) tail = head i += 1 if first_only: break if not quiet: print() db.commit() c.close()
def _generate_kernel_corpus(self) -> str: """ dump all kernels into a string in a random order """ db = dbutil.connect(self.contentcache["kernels.db"]) c = db.cursor() # if preservering order, order by line count. Else, order randomly orderby = "LC(contents)" if self.opts["preserve_order"] else "RANDOM()" c.execute("SELECT PreprocessedFiles.Contents FROM PreprocessedFiles " "WHERE status=0 ORDER BY {orderby}".format(orderby=orderby)) return [row[0] for row in c.fetchall()]
def contentfiles(self) -> Iterable[str]: """ Return an iterator over all un-processed samples. Returns ------- Iterable[str] Samples. """ db = dbutil.connect(self.contentcache["kernels.db"]) c = db.cursor() query = c.execute("SELECT Contents FROM ContentFiles") for row in query.fetchall(): yield row[0]
def _generate_kernel_corpus(self) -> str: """ dump all kernels into a string in a random order """ db = dbutil.connect(self.contentcache["kernels.db"]) c = db.cursor() # if preservering order, order by line count. Else, order randomly orderby = "LC(contents)" if self.opts["preserve_order"] else "RANDOM()" c.execute("SELECT PreprocessedFiles.Contents FROM PreprocessedFiles " "WHERE status=0 ORDER BY {orderby}".format(orderby=orderby)) # If file separators are requested, insert EOF markers between files sep = '\n\n// EOF\n\n' if self.opts["eof"] else '\n\n' return sep.join(row[0] for row in c.fetchall())
def preprocessed_kernels(corpus: Corpus) -> list: """ Return an iterator over all preprocessed kernels. Arguments: corpus (Corpus): Corpus. Returns: sequence of str: Kernel sources. """ assert (isinstance(corpus, Corpus)) db = dbutil.connect(corpus.contentcache["kernels.db"]) c = db.cursor() query = c.execute("SELECT Contents FROM PreprocessedFiles WHERE status=0") for row in query.fetchall(): yield row[0]
def run(self) -> None: i = dbutil.num_rows_in(self.db_path, "ContentFiles") if not log.is_verbose(): bar = progressbar.ProgressBar(max_value=self.max_i) bar.update(self.progress()) try: while True: sample_time = time() # Block while waiting for a new sample to come in: sample = self.queue.get(timeout=120).strip() # Compute the sample ID: kid = crypto.sha1_str(sample) # Add the new sample to the database: db = dbutil.connect(self.db_path) c = db.cursor() dbutil.sql_insert_dict(c, "ContentFiles", { "id": kid, "contents": sample }, ignore_existing=True) c.close() db.commit() db.close() # update progress bar progress = self.progress() if not log.is_verbose(): bar.update(progress) sample_time = time() - sample_time self.sampler.stats["progress"] = progress self.sampler.stats["time"] += sample_time self.sampler._flush_meta(self.cache) # determine if we are done sampling if self.term_condition(): self.producer.stop() return finally: # always kill the sampler thread print() self.producer.stop()
def _preprocess_db_worker(job: dict) -> None: """Database worker thread""" db_path = job["db_in"] db_index_range = job["db_index_range"] outpath = job["json_out"] log.debug("worker", os.getpid(), outpath) db = dbutil.connect(db_path) c = db.cursor() split_start, split_end = db_index_range split_size = split_end - split_start # get the files to preprocess c.execute('SELECT id,contents FROM ContentFiles LIMIT {} OFFSET {}'.format( split_size, split_start)) with open(outpath, 'wb') as outfile: for row in c.fetchall(): id, contents = row # Get checksum of cached file: c.execute('SELECT id FROM PreprocessedFiles WHERE id=?', (id, )) result = c.fetchone() cached_id = result[0] if result else None # Check that file is modified: if id != cached_id: try: # Try and preprocess it: contents = preprocess(contents, id) status = 0 except BadCodeException as e: contents = str(e) status = 1 except UglyCodeException as e: contents = str(e) status = 2 # write result to json line = json.dumps([id, status, contents]).encode('utf-8') outfile.write(line) outfile.write('\n'.encode('utf-8')) c.close() db.close()
def stats_worker(db_path: str) -> list: """ Generate dataset stats. """ log.debug("stats worker ...") db = dbutil.connect(db_path) c = db.cursor() stats = [] # ContentFiles c.execute("SELECT Count(DISTINCT id) from ContentFiles") nb_uniq_ocl_files = c.fetchone()[0] stats.append(('Number of content files', bigint(nb_uniq_ocl_files))) c.execute("SELECT contents FROM ContentFiles") code = c.fetchall() code_lcs = [len(x[0].split('\n')) for x in code] code_lcs.sort() code_lc = sum(code_lcs) stats.append(('Total content line count', bigint(code_lc))) stats.append(('Content file line counts', seq_stats(code_lcs))) stats.append(('', '')) # Preprocessed c.execute("SELECT Count(*) FROM PreprocessedFiles WHERE status=0") nb_pp_files = c.fetchone()[0] ratio_pp_files = div(nb_pp_files, nb_uniq_ocl_files) stats.append( ('Number of good preprocessed files', bigint(nb_pp_files) + ' ({:.0f}%)'.format(ratio_pp_files * 100))) c.execute('SELECT contents FROM PreprocessedFiles WHERE status=0') bc = c.fetchall() pp_lcs = [len(x[0].split('\n')) for x in bc] pp_lcs.sort() pp_lc = sum(pp_lcs) ratio_pp_lcs = div(pp_lc, code_lc) stats.append(('Lines of good preprocessed code', bigint(pp_lc) + ' ({:.0f}%)'.format(ratio_pp_lcs * 100))) stats.append(('Good preprocessed line counts', seq_stats(pp_lcs))) stats.append(('', '')) return stats
def _scrape_github_for_files(db_path: str, github_username: str, github_pw: str, github_token: str, query_terms: List[str], file_is_intetesting, download_file_cb): global errors_counter g = Github(github_username, github_pw) db = dbutil.connect(db_path) if not dbutil.is_github: raise clgen.UserError("not a GitHub database") # fetch the repositories to iterate over for query in query_terms: # forks are okay - we use checksums to ensure uniqueness in # final dataset repos = g.search_repositories(query + ' fork:true sort:stars') for repo in repos: # do nothing unless the repo is new or modified if not _process_repo(g, db, repo): continue # iterate over the entire git tree of the repo's default branch # (usually 'master'). If a file ends with the .cl extension, check # to see if we already have it, else download it try: branch = repo.default_branch tree_iterator = repo.get_git_tree(branch, recursive=True).tree for f in tree_iterator: if file_is_intetesting(f): try: _process_file(g, github_token, db, repo, f, download_file_cb) except Exception as e: print(e) sys.exit(1) errors_counter += 1 except GithubException: # do nothing in case of error (such as an empty repo) pass _print_counters() print("\n\ndone.") db.close()
def train(db_path: str, out_path: str, **kwargs) -> None: """ Generate corpus. Arguments: db_path (str): Dataset. out_path (str): Corpus path. **kwargs (dict): Additional arguments to create_corpus(). """ db = dbutil.connect(db_path) db.create_function("LC", 1, linecount) # auto-detect whether it's a GitHub repo kwargs['gh'] = dbutil.is_github(db) ret = create_corpus(db, out_path, **kwargs) if ret: sys.exit(ret)
def explore(db_path: str, graph: bool = False) -> None: """ Run exploratory analysis on dataset. Arguments: db_path (str): Path to dataset. graph (bool, optional): Render graphs. """ locale.setlocale(locale.LC_ALL, 'en_GB.utf-8') db = dbutil.connect(db_path) if dbutil.is_github(db): db.close() explore_gh(db_path) return if graph and not os.path.exists(IMG_DIR): os.makedirs(IMG_DIR) # Worker process pool pool, jobs = Pool(processes=4), [] if graph: jobs.append(pool.apply_async(graph_ocl_lc, (db_path, ))) # TODO: If GH dataset: # jobs.append(pool.apply_async(graph_ocl_stars, (db_path,))) future_stats = pool.apply_async(stats_worker, (db_path, )) # Wait for jobs to finish [job.wait() for job in jobs] # Print stats print() stats = future_stats.get() maxlen = max([len(x[0]) for x in stats]) for stat in stats: k, v = stat if k: print(k, ':', ' ' * (maxlen - len(k) + 2), v, sep='') elif v == '': print(k) else: print()
def _finalize(db_path, cache): """Tidy up after worker threads finish""" log.debug("worker finalize") db = dbutil.connect(db_path) c = db.cursor() # import results from worker threads for outpath in fs.ls(cache.path, abspaths=True): with open(outpath) as infile: for line in infile: c.execute( 'INSERT OR REPLACE INTO PreprocessedFiles ' 'VALUES(?,?,?)', json.loads(line)) # write changes to database and remove cache db.commit() db.close() cache.empty()
def preprocess_db(db_path: str) -> bool: """ Preprocess database contents. Arguments: db_path (str): Path to database. Returns: bool: True if modified, false if no work needed. """ db = dbutil.connect(db_path) modified = dbutil.is_modified(db) if modified: preprocess_contentfiles(db_path) dbutil.set_modified_status(db, modified) return True else: return False
def test_insert(): db_path = tests.data_path("db", "tmp.db", exists=False) fs.rm(db_path) dbutil.create_db(db_path) db = dbutil.connect(db_path) c = db.cursor() assert dbutil.num_rows_in(db_path, "ContentFiles") == 0 dbutil.sql_insert_dict(c, "ContentFiles", {"id": "a", "contents": "foo"}) dbutil.sql_insert_dict(c, "PreprocessedFiles", { "id": "a", "status": 0, "contents": "bar" }) dbutil.sql_insert_dict(c, "PreprocessedFiles", { "id": "b", "status": 1, "contents": "car" }) db.commit() c = db.cursor() assert dbutil.num_rows_in(db_path, "ContentFiles") == 1 assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 2 assert dbutil.cc(db_path, "ContentFiles", "contents") == 3 assert dbutil.cc(db_path, "ContentFiles", "id") == 1 assert dbutil.lc(db_path, "ContentFiles", "contents") == 1 dbutil.remove_bad_preprocessed(db_path) assert dbutil.num_rows_in(db_path, "ContentFiles") == 1 # remove_bad_preprocessed doesn't actually delete any rows, just # replaces contents assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 2 dbutil.remove_preprocessed(db_path) assert dbutil.num_rows_in(db_path, "ContentFiles") == 1 assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 0
def preprocessed(self, status: int=0) -> Iterable[str]: """ Return an iterator over all preprocessed kernels. Parameters ---------- status : int, optional Pre-processed status, {0, 1, 2} for {good, bad, ugly}. Returns ------- Iterable[str] Sources. """ db = dbutil.connect(self.contentcache["kernels.db"]) c = db.cursor() query = c.execute( "SELECT Contents FROM PreprocessedFiles WHERE status={status}" .format(**vars())) for row in query.fetchall(): yield row[0]
def merge(outpath, inpaths=[]): if not fs.isfile(outpath): dbutil.create_db(outpath) log.info("created", outpath) db = dbutil.connect(outpath) if not inpaths: inpaths = get_all_sampler_datasets() for inpath in inpaths: log.info("merging from", inpath) c = db.cursor() c.execute("ATTACH '{}' AS rhs".format(inpath)) c.execute("INSERT OR IGNORE INTO ContentFiles " "SELECT * FROM rhs.ContentFiles") c.execute("INSERT OR IGNORE INTO PreprocessedFiles " "SELECT * FROM rhs.PreprocessedFiles") c.execute("DETACH rhs") db.commit() explore.explore(outpath)
def get_clsmith_program(db_path: str, header_paths: list=[ "~/clsmith/runtime", "~/clsmith/build"]) -> None: """ Generate a program using CLSmith and add to dataset. Arguments: db_path (str): Path to output dataset. header_paths (str[]): Directories containing CLSmith headers. """ global files_new_counter outputpath = 'CLProg.c' db = dbutil.connect(db_path) c = db.cursor() # TODO: CLSmith might not be in path cmd = ["CLSmith"] process = Popen(cmd) process.communicate() if process.returncode != 0: raise CLSmithException() with open(outputpath) as infile: contents = infile.read() contents = inline_clsmith_headers(contents, header_paths) sha = sha1(contents.encode('utf-8')).hexdigest() c.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)', (sha, contents)) db.commit() db.close() files_new_counter += 1 print_clsmith_counters()
def preprocess_db(db_path: str, **preprocess_opts) -> bool: """ Preprocess database contents. Parameters ---------- db_path : str Path to database. Returns ------- bool True if modified, false if no work needed. """ db = dbutil.connect(db_path) modified = dbutil.is_modified(db) if modified: _preprocess_db(db_path, **preprocess_opts) dbutil.set_modified_status(db, modified) return True else: return False
def clsmith(db_path: str, target_num_kernels: int) -> None: """ Generate kernels using CLSmith. Arguments: db_path (str): Path to dataset. target_num_kernels (int): Number of kernels to generate. """ global errors_counter print('generating', target_num_kernels, 'kernels to', db_path) db = dbutil.connect(db_path) c = db.cursor() c.execute('SELECT Count(*) FROM ContentFiles') num_kernels = c.fetchone()[0] while num_kernels < target_num_kernels: get_clsmith_program(db_path) c.execute('SELECT Count(*) FROM ContentFiles') num_kernels = c.fetchone()[0] print_counters() print("\n\ndone.") db.close()
def github(db_path: str, github_username: str, github_pw: str, github_token: str) -> None: """ Download all of the OpenCL on GitHub (!) Shortcomings of this appraoch: * Only includes exclusively OpenCL files, no inline strings. * Occasionally (< 1%) can't find headers to include. Arguments: db_path (str): Dataset path. github_username (str): Authorization. github_pw (str): Authorization. github_token (str): Authorization. """ global errors_counter g = Github(github_username, github_pw) db = dbutil.connect(db_path) if not dbutil.is_github: raise clgen.UserError("not a GitHub database") handle_repo = partial(process_repo, g, db) # fetch the repositories to iterate over. Since opencl isn't # treated as a first-class language by GitHub, we can't use the # 'language=' keyword for queries, so instead we through a much # wider net and filter the results afterwards. query_terms = [ 'opencl', 'cl', 'khronos', 'gpu', 'gpgpu', 'cuda', 'amd', 'nvidia', 'heterogeneous' ] for query in query_terms: # forks are okay - we use checksums to ensure uniqueness in # final dataset repos = g.search_repositories(query + ' fork:true sort:stars') for repo in repos: repo_modified = handle_repo(repo) # do nothing unless the repo is new or modified if not repo_modified: continue handle_file = partial(process_file, g, github_token, db, repo) # iterate over the entire git tree of the repo's default # branch (usually 'master'). If a file ends with the .cl # extension, check to see if we already have it, else download # it try: branch = repo.default_branch tree_iterator = repo.get_git_tree(branch, recursive=True).tree for f in tree_iterator: try: handle_file(f) except Exception: errors_counter += 1 except GithubException: # do nothing in case of error (such as an empty repo) pass print_counters() print("\n\ndone.") db.close()