def train(db_path: str, out_path: str, **kwargs) -> None: """ Generate corpus. Arguments: db_path (str): Dataset. out_path (str): Corpus path. **kwargs (dict): Additional arguments to create_corpus(). """ db = dbutil.connect(db_path) db.create_function("LC", 1, linecount) # auto-detect whether it's a GitHub repo kwargs['gh'] = dbutil.is_github(db) ret = create_corpus(db, out_path, **kwargs) if ret: sys.exit(ret)
def explore(db_path: str, graph: bool = False) -> None: """ Run exploratory analysis on dataset. Arguments: db_path (str): Path to dataset. graph (bool, optional): Render graphs. """ locale.setlocale(locale.LC_ALL, 'en_GB.utf-8') db = dbutil.connect(db_path) if dbutil.is_github(db): db.close() explore_gh(db_path) return if graph and not os.path.exists(IMG_DIR): os.makedirs(IMG_DIR) # Worker process pool pool, jobs = Pool(processes=4), [] if graph: jobs.append(pool.apply_async(graph_ocl_lc, (db_path, ))) # TODO: If GH dataset: # jobs.append(pool.apply_async(graph_ocl_stars, (db_path,))) future_stats = pool.apply_async(stats_worker, (db_path, )) # Wait for jobs to finish [job.wait() for job in jobs] # Print stats print() stats = future_stats.get() maxlen = max([len(x[0]) for x in stats]) for stat in stats: k, v = stat if k: print(k, ':', ' ' * (maxlen - len(k) + 2), v, sep='') elif v == '': print(k) else: print()
def fetch_repos(db_path: Path, indir: Path, lang: clgen.Language) -> None: db = dbutil.connect(db_path) if not dbutil.is_github(db): raise clgen.UserError("not a GitHub database") c = db.cursor() for directory in fs.ls(indir, abspaths=True): # hacky hardcoded interpretation of `git remote -v` gitdir = fs.path(directory, ".git") output = subprocess.check_output( ["git", "--git-dir", gitdir, "remote", "-v"], universal_newlines=True) url = output.split("\n")[0].split("\t")[1].split(" ")[0] name = fs.basename(directory) output = subprocess.check_output( f"git --git-dir {gitdir} rev-list --format=format:'%ai' " + f"--max-count=1 $(git --git-dir {gitdir} rev-parse HEAD) | tail -n1", shell=True, universal_newlines=True) try: updated_at = dateutil.parser.parse(output) except ValueError: log.error(f"failed to process {name} {url}") continue c.execute("SELECT updated_at FROM Repositories WHERE url=?", (url, )) cached_updated_at = c.fetchone() # Do nothing unless updated timestamps don't match # if cached_updated_at and cached_updated_at[0] >= updated_at: # log.verbose(name, "already in database") # continue c.execute("DELETE FROM Repositories WHERE url=?", (url, )) c.execute("INSERT INTO Repositories VALUES(?,?,?,?,?,?,?,?,?)", (url, "<unknown>", name, 0, 0, 0, 0, updated_at, updated_at)) name_str = " -o ".join( [f"-name '*{ext}'" for ext in clgen.file_extensions(lang)]) output = subprocess.check_output( f"find {directory} -type f {name_str} | grep -v '.git/' || true", shell=True, universal_newlines=True) files = [x.strip() for x in output.split("\n") if x.strip()] # nothing to import if not len(files): # log.verbose("no files in", name) continue log.verbose("processing", len(files), "files in", name) for path in files: relpath = path[len(directory) + 1:] try: contents = inline_fs_headers(path, [], lang=lang) sha = crypto.sha1_str(contents) c.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)', (sha, contents)) c.execute( "INSERT OR IGNORE INTO ContentMeta VALUES(?,?,?,?,?)", (sha, relpath, url, sha, len(contents))) except UnicodeDecodeError: log.warning("non UTF-8 file", path) db.commit() c = db.cursor()
def explore(db_path: str) -> None: """ Run exploratory analysis on dataset. Parameters ---------- db_path : str Path to dataset. """ locale.setlocale(locale.LC_ALL, 'en_GB.utf-8') db = dbutil.connect(db_path) if dbutil.is_github(db): db.close() explore_gh(db_path) return db = dbutil.connect(db_path) c = db.cursor() stats = [] # ContentFiles c.execute("SELECT Count(DISTINCT id) from ContentFiles") nb_uniq_ocl_files = c.fetchone()[0] stats.append(('Number of content files', _bigint(nb_uniq_ocl_files))) c.execute("SELECT contents FROM ContentFiles") code = c.fetchall() code_lcs = [len(x[0].split('\n')) for x in code] code_lcs.sort() code_lc = sum(code_lcs) stats.append(('Total content line count', _bigint(code_lc))) stats.append(('Content file line counts', _seq_stats(code_lcs))) stats.append(('', '')) # Preprocessed c.execute("SELECT Count(*) FROM PreprocessedFiles") nb_pp_files = c.fetchone()[0] ratio_pp_files = _safe_div(nb_pp_files, nb_uniq_ocl_files) stats.append( ('Number of preprocessed files', _bigint(nb_pp_files) + ' ({:.0f}%)'.format(ratio_pp_files * 100))) c.execute("SELECT Count(*) FROM PreprocessedFiles WHERE status=0") nb_pp_files = c.fetchone()[0] ratio_pp_files = _safe_div(nb_pp_files, nb_uniq_ocl_files) stats.append( ('Number of good preprocessed files', _bigint(nb_pp_files) + ' ({:.0f}%)'.format(ratio_pp_files * 100))) c.execute('SELECT contents FROM PreprocessedFiles WHERE status=0') bc = c.fetchall() pp_lcs = [len(x[0].split('\n')) for x in bc] pp_lcs.sort() pp_lc = sum(pp_lcs) ratio_pp_lcs = _safe_div(pp_lc, code_lc) stats.append(('Lines of good preprocessed code', _bigint(pp_lc) + ' ({:.0f}%)'.format(ratio_pp_lcs * 100))) stats.append(('Good preprocessed line counts', _seq_stats(pp_lcs))) stats.append(('', '')) # Print stats print() maxlen = max([len(x[0]) for x in stats]) for stat in stats: k, v = stat if k: print(k, ':', ' ' * (maxlen - len(k) + 2), v, sep='') elif v == '': print(k) else: print()
def test_is_github(self): self.assertFalse(dbutil.is_github(tests.db('empty'))) self.assertTrue(dbutil.is_github(tests.db('empty-gh')))