def analyze(args): collector = WST_ArangoTreeCollector( args.repo_url, workers=args.workers, database_conn=args.db, commit_sha=args.target_commit, ) collector.setup() log.debug(f"Set up collector: {collector}") if args.interactive_debug: log.warn("Starting debugging:") bpdb.set_trace() try: collector.collect_all(overwrite_incomplete=args.overwrite_incomplete) except RepoExistsError as e: if args.skip_exists: log.warn( f"Skipping collection since repo document already present for commit {collector._current_commit_hash}" ) return else: raise except Exception as e: log.crit(f"{collector} run failed.") raise e
def setup(self): """Clone the repo, connect to the DB, create working directories, etc.""" self._connect_db() repo = self._get_git_repo() if self._current_commit is None: log.warn( f"Deleting and re-cloning repo in {self._local_repo_path}") try: shutil.rmtree(self._local_repo_path) repo = self._get_git_repo() except Exception as e: log.error(f"Failed to repair repository: {type(e)}: {e}") raise e # to _target_commit if set if self._target_commit and self._target_commit != self._current_commit_hash: log.info(f"Checking out commit {self._target_commit}...") try: commit = repo.get(self._target_commit) log.debug(f"target commit {commit}") # commit might not exist for a variety of reasons (need to fetch, DNE, corrupt, etc) repo.checkout_tree(commit.tree) repo.head.set_target(commit.id) except Exception as e: raise e log.info( f"Repo at {self._local_repo_path} now at {self._current_commit_hash}" ) elif self._target_commit and self._target_commit == self._current_commit_hash: log.debug( f"Repo in {self._local_repo_path} is already at {self._target_commit}" )
def __main__(): parser = argparse.ArgumentParser() parser.add_argument("query", type=str, help="S-exp query to execute") parser.add_argument("-v", "--verbose", help="Increase output verbosity", action="store_true") parser.add_argument("--node-text", help="Show the text content of the matched nodes", action="store_true") args = parser.parse_args() if args.verbose: log.setLevel(log.DEBUG) log.debug("Verbose logging enabled.") parsed_s_query = sexp.parseString(args.query) log.debug(parsed_s_query) r = find_nodes_by_query(parsed_s_query) rl = [] for n in r: log.info(f"{n} in {n.file.fetch()}") n_sexp = node_as_sexp(n, maxdepth=3, indent=2, show_start_coords=True) log.info(f"{n_sexp}") if args.node_text: log.info(f"{n.text.fetch()}") rl.append(n) log.info(f"{len(rl)} results returned")
def database_init(args): client = ArangoClient(hosts=strip_url(args.db)) p = urlparse(args.db) odb = client.db(p.path[1:], username=p.username, password=p.password) if args.delete: log.warn(f"deleting all data ...") # deleting old stuff could take awhile jobs = [] db = odb.begin_async_execution() jobs.append( db.delete_graph(tree_models._graph_name, ignore_missing=True)) for c in _db_collections: jobs.append(db.delete_collection(c, ignore_missing=True)) for c in _db_edgecollections: jobs.append(db.delete_collection(c, ignore_missing=True)) jt_wait = len(jobs) while len(jobs) > 0: time.sleep(1) for j in jobs: if j.status() == 'done': jobs.remove(j) if jt_wait != len(jobs): log.debug(f"delete: waiting on {len(jobs)} jobs to finish ...") jt_wait = len(jobs) # back to non-async db = odb log.info(f"Creating collections ...") colls = {} for cn in _db_collections: if db.has_collection(cn): colls[cn] = db.collection(cn) else: colls[cn] = db.create_collection(cn, user_keys=True) for cn in _db_edgecollections: if db.has_collection(cn): colls[cn] = db.collection(cn) else: colls[cn] = db.create_collection(cn, user_keys=True, edge=True) graph = None if not db.has_graph(tree_models._graph_name): graph = db.create_graph(tree_models._graph_name) else: graph = db.graph(tree_models._graph_name) edgedefs = {} for gk, gv in _graph_edge_definitions.items(): if not graph.has_edge_definition(gv['edge_collection']): log.debug(f"Added graph edges {gv}") edgedefs[gk] = graph.create_edge_definition(**gv)
def collect_all(self, existing_node_q=None, overwrite_incomplete: bool = False): """Creates every node down the tree for this repo""" # create the main Repos self._tree_repo = WSTRepository( type='git', url=self.repo_url, path=self._url_path, analyzed_time=int(time.time()), wst_status="started", ) # self._coll['wstrepos'].insert(nr.__dict__) try: self._tree_repo.insert_in_db(self._db) except arango.exceptions.DocumentInsertError as e: if e.http_code == 409: existing_repo = WSTRepository.get(self._db, self._tree_repo._key) if overwrite_incomplete and existing_repo.wst_status != "completed": log.warn( f"Overwriting prior WSTRespository, status was '{existing_repo.wst_status}'" ) self._tree_repo.update_in_db(self._db) else: raise RepoExistsError(f"Already present: {existing_repo}") else: raise e # attempt to find an existing commit in the db: if not (commit := WSTCommit.get(self._db, self._current_commit_hash)): _cc = self._current_commit self._wst_commit = WSTCommit( _key=_cc.hex, commit_time=_cc.commit_time, commit_time_offset=_cc.commit_time_offset, parent_ids=[str(i) for i in _cc.parent_ids], tree_id=str(_cc.tree_id), ) log.debug(f"Inserting {self._wst_commit}") self._wst_commit.insert_in_db(self._db)
def _tqdm_node_receiver(q, en_manager): """This is the cross-process aggregator for non-required data Even without this process the collection and analysis should run normally. It's mostly just used for debugging and informational output. """ try: log.debug(f"start counting db inserts...") n = 0 cache_stats = { "text_lfu_hit": 0, "text_lfu_miss": 0, } dedup_stats = {} cntr = en_manager.counter(desc="writing to db", position=1, unit='docs', autorefresh=True) # with tqdm(desc="writing documents to db", position=1, unit='docs', unit_scale=True) as tbar: while (nc := q.get()) is not None: if type(nc) == int: n += nc cntr.update(nc) elif nc[0] == "cache_stats": for k, v in nc[1].items(): cache_stats[k] += v elif nc[0] == "dedup_stats": if nc[1] not in dedup_stats: dedup_stats[nc[1]] = 0 dedup_stats[nc[1]] += nc[2] else: log.error( f"node receiver process got invalid data sent of type {type(nc)}" ) log.info(f"stopped counting nodes, total documents inserted: {n}") cache_text_lfu_ratio = cache_stats["text_lfu_hit"] / ( cache_stats["text_lfu_miss"] or 1) log.debug( f"text_lfu cache stats: ratio {cache_text_lfu_ratio}, hit {cache_stats['text_lfu_hit']}" ) return True
def build_dask_dataframe_for_file(lang: TreeSitterAutoBuiltLanguage, file: str): tree = lang.parse_file(file) cur = tree.walk() # cur = TreeSitterCursorIterator(cur, nodefilter=lambda x: x.is_named) cur = TreeSitterCursorIterator(cur) log.debug(f"{cur}") cols = ["repo", "file", "x1", "y1", "x2", "y2", "type", "text"] nl = [] for node in cur: # log.trace(log.debug, f"{node.type}: {node.text.tobytes().decode('utf-8')}") nl.append([ -1, file, *node.start_point, *node.end_point, node.type, node.text.tobytes() ]) ndb = db.from_sequence(nl) ndf = ndb.to_dataframe(columns=cols) return ndf.persist().repartition(1)
def __main__(): parser = argparse.ArgumentParser() parser.add_argument("--nworkers", type=int, help="number of workers", default=os.cpu_count()) parser.add_argument("--jobitems", type=int, help="items in a single job", default=200) parser.add_argument("--njobs", type=int, help="number of total jobs to complete", default=200) parser.add_argument("--itemtime", type=float, help="time taken per item in a job", default=0.1) parser.add_argument("-v", "--verbose", help="Increase output verbosity", action="store_true") args = parser.parse_args() if args.verbose: log.setLevel(log.DEBUG) log.debug("Verbose logging enabled.") with ProcessPool(max_workers=args.nworkers) as executor: multiprogress.main_proc_setup() multiprogress.start_server_thread() en_manager_proxy = multiprogress.get_manager_proxy() en_manager = multiprogress.get_manager() ret_futures = [] # log.debug(f"counter_generator: {repr(counter_generator)}") log.info(f"Starting jobs...") for i in range(args.njobs): ret_futures.append( executor.schedule( slow_worker, (i, args.jobitems, args.itemtime, en_manager_proxy))) log.info(f"Waiting for jobs to complete...") cntr_all_jobs = en_manager.counter(desc="all jobs", total=args.njobs, color='blue') log.debug(f"cntr_all_jobs: {repr(cntr_all_jobs)}") for f in futures.as_completed(ret_futures): f.result() log.debug(f"finished a job!") cntr_all_jobs.update() log.info(f"All jobs completed!")
def __main__(): parser = argparse.ArgumentParser() parser.add_argument("--db", "--database", type=str, help="Database connection string", default=os.environ.get( 'WST_DB_URI', "http://*****:*****@localhost:8529/wst")) parser.add_argument("-v", "--verbose", help="Increase output verbosity", action="store_true") parser.set_defaults(en_manager=enlighten.get_manager()) subcmds = parser.add_subparsers(title="Collector commands") # analysis cmd_analyze = subcmds.add_parser('analyze', aliases=['add', 'a'], help="Analyze repositories") cmd_analyze.set_defaults(func=analyze) cmd_analyze.add_argument("repo_url", type=str, help="URI for cloning the repository") cmd_analyze.add_argument( "-w", "--workers", type=int, help= "Number of workers to use for processing files, default: os.cpu_count()", default=None) cmd_analyze.add_argument( "--skip-exists", "--skip-existing", action="store_true", help= "Skip the analysis if the repo document already exists in the database" ) cmd_analyze.add_argument( "--interactive-debug", action="store_true", help="Start the interactive debugger after repo setup") cmd_analyze.add_argument( "--overwrite-incomplete", action="store_true", help="Overwrite existing but incomplete / unfinished data in the DB") cmd_analyze.add_argument( "-t", "--target-commit", type=str, help="Checkout and analyze a specific commit from the repo", default=None) # batch analysis cmd_batch = subcmds.add_parser( 'batch', aliases=['addbatch', 'addmulti'], help="Analyze multiple repos from a JSON specification list") set_batch_analyze_args(cmd_batch) # delete data selectively cmd_delete = subcmds.add_parser('delete', aliases=['del'], help="Delete tree data selectively") cmd_delete.set_defaults(func=delete) cmd_delete.add_argument( "which_repo", type=str, help="URI or commit SHA for which repo's data to delete") # db setup cmd_db = subcmds.add_parser('db', aliases=['database'], help="Manage the database") subcmds_db = cmd_db.add_subparsers(title="Manage the database") cmd_db_init = subcmds_db.add_parser('initialize', aliases=['init', 'setup'], help="Set up the database") cmd_db_init.set_defaults(func=database_init) cmd_db_init.add_argument( "-d", "--delete", help="Delete any existing data in the database", action="store_true", ) args = parser.parse_args() if args.verbose: log.setLevel(log.DEBUG) log.debug("Verbose logging enabled.") log.info(f"DB connection: {desensitize_url(args.db)}") if 'func' not in args: log.warn(f"Please supply a valid subcommand!") return try: args.func(args) except KeyboardInterrupt as e: log.warn(f"Stopping all child processes...") cur_proc = psutil.Process() children = cur_proc.children(recursive=True) for c in children: os.kill(c.pid, signal.SIGINT) psutil.wait_procs(children, timeout=5) children = cur_proc.children(recursive=True) for c in children: c.terminate() raise e
args = parser.parse_args() if args.verbose: log.setLevel(log.DEBUG) client = ArangoClient(hosts=strip_url(args.db)) p = urlparse(args.db) db = client.db(p.path[1:], username=p.username, password=p.password) lang = TreeSitterAutoBuiltLanguage(args.language) tree = lang.parse_file(args.file_path) cur = tree.walk() cur = TreeSitterCursorIterator(cur, nodefilter=lambda x: True) log.debug(cur) root = cur.peek() test_id = str(uuid.uuid4().hex) repo = WSTRepository( _key="wst0test0461b1c841f897cbd952354370471a64", type='test', url=f"wst.tests.insertion/{test_id}", commit="wst0test0461b1c841f897cbd952354370471a64", path=f"wst/tests/{test_id}", ) repo.insert_in_db(db) file = WSTFile( _key="wst0test0461b1c841f897cbd952354370471a64-0",
def _local_repo_path(self): cachedir = LocalCache.get_local_cache_dir() / 'collector_repos' if not cachedir.exists(): cachedir.mkdir(mode=0o770, exist_ok=True) log.debug(f"created dir {cachedir}") return cachedir.joinpath(self._url_path)
def batch_analyze(args): repo_list_file = Path(args.repo_list_file) if not repo_list_file.exists(): log.err(f"Input file not found: {args.repo_list_file}") try: with repo_list_file.open('r') as f: repolist = json.load(f) except Exception as e: log.err(f"Failed to read repo list file") raise client = ArangoClient(hosts=strip_url(args.db)) p = urlparse(args.db) db = client.db(p.path[1:], username=p.username, password=p.password) batch_id = uuid.uuid4().hex log.info(f"Batch ID {batch_id}") _mp_manager = Manager() node_q = _mp_manager.Queue() log.debug(f"checking {len(repolist)} items in repo list") try: multiprogress.main_proc_setup() multiprogress.start_server_thread() en_manager_proxy = multiprogress.get_manager_proxy() en_manager = multiprogress.get_manager() node_receiver = _tqdm_node_receiver(node_q, en_manager_proxy) with ProcessPool(max_workers=args.jobs) as executor: ret_futures = [] all_repos_sched_cntr = en_manager.counter(desc="adding repo jobs", total=len(repolist), unit='repos') for repo in repolist: ret_futures.append( executor.schedule(repo_worker, (repo, node_q), { 'workers': args.workers, 'database_conn': args.db })) all_repos_sched_cntr.update() all_repos_sched_cntr.close() all_repos_cntr = en_manager.counter(desc="repos in batch", total=len(repolist), unit='repos', autorefresh=True) try: for r in futures.as_completed(ret_futures): try: repo_dict, tr = r.result() except RepoExistsError as e: if args.skip_exists: log.debug(f"{e}") all_repos_cntr.update() continue else: log.err(f"{e}") raise e # save the original repo data to the db as well: tr.wst_extra = {"wst_batch": batch_id, **repo_dict} tr.update_in_db(db) all_repos_cntr.update() except KeyboardInterrupt as e: log.warn(f"stopping batch worker pool...") executor.stop() for rf in ret_futures: rf.cancel() log.warn(f"waiting for already started jobs to finish...") executor.join() finally: try: node_q.put(None) receiver_exit = node_receiver.result(timeout=1) except (BrokenPipeError, KeyboardInterrupt) as e: pass
# link target probably not within our repo dir file.symlink['relative'] = None file_shake_256.update(str(target).encode()) file.content_hash = file_shake_256.hexdigest(64) else: raise UnhandledGitFileMode(f"{file.path} mode is {oct(file.mode)}") try: file.insert_in_db(db) (wst_commit / file).insert_in_db(db) # commit -> file except arango.exceptions.DocumentInsertError as e: if e.http_code == 409: # already exists: get it preexisting_file = WSTFile.get(db, file._key) if preexisting_file != file: log.debug(f"existing file: {preexisting_file}") log.debug(f"new file: {file}") if overwrite_errored_docs and preexisting_file.error: log.warn( f"Overwriting errored WSTFile, prior error: {preexisting_file.error}, new error: {file.error}" ) file.update_in_db(db) (wst_commit / file).insert_in_db( db, overwrite=True) # commit -> file else: raise PrerequisiteStateInvalid( f"WSTFile {file._key} already exists but has mismatched data" ) else: # WSTFiles are equivalent, dedup (wst_commit / preexisting_file).insert_in_db( db, overwrite=overwrite_errored_docs)