def suggest_naming(self, file_path: Path, prj_root: Optional[Path] = None): """ Processes a file to get its lemmas and runs the model to get predictions. """ # Figure out which project we're at, and then load configs if prj_root is None: prj_root = RoosterizeDirUtils.auto_infer_project_root(file_path) self.load_configs(prj_root) # Infer SerAPI options serapi_options = self.infer_serapi_options(prj_root) # If user provided compile_cmd, first compile the project if self.compile_cmd is not None: with IOUtils.cd(prj_root): BashUtils.run(self.compile_cmd, expected_return_code=0) # Parse file data = self.parse_file(file_path, prj_root, serapi_options) # Load model self.load_local_model(prj_root) model = self.get_model() # Use the model to make predictions # Temp dirs for processed data and results temp_data_dir = Path(tempfile.mkdtemp(prefix="roosterize")) # Dump lemmas & definitions temp_raw_data_dir = temp_data_dir / "raw" temp_raw_data_dir.mkdir() IOUtils.dump( temp_raw_data_dir / "lemmas.json", IOUtils.jsonfy(data.lemmas), IOUtils.Format.json, ) IOUtils.dump( temp_raw_data_dir / "definitions.json", IOUtils.jsonfy(data.definitions), IOUtils.Format.json, ) # Model-specific process temp_processed_data_dir = temp_data_dir / "processed" temp_processed_data_dir.mkdir() model.process_data_impl(temp_raw_data_dir, temp_processed_data_dir) # Invoke eval candidates_logprobs = model.eval_impl( temp_processed_data_dir, beam_search_size=self.beam_search_size, k=self.k, ) # Save predictions IOUtils.rm_dir(temp_data_dir) # Report predictions self.report_predictions(data, candidates_logprobs) return
def download_global_model(self, force_yes: bool = False): """ Downloads a global Roosterize model. """ global_model_dir = RoosterizeDirUtils.get_global_model_dir() if global_model_dir.exists(): ans = self.ask_for_confirmation( f"A Roosterize model already exists at {global_model_dir}. " f"Do you want to delete it and download again?") if force_yes: ans = True if ans != True: return IOUtils.rm_dir(global_model_dir) self.show_message("Downloading Roosterize model...") # Download and unpack temp_model_dir = Path(tempfile.mkdtemp(prefix="roosterize")) urllib.request.urlretrieve(self.model_url, str(temp_model_dir / "model.tgz")) with IOUtils.cd(temp_model_dir): BashUtils.run("tar xzf model.tgz", expected_return_code=0) # Move the stuff to global model place shutil.move(str(Path.cwd() / "model"), global_model_dir) # Delete temp dir IOUtils.rm_dir(temp_model_dir) self.show_message("Finish downloading Roosterize model.")
def prepare_data(self): data_prefix = f"{self.eval_setting}-{self.year}" data_dir = self.work_dir / "data" self.logger.info( f"Preparing the data for {self.eval_setting} {self.year} at {data_dir}" ) IOUtils.rm_dir(data_dir) IOUtils.mk_dir(data_dir) # Copy train/val/test_common/test_standard data BashUtils.run( f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.train}/train {data_dir}/train", expected_return_code=0) BashUtils.run( f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.val}/valid {data_dir}/valid", expected_return_code=0) BashUtils.run( f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.test_common}/test {data_dir}/{Macros.test_common}", expected_return_code=0) BashUtils.run( f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/test {data_dir}/{Macros.test_standard}", expected_return_code=0) # Copy vocab BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/vocab* {data_dir}/", expected_return_code=0) return
def parse_file(self, file_path: Path, prj_root: Path, serapi_options: str): source_code = IOUtils.load(file_path, IOUtils.Format.txt) unicode_offsets = ParserUtils.get_unicode_offsets(source_code) with IOUtils.cd(prj_root): rel_path = file_path.relative_to(prj_root) ast_sexp_str = BashUtils.run( f"sercomp {serapi_options} --mode=sexp -- {rel_path}", expected_return_code=0).stdout tok_sexp_str = BashUtils.run( f"sertok {serapi_options} -- {rel_path}", expected_return_code=0).stdout ast_sexp_list: List[SexpNode] = SexpParser.parse_list(ast_sexp_str) tok_sexp_list: List[SexpNode] = SexpParser.parse_list(tok_sexp_str) doc = CoqParser.parse_document( source_code, ast_sexp_list, tok_sexp_list, unicode_offsets=unicode_offsets, ) doc.file_name = str(rel_path) # Collect lemmas & definitions lemmas: List[Lemma] = DataMiner.collect_lemmas_doc( doc, ast_sexp_list, serapi_options) definitions: List[Definition] = DataMiner.collect_definitions_doc( doc, ast_sexp_list) return ProcessedFile(file_path, source_code, doc, ast_sexp_list, tok_sexp_list, unicode_offsets, lemmas, definitions)
def prepare_data(self): data_prefix = f"{self.eval_setting}-{self.year}" data_dir = self.work_dir / "data" self.logger.info( f"Preparing the data for {self.eval_setting} {self.year} at {self.work_dir}" ) IOUtils.rm_dir(data_dir) IOUtils.mk_dir(data_dir) # Copy train/val/test_common/test_standard data BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/code2seq.train.c2s {self.work_dir}/data/", expected_return_code=0) BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.val}/code2seq.val.c2s {self.work_dir}/data/", expected_return_code=0) BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_common}/code2seq.test.c2s {self.work_dir}/data/code2seq.{Macros.test_common}.c2s", expected_return_code=0) BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/code2seq.test.c2s {self.work_dir}/data/code2seq.{Macros.test_standard}.c2s", expected_return_code=0) # Copy vocab BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/code2seq.dict.c2s {self.work_dir}/data/", expected_return_code=0) return
def prepare_code(self): IOUtils.rm_dir(self.code_dir) IOUtils.mk_dir(self.code_dir.parent) with IOUtils.cd(self.code_dir.parent): BashUtils.run(f"git clone {self.REPO_URL} {self.code_dir.name}", expected_return_code=0) # end with with IOUtils.cd(self.code_dir): BashUtils.run(f"git checkout {self.REPO_SHA}", expected_return_code=0) # end with return
def install_coq_project(cls, project: Project, names_projects: Dict[str, Project]) -> None: """ :requires: the project is cloned and checked-out to the desired version. """ if not project.is_cloned: project.clone() project.checkout(project.data["sha"], is_forced=True) # end if # Check if the project is already compiled confirmation_file = "lpc-installed.txt" confirmation_content = project.revision + " " + BashUtils.run("opam list coq -s", expected_return_code=0).stdout.strip() if (project.checkout_dir/confirmation_file).is_file() and IOUtils.load(project.checkout_dir/confirmation_file, "txt") == confirmation_content: cls.logger.debug(f"Project {project.full_name} already installed") return # end if project.clean() # Install dependencies for dependency in project.data.get("dependencies", []): dependency_project = names_projects.get(dependency) if dependency_project is None: raise Exception(f"Cannot find dependency {dependency}") cls.logger.info(f"For Project {project.full_name}, installing dependency {dependency}") cls.install_coq_project(dependency_project, names_projects) # end for if "build_cmd" not in project.data: raise Exception(f"Project {project.full_name} does not have build_cmd") if "install_cmd" not in project.data: raise Exception(f"Project {project.full_name} does not have install_cmd") with IOUtils.cd(project.checkout_dir): # Build cls.logger.info(f"Project {project.full_name}: Building with {project.data['build_cmd']}") r = BashUtils.run(project.data["build_cmd"]) if r.return_code != 0: raise Exception(f"Compilation failed! Return code is {r.return_code}! stdout:\n{r.stdout}\n; stderr:\n{r.stderr}") else: cls.logger.debug(f"Compilation finished. Return code is {r.return_code}. stdout:\n{r.stdout}\n; stderr:\n{r.stderr}") # end if # Install cls.logger.info(f"Project {project.full_name}: Installing with {project.data['install_cmd']}") r = BashUtils.run(project.data["install_cmd"]) if r.return_code != 0: raise Exception(f"Installation failed! Return code is {r.return_code}! stdout:\n{r.stdout}\n; stderr:\n{r.stderr}") else: cls.logger.debug(f"Installation finished. Return code is {r.return_code}. stdout:\n{r.stdout}\n; stderr:\n{r.stderr}") # end if IOUtils.dump(project.checkout_dir / confirmation_file, confirmation_content, "txt") # end with return
def require_collector(cls): if cls.is_parallel: return if not cls.collector_installed: cls.logger.info("Require collector, installing ...") with IOUtils.cd(Macros.collector_dir): BashUtils.run(f"mvn clean install -DskipTests", expected_return_code=0) # end with cls.collector_installed = True else: cls.logger.debug("Require collector, and already installed") # end if return
def prepare_configs_and_scripts(self, trials: List[int]): data_dir = self.work_dir / "data" base_config = IOUtils.load(self.base_config_file, IOUtils.Format.jsonPretty) for trial in trials: trial_dir = self.work_dir / f"trial-{trial}" IOUtils.mk_dir(trial_dir) config = copy.copy(base_config) config["data_dir"] = str(data_dir) config["model_dir"] = str(trial_dir / "model") config["output"] = str(trial_dir / "output.txt") config_file = trial_dir / "config.json" IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty) training_trace_file = trial_dir / "training-trace.json" train_script_file = trial_dir / f"{Macros.train}.sh" # The gpu-id argument is necessary for tensorflow, even if we are using CUDA_VISIBLE_DEVICES train_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 __main__.py {config_file} --train -v --train-log {training_trace_file} --gpu-id $1 &> {trial_dir}/log-{Macros.train}.txt\n" IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0) for test_type in [Macros.test_common, Macros.test_standard]: output_file = trial_dir / f"output_{test_type}.txt" config["output"] = str(output_file) test_config_file = trial_dir / f"config_{test_type}.json" IOUtils.dump(test_config_file, config, IOUtils.Format.jsonPretty) test_script_file = trial_dir / f"{test_type}.sh" test_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 __main__.py {test_config_file} --eval {data_dir}/{test_type}/test.token.code {data_dir}/{test_type}/test.token.sbt {data_dir}/{test_type}/test.token.nl --gpu-id $1 &> {trial_dir}/log-{test_type}.txt\n" \ f"python3 Bleu.py {data_dir}/{test_type}/test.token.nl {trial_dir}/output_{test_type}.txt {trial_dir}/results_{test_type}.json\n" IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0) return
def prepare_configs_and_scripts(self, trials: List[int]): exp_dir = self.work_dir for trial in trials: trial_dir = exp_dir/f"trial-{trial}" IOUtils.mk_dir(trial_dir) train_script_file = trial_dir/"train.sh" train_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"module load cuda/10.1 cudnn/7.6.2\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"cd {self.code_dir}\n" \ f"export MKL_SERVICE_FORCE_INTEL=1\n"\ f"python3 train.py " \ f"-data {self.data_dir}/transformer -save_model {trial_dir}/bestTransformer "\ f"-layers 6 -rnn_size 512 -word_vec_size 512 -transformer_ff 2048 -heads 8 "\ f"-encoder_type transformer -decoder_type transformer -position_encoding "\ f"-train_steps 50000 -max_generator_batches 2 -dropout 0.1 "\ f"-batch_size 4096 -batch_type tokens -normalization tokens -accum_count 2 "\ f"-optim adam -adam_beta2 0.998 -decay_method noam -warmup_steps 8000 -learning_rate 2 " \ f"-max_grad_norm 0 -param_init 0 -param_init_glorot -early_stopping 10 -keep_checkpoint 1 " \ f"-label_smoothing 0.1 -valid_steps 500 -save_checkpoint_steps 500 -report_every 500 " \ f"--world_size 1 --gpu_ranks 0 " \ f"&> {trial_dir}/train-log.txt\n" IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0) for test_type in [Macros.test_common, Macros.test_standard]: test_script_file = trial_dir/f"{test_type}.sh" output_file = trial_dir / f"output_{test_type}.txt" test_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"module load cuda/10.1 cudnn/7.6.2\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"cd {self.code_dir}\n" \ f"export MKL_SERVICE_FORCE_INTEL=1\n"\ f"python3 translate.py "\ f"--model {trial_dir}/*.pt --output {output_file} --src {self.data_dir}/src-{test_type}.txt "\ f"&> {trial_dir}/{test_type}-log.txt\n" \ f"python3 eval_utils.py " \ f"{self.data_dir}/tgt-{test_type}.txt {output_file} {trial_dir}/results_{test_type}.json\n" IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0) # end for return
def prepare_data(self): data_prefix = f"{self.eval_setting}-{self.year}" IOUtils.rm_dir(self.data_dir) IOUtils.mk_dir(self.data_dir) BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/transformer.* {self.data_dir}/", expected_return_code=0) BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_common}/src-test.txt {self.data_dir}/src-{Macros.test_common}.txt", expected_return_code=0) BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_common}/tgt-test.txt {self.data_dir}/tgt-{Macros.test_common}.txt", expected_return_code=0) BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/src-test.txt {self.data_dir}/src-{Macros.test_standard}.txt", expected_return_code=0) BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/tgt-test.txt {self.data_dir}/tgt-{Macros.test_standard}.txt", expected_return_code=0) return
def test_for_each_revision(self): project = self.get_a_test_project() with TestSupport.get_playground_path(): Project.set_downloads_dir(Path.cwd() / "_downloads") Project.set_results_dir(Path.cwd() / "_results") # Clone project.clone() # Set up results project.init_results() # Get all revisions, compare with dumped version all_revisions = project.get_all_revisions() if len(all_revisions) < 10: print( "Too few revisions (<10) to do testing on for_each_revision. Will skip that." ) return # For each revision, count number of files project.for_each_revision( lambda p, r: p.results.dump_revision_result( r, "count_files.json", BashUtils.run("git ls-files | wc -l").stdout), all_revisions[-10:]) project.for_each_revision(lambda p, r: self.assertIsNotNone( p.results.load_revision_result(r, "count_files.json")), all_revisions[-10:], is_auto_checkout=False) # end with return
def prepare_configs_and_scripts(self, trials: List[int]): with open(self.base_config_file, "r") as f: base_config = yaml.load(f) exp_dir = self.work_dir for trial in trials: seed = random.randint(0, 9) trial_dir = exp_dir / f"trial-{trial}" IOUtils.mk_dir(trial_dir) config = copy.copy(base_config) config["data"] = str(self.data_dir / "biLSTM") config["save_model"] = str(trial_dir / "bestLSTM") config_file = trial_dir / "config.yaml" with open(config_file, "w+") as f: yaml.dump(config, f) train_script_file = trial_dir/"train.sh" train_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"module load cuda/10.1 cudnn/7.6.2\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"cd {self.code_dir}\n" \ f"export MKL_SERVICE_FORCE_INTEL=1\n"\ f"python3 train.py --config {config_file} --world_size 1 --gpu_ranks 0 -keep_checkpoint 1 " \ f"--seed {seed} &> {trial_dir}/train-log.txt\n" IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0) for test_type in [Macros.test_common, Macros.test_standard]: test_script_file = trial_dir/f"{test_type}.sh" output_file = trial_dir / f"output_{test_type}.txt" test_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"module load cuda/10.1 cudnn/7.6.2\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"cd {self.code_dir}\n" \ f"export MKL_SERVICE_FORCE_INTEL=1\n"\ f"python3 translate.py "\ f"--model {trial_dir}/*.pt --output {output_file} --src {self.data_dir}/src-{test_type}.txt "\ f"&> {trial_dir}/{test_type}-log.txt\n" \ f"python3 eval_utils.py " \ f"{self.data_dir}/tgt-{test_type}.txt {output_file} {trial_dir}/results_{test_type}.json\n" IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0) # end for return
def process_data(self, model_data_dir: Path, data_prefix: str): """ Assume we have the raw data file generated bu Bi-LSTM model processor: src-train.txt, tgt-train.txt, src-val.txt, tgt-val.txt :param model_data_dir: the dir for storing the data for transformer. :param data_prefix: e.g. evo-2020, mixedproj-2020 :return: """ self.logger.info(f"Start processing") BashUtils.run( f"onmt_preprocess -train_src {model_data_dir}/{data_prefix}-{Macros.train}/src-train.txt " f"-train_tgt {model_data_dir}/{data_prefix}-{Macros.train}/tgt-train.txt " f"-valid_src {model_data_dir}/{data_prefix}-{Macros.val}/src-val.txt " f"-valid_tgt {model_data_dir}/{data_prefix}-{Macros.val}/tgt-val.txt " f"-save_data {model_data_dir}/{data_prefix}-{Macros.train}/transformer --src_seq_length 200" f" --src_seq_length_trunc 200 --shard_size 0", expected_return_code=0)
def get_cur_cluster(cls) -> str: hostname = BashUtils.run(f"hostname").stdout.strip() if hostname.endswith("maverick2.tacc.utexas.edu"): return cls.maverick2 elif hostname.endswith("stampede2.tacc.utexas.edu"): return cls.stampede2 else: cls.logger.warning("Currently not on TACC") return cls.maverick2
def test_propagate_env(self): del os.environ[self.TEST_ENV_A_KEY] self.assertTrue(self.TEST_ENV_A_KEY not in os.environ) self.assertEqual( self.TEST_ENV_A_VALUE, BashUtils.run( f"export {self.TEST_ENV_A_KEY}={self.TEST_ENV_A_VALUE}; echo -n ${self.TEST_ENV_A_KEY}", is_update_env=True).stdout) self.assertEqual(self.TEST_ENV_A_VALUE, os.environ[self.TEST_ENV_A_KEY])
def require_special_repo(cls, directory: Path, branch: str): cls.logger.info(f"Updating {directory} to {branch} branch") if directory.exists(): if not directory.is_dir() or not (directory / ".git").is_dir(): LoggingUtils.log_and_raise( cls.logger, f"Path {directory} already exists but is not a proper git repository!", Exception) # end if with IOUtils.cd(directory): BashUtils.run(f"git pull", expected_return_code=0) # end with else: IOUtils.mk_dir(directory) with IOUtils.cd(directory): BashUtils.run( f"git clone --single-branch -b {branch} -- {cls.get_git_url()} .", expected_return_code=0)
def tacc_submit_jobs(cls, submit_script: Path, titles: List[str], scripts: List[Path], timeouts: List[str], output_dir: Path, submit_cd: int = 600, max_jobs: int = 4): job_i = 0 while job_i < len(scripts): if cls.tacc_get_num_jobs() >= max_jobs: cls.logger.warning( f"Number of running jobs reach limit {max_jobs}, will retry after {submit_cd} seconds at {time.strftime('%a, %d %b %Y %H:%M:%S +0000', time.localtime(time.time()+submit_cd))}" ) time.sleep(submit_cd) continue # end if title = titles[job_i] script = scripts[job_i] timeout = timeouts[job_i] cls.logger.info(f"Submitting script {script}") try: BashUtils.run( f"{submit_script} \"{title}\" \"{output_dir}\" \"{script}\" \"{timeout}\"", expected_return_code=0) except KeyboardInterrupt: cls.logger.warning(f"Keyboard interrupt!") break except: cls.logger.warning( f"Failed to submit, will retry after {submit_cd} seconds at {time.strftime('%a, %d %b %Y %H:%M:%S +0000', time.localtime(time.time()+submit_cd))}" ) time.sleep(submit_cd) continue # end try # Submit successfully job_i += 1 # end while return
def prepare_configs_and_scripts(self, trials: List[int]): exp_dir = self.work_dir for trial in trials: trial_dir = exp_dir / f"trial-{trial}" IOUtils.mk_dir(trial_dir) model_dir = trial_dir / "models" IOUtils.mk_dir(model_dir) log_dir = trial_dir / "logs" IOUtils.mk_dir(log_dir) data = str(exp_dir / "data/code2seq") val_data = data + ".val.c2s" train_log = trial_dir / "training-trace.json" train_script_file = trial_dir / f"{Macros.train}.sh" # Copy config file BashUtils.run( f"cp {self.base_config_file} {trial_dir}/config.yaml", expected_return_code=0) output_file = trial_dir / "output_tmp.txt" reference_file = trial_dir / "ref_tmp.txt" config_file = trial_dir / "config.yaml" train_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}\n" \ f"python -u code2seq.py --data {data} --test {val_data} --log {train_log} --config {config_file} " \ f"--pred_file {output_file} --ref_file {reference_file} "\ f"--save_prefix {model_dir}/model --gpu_id $1 &> {trial_dir}/train-log.txt" IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0) for test_type in [Macros.test_common, Macros.test_standard]: test_data = exp_dir / "data" / f"code2seq.{test_type}.c2s" output_file = trial_dir / f"output_{test_type}.txt" reference_file = trial_dir / f"ref_{test_type}.txt" test_script_file = trial_dir / f"{test_type}.sh" test_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}\n" \ f"python3 code2seq.py --load {model_dir}/model_best --test {test_data} --config {config_file} " \ f"--pred_file {output_file} --ref_file {reference_file} "\ f"--gpu_id $1 &> {trial_dir}/{test_type}-log.txt\n" \ f"python3 eval_utils.py {reference_file} {output_file} {trial_dir}/results_{test_type}.json\n" IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0) return
def process_data(self, method_data_list: List[MethodData], data_type: str, output_dir: Path, split: bool = True): Environment.require_collector() log_file = output_dir / "collector-log.txt" data_file = output_dir / "method-data.json" IOUtils.dump(data_file, IOUtils.jsonfy(method_data_list), IOUtils.Format.json) config = { "transform": True, "model": "BiLSTM", "dataType": data_type, "dataFile": str(data_file), "logFile": str(log_file), "outputDir": str(output_dir), } config_file = output_dir / "collector-config.json" IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty) self.logger.info( f"Starting the Java collector. Check log at {log_file} and outputs at {output_dir}" ) rr = BashUtils.run( f"java -jar {Environment.collector_jar} {config_file}", expected_return_code=0) if rr.stdout: self.logger.warning(f"Stdout of collector:\n{rr.stdout}") # end if if rr.stderr: self.logger.warning(f"Stderr of collector:\n{rr.stderr}") # end if # build raw dataset if split: self.tokenizeFile(output_dir / f"{data_type}.raw.txt", data_type) else: self.noSplit(output_dir / f"{data_type}.raw.txt", data_type) error_ids = IOUtils.load(str(output_dir) + "-error-ids.json") print(f"Number of error id is: {len(error_ids)}") # BashUtils.run(f"rm {output_dir}-error-ids.json", expected_return_code=0) return error_ids
def prepare_code(self): IOUtils.rm_dir(self.code_dir) IOUtils.mk_dir(self.code_dir.parent) with IOUtils.cd(self.code_dir.parent): BashUtils.run(f"git clone {self.REPO_URL} {self.code_dir.name}", expected_return_code=0) # end with with IOUtils.cd(self.code_dir): BashUtils.run(f"git checkout {self.REPO_SHA}", expected_return_code=0) # end with # copy eval code BashUtils.run(f"cp {Macros.this_dir}/eval/eval_utils.py {self.code_dir}/") return
def collect_project(self, project_name: str, project_url: str): Environment.require_collector() # 0. Download repo downloads_dir = self.repos_downloads_dir / project_name results_dir = self.repos_results_dir / project_name # Remove previous results if any IOUtils.rm_dir(results_dir) IOUtils.mk_dir(results_dir) # Clone the repo if not exists if not downloads_dir.exists(): with IOUtils.cd(self.repos_downloads_dir): with TimeUtils.time_limit(300): BashUtils.run(f"git clone {project_url} {project_name}", expected_return_code=0) # end with # end with # end if project_data = ProjectData.create() project_data.name = project_name project_data.url = project_url # 1. Get list of revisions with IOUtils.cd(downloads_dir): git_log_out = BashUtils.run(f"git log --pretty=format:'%H %P'", expected_return_code=0).stdout for line in git_log_out.splitlines()[:self.MAX_REVISIONS]: shas = line.split() project_data.revisions.append(shas[0]) project_data.parent_revisions[shas[0]] = shas[1:] # end for # end with # 2. Get revisions in different year with IOUtils.cd(downloads_dir): for year in self.YEARS: git_log_out = BashUtils.run( f"git rev-list -1 --before=\"Jan 1 {year}\" origin", expected_return_code=0).stdout project_data.year_revisions[str(year) + "_Jan_1"] = git_log_out.rstrip() # end for # end with project_data_file = results_dir / "project.json" IOUtils.dump(project_data_file, IOUtils.jsonfy(project_data), IOUtils.Format.jsonPretty) # 2. Start java collector # Prepare config log_file = results_dir / "collector-log.txt" output_dir = results_dir / "collector" config = { "collect": True, "projectDir": str(downloads_dir), "projectDataFile": str(project_data_file), "logFile": str(log_file), "outputDir": str(output_dir), "year": True # To indicate whether to collect all evo data or yearly data } config_file = results_dir / "collector-config.json" IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty) self.logger.info( f"Starting the Java collector. Check log at {log_file} and outputs at {output_dir}" ) rr = BashUtils.run( f"java -jar {Environment.collector_jar} {config_file}", expected_return_code=0) if rr.stderr: self.logger.warning(f"Stderr of collector:\n{rr.stderr}") # end if # 3. In some cases, save collected data to appropriate location or database # TODO private info # On luzhou server for user pynie, move it to a dedicated location at /user/disk2 if BashUtils.run( f"hostname").stdout.strip() == "luzhou" and BashUtils.run( f"echo $USER").stdout.strip() == "pynie": alter_results_dir = Path( "/home/disk2/pynie/csevo-results") / project_name IOUtils.rm_dir(alter_results_dir) IOUtils.mk_dir(alter_results_dir.parent) BashUtils.run(f"mv {results_dir} {alter_results_dir}") self.logger.info(f"Results moved to {alter_results_dir}") # end if # -1. Remove repo IOUtils.rm_dir(downloads_dir) return
def process(self, model: str, output_dir: Path, task: str, year: int, eval_setting: str): """ Main entry for processors of different models. :param model: the model name, one of {"DeepCom", "ast-attendgru"} :param output_dir: the output directory (usually data/models) :param task: the task name, either "CG" or "MN" :param year: the year that the testing data should be on :param eval_setting: the evaluation setting, one of {"evo", "crossproj", "mixedproj"} """ assert year == self.EVO_YEARS[ -1] # TODO: Only support the latest year for now assert task in self.TASKS.keys() model_data_dir = output_dir / model if model == "DeepCom": from csevo.processor.DeepComProcessor import DeepComProcessor processor = DeepComProcessor() elif model == "DeepCom-Preorder": from csevo.processor.DeepComProcessor import DeepComProcessor processor = DeepComProcessor() elif model == "Bi-LSTM": from csevo.processor.BiLSTMProcessor import BiLSTMProcessor processor = BiLSTMProcessor() elif model == "no-split-Bi-LSTM": from csevo.processor.BiLSTMProcessor import BiLSTMProcessor processor = BiLSTMProcessor() elif model == "Transformer": from csevo.processor.TransformerProcessor import TransformerProcessor processor = TransformerProcessor() data_prefix = f"{eval_setting}-{year}" processor.process_data(model_data_dir, data_prefix) return elif model == "ASTAttendGRU": from csevo.processor.ASTAttendGRUProcessor import ASTAttendGRUProcessor processor = ASTAttendGRUProcessor() elif model == "Code2Seq": from csevo.processor.Code2SeqProcessor import Code2SeqProcessor processor = Code2SeqProcessor() else: raise ValueError(f"Illegal model {model}") # end if error_ids = None # Load dataset after split (from shared directory) shared_data_dir = output_dir / f"{task}-shared" self.logger.info(f"Loading dataset from {shared_data_dir}") data_type_2_data_list: Dict[str, List] = dict() data_type_2_data_list[Macros.test_common] = IOUtils.load( shared_data_dir / f"{year}-{Macros.test_common}.json", IOUtils.Format.json) for dt in [Macros.train, Macros.val, Macros.test_standard]: data_type_2_data_list[dt] = IOUtils.load( shared_data_dir / f"{eval_setting}-{year}-{dt}.json", IOUtils.Format.json) # Process each set for data_type, data_list in data_type_2_data_list.items(): sub_dir_name = f"{eval_setting}-{year}-{data_type}" if data_type in [Macros.test_common, Macros.test_standard]: data_type_tvt = Macros.test else: data_type_tvt = data_type model_dt_output_dir = model_data_dir / sub_dir_name IOUtils.mk_dir(model_dt_output_dir) if model == "DeepCom": error_ids = processor.process_data(data_list, data_type_tvt, model_dt_output_dir, "sbt") elif model == "DeepCom-Preorder": error_ids = processor.process_data(data_list, data_type_tvt, model_dt_output_dir, "Preorder") elif model == "Code2Seq": error_ids = processor.process_data(data_list, data_type_tvt, model_dt_output_dir) elif model == "Bi-LSTM": processor.process_data(data_list, data_type_tvt, model_dt_output_dir) elif model == "no-split-Bi-LSTM": processor.process_data(data_list, data_type_tvt, model_dt_output_dir, split=False) if error_ids is not None: self.logger.warning(f"Error data count: {len(error_ids)}") IOUtils.dump(model_data_dir / f"error-ids-{sub_dir_name}.json", error_ids, IOUtils.Format.json) # extra step for Open-NMT data if model == "Bi-LSTM" or model == "no-split-Bi-LSTM": # build dataset used by Open-NMT BashUtils.run( f"onmt_preprocess -train_src {model_data_dir}/{eval_setting}-{year}-{Macros.train}/src-train.txt " f"-train_tgt {model_data_dir}/{eval_setting}-{year}-{Macros.train}/tgt-train.txt " f"-valid_src {model_data_dir}/{eval_setting}-{year}-{Macros.val}/src-val.txt " f"-valid_tgt {model_data_dir}/{eval_setting}-{year}-{Macros.val}/tgt-val.txt " f"-save_data {model_data_dir}/{eval_setting}-{year}-{Macros.train}/biLSTM --src_seq_length 200 --src_seq_" f"length_trunc 200", expected_return_code=0) return
def extract_data_project( cls, project_path: Path, files: Optional[List[str]], exclude_files: Optional[List[str]], exclude_pattern: Optional[str], serapi_options: str, output_path: Path, ): # 1. Prepare output path if output_path.is_dir(): cls.logger.warning( f"{output_path} already exists, will overwrite the files.") elif output_path.is_file(): LoggingUtils.log_and_raise( cls.logger, f"{output_path} already exists as a file. Aborting.", Exception) else: IOUtils.mk_dir(output_path) # end if # 2. Extract documents, tok.sexp and ast.sexp coq_documents: Dict[str, CoqDocument] = collections.OrderedDict() ast_sexp_lists: Dict[str, List[SexpNode]] = dict() tok_sexp_lists: Dict[str, List[SexpNode]] = dict() with IOUtils.cd(project_path): coq_files: List[str] = BashUtils.run( f"find -name '*.v' -type f").stdout.split("\n")[:-1] coq_files = [coq_file[2:] for coq_file in coq_files] if files is not None: coq_files = [f for f in coq_files if f in files] # end if if exclude_files is not None: coq_files = [f for f in coq_files if f not in exclude_files] # end if if exclude_pattern is not None: re_exclude_pattern = re.compile(exclude_pattern) coq_files = [ f for f in coq_files if not re_exclude_pattern.fullmatch(f) ] # end if for i, coq_file in enumerate(tqdm(coq_files)): try: # Read file with open(coq_file, "r", newline="") as f: source_code = f.read() # end with # Get unicode offsets unicode_offsets = ParserUtils.get_unicode_offsets( source_code) # Call SerAPI ast_sexp_str: str = BashUtils.run( f"sercomp {serapi_options} --mode=sexp -- {coq_file}", expected_return_code=0).stdout tok_sexp_str: str = BashUtils.run( f"sertok {serapi_options} -- {coq_file}", expected_return_code=0).stdout # Parse ast sexp ast_sexp_list: List[SexpNode] = SexpParser.parse_list( ast_sexp_str) tok_sexp_list: List[SexpNode] = SexpParser.parse_list( tok_sexp_str) # Parse the document coq_document = CoqParser.parse_document( source_code, ast_sexp_list, tok_sexp_list, unicode_offsets=unicode_offsets) # Set meta data coq_document.file_name = coq_file coq_document.project_name = project_path.name coq_documents[coq_file] = coq_document ast_sexp_lists[coq_file] = ast_sexp_list tok_sexp_lists[coq_file] = tok_sexp_list except KeyboardInterrupt: cls.logger.warning("Keyboard interrupt!") raise except: cls.logger.warning( f"File {coq_file} failed! Exception was: {traceback.format_exc()}" ) continue # end try # end for # 3. Extract and save lemmas and definitions lemmas: List[Lemma] = list() definitions: List[Definition] = list() # Increase recursion limit because the backend sexps are CRAZZZZY deep sys.setrecursionlimit(10000) for file_path, doc in tqdm(coq_documents.items()): ast_sexp_list = ast_sexp_lists[file_path] lemmas_doc = cls.collect_lemmas_doc(doc, ast_sexp_list, serapi_options) lemmas.extend(lemmas_doc) definitions_doc = cls.collect_definitions_doc( doc, ast_sexp_list) definitions.extend(definitions_doc) # end for IOUtils.dump(output_path / "lemmas.json", IOUtils.jsonfy(lemmas), IOUtils.Format.json) IOUtils.dump(output_path / "definitions.json", IOUtils.jsonfy(definitions), IOUtils.Format.json) # end with return
def tacc_get_num_jobs(cls) -> int: return int( BashUtils.run(f"squeue -u {os.getenv('USER')} | wc -l").stdout) - 1
def collect_coq_documents_project( cls, data_mgr: FilesManager, project: Project, names_projects: Dict[str, Project], files: List[str] = None, is_verifying_tokenizer: bool = False, ) -> List[CoqDocument]: coq_documents: List[CoqDocument] = list() # Clone and checkout repo project.clone() project.checkout(project.data["sha"], is_forced=True) # Build the project cls.install_coq_project(project, names_projects) # For each file, parse code to tokens with IOUtils.cd(project.checkout_dir): coq_files: List[str] = BashUtils.run( f"find -name '*.v' -type f").stdout.split("\n")[:-1] if files is not None: coq_files = [f for f in coq_files if f[2:] in files] # [2:] is to remove the ./ # end if re_ignore_path = re.compile( project.data["ignore_path_regex"] ) if "ignore_path_regex" in project.data else None for i, coq_file in enumerate(coq_files): try: coq_file = coq_file[2:] cls.logger.debug( f"File {i + 1}/{len(coq_files)}: {coq_file}") # Check if file is ignored if re_ignore_path is not None and re_ignore_path.fullmatch( coq_file): cls.logger.info(f"Ignoring file {coq_file}") continue # end if # Read file with open(coq_file, "r", newline="") as f: source_code = f.read() # end with # Get unicode offsets unicode_offsets = ParserUtils.get_unicode_offsets( source_code) # Save original file to original_files data_mgr.dump_data([ FilesManager.ORIGINAL_FILES, project.full_name, coq_file ], source_code, IOUtils.Format.txt) # Call SerAPI serapi_options = project.data.get("serapi_options", "") ast_sexp_str: str = BashUtils.run( f"sercomp {serapi_options} --mode=sexp -- {coq_file}", expected_return_code=0).stdout tok_sexp_str: str = BashUtils.run( f"sertok {serapi_options} -- {coq_file}", expected_return_code=0).stdout # Save ast sexp to dataset (.ast.sexp) data_mgr.dump_data([ FilesManager.RAW_FILES, project.full_name, coq_file[:-2] + ".ast.sexp" ], ast_sexp_str, IOUtils.Format.txt) # Save tok sexp to dataset (.tok.sexp) data_mgr.dump_data([ FilesManager.RAW_FILES, project.full_name, coq_file[:-2] + ".tok.sexp" ], tok_sexp_str, IOUtils.Format.txt) # Parse ast sexp ast_sexp_list: List[SexpNode] = SexpParser.parse_list( ast_sexp_str) tok_sexp_list: List[SexpNode] = SexpParser.parse_list( tok_sexp_str) # Verify the tokenizer if requested if is_verifying_tokenizer: if not cls.verify_tokenizer(tok_sexp_list, source_code, unicode_offsets): LoggingUtils.log_and_raise( cls.logger, "Tokenized content doesn't match original file!", Exception) # end if # end if # Parse the document coq_document = CoqParser.parse_document( source_code, ast_sexp_list, tok_sexp_list, unicode_offsets=unicode_offsets) # Save the parsed document (printed format) to raw_files data_mgr.dump_data( [FilesManager.RAW_FILES, project.full_name, coq_file], coq_document.str_with_space(), IOUtils.Format.txt) # Set meta data coq_document.file_name = coq_file coq_document.project_name = project.full_name coq_document.revision = project.revision coq_documents.append(coq_document) except KeyboardInterrupt: cls.logger.warning("Keyboard interrupt!") raise except: cls.logger.warning( f"File {coq_file} failed! Exception was: {traceback.format_exc()}" ) continue # end try # end for # end with return coq_documents
def prepare_data(self): if not self.use_latest: for t in range(13, 18): exp_dir = self.work_dir / f"{t}{t+1}-train" self.logger.info( f"Preparing the data for {t}-{t+1} at {exp_dir}") IOUtils.rm_dir(exp_dir) IOUtils.mk_dir(exp_dir) # Copy train data BashUtils.run( f"cp -r {self.model_data_dir}/20{t}-20{t+1}-train/train {exp_dir}/", expected_return_code=0) # Copy val test data BashUtils.run( f"cp -r {self.model_data_dir}/20{t+1}-20{t+2}-val/valid {exp_dir}/", expected_return_code=0) BashUtils.run( f"cp -r {self.model_data_dir}/20{t+2}-20{t+3}-test/test {exp_dir}/", expected_return_code=0) # Copy vocab BashUtils.run( f"cp {self.model_data_dir}/20{t}-20{t+1}-train/vocab* {exp_dir}/", expected_return_code=0) # end for # end for else: exp_dir = self.work_dir / "latest" IOUtils.rm_dir(exp_dir) IOUtils.mk_dir(exp_dir) # Copy Train data BashUtils.run( f"cp -r {self.model_data_dir}/latest/train {exp_dir}/", expected_return_code=0) BashUtils.run( f"cp -r {self.model_data_dir}/latest/valid {exp_dir}/", expected_return_code=0) BashUtils.run( f"cp -r {self.model_data_dir}/latest/test {exp_dir}/", expected_return_code=0) # Copy vocab BashUtils.run(f"cp {self.model_data_dir}/latest/vocab* {exp_dir}/", expected_return_code=0) return
def prepare_configs_and_scripts(self, trials: List[int]): base_config = IOUtils.load(self.base_config_file, IOUtils.Format.jsonPretty) if not self.use_latest: exps = [f"{t}{t+1}-train" for t in range(13, 18)] for exp in exps: exp_dir = self.work_dir / exp for trial in trials: trial_dir = exp_dir / f"trial-{trial}" IOUtils.mk_dir(trial_dir) output_file = trial_dir / "output.txt" config = copy.copy(base_config) config["data_dir"] = str(exp_dir) config["model_dir"] = str(trial_dir / "model") config["output"] = str(output_file) config_file = trial_dir / "config.json" IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty) train_script_file = trial_dir / "train.sh" train_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 __main__.py {config_file} --train -v --gpu-id $1 &> {trial_dir}/log-train.txt\n" IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0) test_script_file = trial_dir / "test.sh" test_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 __main__.py {config_file} --eval {exp_dir}/test/test.token.code {exp_dir}/test/test.token.sbt {exp_dir}/test/test.token.nl &> {trial_dir}/log-test.txt" IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0) eval_script_file = trial_dir / "val.sh" eval_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 Bleu.py {exp_dir}/test/test.token.nl {trial_dir}/output.txt {trial_dir}\n" IOUtils.dump(eval_script_file, eval_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {eval_script_file}", expected_return_code=0) # end for # end for else: exp_dir = self.work_dir / "latest" for trial in trials: trial_dir = exp_dir / f"trial-{trial}" IOUtils.mk_dir(trial_dir) output_file = trial_dir / "output.txt" config = copy.copy(base_config) config["data_dir"] = str(exp_dir) config["model_dir"] = str(trial_dir / "model") config["output"] = str(output_file) config_file = trial_dir / "config.json" IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty) train_script_file = trial_dir / "train.sh" train_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 __main__.py {config_file} --train -v --gpu-id $1 &> {trial_dir}/log-train.txt\n" IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0) test_script_file = trial_dir / "test.sh" test_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 __main__.py {config_file} --eval {exp_dir}/test/test.token.code {exp_dir}/test/test.token.sbt {exp_dir}/test/test.token.nl &> {trial_dir}/log-test.txt" IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0) eval_script_file = trial_dir / "val.sh" eval_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 Bleu.py {exp_dir}/test/test.token.nl {trial_dir}/output.txt {trial_dir}\n" IOUtils.dump(eval_script_file, eval_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {eval_script_file}", expected_return_code=0) return
def collect_lemmas_doc( cls, doc: CoqDocument, ast_sexp_list: List[SexpNode], serapi_options: str, ) -> List[Lemma]: lemmas_doc: List[Lemma] = list() data_index = doc.get_data_index() # Maintain a stack of module modules: List[str] = list() # Prepare qualified name prefix qprefix_this_doc = "./" + doc.file_name[:-2] # Remove .v for m in cls.RE_PATH_TO_QUALIFIED_PREFIX.finditer(serapi_options): path = m.group("path") if path != ".": path = "./" + path qprefix = m.group("qprefix") if qprefix_this_doc.startswith(path): qprefix_this_doc = qprefix + qprefix_this_doc[len(path):] break # end if # end for if qprefix_this_doc.startswith("./"): qprefix_this_doc = qprefix_this_doc[len("./"):] qprefix_this_doc = qprefix_this_doc.replace("/", ".") for sent_i, sent in enumerate(doc.sentences): ast_sexp = ast_sexp_list[sent_i] vernac = SexpAnalyzer.analyze_vernac(ast_sexp) if vernac.vernac_type in cls.VTYPES_MODULE_BEG: # (VernacExpr()(VernacDefineModule() ( ( v ( Id <module name>)) ... # 0 1 2 20 21 22 220 2201 22011 module_name = vernac.vernac_sexp[2][2][0][1][ 1].content_no_quote modules.append(module_name) elif vernac.vernac_type in cls.VTYPES_MODULE_END: # (VernacExpr()(VernacEndSegment ( ( v ( Id <module name>)) ... # 0 1 2 20 21 210 2101 21011 try: module_name = vernac.vernac_sexp[2][1][0][1][ 1].content_no_quote except: print(vernac.vernac_sexp.pretty_format()) raise # end try if len(modules) > 0 and module_name == modules[-1]: modules.pop( ) # EndModule and EndSection share the same vernac type elif vernac.vernac_type in cls.VTYPES_LEMMA: # (VernacExpr()(VernacStartTheoremProof Lemma ( ( ( ( ( v ( Id <lemma name>)) # 0 1 2 20 21 22 2200000 2200001 22000011 lemma = Lemma() lemma.data_index = data_index lemma.name = vernac.vernac_sexp[2][2][0][0][0][0][1][ 1].content_no_quote lemma.qname = qprefix_this_doc + "." + ".".join(modules + [lemma.name]) # Find lemma content, after the first token matching the lemma name tok_i = 0 for tok in sent.tokens: if tok.content == lemma.name: break tok_i += 1 # end for if tok_i == len(sent.tokens): LoggingUtils.log_and_raise( cls.logger, f"Lemma name {lemma.name} didn't appear in the source code {sent.str_with_space()}", Exception) lemma.vernac_command = sent.tokens[:tok_i] lemma.statement = sent.tokens[tok_i + 1:] lemma.ast_sexp = vernac.vernac_sexp lemmas_doc.append(lemma) # end if # end for # Use sername to get the backend representations lemma_qnames: str = "".join([l.qname + "\n" for l in lemmas_doc]) lemma_qnames_file = BashUtils.get_temp_file() IOUtils.dump(lemma_qnames_file, lemma_qnames, IOUtils.Format.txt) lemma_qnames_backend_sexps_str: str = BashUtils.run( f"sername {serapi_options} --require-lib={qprefix_this_doc} {lemma_qnames_file}", expected_return_code=0).stdout IOUtils.rm(lemma_qnames_file) for qname_backend_sexp_str in lemma_qnames_backend_sexps_str.splitlines( ): qname, backend_sexp_str = qname_backend_sexp_str.split(":", 1) backend_sexp = SexpParser.parse(backend_sexp_str) for lemma in lemmas_doc: if lemma.qname == qname: lemma.backend_sexp = backend_sexp break # end if # end for # end for lemmas_doc = [l for l in lemmas_doc if l.backend_sexp is not None] return lemmas_doc
def test_inherit_env(self): os.environ[self.TEST_ENV_A_KEY] = self.TEST_ENV_A_VALUE self.assertEqual( self.TEST_ENV_A_VALUE, BashUtils.run(f"echo -n ${self.TEST_ENV_A_KEY}").stdout) return