def suggest_naming(self, file_path: Path, prj_root: Optional[Path] = None):
        """
        Processes a file to get its lemmas and runs the model to get predictions.
        """
        # Figure out which project we're at, and then load configs
        if prj_root is None:
            prj_root = RoosterizeDirUtils.auto_infer_project_root(file_path)
        self.load_configs(prj_root)

        # Infer SerAPI options
        serapi_options = self.infer_serapi_options(prj_root)

        # If user provided compile_cmd, first compile the project
        if self.compile_cmd is not None:
            with IOUtils.cd(prj_root):
                BashUtils.run(self.compile_cmd, expected_return_code=0)

        # Parse file
        data = self.parse_file(file_path, prj_root, serapi_options)

        # Load model
        self.load_local_model(prj_root)
        model = self.get_model()

        # Use the model to make predictions
        # Temp dirs for processed data and results
        temp_data_dir = Path(tempfile.mkdtemp(prefix="roosterize"))

        # Dump lemmas & definitions
        temp_raw_data_dir = temp_data_dir / "raw"
        temp_raw_data_dir.mkdir()
        IOUtils.dump(
            temp_raw_data_dir / "lemmas.json",
            IOUtils.jsonfy(data.lemmas),
            IOUtils.Format.json,
        )
        IOUtils.dump(
            temp_raw_data_dir / "definitions.json",
            IOUtils.jsonfy(data.definitions),
            IOUtils.Format.json,
        )

        # Model-specific process
        temp_processed_data_dir = temp_data_dir / "processed"
        temp_processed_data_dir.mkdir()
        model.process_data_impl(temp_raw_data_dir, temp_processed_data_dir)

        # Invoke eval
        candidates_logprobs = model.eval_impl(
            temp_processed_data_dir,
            beam_search_size=self.beam_search_size,
            k=self.k,
        )

        # Save predictions
        IOUtils.rm_dir(temp_data_dir)

        # Report predictions
        self.report_predictions(data, candidates_logprobs)
        return
    def download_global_model(self, force_yes: bool = False):
        """
        Downloads a global Roosterize model.
        """
        global_model_dir = RoosterizeDirUtils.get_global_model_dir()
        if global_model_dir.exists():
            ans = self.ask_for_confirmation(
                f"A Roosterize model already exists at {global_model_dir}. "
                f"Do you want to delete it and download again?")
            if force_yes:
                ans = True
            if ans != True:
                return
            IOUtils.rm_dir(global_model_dir)

        self.show_message("Downloading Roosterize model...")

        # Download and unpack
        temp_model_dir = Path(tempfile.mkdtemp(prefix="roosterize"))

        urllib.request.urlretrieve(self.model_url,
                                   str(temp_model_dir / "model.tgz"))
        with IOUtils.cd(temp_model_dir):
            BashUtils.run("tar xzf model.tgz", expected_return_code=0)

            # Move the stuff to global model place
            shutil.move(str(Path.cwd() / "model"), global_model_dir)

        # Delete temp dir
        IOUtils.rm_dir(temp_model_dir)

        self.show_message("Finish downloading Roosterize model.")
예제 #3
0
    def prepare_data(self):
        data_prefix = f"{self.eval_setting}-{self.year}"
        data_dir = self.work_dir / "data"

        self.logger.info(
            f"Preparing the data for {self.eval_setting} {self.year} at {data_dir}"
        )
        IOUtils.rm_dir(data_dir)
        IOUtils.mk_dir(data_dir)

        # Copy train/val/test_common/test_standard data
        BashUtils.run(
            f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.train}/train {data_dir}/train",
            expected_return_code=0)
        BashUtils.run(
            f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.val}/valid {data_dir}/valid",
            expected_return_code=0)
        BashUtils.run(
            f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.test_common}/test {data_dir}/{Macros.test_common}",
            expected_return_code=0)
        BashUtils.run(
            f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/test {data_dir}/{Macros.test_standard}",
            expected_return_code=0)

        # Copy vocab
        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/vocab* {data_dir}/",
            expected_return_code=0)

        return
    def parse_file(self, file_path: Path, prj_root: Path, serapi_options: str):
        source_code = IOUtils.load(file_path, IOUtils.Format.txt)
        unicode_offsets = ParserUtils.get_unicode_offsets(source_code)

        with IOUtils.cd(prj_root):
            rel_path = file_path.relative_to(prj_root)
            ast_sexp_str = BashUtils.run(
                f"sercomp {serapi_options} --mode=sexp -- {rel_path}",
                expected_return_code=0).stdout
            tok_sexp_str = BashUtils.run(
                f"sertok {serapi_options} -- {rel_path}",
                expected_return_code=0).stdout

            ast_sexp_list: List[SexpNode] = SexpParser.parse_list(ast_sexp_str)
            tok_sexp_list: List[SexpNode] = SexpParser.parse_list(tok_sexp_str)

            doc = CoqParser.parse_document(
                source_code,
                ast_sexp_list,
                tok_sexp_list,
                unicode_offsets=unicode_offsets,
            )
            doc.file_name = str(rel_path)

            # Collect lemmas & definitions
            lemmas: List[Lemma] = DataMiner.collect_lemmas_doc(
                doc, ast_sexp_list, serapi_options)
            definitions: List[Definition] = DataMiner.collect_definitions_doc(
                doc, ast_sexp_list)

        return ProcessedFile(file_path, source_code, doc, ast_sexp_list,
                             tok_sexp_list, unicode_offsets, lemmas,
                             definitions)
예제 #5
0
    def prepare_data(self):
        data_prefix = f"{self.eval_setting}-{self.year}"
        data_dir = self.work_dir / "data"

        self.logger.info(
            f"Preparing the data for {self.eval_setting} {self.year} at {self.work_dir}"
        )
        IOUtils.rm_dir(data_dir)
        IOUtils.mk_dir(data_dir)

        # Copy train/val/test_common/test_standard data
        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/code2seq.train.c2s {self.work_dir}/data/",
            expected_return_code=0)
        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.val}/code2seq.val.c2s {self.work_dir}/data/",
            expected_return_code=0)
        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_common}/code2seq.test.c2s {self.work_dir}/data/code2seq.{Macros.test_common}.c2s",
            expected_return_code=0)
        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/code2seq.test.c2s {self.work_dir}/data/code2seq.{Macros.test_standard}.c2s",
            expected_return_code=0)

        # Copy vocab
        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/code2seq.dict.c2s {self.work_dir}/data/",
            expected_return_code=0)
        return
예제 #6
0
    def prepare_code(self):
        IOUtils.rm_dir(self.code_dir)
        IOUtils.mk_dir(self.code_dir.parent)
        with IOUtils.cd(self.code_dir.parent):
            BashUtils.run(f"git clone {self.REPO_URL} {self.code_dir.name}", expected_return_code=0)
        # end with

        with IOUtils.cd(self.code_dir):
            BashUtils.run(f"git checkout {self.REPO_SHA}", expected_return_code=0)
        # end with
        return
예제 #7
0
    def install_coq_project(cls, project: Project, names_projects: Dict[str, Project]) -> None:
        """
        :requires: the project is cloned and checked-out to the desired version.
        """
        if not project.is_cloned:
            project.clone()
            project.checkout(project.data["sha"], is_forced=True)
        # end if

        # Check if the project is already compiled
        confirmation_file = "lpc-installed.txt"
        confirmation_content = project.revision + " " + BashUtils.run("opam list coq -s", expected_return_code=0).stdout.strip()
        if (project.checkout_dir/confirmation_file).is_file() and IOUtils.load(project.checkout_dir/confirmation_file, "txt") == confirmation_content:
            cls.logger.debug(f"Project {project.full_name} already installed")
            return
        # end if

        project.clean()

        # Install dependencies
        for dependency in project.data.get("dependencies", []):
            dependency_project = names_projects.get(dependency)
            if dependency_project is None:  raise Exception(f"Cannot find dependency {dependency}")
            cls.logger.info(f"For Project {project.full_name}, installing dependency {dependency}")
            cls.install_coq_project(dependency_project, names_projects)
        # end for

        if "build_cmd" not in project.data:  raise Exception(f"Project {project.full_name} does not have build_cmd")
        if "install_cmd" not in project.data:  raise Exception(f"Project {project.full_name} does not have install_cmd")

        with IOUtils.cd(project.checkout_dir):
            # Build
            cls.logger.info(f"Project {project.full_name}: Building with {project.data['build_cmd']}")
            r = BashUtils.run(project.data["build_cmd"])
            if r.return_code != 0:
                raise Exception(f"Compilation failed! Return code is {r.return_code}! stdout:\n{r.stdout}\n; stderr:\n{r.stderr}")
            else:
                cls.logger.debug(f"Compilation finished. Return code is {r.return_code}. stdout:\n{r.stdout}\n; stderr:\n{r.stderr}")
            # end if

            # Install
            cls.logger.info(f"Project {project.full_name}: Installing with {project.data['install_cmd']}")
            r = BashUtils.run(project.data["install_cmd"])
            if r.return_code != 0:
                raise Exception(f"Installation failed! Return code is {r.return_code}! stdout:\n{r.stdout}\n; stderr:\n{r.stderr}")
            else:
                cls.logger.debug(f"Installation finished. Return code is {r.return_code}. stdout:\n{r.stdout}\n; stderr:\n{r.stderr}")
            # end if

            IOUtils.dump(project.checkout_dir / confirmation_file, confirmation_content, "txt")
        # end with
        return
예제 #8
0
 def require_collector(cls):
     if cls.is_parallel: return
     if not cls.collector_installed:
         cls.logger.info("Require collector, installing ...")
         with IOUtils.cd(Macros.collector_dir):
             BashUtils.run(f"mvn clean install -DskipTests",
                           expected_return_code=0)
         # end with
         cls.collector_installed = True
     else:
         cls.logger.debug("Require collector, and already installed")
     # end if
     return
예제 #9
0
    def prepare_configs_and_scripts(self, trials: List[int]):
        data_dir = self.work_dir / "data"
        base_config = IOUtils.load(self.base_config_file,
                                   IOUtils.Format.jsonPretty)

        for trial in trials:
            trial_dir = self.work_dir / f"trial-{trial}"
            IOUtils.mk_dir(trial_dir)

            config = copy.copy(base_config)
            config["data_dir"] = str(data_dir)
            config["model_dir"] = str(trial_dir / "model")
            config["output"] = str(trial_dir / "output.txt")

            config_file = trial_dir / "config.json"
            IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty)

            training_trace_file = trial_dir / "training-trace.json"

            train_script_file = trial_dir / f"{Macros.train}.sh"
            # The gpu-id argument is necessary for tensorflow, even if we are using CUDA_VISIBLE_DEVICES
            train_script = f"#!/bin/bash\n" \
                           f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                           f"conda activate {self.CONDA_ENV}\n" \
                           f"module load cuda/10.0 cudnn/7.6.2\n" \
                           f"cd {self.code_dir}/translate\n" \
                           f"python3 __main__.py {config_file} --train -v --train-log {training_trace_file} --gpu-id $1 &> {trial_dir}/log-{Macros.train}.txt\n"
            IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt)
            BashUtils.run(f"chmod +x {train_script_file}",
                          expected_return_code=0)

            for test_type in [Macros.test_common, Macros.test_standard]:
                output_file = trial_dir / f"output_{test_type}.txt"
                config["output"] = str(output_file)
                test_config_file = trial_dir / f"config_{test_type}.json"
                IOUtils.dump(test_config_file, config,
                             IOUtils.Format.jsonPretty)

                test_script_file = trial_dir / f"{test_type}.sh"
                test_script = f"#!/bin/bash\n" \
                              f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                              f"conda activate {self.CONDA_ENV}\n" \
                              f"module load cuda/10.0 cudnn/7.6.2\n" \
                              f"cd {self.code_dir}/translate\n" \
                              f"python3 __main__.py {test_config_file} --eval {data_dir}/{test_type}/test.token.code {data_dir}/{test_type}/test.token.sbt {data_dir}/{test_type}/test.token.nl --gpu-id $1 &> {trial_dir}/log-{test_type}.txt\n" \
                              f"python3 Bleu.py {data_dir}/{test_type}/test.token.nl {trial_dir}/output_{test_type}.txt {trial_dir}/results_{test_type}.json\n"
                IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt)
                BashUtils.run(f"chmod +x {test_script_file}",
                              expected_return_code=0)

        return
예제 #10
0
    def prepare_configs_and_scripts(self, trials: List[int]):

        exp_dir = self.work_dir
        for trial in trials:
            trial_dir = exp_dir/f"trial-{trial}"
            IOUtils.mk_dir(trial_dir)

            train_script_file = trial_dir/"train.sh"
            train_script = f"#!/bin/bash\n" \
                           f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                           f"module load cuda/10.1 cudnn/7.6.2\n" \
                           f"conda activate {self.CONDA_ENV}\n" \
                           f"cd {self.code_dir}\n" \
                           f"export MKL_SERVICE_FORCE_INTEL=1\n"\
                           f"python3 train.py " \
                           f"-data {self.data_dir}/transformer -save_model {trial_dir}/bestTransformer "\
                           f"-layers 6 -rnn_size 512 -word_vec_size 512 -transformer_ff 2048 -heads 8 "\
                           f"-encoder_type transformer -decoder_type transformer -position_encoding "\
                           f"-train_steps 50000  -max_generator_batches 2 -dropout 0.1 "\
                           f"-batch_size 4096 -batch_type tokens -normalization tokens  -accum_count 2 "\
                           f"-optim adam -adam_beta2 0.998 -decay_method noam -warmup_steps 8000 -learning_rate 2 " \
                           f"-max_grad_norm 0 -param_init 0 -param_init_glorot -early_stopping 10 -keep_checkpoint 1 " \
                           f"-label_smoothing 0.1 -valid_steps 500 -save_checkpoint_steps 500 -report_every 500 " \
                           f"--world_size 1 --gpu_ranks 0 " \
                           f"&> {trial_dir}/train-log.txt\n"
            IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt)
            BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0)

            for test_type in [Macros.test_common, Macros.test_standard]:

                test_script_file = trial_dir/f"{test_type}.sh"
                output_file = trial_dir / f"output_{test_type}.txt"
                test_script = f"#!/bin/bash\n" \
                              f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                              f"module load cuda/10.1 cudnn/7.6.2\n" \
                              f"conda activate {self.CONDA_ENV}\n" \
                              f"cd {self.code_dir}\n" \
                              f"export MKL_SERVICE_FORCE_INTEL=1\n"\
                              f"python3 translate.py "\
                              f"--model {trial_dir}/*.pt --output {output_file} --src {self.data_dir}/src-{test_type}.txt "\
                              f"&> {trial_dir}/{test_type}-log.txt\n" \
                              f"python3 eval_utils.py " \
                              f"{self.data_dir}/tgt-{test_type}.txt {output_file} {trial_dir}/results_{test_type}.json\n"
                IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt)
                BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0)

            # end for

        return
예제 #11
0
    def prepare_data(self):
        data_prefix = f"{self.eval_setting}-{self.year}"
        IOUtils.rm_dir(self.data_dir)
        IOUtils.mk_dir(self.data_dir)

        BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/transformer.* {self.data_dir}/",
                      expected_return_code=0)

        BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_common}/src-test.txt {self.data_dir}/src-{Macros.test_common}.txt", expected_return_code=0)
        BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_common}/tgt-test.txt {self.data_dir}/tgt-{Macros.test_common}.txt", expected_return_code=0)

        BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/src-test.txt {self.data_dir}/src-{Macros.test_standard}.txt", expected_return_code=0)
        BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/tgt-test.txt {self.data_dir}/tgt-{Macros.test_standard}.txt", expected_return_code=0)

        return
예제 #12
0
    def test_for_each_revision(self):
        project = self.get_a_test_project()
        with TestSupport.get_playground_path():
            Project.set_downloads_dir(Path.cwd() / "_downloads")
            Project.set_results_dir(Path.cwd() / "_results")

            # Clone
            project.clone()

            # Set up results
            project.init_results()

            # Get all revisions, compare with dumped version
            all_revisions = project.get_all_revisions()

            if len(all_revisions) < 10:
                print(
                    "Too few revisions (<10) to do testing on for_each_revision. Will skip that."
                )
                return

            # For each revision, count number of files
            project.for_each_revision(
                lambda p, r: p.results.dump_revision_result(
                    r, "count_files.json",
                    BashUtils.run("git ls-files | wc -l").stdout),
                all_revisions[-10:])
            project.for_each_revision(lambda p, r: self.assertIsNotNone(
                p.results.load_revision_result(r, "count_files.json")),
                                      all_revisions[-10:],
                                      is_auto_checkout=False)
        # end with
        return
예제 #13
0
    def prepare_configs_and_scripts(self, trials: List[int]):
        with open(self.base_config_file, "r") as f:
            base_config = yaml.load(f)
        exp_dir = self.work_dir
        for trial in trials:
            seed = random.randint(0, 9)
            trial_dir = exp_dir / f"trial-{trial}"
            IOUtils.mk_dir(trial_dir)

            config = copy.copy(base_config)
            config["data"] = str(self.data_dir / "biLSTM")
            config["save_model"] = str(trial_dir / "bestLSTM")
            config_file = trial_dir / "config.yaml"
            with open(config_file, "w+") as f:
                yaml.dump(config, f)
            train_script_file = trial_dir/"train.sh"
            train_script = f"#!/bin/bash\n" \
                           f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                           f"module load cuda/10.1 cudnn/7.6.2\n" \
                           f"conda activate {self.CONDA_ENV}\n" \
                           f"cd {self.code_dir}\n" \
                           f"export MKL_SERVICE_FORCE_INTEL=1\n"\
                           f"python3 train.py --config {config_file} --world_size 1 --gpu_ranks 0 -keep_checkpoint 1 " \
                           f"--seed {seed} &> {trial_dir}/train-log.txt\n"
            IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt)
            BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0)

            for test_type in [Macros.test_common, Macros.test_standard]:
                test_script_file = trial_dir/f"{test_type}.sh"
                output_file = trial_dir / f"output_{test_type}.txt"
                test_script = f"#!/bin/bash\n" \
                              f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                              f"module load cuda/10.1 cudnn/7.6.2\n" \
                              f"conda activate {self.CONDA_ENV}\n" \
                              f"cd {self.code_dir}\n" \
                              f"export MKL_SERVICE_FORCE_INTEL=1\n"\
                              f"python3 translate.py "\
                              f"--model {trial_dir}/*.pt --output {output_file} --src {self.data_dir}/src-{test_type}.txt "\
                              f"&> {trial_dir}/{test_type}-log.txt\n" \
                              f"python3 eval_utils.py " \
                              f"{self.data_dir}/tgt-{test_type}.txt {output_file} {trial_dir}/results_{test_type}.json\n"
                IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt)
                BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0)

            # end for
        return
예제 #14
0
    def process_data(self, model_data_dir: Path, data_prefix: str):
        """
        Assume we have the raw data file generated bu Bi-LSTM model processor: src-train.txt, tgt-train.txt, src-val.txt, tgt-val.txt
        :param model_data_dir: the dir for storing the data for transformer.
        :param data_prefix: e.g. evo-2020, mixedproj-2020
        :return:
        """
        self.logger.info(f"Start processing")

        BashUtils.run(
            f"onmt_preprocess -train_src {model_data_dir}/{data_prefix}-{Macros.train}/src-train.txt "
            f"-train_tgt {model_data_dir}/{data_prefix}-{Macros.train}/tgt-train.txt "
            f"-valid_src {model_data_dir}/{data_prefix}-{Macros.val}/src-val.txt "
            f"-valid_tgt {model_data_dir}/{data_prefix}-{Macros.val}/tgt-val.txt "
            f"-save_data {model_data_dir}/{data_prefix}-{Macros.train}/transformer --src_seq_length 200"
            f" --src_seq_length_trunc 200 --shard_size 0",
            expected_return_code=0)
예제 #15
0
 def get_cur_cluster(cls) -> str:
     hostname = BashUtils.run(f"hostname").stdout.strip()
     if hostname.endswith("maverick2.tacc.utexas.edu"):
         return cls.maverick2
     elif hostname.endswith("stampede2.tacc.utexas.edu"):
         return cls.stampede2
     else:
         cls.logger.warning("Currently not on TACC")
         return cls.maverick2
예제 #16
0
 def test_propagate_env(self):
     del os.environ[self.TEST_ENV_A_KEY]
     self.assertTrue(self.TEST_ENV_A_KEY not in os.environ)
     self.assertEqual(
         self.TEST_ENV_A_VALUE,
         BashUtils.run(
             f"export {self.TEST_ENV_A_KEY}={self.TEST_ENV_A_VALUE}; echo -n ${self.TEST_ENV_A_KEY}",
             is_update_env=True).stdout)
     self.assertEqual(self.TEST_ENV_A_VALUE,
                      os.environ[self.TEST_ENV_A_KEY])
예제 #17
0
    def require_special_repo(cls, directory: Path, branch: str):
        cls.logger.info(f"Updating {directory} to {branch} branch")
        if directory.exists():
            if not directory.is_dir() or not (directory / ".git").is_dir():
                LoggingUtils.log_and_raise(
                    cls.logger,
                    f"Path {directory} already exists but is not a proper git repository!",
                    Exception)
            # end if

            with IOUtils.cd(directory):
                BashUtils.run(f"git pull", expected_return_code=0)
            # end with
        else:
            IOUtils.mk_dir(directory)
            with IOUtils.cd(directory):
                BashUtils.run(
                    f"git clone --single-branch -b {branch} -- {cls.get_git_url()} .",
                    expected_return_code=0)
예제 #18
0
    def tacc_submit_jobs(cls,
                         submit_script: Path,
                         titles: List[str],
                         scripts: List[Path],
                         timeouts: List[str],
                         output_dir: Path,
                         submit_cd: int = 600,
                         max_jobs: int = 4):
        job_i = 0
        while job_i < len(scripts):
            if cls.tacc_get_num_jobs() >= max_jobs:
                cls.logger.warning(
                    f"Number of running jobs reach limit {max_jobs}, will retry after {submit_cd} seconds at {time.strftime('%a, %d %b %Y %H:%M:%S +0000', time.localtime(time.time()+submit_cd))}"
                )
                time.sleep(submit_cd)
                continue
            # end if

            title = titles[job_i]
            script = scripts[job_i]
            timeout = timeouts[job_i]
            cls.logger.info(f"Submitting script {script}")

            try:
                BashUtils.run(
                    f"{submit_script} \"{title}\" \"{output_dir}\" \"{script}\" \"{timeout}\"",
                    expected_return_code=0)
            except KeyboardInterrupt:
                cls.logger.warning(f"Keyboard interrupt!")
                break
            except:
                cls.logger.warning(
                    f"Failed to submit, will retry after {submit_cd} seconds at {time.strftime('%a, %d %b %Y %H:%M:%S +0000', time.localtime(time.time()+submit_cd))}"
                )
                time.sleep(submit_cd)
                continue
            # end try

            # Submit successfully
            job_i += 1
        # end while
        return
예제 #19
0
    def prepare_configs_and_scripts(self, trials: List[int]):
        exp_dir = self.work_dir

        for trial in trials:
            trial_dir = exp_dir / f"trial-{trial}"
            IOUtils.mk_dir(trial_dir)

            model_dir = trial_dir / "models"
            IOUtils.mk_dir(model_dir)
            log_dir = trial_dir / "logs"
            IOUtils.mk_dir(log_dir)
            data = str(exp_dir / "data/code2seq")
            val_data = data + ".val.c2s"
            train_log = trial_dir / "training-trace.json"

            train_script_file = trial_dir / f"{Macros.train}.sh"
            # Copy config file
            BashUtils.run(
                f"cp {self.base_config_file} {trial_dir}/config.yaml",
                expected_return_code=0)
            output_file = trial_dir / "output_tmp.txt"
            reference_file = trial_dir / "ref_tmp.txt"
            config_file = trial_dir / "config.yaml"
            train_script = f"#!/bin/bash\n" \
                           f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                           f"conda activate {self.CONDA_ENV}\n" \
                           f"module load cuda/10.0 cudnn/7.6.2\n" \
                           f"cd {self.code_dir}\n" \
                           f"python -u code2seq.py --data {data} --test {val_data} --log {train_log} --config {config_file} " \
                           f"--pred_file {output_file} --ref_file {reference_file} "\
                           f"--save_prefix {model_dir}/model --gpu_id $1 &> {trial_dir}/train-log.txt"
            IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt)
            BashUtils.run(f"chmod +x {train_script_file}",
                          expected_return_code=0)

            for test_type in [Macros.test_common, Macros.test_standard]:
                test_data = exp_dir / "data" / f"code2seq.{test_type}.c2s"
                output_file = trial_dir / f"output_{test_type}.txt"
                reference_file = trial_dir / f"ref_{test_type}.txt"
                test_script_file = trial_dir / f"{test_type}.sh"
                test_script = f"#!/bin/bash\n" \
                              f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                              f"conda activate {self.CONDA_ENV}\n" \
                              f"module load cuda/10.0 cudnn/7.6.2\n" \
                              f"cd {self.code_dir}\n" \
                              f"python3 code2seq.py --load {model_dir}/model_best --test {test_data} --config {config_file} " \
                              f"--pred_file {output_file} --ref_file {reference_file} "\
                              f"--gpu_id $1 &> {trial_dir}/{test_type}-log.txt\n" \
                              f"python3 eval_utils.py {reference_file} {output_file} {trial_dir}/results_{test_type}.json\n"
                IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt)
                BashUtils.run(f"chmod +x {test_script_file}",
                              expected_return_code=0)

        return
예제 #20
0
    def process_data(self,
                     method_data_list: List[MethodData],
                     data_type: str,
                     output_dir: Path,
                     split: bool = True):
        Environment.require_collector()

        log_file = output_dir / "collector-log.txt"
        data_file = output_dir / "method-data.json"
        IOUtils.dump(data_file, IOUtils.jsonfy(method_data_list),
                     IOUtils.Format.json)

        config = {
            "transform": True,
            "model": "BiLSTM",
            "dataType": data_type,
            "dataFile": str(data_file),
            "logFile": str(log_file),
            "outputDir": str(output_dir),
        }
        config_file = output_dir / "collector-config.json"
        IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty)

        self.logger.info(
            f"Starting the Java collector. Check log at {log_file} and outputs at {output_dir}"
        )
        rr = BashUtils.run(
            f"java -jar {Environment.collector_jar} {config_file}",
            expected_return_code=0)
        if rr.stdout:
            self.logger.warning(f"Stdout of collector:\n{rr.stdout}")
        # end if
        if rr.stderr:
            self.logger.warning(f"Stderr of collector:\n{rr.stderr}")
        # end if
        # build raw dataset
        if split:
            self.tokenizeFile(output_dir / f"{data_type}.raw.txt", data_type)
        else:
            self.noSplit(output_dir / f"{data_type}.raw.txt", data_type)

        error_ids = IOUtils.load(str(output_dir) + "-error-ids.json")
        print(f"Number of error id is: {len(error_ids)}")
        # BashUtils.run(f"rm {output_dir}-error-ids.json", expected_return_code=0)
        return error_ids
예제 #21
0
    def prepare_code(self):
        IOUtils.rm_dir(self.code_dir)
        IOUtils.mk_dir(self.code_dir.parent)
        with IOUtils.cd(self.code_dir.parent):
            BashUtils.run(f"git clone {self.REPO_URL} {self.code_dir.name}", expected_return_code=0)
        # end with

        with IOUtils.cd(self.code_dir):
            BashUtils.run(f"git checkout {self.REPO_SHA}", expected_return_code=0)
        # end with

        # copy eval code
        BashUtils.run(f"cp {Macros.this_dir}/eval/eval_utils.py {self.code_dir}/")
        return
예제 #22
0
    def collect_project(self, project_name: str, project_url: str):
        Environment.require_collector()

        # 0. Download repo
        downloads_dir = self.repos_downloads_dir / project_name
        results_dir = self.repos_results_dir / project_name

        # Remove previous results if any
        IOUtils.rm_dir(results_dir)
        IOUtils.mk_dir(results_dir)

        # Clone the repo if not exists
        if not downloads_dir.exists():
            with IOUtils.cd(self.repos_downloads_dir):
                with TimeUtils.time_limit(300):
                    BashUtils.run(f"git clone {project_url} {project_name}",
                                  expected_return_code=0)
                # end with
            # end with
        # end if

        project_data = ProjectData.create()
        project_data.name = project_name
        project_data.url = project_url

        # 1. Get list of revisions
        with IOUtils.cd(downloads_dir):
            git_log_out = BashUtils.run(f"git log --pretty=format:'%H %P'",
                                        expected_return_code=0).stdout
            for line in git_log_out.splitlines()[:self.MAX_REVISIONS]:
                shas = line.split()
                project_data.revisions.append(shas[0])
                project_data.parent_revisions[shas[0]] = shas[1:]
            # end for
        # end with

        # 2. Get revisions in different year
        with IOUtils.cd(downloads_dir):
            for year in self.YEARS:
                git_log_out = BashUtils.run(
                    f"git rev-list -1 --before=\"Jan 1 {year}\" origin",
                    expected_return_code=0).stdout
                project_data.year_revisions[str(year) +
                                            "_Jan_1"] = git_log_out.rstrip()
            # end for
        # end with

        project_data_file = results_dir / "project.json"
        IOUtils.dump(project_data_file, IOUtils.jsonfy(project_data),
                     IOUtils.Format.jsonPretty)

        # 2. Start java collector
        # Prepare config
        log_file = results_dir / "collector-log.txt"
        output_dir = results_dir / "collector"

        config = {
            "collect": True,
            "projectDir": str(downloads_dir),
            "projectDataFile": str(project_data_file),
            "logFile": str(log_file),
            "outputDir": str(output_dir),
            "year":
            True  # To indicate whether to collect all evo data or yearly data
        }
        config_file = results_dir / "collector-config.json"
        IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty)

        self.logger.info(
            f"Starting the Java collector. Check log at {log_file} and outputs at {output_dir}"
        )
        rr = BashUtils.run(
            f"java -jar {Environment.collector_jar} {config_file}",
            expected_return_code=0)
        if rr.stderr:
            self.logger.warning(f"Stderr of collector:\n{rr.stderr}")
        # end if

        # 3. In some cases, save collected data to appropriate location or database
        # TODO private info
        # On luzhou server for user pynie, move it to a dedicated location at /user/disk2
        if BashUtils.run(
                f"hostname").stdout.strip() == "luzhou" and BashUtils.run(
                    f"echo $USER").stdout.strip() == "pynie":
            alter_results_dir = Path(
                "/home/disk2/pynie/csevo-results") / project_name
            IOUtils.rm_dir(alter_results_dir)
            IOUtils.mk_dir(alter_results_dir.parent)
            BashUtils.run(f"mv {results_dir} {alter_results_dir}")
            self.logger.info(f"Results moved to {alter_results_dir}")
        # end if

        # -1. Remove repo
        IOUtils.rm_dir(downloads_dir)
        return
예제 #23
0
    def process(self, model: str, output_dir: Path, task: str, year: int,
                eval_setting: str):
        """
        Main entry for processors of different models.
        :param model: the model name, one of {"DeepCom", "ast-attendgru"}
        :param output_dir: the output directory (usually data/models)
        :param task: the task name, either "CG" or "MN"
        :param year: the year that the testing data should be on
        :param eval_setting: the evaluation setting, one of {"evo", "crossproj", "mixedproj"}
        """
        assert year == self.EVO_YEARS[
            -1]  # TODO: Only support the latest year for now
        assert task in self.TASKS.keys()

        model_data_dir = output_dir / model

        if model == "DeepCom":
            from csevo.processor.DeepComProcessor import DeepComProcessor
            processor = DeepComProcessor()
        elif model == "DeepCom-Preorder":
            from csevo.processor.DeepComProcessor import DeepComProcessor
            processor = DeepComProcessor()
        elif model == "Bi-LSTM":
            from csevo.processor.BiLSTMProcessor import BiLSTMProcessor
            processor = BiLSTMProcessor()
        elif model == "no-split-Bi-LSTM":
            from csevo.processor.BiLSTMProcessor import BiLSTMProcessor
            processor = BiLSTMProcessor()
        elif model == "Transformer":
            from csevo.processor.TransformerProcessor import TransformerProcessor
            processor = TransformerProcessor()
            data_prefix = f"{eval_setting}-{year}"
            processor.process_data(model_data_dir, data_prefix)
            return
        elif model == "ASTAttendGRU":
            from csevo.processor.ASTAttendGRUProcessor import ASTAttendGRUProcessor
            processor = ASTAttendGRUProcessor()
        elif model == "Code2Seq":
            from csevo.processor.Code2SeqProcessor import Code2SeqProcessor
            processor = Code2SeqProcessor()
        else:
            raise ValueError(f"Illegal model {model}")
        # end if
        error_ids = None

        # Load dataset after split (from shared directory)
        shared_data_dir = output_dir / f"{task}-shared"
        self.logger.info(f"Loading dataset from {shared_data_dir}")
        data_type_2_data_list: Dict[str, List] = dict()
        data_type_2_data_list[Macros.test_common] = IOUtils.load(
            shared_data_dir / f"{year}-{Macros.test_common}.json",
            IOUtils.Format.json)
        for dt in [Macros.train, Macros.val, Macros.test_standard]:
            data_type_2_data_list[dt] = IOUtils.load(
                shared_data_dir / f"{eval_setting}-{year}-{dt}.json",
                IOUtils.Format.json)

        # Process each set
        for data_type, data_list in data_type_2_data_list.items():
            sub_dir_name = f"{eval_setting}-{year}-{data_type}"

            if data_type in [Macros.test_common, Macros.test_standard]:
                data_type_tvt = Macros.test
            else:
                data_type_tvt = data_type

            model_dt_output_dir = model_data_dir / sub_dir_name
            IOUtils.mk_dir(model_dt_output_dir)
            if model == "DeepCom":
                error_ids = processor.process_data(data_list, data_type_tvt,
                                                   model_dt_output_dir, "sbt")
            elif model == "DeepCom-Preorder":
                error_ids = processor.process_data(data_list, data_type_tvt,
                                                   model_dt_output_dir,
                                                   "Preorder")
            elif model == "Code2Seq":
                error_ids = processor.process_data(data_list, data_type_tvt,
                                                   model_dt_output_dir)
            elif model == "Bi-LSTM":
                processor.process_data(data_list, data_type_tvt,
                                       model_dt_output_dir)
            elif model == "no-split-Bi-LSTM":
                processor.process_data(data_list,
                                       data_type_tvt,
                                       model_dt_output_dir,
                                       split=False)
            if error_ids is not None:
                self.logger.warning(f"Error data count: {len(error_ids)}")
                IOUtils.dump(model_data_dir / f"error-ids-{sub_dir_name}.json",
                             error_ids, IOUtils.Format.json)
        # extra step for Open-NMT data
        if model == "Bi-LSTM" or model == "no-split-Bi-LSTM":
            # build dataset used by Open-NMT
            BashUtils.run(
                f"onmt_preprocess -train_src {model_data_dir}/{eval_setting}-{year}-{Macros.train}/src-train.txt "
                f"-train_tgt {model_data_dir}/{eval_setting}-{year}-{Macros.train}/tgt-train.txt "
                f"-valid_src {model_data_dir}/{eval_setting}-{year}-{Macros.val}/src-val.txt "
                f"-valid_tgt {model_data_dir}/{eval_setting}-{year}-{Macros.val}/tgt-val.txt "
                f"-save_data {model_data_dir}/{eval_setting}-{year}-{Macros.train}/biLSTM --src_seq_length 200 --src_seq_"
                f"length_trunc 200",
                expected_return_code=0)

        return
예제 #24
0
    def extract_data_project(
        cls,
        project_path: Path,
        files: Optional[List[str]],
        exclude_files: Optional[List[str]],
        exclude_pattern: Optional[str],
        serapi_options: str,
        output_path: Path,
    ):
        # 1. Prepare output path
        if output_path.is_dir():
            cls.logger.warning(
                f"{output_path} already exists, will overwrite the files.")
        elif output_path.is_file():
            LoggingUtils.log_and_raise(
                cls.logger,
                f"{output_path} already exists as a file. Aborting.",
                Exception)
        else:
            IOUtils.mk_dir(output_path)
        # end if

        # 2. Extract documents, tok.sexp and ast.sexp
        coq_documents: Dict[str, CoqDocument] = collections.OrderedDict()
        ast_sexp_lists: Dict[str, List[SexpNode]] = dict()
        tok_sexp_lists: Dict[str, List[SexpNode]] = dict()

        with IOUtils.cd(project_path):
            coq_files: List[str] = BashUtils.run(
                f"find -name '*.v' -type f").stdout.split("\n")[:-1]
            coq_files = [coq_file[2:] for coq_file in coq_files]

            if files is not None:
                coq_files = [f for f in coq_files if f in files]
            # end if

            if exclude_files is not None:
                coq_files = [f for f in coq_files if f not in exclude_files]
            # end if

            if exclude_pattern is not None:
                re_exclude_pattern = re.compile(exclude_pattern)
                coq_files = [
                    f for f in coq_files if not re_exclude_pattern.fullmatch(f)
                ]
            # end if

            for i, coq_file in enumerate(tqdm(coq_files)):
                try:
                    # Read file
                    with open(coq_file, "r", newline="") as f:
                        source_code = f.read()
                    # end with

                    # Get unicode offsets
                    unicode_offsets = ParserUtils.get_unicode_offsets(
                        source_code)

                    # Call SerAPI
                    ast_sexp_str: str = BashUtils.run(
                        f"sercomp {serapi_options} --mode=sexp -- {coq_file}",
                        expected_return_code=0).stdout
                    tok_sexp_str: str = BashUtils.run(
                        f"sertok {serapi_options} -- {coq_file}",
                        expected_return_code=0).stdout

                    # Parse ast sexp
                    ast_sexp_list: List[SexpNode] = SexpParser.parse_list(
                        ast_sexp_str)
                    tok_sexp_list: List[SexpNode] = SexpParser.parse_list(
                        tok_sexp_str)

                    # Parse the document
                    coq_document = CoqParser.parse_document(
                        source_code,
                        ast_sexp_list,
                        tok_sexp_list,
                        unicode_offsets=unicode_offsets)

                    # Set meta data
                    coq_document.file_name = coq_file
                    coq_document.project_name = project_path.name

                    coq_documents[coq_file] = coq_document
                    ast_sexp_lists[coq_file] = ast_sexp_list
                    tok_sexp_lists[coq_file] = tok_sexp_list
                except KeyboardInterrupt:
                    cls.logger.warning("Keyboard interrupt!")
                    raise
                except:
                    cls.logger.warning(
                        f"File {coq_file} failed! Exception was: {traceback.format_exc()}"
                    )
                    continue
                # end try
            # end for

            # 3. Extract and save lemmas and definitions
            lemmas: List[Lemma] = list()
            definitions: List[Definition] = list()

            # Increase recursion limit because the backend sexps are CRAZZZZY deep
            sys.setrecursionlimit(10000)

            for file_path, doc in tqdm(coq_documents.items()):
                ast_sexp_list = ast_sexp_lists[file_path]
                lemmas_doc = cls.collect_lemmas_doc(doc, ast_sexp_list,
                                                    serapi_options)
                lemmas.extend(lemmas_doc)
                definitions_doc = cls.collect_definitions_doc(
                    doc, ast_sexp_list)
                definitions.extend(definitions_doc)
            # end for

            IOUtils.dump(output_path / "lemmas.json", IOUtils.jsonfy(lemmas),
                         IOUtils.Format.json)
            IOUtils.dump(output_path / "definitions.json",
                         IOUtils.jsonfy(definitions), IOUtils.Format.json)
        # end with
        return
예제 #25
0
 def tacc_get_num_jobs(cls) -> int:
     return int(
         BashUtils.run(f"squeue -u {os.getenv('USER')} | wc -l").stdout) - 1
예제 #26
0
    def collect_coq_documents_project(
        cls,
        data_mgr: FilesManager,
        project: Project,
        names_projects: Dict[str, Project],
        files: List[str] = None,
        is_verifying_tokenizer: bool = False,
    ) -> List[CoqDocument]:
        coq_documents: List[CoqDocument] = list()

        # Clone and checkout repo
        project.clone()
        project.checkout(project.data["sha"], is_forced=True)

        # Build the project
        cls.install_coq_project(project, names_projects)

        # For each file, parse code to tokens
        with IOUtils.cd(project.checkout_dir):
            coq_files: List[str] = BashUtils.run(
                f"find -name '*.v' -type f").stdout.split("\n")[:-1]
            if files is not None:
                coq_files = [f for f in coq_files
                             if f[2:] in files]  # [2:] is to remove the ./
            # end if
            re_ignore_path = re.compile(
                project.data["ignore_path_regex"]
            ) if "ignore_path_regex" in project.data else None
            for i, coq_file in enumerate(coq_files):
                try:
                    coq_file = coq_file[2:]
                    cls.logger.debug(
                        f"File {i + 1}/{len(coq_files)}: {coq_file}")

                    # Check if file is ignored
                    if re_ignore_path is not None and re_ignore_path.fullmatch(
                            coq_file):
                        cls.logger.info(f"Ignoring file {coq_file}")
                        continue
                    # end if

                    # Read file
                    with open(coq_file, "r", newline="") as f:
                        source_code = f.read()
                    # end with

                    # Get unicode offsets
                    unicode_offsets = ParserUtils.get_unicode_offsets(
                        source_code)

                    # Save original file to original_files
                    data_mgr.dump_data([
                        FilesManager.ORIGINAL_FILES, project.full_name,
                        coq_file
                    ], source_code, IOUtils.Format.txt)

                    # Call SerAPI
                    serapi_options = project.data.get("serapi_options", "")
                    ast_sexp_str: str = BashUtils.run(
                        f"sercomp {serapi_options} --mode=sexp -- {coq_file}",
                        expected_return_code=0).stdout
                    tok_sexp_str: str = BashUtils.run(
                        f"sertok {serapi_options} -- {coq_file}",
                        expected_return_code=0).stdout

                    # Save ast sexp to dataset (.ast.sexp)
                    data_mgr.dump_data([
                        FilesManager.RAW_FILES, project.full_name,
                        coq_file[:-2] + ".ast.sexp"
                    ], ast_sexp_str, IOUtils.Format.txt)

                    # Save tok sexp to dataset (.tok.sexp)
                    data_mgr.dump_data([
                        FilesManager.RAW_FILES, project.full_name,
                        coq_file[:-2] + ".tok.sexp"
                    ], tok_sexp_str, IOUtils.Format.txt)

                    # Parse ast sexp
                    ast_sexp_list: List[SexpNode] = SexpParser.parse_list(
                        ast_sexp_str)
                    tok_sexp_list: List[SexpNode] = SexpParser.parse_list(
                        tok_sexp_str)

                    # Verify the tokenizer if requested
                    if is_verifying_tokenizer:
                        if not cls.verify_tokenizer(tok_sexp_list, source_code,
                                                    unicode_offsets):
                            LoggingUtils.log_and_raise(
                                cls.logger,
                                "Tokenized content doesn't match original file!",
                                Exception)
                        # end if
                    # end if

                    # Parse the document
                    coq_document = CoqParser.parse_document(
                        source_code,
                        ast_sexp_list,
                        tok_sexp_list,
                        unicode_offsets=unicode_offsets)

                    # Save the parsed document (printed format) to raw_files
                    data_mgr.dump_data(
                        [FilesManager.RAW_FILES, project.full_name, coq_file],
                        coq_document.str_with_space(), IOUtils.Format.txt)

                    # Set meta data
                    coq_document.file_name = coq_file
                    coq_document.project_name = project.full_name
                    coq_document.revision = project.revision

                    coq_documents.append(coq_document)
                except KeyboardInterrupt:
                    cls.logger.warning("Keyboard interrupt!")
                    raise
                except:
                    cls.logger.warning(
                        f"File {coq_file} failed! Exception was: {traceback.format_exc()}"
                    )
                    continue
                # end try
            # end for
        # end with

        return coq_documents
예제 #27
0
    def prepare_data(self):
        if not self.use_latest:
            for t in range(13, 18):
                exp_dir = self.work_dir / f"{t}{t+1}-train"
                self.logger.info(
                    f"Preparing the data for {t}-{t+1} at {exp_dir}")
                IOUtils.rm_dir(exp_dir)
                IOUtils.mk_dir(exp_dir)

                # Copy train data
                BashUtils.run(
                    f"cp -r {self.model_data_dir}/20{t}-20{t+1}-train/train {exp_dir}/",
                    expected_return_code=0)

                # Copy val test data
                BashUtils.run(
                    f"cp -r {self.model_data_dir}/20{t+1}-20{t+2}-val/valid {exp_dir}/",
                    expected_return_code=0)
                BashUtils.run(
                    f"cp -r {self.model_data_dir}/20{t+2}-20{t+3}-test/test {exp_dir}/",
                    expected_return_code=0)

                # Copy vocab
                BashUtils.run(
                    f"cp {self.model_data_dir}/20{t}-20{t+1}-train/vocab* {exp_dir}/",
                    expected_return_code=0)
                # end for
            # end for
        else:
            exp_dir = self.work_dir / "latest"
            IOUtils.rm_dir(exp_dir)
            IOUtils.mk_dir(exp_dir)
            # Copy Train data
            BashUtils.run(
                f"cp -r {self.model_data_dir}/latest/train {exp_dir}/",
                expected_return_code=0)

            BashUtils.run(
                f"cp -r {self.model_data_dir}/latest/valid {exp_dir}/",
                expected_return_code=0)
            BashUtils.run(
                f"cp -r {self.model_data_dir}/latest/test {exp_dir}/",
                expected_return_code=0)

            # Copy vocab
            BashUtils.run(f"cp {self.model_data_dir}/latest/vocab* {exp_dir}/",
                          expected_return_code=0)
        return
예제 #28
0
    def prepare_configs_and_scripts(self, trials: List[int]):
        base_config = IOUtils.load(self.base_config_file,
                                   IOUtils.Format.jsonPretty)
        if not self.use_latest:
            exps = [f"{t}{t+1}-train" for t in range(13, 18)]
            for exp in exps:
                exp_dir = self.work_dir / exp
                for trial in trials:
                    trial_dir = exp_dir / f"trial-{trial}"
                    IOUtils.mk_dir(trial_dir)

                    output_file = trial_dir / "output.txt"

                    config = copy.copy(base_config)
                    config["data_dir"] = str(exp_dir)
                    config["model_dir"] = str(trial_dir / "model")
                    config["output"] = str(output_file)

                    config_file = trial_dir / "config.json"
                    IOUtils.dump(config_file, config,
                                 IOUtils.Format.jsonPretty)

                    train_script_file = trial_dir / "train.sh"
                    train_script = f"#!/bin/bash\n" \
                                   f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                                   f"conda activate {self.CONDA_ENV}\n" \
                                   f"module load cuda/10.0 cudnn/7.6.2\n" \
                                   f"cd {self.code_dir}/translate\n" \
                                   f"python3 __main__.py {config_file} --train -v --gpu-id $1 &> {trial_dir}/log-train.txt\n"
                    IOUtils.dump(train_script_file, train_script,
                                 IOUtils.Format.txt)
                    BashUtils.run(f"chmod +x {train_script_file}",
                                  expected_return_code=0)

                    test_script_file = trial_dir / "test.sh"
                    test_script = f"#!/bin/bash\n" \
                                  f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                                  f"conda activate {self.CONDA_ENV}\n" \
                                  f"module load cuda/10.0 cudnn/7.6.2\n" \
                                  f"cd {self.code_dir}/translate\n" \
                                  f"python3 __main__.py {config_file} --eval {exp_dir}/test/test.token.code {exp_dir}/test/test.token.sbt {exp_dir}/test/test.token.nl &> {trial_dir}/log-test.txt"
                    IOUtils.dump(test_script_file, test_script,
                                 IOUtils.Format.txt)
                    BashUtils.run(f"chmod +x {test_script_file}",
                                  expected_return_code=0)

                    eval_script_file = trial_dir / "val.sh"
                    eval_script = f"#!/bin/bash\n" \
                                   f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                                   f"conda activate {self.CONDA_ENV}\n" \
                                   f"module load cuda/10.0 cudnn/7.6.2\n" \
                                   f"cd {self.code_dir}/translate\n" \
                                   f"python3 Bleu.py {exp_dir}/test/test.token.nl {trial_dir}/output.txt {trial_dir}\n"
                    IOUtils.dump(eval_script_file, eval_script,
                                 IOUtils.Format.txt)
                    BashUtils.run(f"chmod +x {eval_script_file}",
                                  expected_return_code=0)
                # end for
            # end for
        else:
            exp_dir = self.work_dir / "latest"
            for trial in trials:
                trial_dir = exp_dir / f"trial-{trial}"
                IOUtils.mk_dir(trial_dir)

                output_file = trial_dir / "output.txt"

                config = copy.copy(base_config)
                config["data_dir"] = str(exp_dir)
                config["model_dir"] = str(trial_dir / "model")
                config["output"] = str(output_file)

                config_file = trial_dir / "config.json"
                IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty)

                train_script_file = trial_dir / "train.sh"
                train_script = f"#!/bin/bash\n" \
                               f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                               f"conda activate {self.CONDA_ENV}\n" \
                               f"module load cuda/10.0 cudnn/7.6.2\n" \
                               f"cd {self.code_dir}/translate\n" \
                               f"python3 __main__.py {config_file} --train -v --gpu-id $1 &> {trial_dir}/log-train.txt\n"
                IOUtils.dump(train_script_file, train_script,
                             IOUtils.Format.txt)
                BashUtils.run(f"chmod +x {train_script_file}",
                              expected_return_code=0)

                test_script_file = trial_dir / "test.sh"
                test_script = f"#!/bin/bash\n" \
                              f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                              f"conda activate {self.CONDA_ENV}\n" \
                              f"module load cuda/10.0 cudnn/7.6.2\n" \
                              f"cd {self.code_dir}/translate\n" \
                              f"python3 __main__.py {config_file} --eval {exp_dir}/test/test.token.code {exp_dir}/test/test.token.sbt {exp_dir}/test/test.token.nl &> {trial_dir}/log-test.txt"
                IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt)
                BashUtils.run(f"chmod +x {test_script_file}",
                              expected_return_code=0)

                eval_script_file = trial_dir / "val.sh"
                eval_script = f"#!/bin/bash\n" \
                              f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                              f"conda activate {self.CONDA_ENV}\n" \
                              f"module load cuda/10.0 cudnn/7.6.2\n" \
                              f"cd {self.code_dir}/translate\n" \
                              f"python3 Bleu.py {exp_dir}/test/test.token.nl {trial_dir}/output.txt {trial_dir}\n"
                IOUtils.dump(eval_script_file, eval_script, IOUtils.Format.txt)
                BashUtils.run(f"chmod +x {eval_script_file}",
                              expected_return_code=0)
        return
예제 #29
0
    def collect_lemmas_doc(
        cls,
        doc: CoqDocument,
        ast_sexp_list: List[SexpNode],
        serapi_options: str,
    ) -> List[Lemma]:
        lemmas_doc: List[Lemma] = list()
        data_index = doc.get_data_index()

        # Maintain a stack of module
        modules: List[str] = list()

        # Prepare qualified name prefix
        qprefix_this_doc = "./" + doc.file_name[:-2]  # Remove .v
        for m in cls.RE_PATH_TO_QUALIFIED_PREFIX.finditer(serapi_options):
            path = m.group("path")
            if path != ".": path = "./" + path
            qprefix = m.group("qprefix")

            if qprefix_this_doc.startswith(path):
                qprefix_this_doc = qprefix + qprefix_this_doc[len(path):]
                break
            # end if
        # end for
        if qprefix_this_doc.startswith("./"):
            qprefix_this_doc = qprefix_this_doc[len("./"):]
        qprefix_this_doc = qprefix_this_doc.replace("/", ".")

        for sent_i, sent in enumerate(doc.sentences):
            ast_sexp = ast_sexp_list[sent_i]
            vernac = SexpAnalyzer.analyze_vernac(ast_sexp)

            if vernac.vernac_type in cls.VTYPES_MODULE_BEG:
                # (VernacExpr()(VernacDefineModule()  (  (   v   ( Id <module name>)) ...
                #  0         1 2 20               21  22 220  2201    22011
                module_name = vernac.vernac_sexp[2][2][0][1][
                    1].content_no_quote
                modules.append(module_name)
            elif vernac.vernac_type in cls.VTYPES_MODULE_END:
                # (VernacExpr()(VernacEndSegment  (  (   v   ( Id <module name>)) ...
                #  0         1 2 20               21 210  2101    21011
                try:
                    module_name = vernac.vernac_sexp[2][1][0][1][
                        1].content_no_quote
                except:
                    print(vernac.vernac_sexp.pretty_format())
                    raise
                # end try
                if len(modules) > 0 and module_name == modules[-1]:
                    modules.pop(
                    )  # EndModule and EndSection share the same vernac type
            elif vernac.vernac_type in cls.VTYPES_LEMMA:
                # (VernacExpr()(VernacStartTheoremProof Lemma ( ( ( ( ( v (       Id <lemma name>))
                #  0         1 2 20                     21   22   2200000 2200001    22000011
                lemma = Lemma()
                lemma.data_index = data_index

                lemma.name = vernac.vernac_sexp[2][2][0][0][0][0][1][
                    1].content_no_quote
                lemma.qname = qprefix_this_doc + "." + ".".join(modules +
                                                                [lemma.name])

                # Find lemma content, after the first token matching the lemma name
                tok_i = 0
                for tok in sent.tokens:
                    if tok.content == lemma.name: break
                    tok_i += 1
                # end for
                if tok_i == len(sent.tokens):
                    LoggingUtils.log_and_raise(
                        cls.logger,
                        f"Lemma name {lemma.name} didn't appear in the source code {sent.str_with_space()}",
                        Exception)

                lemma.vernac_command = sent.tokens[:tok_i]
                lemma.statement = sent.tokens[tok_i + 1:]
                lemma.ast_sexp = vernac.vernac_sexp

                lemmas_doc.append(lemma)
            # end if
        # end for

        # Use sername to get the backend representations
        lemma_qnames: str = "".join([l.qname + "\n" for l in lemmas_doc])
        lemma_qnames_file = BashUtils.get_temp_file()
        IOUtils.dump(lemma_qnames_file, lemma_qnames, IOUtils.Format.txt)

        lemma_qnames_backend_sexps_str: str = BashUtils.run(
            f"sername {serapi_options} --require-lib={qprefix_this_doc} {lemma_qnames_file}",
            expected_return_code=0).stdout
        IOUtils.rm(lemma_qnames_file)
        for qname_backend_sexp_str in lemma_qnames_backend_sexps_str.splitlines(
        ):
            qname, backend_sexp_str = qname_backend_sexp_str.split(":", 1)
            backend_sexp = SexpParser.parse(backend_sexp_str)

            for lemma in lemmas_doc:
                if lemma.qname == qname:
                    lemma.backend_sexp = backend_sexp
                    break
                # end if
            # end for
        # end for

        lemmas_doc = [l for l in lemmas_doc if l.backend_sexp is not None]
        return lemmas_doc
예제 #30
0
 def test_inherit_env(self):
     os.environ[self.TEST_ENV_A_KEY] = self.TEST_ENV_A_VALUE
     self.assertEqual(
         self.TEST_ENV_A_VALUE,
         BashUtils.run(f"echo -n ${self.TEST_ENV_A_KEY}").stdout)
     return