예제 #1
0
    def extract_data_from_corpus(cls,
            corpus_path: Path,
            trainevals: List[str],
            groups: List[str],
            output_path: Path,
    ):
        # 1. Prepare output path
        if output_path.is_dir():
            cls.logger.warning(f"{output_path} already exists, will overwrite the files.")
        elif output_path.is_file():
            LoggingUtils.log_and_raise(cls.logger, f"{output_path} already exists as a file. Aborting.", Exception)
        else:
            IOUtils.mk_dir(output_path)
        # end if

        assert all([traineval in Macros.DS_TRAINEVALS for traineval in trainevals])
        assert all([group in Macros.DS_GROUPS+[Macros.DS_GROUP_TA] for group in groups])

        data_mgr = FilesManager(corpus_path)

        # 2. Load lemmas and definitions
        lemmas_filtered: List[Lemma] = data_mgr.load_data([FilesManager.LEMMAS_FILTERED], IOUtils.Format.json, is_batched=True, clz=Lemma)
        definitions: List[Definition] = data_mgr.load_data([FilesManager.DEFINITIONS, "definitions.json"], IOUtils.Format.json, clz=Definition)

        # 3. Output to output_path for each combination of traineval and group
        for traineval in trainevals:
            for group in groups:
                IOUtils.mk_dir(output_path/f"{group}-{traineval}")
                data_indexes = IOUtils.load(project_dir/"training"/f"{group}-{traineval}.json"], IOUtils.Format.json, clz=str)
                IOUtils.dump(output_path/f"{group}-{traineval}/lemmas.json", IOUtils.jsonfy([l for l in lemmas_filtered if l.data_index in data_indexes]), IOUtils.Format.json)
                IOUtils.dump(output_path/f"{group}-{traineval}/definitions.json", IOUtils.jsonfy([d for d in definitions if d.data_index in data_indexes]), IOUtils.Format.json)
            # end for
        # end for
        return
예제 #2
0
 def test_dejsonfy_basic(self):
     self.assertEqual("aaa", IOUtils.dejsonfy("aaa"))
     self.assertEqual(42, IOUtils.dejsonfy(42))
     self.assertEqual(1.111, IOUtils.dejsonfy(1.111))
     self.assertEqual([1, 2.0, "ccc"], IOUtils.dejsonfy([1, 2.0, "ccc"]))
     self.assertEqual({"f1": 1, "f2": 2.0, "f3": "ccc"}, IOUtils.dejsonfy({"f1": 1, "f2": 2.0, "f3": "ccc"}))
     return
예제 #3
0
    def collect_data(cls, **options) -> NoReturn:
        data_mgr = FilesManager(cls.dataset_dir)

        task = options["task"]

        projects_path = Path(options.get("corpus", cls.dataset_dir / "projects-standalone-8.10.yml"))
        projects: List[Project] = IOUtils.dejsonfy(IOUtils.load(projects_path, "json"), Project)

        if task == cls.TASK_COQ_DOCUMENTS:
            files = Utils.get_option_as_list(options, "files", None)
            is_verifying_tokenizer = Utils.get_option_as_boolean(options, "verify-tokenizer")
            cls.collect_coq_documents_projects(data_mgr, projects, files, is_verifying_tokenizer)
        elif task == cls.TASK_DATA_INDEXES:
            cls.collect_data_indexes(data_mgr, projects)
        elif task == cls.TASK_DEFINITIONS:
            cls.collect_definitions(data_mgr)
        elif task == cls.TASK_INSTALL_COQ_PROJECTS:
            cls.install_coq_projects(projects)
        elif task == cls.TASK_LEMMA:
            files = Utils.get_option_as_list(options, "files", None)
            cls.collect_lemmas(data_mgr, projects, files)
        elif task == cls.TASK_LEMMA_BACKEND_SEXP_TRANSFORMATIONS:
            cls.collect_lemmas_backend_sexp_transformations(data_mgr)
        elif task == cls.TASK_LEMMA_FILTERED:
            cls.filter_lemmas(data_mgr)
        elif task == cls.TASK_LEMMA_FOREEND_SEXP_TRANSFORMATIONS:
            cls.collect_lemmas_foreend_sexp_transformations(data_mgr)
        else:
            LoggingUtils.log_and_raise(cls.logger, f"Unknown task {task}", ValueError)
        # end if
        return
예제 #4
0
파일: ex_ngram.py 프로젝트: mfkiwl/hdlp
def write_seq_len_stat(num_pa, ref_modelname):
    stat_list = list()
    src_l = list()
    data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            f"data/vhdl/{ref_modelname}")
    for mode in ["train", "val", "test"]:
        src_l_file = os.path.join(data_dir, f"src.l.{mode}.txt")
        src_l += [
            l.split() for l in IOUtils.load(
                src_l_file, IOUtils.Format.txt).strip().splitlines()
        ]
    stat_list.append(get_seq_len_stat(src_l))

    for i in range(num_pa):
        src_pa = list()
        result_list = list()
        for mode in ["train", "val", "test"]:
            src_pa_file = os.path.join(data_dir,
                                       f"src.prevassign{i}.{mode}.txt")
            src_pa += [
                l.split() for l in IOUtils.load(
                    src_pa_file, IOUtils.Format.txt).strip().splitlines()
            ]
        for j, pa in enumerate(src_pa):
            if pa != ["<empty>"]:
                src_l[j] = pa + src_l[j]
                result_list.append(src_l[j])
        stat_list.append(get_seq_len_stat(result_list))
    results_file = os.path.join("../slpproject/_results/vhdl/ALL/metrics",
                                f"lhs-pa-len-stat.json")
    IOUtils.dump(results_file, stat_list, IOUtils.Format.json)
    return
예제 #5
0
    def test_format_yaml(self):
        """
        Tests for IOUtils.Format.yaml
        """
        objs = [
            42.001,
            "aaa",
            [13, "24", 56.7],
            {
                "name": "K",
                "job": "Y"
            },
        ]
        exp_strs = [
            "42.001\n...\n",
            "aaa\n...\n",
            "- 13\n- '24'\n- 56.7\n",
            "job: Y\nname: K\n",  # dictionary are forced to be sorted
        ]

        for obj, exp_str in zip(objs, exp_strs):
            path = Path(tempfile.mktemp())

            # Test dump
            IOUtils.dump(path, obj, IOUtils.Format.yaml)
            self.assertEqual(exp_str, self.load_plain(path))

            # Test load
            loaded = IOUtils.load(path, IOUtils.Format.yaml)
            self.assertEqual(obj, loaded)

            self.rm(path)
예제 #6
0
    def split_project(self,
                      method_file: Path,
                      random_seed: int,
                      debug: bool = False):
        """
        Split projects into train, val, test according to the project names. Will get 2 new files:
        project-list.json, project-split.json.
        """
        proj_list = set()
        with open(method_file, "r") as f:
            objects = ijson.items(f, "item")
            for o in objects:
                proj_list.add(o["prj_name"])
        num_proj = len(proj_list)
        proj_list = list(proj_list)
        if debug:
            output_dir = Path("/tmp/nlpast-data-10")
        else:
            output_dir = Path("/tmp/nlpast-data-880")

        IOUtils.dump(output_dir / "project-list.json", proj_list)

        random.seed(random_seed)
        random.shuffle(proj_list)
        train_index = round(num_proj * 0.8)
        valid_index = train_index + round(num_proj * 0.1)
        train_projs = proj_list[:train_index]
        valid_projs = proj_list[train_index:valid_index]
        test_projs = proj_list[valid_index:]
        project_split = {
            "train": train_projs,
            "val": valid_projs,
            "test": test_projs
        }
        IOUtils.dump(output_dir / "project-split.json", project_split)
    def load_configs(self,
                     prj_root: Optional[Path] = None,
                     force_reload: bool = False):
        """
        Load configs (first project-local, then global) to this user interface.
        """
        # If the configs of the current project is already loaded, skip
        if not force_reload and prj_root is not None and prj_root == self.loaded_config_prj:
            return

        # Reset the project-local config indicator
        self.loaded_config_prj = None

        # First, load global config
        global_config_file = RoosterizeDirUtils.get_global_config_file()
        if global_config_file.exists():
            global_config = IOUtils.load(global_config_file,
                                         IOUtils.Format.yaml)
            self.set_configs_from_dict(global_config, self.GLOBAL_CONFIGS)

        # Then, load local config
        if prj_root is not None:
            local_config_file = RoosterizeDirUtils.get_local_config_file(
                prj_root)
            if local_config_file.exists():
                local_config = IOUtils.load(local_config_file,
                                            IOUtils.Format.yaml)
                self.set_configs_from_dict(local_config, self.LOCAL_CONFIGS)

            self.loaded_config_prj = prj_root
예제 #8
0
파일: ex_baseline.py 프로젝트: mfkiwl/hdlp
def main_val(modelname, ref_modelname):
    bleus, accs, exact_accs = [],[],[]
    target_list = get_targets(ref_modelname, "val")
    preds_list = get_baseline_preds(ref_modelname, "val")
    for pred, target in zip(preds_list, target_list):
        pred_split = [t for t in pred.split(" ") if t!='']
        target_split = [t for t in target.split(" ") if t!='']
        bleu = get_bleu(target=target_split, pred=pred_split)
        acc = get_accuracy(target=target_split, pred=pred_split)
        exact_acc = get_exact_match_accuracy(target=target_split, pred=pred_split)
        bleus.append(bleu)
        accs.append(acc)
        exact_accs.append(exact_acc)

    avg_bleu = np.mean(bleus)
    avg_acc = np.mean(accs)
    avg_exact_acc = np.mean(exact_accs)
    print(f"Average BLEU: {avg_bleu:.3f}, average accuracy: {avg_acc:.3f}, average exact match accuracy: {avg_exact_acc:.3f}")
    results_file = os.path.join(SAVE_DIR, modelname,"testlog.val.assignments.baseline.log")
    results = {
        "bleu-AVG": avg_bleu,
        "acc-AVG": avg_acc,
        "exact-acc-AVG": avg_exact_acc,
        "bleu": bleus,
        "acc": accs,
        "exact-acc": exact_accs,
    }
    IOUtils.dump(results_file, results, IOUtils.Format.jsonNoSort)

    IOUtils.dump(os.path.join(SAVE_DIR, modelname, "pred.val.assignments.baseline.log"), "".join([pred.strip()+"\n" for pred in preds_list]), IOUtils.Format.txt)
    return
예제 #9
0
파일: ex_ngram.py 프로젝트: mfkiwl/hdlp
def load_data(num_pa, ref_modelname):
    src_dict = dict()
    stat_list = list()
    data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            f"data/vhdl/{ref_modelname}")
    for mode in ["train", "val", "test"]:
        src_l_file = os.path.join(data_dir, f"src.l.{mode}.txt")
        src_l = [
            l.split() for l in IOUtils.load(
                src_l_file, IOUtils.Format.txt).strip().splitlines()
        ]
        src_r_file = os.path.join(data_dir, f"tgt.{mode}.txt")
        src_r = [
            l.split() for l in IOUtils.load(
                src_r_file, IOUtils.Format.txt).strip().splitlines()
        ]
        src_seq = [l + ["<="] + r for l, r in zip(src_l, src_r)]
        for i in range(num_pa):
            src_pa_file = os.path.join(data_dir,
                                       f"src.prevassign{i}.{mode}.txt")
            src_pa = [
                l.split() for l in IOUtils.load(
                    src_pa_file, IOUtils.Format.txt).strip().splitlines()
            ]
            for j, pa in enumerate(src_pa):
                src_seq[j] = pa + src_seq[j]
        src_dict[f"{mode}"] = src_seq
    return src_dict
    def download_global_model(self, force_yes: bool = False):
        """
        Downloads a global Roosterize model.
        """
        global_model_dir = RoosterizeDirUtils.get_global_model_dir()
        if global_model_dir.exists():
            ans = self.ask_for_confirmation(
                f"A Roosterize model already exists at {global_model_dir}. "
                f"Do you want to delete it and download again?")
            if force_yes:
                ans = True
            if ans != True:
                return
            IOUtils.rm_dir(global_model_dir)

        self.show_message("Downloading Roosterize model...")

        # Download and unpack
        temp_model_dir = Path(tempfile.mkdtemp(prefix="roosterize"))

        urllib.request.urlretrieve(self.model_url,
                                   str(temp_model_dir / "model.tgz"))
        with IOUtils.cd(temp_model_dir):
            BashUtils.run("tar xzf model.tgz", expected_return_code=0)

            # Move the stuff to global model place
            shutil.move(str(Path.cwd() / "model"), global_model_dir)

        # Delete temp dir
        IOUtils.rm_dir(temp_model_dir)

        self.show_message("Finish downloading Roosterize model.")
예제 #11
0
    def prepare_data(self):
        data_prefix = f"{self.eval_setting}-{self.year}"
        data_dir = self.work_dir / "data"

        self.logger.info(
            f"Preparing the data for {self.eval_setting} {self.year} at {data_dir}"
        )
        IOUtils.rm_dir(data_dir)
        IOUtils.mk_dir(data_dir)

        # Copy train/val/test_common/test_standard data
        BashUtils.run(
            f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.train}/train {data_dir}/train",
            expected_return_code=0)
        BashUtils.run(
            f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.val}/valid {data_dir}/valid",
            expected_return_code=0)
        BashUtils.run(
            f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.test_common}/test {data_dir}/{Macros.test_common}",
            expected_return_code=0)
        BashUtils.run(
            f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/test {data_dir}/{Macros.test_standard}",
            expected_return_code=0)

        # Copy vocab
        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/vocab* {data_dir}/",
            expected_return_code=0)

        return
예제 #12
0
    def process_data(self, project_dir):
        try:
            revision_data = IOUtils.load(project_dir / "collector" /
                                         "method-project-revision.json")
            method_data = IOUtils.load(project_dir / "collector" /
                                       "method-data.json")
            output_dir = project_dir / "collector"
            method_project_evo = []
            for year in BetaFilter.YEARS[:-1]:
                curr_time = f"{year}_Jan_1"
                curr_method_ids = \
                    [year_data["method_ids"] for year_data in revision_data if year_data["year"] == curr_time][0]
                next_time = f"{year + 1}_Jan_1"
                next_method_ids = \
                    [year_data["method_ids"] for year_data in revision_data if year_data["year"] == next_time][0]
                new_method_ids = list(
                    set(next_method_ids) - set(curr_method_ids))
                filtered_method_ids = BetaFilter.beta_filter(
                    new_method_ids, curr_method_ids, method_data)
                method_project_evo.append({
                    "prj_name":
                    revision_data[0]["prj_name"],
                    "time":
                    f"{curr_time}-{next_time}",
                    "method_ids":
                    filtered_method_ids
                })

            IOUtils.dump(output_dir / "method-project-beta-filtered.json",
                         IOUtils.jsonfy(method_project_evo),
                         IOUtils.Format.json)
            return
        except:
            self.logger.info(f"Unexpected error: {sys.exc_info()[0]}")
            return
예제 #13
0
    def load_data(self,
            rel_path: Union[str, List[str]],
            fmt: IOUtils.Format,
            is_batched: bool = False,
            clz = None,
    ) -> Any:
        if self.is_json_format(fmt) and clz is None:
            self.logger.warning(f"Load data from {rel_path} with json format, but did not specify clz (at {traceback.format_stack()})")
        # end if

        abs_path = self.data_dir / self.assemble_rel_path(rel_path)
        if not abs_path.exists():
            LoggingUtils.log_and_raise(self.logger, f"Cannot find data at {abs_path}", IOError)
        # end if

        if not is_batched:
            data = IOUtils.load(abs_path, fmt)
            if self.is_json_format(fmt) and clz is not None:
                data = IOUtils.dejsonfy(data, clz)
            # end if
            return data
        else:
            data = list()
            batch_numbers = sorted([int(str(f.stem).split("-")[1]) for f in abs_path.iterdir()])
            for batch_number in tqdm(batch_numbers):
                batch_file = abs_path / f"batch-{batch_number}.{fmt.get_extension()}"
                data_batch = IOUtils.load(batch_file, fmt)
                if self.is_json_format(fmt) and clz is not None:
                    data_batch = IOUtils.dejsonfy(data_batch, clz)
                # end if
                data.extend(data_batch)
            # end for
            return data
예제 #14
0
def suggest_lemmas(**options):
    from roosterize.data.DataMiner import DataMiner
    from roosterize.data.ModelSpec import ModelSpec
    from roosterize.ml.MLModels import MLModels

    project_path = Path(options["project"]).absolute()
    files = Utils.get_option_as_list(options, "files", None)
    exclude_files = Utils.get_option_as_list(options, "exclude-files", None)
    exclude_pattern = options.get("exclude-pattern", None)
    serapi_options = options.get("serapi-options", "")
    output_dir = Path(options["output"]).absolute()
    model_dir = Path(options["model-dir"]).absolute()

    # Extract data
    print(">>>>> Extracting lemmas ...")
    DataMiner.extract_data_project(project_path, files, exclude_files, exclude_pattern, serapi_options, output_dir/"raw-data")

    # Get the ML model
    print(">>>>> Initializing model ...")
    model_spec = IOUtils.dejsonfy(IOUtils.load(model_dir/"spec.json", IOUtils.Format.json), ModelSpec)
    model = MLModels.get_model(model_dir, model_spec, is_eval=True)

    # Process data
    print(">>>>> Processing data ...")
    model.process_data(output_dir/"raw-data", output_dir/"eval-processed-data")

    # Eval
    print(">>>>> Applying model ...")
    model.eval(output_dir/"eval-processed-data", output_dir/"eval-result")

    # Print suggestions
    print(">>>>> Suggestions:")
    print(IOUtils.load(output_dir/"eval-result"/"suggestions.txt", IOUtils.Format.txt))
    return
예제 #15
0
 def data_cut(self, data_size: int):
     """cut down the dataset to data_size, then save the projects list to data_dir"""
     collected_projects_file = Macros.data_dir / "projects-github.txt"
     self.collected_projects_list = list()
     if collected_projects_file.exists():
         self.collected_projects_list += IOUtils.load(
             collected_projects_file, IOUtils.Format.txt).splitlines()
     # end if
     project_name_list = list()
     for project_url in self.collected_projects_list:
         user_repo = self.parse_github_url(project_url)
         project_name_list.append(f"{user_repo[0]}_{user_repo[1]}")
     all_used_projects = [
         str(x).split("/")[-1] for x in Macros.repos_results_dir.iterdir()
         if x.is_dir()
     ]
     # Find the overlapping projects and select the top data_size projects
     overall_project_num = 0
     reduced_project_list = list()
     for p in project_name_list:
         if p in all_used_projects and overall_project_num < data_size:
             # load the revision data
             filtered_methods = IOUtils.load(Macros.repos_results_dir / p /
                                             "collector" /
                                             "method-project-revision.json")
             new_method_ids = [
                 delta_data["method_ids"] for delta_data in filtered_methods
                 if delta_data["year"] == "2020_Jan_1"
             ][0]
             if len(new_method_ids) > 0:
                 reduced_project_list.append(p)
                 overall_project_num += 1
                 all_used_projects.remove(p)
     IOUtils.dump(Macros.data_dir / f"projects-github-{data_size}.json",
                  reduced_project_list, IOUtils.Format.jsonNoSort)
    def parse_file(self, file_path: Path, prj_root: Path, serapi_options: str):
        source_code = IOUtils.load(file_path, IOUtils.Format.txt)
        unicode_offsets = ParserUtils.get_unicode_offsets(source_code)

        with IOUtils.cd(prj_root):
            rel_path = file_path.relative_to(prj_root)
            ast_sexp_str = BashUtils.run(
                f"sercomp {serapi_options} --mode=sexp -- {rel_path}",
                expected_return_code=0).stdout
            tok_sexp_str = BashUtils.run(
                f"sertok {serapi_options} -- {rel_path}",
                expected_return_code=0).stdout

            ast_sexp_list: List[SexpNode] = SexpParser.parse_list(ast_sexp_str)
            tok_sexp_list: List[SexpNode] = SexpParser.parse_list(tok_sexp_str)

            doc = CoqParser.parse_document(
                source_code,
                ast_sexp_list,
                tok_sexp_list,
                unicode_offsets=unicode_offsets,
            )
            doc.file_name = str(rel_path)

            # Collect lemmas & definitions
            lemmas: List[Lemma] = DataMiner.collect_lemmas_doc(
                doc, ast_sexp_list, serapi_options)
            definitions: List[Definition] = DataMiner.collect_definitions_doc(
                doc, ast_sexp_list)

        return ProcessedFile(file_path, source_code, doc, ast_sexp_list,
                             tok_sexp_list, unicode_offsets, lemmas,
                             definitions)
예제 #17
0
    def prepare_data(self):
        data_prefix = f"{self.eval_setting}-{self.year}"
        data_dir = self.work_dir / "data"

        self.logger.info(
            f"Preparing the data for {self.eval_setting} {self.year} at {self.work_dir}"
        )
        IOUtils.rm_dir(data_dir)
        IOUtils.mk_dir(data_dir)

        # Copy train/val/test_common/test_standard data
        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/code2seq.train.c2s {self.work_dir}/data/",
            expected_return_code=0)
        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.val}/code2seq.val.c2s {self.work_dir}/data/",
            expected_return_code=0)
        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_common}/code2seq.test.c2s {self.work_dir}/data/code2seq.{Macros.test_common}.c2s",
            expected_return_code=0)
        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/code2seq.test.c2s {self.work_dir}/data/code2seq.{Macros.test_standard}.c2s",
            expected_return_code=0)

        # Copy vocab
        BashUtils.run(
            f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/code2seq.dict.c2s {self.work_dir}/data/",
            expected_return_code=0)
        return
예제 #18
0
파일: Vocabulary.py 프로젝트: mfkiwl/hdlp
 def dump(self, path: Path):
     d = dict()
     for f in ["word_to_index", "index_to_word", "next_index", "counter"]:
         d[f] = getattr(self, f)
     # end for
     IOUtils.dump(path, d, IOUtils.Format.jsonPretty)
     return
예제 #19
0
 def clean_path(self, rel_path: Union[str, List[str]]):
     abs_path = self.data_dir / self.assemble_rel_path(rel_path)
     if abs_path.exists():
         self.logger.info(f"Removing existing things at {abs_path}")
         IOUtils.rm(abs_path)
     # end if
     return
예제 #20
0
    def __init__(self, database: Database):
        self.database = database
        self.output_dir = Macros.data_dir / "split"
        IOUtils.rm_dir(self.output_dir)
        IOUtils.mk_dir(self.output_dir)

        self.statistics = dict()
        return
    def eval_impl(self,
            processed_data_dir: Path,
            model_dir: Path,
            beam_search_size: int,
            k: int
    ) -> List[List[Tuple[str, float]]]:
        from roosterize.ml.onmt.CustomTranslator import CustomTranslator
        from onmt.utils.misc import split_corpus
        from onmt.utils.parse import ArgumentParser
        from translate import _get_parser as translate_get_parser

        src_path = processed_data_dir/"src.txt"
        tgt_path = processed_data_dir/"tgt.txt"

        best_step = IOUtils.load(model_dir/"best-step.json", IOUtils.Format.json)
        self.logger.info(f"Taking best step at {best_step}")

        candidates_logprobs: List[List[Tuple[List[str], float]]] = list()

        with IOUtils.cd(self.open_nmt_path):
            parser = translate_get_parser()
            opt = parser.parse_args(
                f" -model {model_dir}/models/ckpt_step_{best_step}.pt"
                f" -src {src_path}"
                f" -tgt {tgt_path}"
            )
            opt.output = f"{model_dir}/last-pred.txt"
            opt.beam_size = beam_search_size
            opt.gpu = 0 if torch.cuda.is_available() else -1
            opt.n_best = k
            opt.block_ngram_repeat = 1
            opt.ignore_when_blocking = ["_"]

            # translate.main
            ArgumentParser.validate_translate_opts(opt)

            translator = CustomTranslator.build_translator(opt, report_score=False)
            src_shards = split_corpus(opt.src, opt.shard_size)
            tgt_shards = split_corpus(opt.tgt, opt.shard_size) if opt.tgt is not None else repeat(None)
            shard_pairs = zip(src_shards, tgt_shards)

            for i, (src_shard, tgt_shard) in enumerate(shard_pairs):
                self.logger.info("Translating shard %d." % i)
                _, _, candidates_logprobs_shard = translator.translate(
                    src=src_shard,
                    tgt=tgt_shard,
                    src_dir=opt.src_dir,
                    batch_size=opt.batch_size,
                    attn_debug=opt.attn_debug
                )
                candidates_logprobs.extend(candidates_logprobs_shard)
            # end for
        # end with

        # Reformat candidates
        candidates_logprobs: List[List[Tuple[str, float]]] = [[("".join(c), l) for c, l in cl] for cl in candidates_logprobs]

        return candidates_logprobs
    def suggest_naming(self, file_path: Path, prj_root: Optional[Path] = None):
        """
        Processes a file to get its lemmas and runs the model to get predictions.
        """
        # Figure out which project we're at, and then load configs
        if prj_root is None:
            prj_root = RoosterizeDirUtils.auto_infer_project_root(file_path)
        self.load_configs(prj_root)

        # Infer SerAPI options
        serapi_options = self.infer_serapi_options(prj_root)

        # If user provided compile_cmd, first compile the project
        if self.compile_cmd is not None:
            with IOUtils.cd(prj_root):
                BashUtils.run(self.compile_cmd, expected_return_code=0)

        # Parse file
        data = self.parse_file(file_path, prj_root, serapi_options)

        # Load model
        self.load_local_model(prj_root)
        model = self.get_model()

        # Use the model to make predictions
        # Temp dirs for processed data and results
        temp_data_dir = Path(tempfile.mkdtemp(prefix="roosterize"))

        # Dump lemmas & definitions
        temp_raw_data_dir = temp_data_dir / "raw"
        temp_raw_data_dir.mkdir()
        IOUtils.dump(
            temp_raw_data_dir / "lemmas.json",
            IOUtils.jsonfy(data.lemmas),
            IOUtils.Format.json,
        )
        IOUtils.dump(
            temp_raw_data_dir / "definitions.json",
            IOUtils.jsonfy(data.definitions),
            IOUtils.Format.json,
        )

        # Model-specific process
        temp_processed_data_dir = temp_data_dir / "processed"
        temp_processed_data_dir.mkdir()
        model.process_data_impl(temp_raw_data_dir, temp_processed_data_dir)

        # Invoke eval
        candidates_logprobs = model.eval_impl(
            temp_processed_data_dir,
            beam_search_size=self.beam_search_size,
            k=self.k,
        )

        # Save predictions
        IOUtils.rm_dir(temp_data_dir)

        # Report predictions
        self.report_predictions(data, candidates_logprobs)
        return
예제 #23
0
파일: ex_ms2.py 프로젝트: mfkiwl/hdlp
def convert_json2txt(config_dict, data_types: List[str] = None):
    if data_types is None:
        data_types = ["train", "val", "test"]

    for data_type in data_types:
        data_list = IOUtils.load(
            os.path.join(DATADIR, config_dict["intermediate_data_dir"],
                         f"{data_type}.json"), IOUtils.Format.json)

        for src_type in config_dict["src_types"]:
            output_path = os.path.join(DATADIR, config_dict["save_dir"],
                                       f"src.{src_type}.{data_type}.txt")
            pa_i = int(config_dict["augment"])
            if src_type == "l":
                field = "l"
            elif src_type == "type":
                field = "l-type"
            elif src_type == "prevassign":
                field = f"pa{pa_i}"
            elif src_type == "patype":
                field = f"pa{pa_i}-type"
            else:
                raise ValueError(f"Unknown src_type {src_type}")
            # end if

            with open(output_path, "w") as f:
                for data in data_list:
                    if len(data[field]) == 0:
                        if field.endswith("-type"):
                            f.write("<pad>\n")
                        else:
                            f.write("<empty>\n")
                        # end if
                    else:
                        f.write(data[field] + "\n")
                    # end if
                # end for
            # end with
        # end for

        fn_output_path = os.path.join(DATADIR, config_dict["save_dir"],
                                      f"src.fn.{data_type}.txt")
        IOUtils.dump(fn_output_path,
                     "".join([data["file_sha"] + "\n" for data in data_list]),
                     IOUtils.Format.txt)

        tgt_output_path = os.path.join(DATADIR, config_dict["save_dir"],
                                       f"tgt.{data_type}.txt")
        # [3:-2]: remove prefix "<= " and suffix " ;"
        IOUtils.dump(tgt_output_path,
                     "".join([data["r"][3:-2] + "\n" for data in data_list]),
                     IOUtils.Format.txt)
    # end for
    print("Conversion into txt is done.")
    return
예제 #24
0
    def prepare_configs_and_scripts(self, trials: List[int]):
        exp_dir = self.work_dir

        for trial in trials:
            trial_dir = exp_dir / f"trial-{trial}"
            IOUtils.mk_dir(trial_dir)

            model_dir = trial_dir / "models"
            IOUtils.mk_dir(model_dir)
            log_dir = trial_dir / "logs"
            IOUtils.mk_dir(log_dir)
            data = str(exp_dir / "data/code2seq")
            val_data = data + ".val.c2s"
            train_log = trial_dir / "training-trace.json"

            train_script_file = trial_dir / f"{Macros.train}.sh"
            # Copy config file
            BashUtils.run(
                f"cp {self.base_config_file} {trial_dir}/config.yaml",
                expected_return_code=0)
            output_file = trial_dir / "output_tmp.txt"
            reference_file = trial_dir / "ref_tmp.txt"
            config_file = trial_dir / "config.yaml"
            train_script = f"#!/bin/bash\n" \
                           f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                           f"conda activate {self.CONDA_ENV}\n" \
                           f"module load cuda/10.0 cudnn/7.6.2\n" \
                           f"cd {self.code_dir}\n" \
                           f"python -u code2seq.py --data {data} --test {val_data} --log {train_log} --config {config_file} " \
                           f"--pred_file {output_file} --ref_file {reference_file} "\
                           f"--save_prefix {model_dir}/model --gpu_id $1 &> {trial_dir}/train-log.txt"
            IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt)
            BashUtils.run(f"chmod +x {train_script_file}",
                          expected_return_code=0)

            for test_type in [Macros.test_common, Macros.test_standard]:
                test_data = exp_dir / "data" / f"code2seq.{test_type}.c2s"
                output_file = trial_dir / f"output_{test_type}.txt"
                reference_file = trial_dir / f"ref_{test_type}.txt"
                test_script_file = trial_dir / f"{test_type}.sh"
                test_script = f"#!/bin/bash\n" \
                              f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                              f"conda activate {self.CONDA_ENV}\n" \
                              f"module load cuda/10.0 cudnn/7.6.2\n" \
                              f"cd {self.code_dir}\n" \
                              f"python3 code2seq.py --load {model_dir}/model_best --test {test_data} --config {config_file} " \
                              f"--pred_file {output_file} --ref_file {reference_file} "\
                              f"--gpu_id $1 &> {trial_dir}/{test_type}-log.txt\n" \
                              f"python3 eval_utils.py {reference_file} {output_file} {trial_dir}/results_{test_type}.json\n"
                IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt)
                BashUtils.run(f"chmod +x {test_script_file}",
                              expected_return_code=0)

        return
예제 #25
0
def get_eval_stats(pred_file: Path, ref_file: Path, result_dir: Path):
    true_positive, false_positive, false_negative = 0, 0, 0
    with open(pred_file, "r") as pf, open(ref_file, "r") as rf:
        pred_lines = pf.readlines()
        ref_lines = rf.readlines()
    true_positive, false_positive, false_negative = update_per_subtoken_statistics(
        zip(ref_lines, pred_lines), true_positive, false_positive,
        false_negative)
    precision, recall, f1 = calculate_results(true_positive, false_positive,
                                              false_negative)
    test_result = {"f1": f1, "precision": precision, "recall": recall}
    IOUtils.dump(result_dir, test_result, IOUtils.Format.jsonPretty)
예제 #26
0
    def prepare_code(self):
        IOUtils.rm_dir(self.code_dir)
        with IOUtils.cd(self.code_dir.parent):
            BashUtils.run(f"git clone {self.REPO_URL} {self.code_dir.name}",
                          expected_return_code=0)
        # end with

        with IOUtils.cd(self.code_dir):
            BashUtils.run(f"git checkout {self.REPO_SHA}",
                          expected_return_code=0)
        # end with
        return
예제 #27
0
    def install_coq_project(cls, project: Project, names_projects: Dict[str, Project]) -> None:
        """
        :requires: the project is cloned and checked-out to the desired version.
        """
        if not project.is_cloned:
            project.clone()
            project.checkout(project.data["sha"], is_forced=True)
        # end if

        # Check if the project is already compiled
        confirmation_file = "lpc-installed.txt"
        confirmation_content = project.revision + " " + BashUtils.run("opam list coq -s", expected_return_code=0).stdout.strip()
        if (project.checkout_dir/confirmation_file).is_file() and IOUtils.load(project.checkout_dir/confirmation_file, "txt") == confirmation_content:
            cls.logger.debug(f"Project {project.full_name} already installed")
            return
        # end if

        project.clean()

        # Install dependencies
        for dependency in project.data.get("dependencies", []):
            dependency_project = names_projects.get(dependency)
            if dependency_project is None:  raise Exception(f"Cannot find dependency {dependency}")
            cls.logger.info(f"For Project {project.full_name}, installing dependency {dependency}")
            cls.install_coq_project(dependency_project, names_projects)
        # end for

        if "build_cmd" not in project.data:  raise Exception(f"Project {project.full_name} does not have build_cmd")
        if "install_cmd" not in project.data:  raise Exception(f"Project {project.full_name} does not have install_cmd")

        with IOUtils.cd(project.checkout_dir):
            # Build
            cls.logger.info(f"Project {project.full_name}: Building with {project.data['build_cmd']}")
            r = BashUtils.run(project.data["build_cmd"])
            if r.return_code != 0:
                raise Exception(f"Compilation failed! Return code is {r.return_code}! stdout:\n{r.stdout}\n; stderr:\n{r.stderr}")
            else:
                cls.logger.debug(f"Compilation finished. Return code is {r.return_code}. stdout:\n{r.stdout}\n; stderr:\n{r.stderr}")
            # end if

            # Install
            cls.logger.info(f"Project {project.full_name}: Installing with {project.data['install_cmd']}")
            r = BashUtils.run(project.data["install_cmd"])
            if r.return_code != 0:
                raise Exception(f"Installation failed! Return code is {r.return_code}! stdout:\n{r.stdout}\n; stderr:\n{r.stderr}")
            else:
                cls.logger.debug(f"Installation finished. Return code is {r.return_code}. stdout:\n{r.stdout}\n; stderr:\n{r.stderr}")
            # end if

            IOUtils.dump(project.checkout_dir / confirmation_file, confirmation_content, "txt")
        # end with
        return
예제 #28
0
 def parse_projects(cls, project_list_file):
     """
     Parse the project list file provided by DeepCom and return the github url file.
     """
     project_list = IOUtils.load(project_list_file,
                                 IOUtils.Format.txt).splitlines()
     git_urls = list()
     for project in project_list:
         project_name = project.split("_", 1)
         git_urls.append(
             f"https://github.com/{project_name[0]}/{project_name[1]}.git")
     IOUtils.dump(Macros.data_dir / "DeepCom-projects-github.txt",
                  "".join([url + "\n" for url in git_urls]),
                  IOUtils.Format.txt)
예제 #29
0
    def dump_data(self,
            rel_path: Union[str, List[str]],
            data: Any,
            fmt: IOUtils.Format,
            is_batched: bool = False,
            per_batch: int = 100,
            exist_ok: bool = False,
    ):
        abs_path = self.data_dir / self.assemble_rel_path(rel_path)
        if abs_path.exists() and not exist_ok:
            LoggingUtils.log_and_raise(self.logger, f"Cannot rewrite existing data at {abs_path}", IOError)
        # end if

        abs_path.parent.mkdir(parents=True, exist_ok=True)
        if not is_batched:
            if self.is_json_format(fmt):
                data = IOUtils.jsonfy(data)
            # end if
            IOUtils.dump(abs_path, data, fmt)
        else:
            # In batched mode, the data need to be slice-able and sizable
            IOUtils.rm(abs_path)
            abs_path.mkdir(parents=True)

            for batch_i in tqdm(range(math.ceil(len(data)/per_batch))):
                data_batch = data[per_batch*batch_i : per_batch*(batch_i+1)]
                if self.is_json_format(fmt):
                    data_batch = IOUtils.jsonfy(data_batch)
                # end if
                IOUtils.dump(abs_path/f"batch-{batch_i}.{fmt.get_extension()}", data_batch, fmt)
            # end for
        # end if
        return
예제 #30
0
    def process_data_impl(
        self,
        data_dir: Path,
        output_processed_data_dir: Path,
    ) -> NoReturn:
        lemmas: List[Lemma] = IOUtils.dejsonfy(
            IOUtils.load(data_dir / "lemmas.json", IOUtils.Format.json),
            List[Lemma])
        definitions: List[Definition] = IOUtils.dejsonfy(
            IOUtils.load(data_dir / "definitions.json", IOUtils.Format.json),
            List[Definition])

        docs_sub_tokenizers = SubTokenizer.get_docs_sub_tokenizers(
            lemmas, definitions)

        # Inputs
        all_inputs: Dict[str, List[List[str]]] = self.get_all_inputs(
            lemmas, docs_sub_tokenizers)
        for input_type, src_sentences in all_inputs.items():
            IOUtils.dump(
                output_processed_data_dir / f"src.{input_type}.txt",
                "".join([" ".join(sent) + "\n" for sent in src_sentences]),
                IOUtils.Format.txt)
        # end for

        # Outputs
        IOUtils.dump(
            output_processed_data_dir / f"tgt.txt", "".join([
                " ".join(self.get_output(lemma, docs_sub_tokenizers)) + "\n"
                for lemma in lemmas
            ]), IOUtils.Format.txt)

        super().process_data_impl(data_dir, output_processed_data_dir)
        return