示例#1
0
def suggest_lemmas(**options):
    from roosterize.data.DataMiner import DataMiner
    from roosterize.data.ModelSpec import ModelSpec
    from roosterize.ml.MLModels import MLModels

    project_path = Path(options["project"]).absolute()
    files = Utils.get_option_as_list(options, "files", None)
    exclude_files = Utils.get_option_as_list(options, "exclude-files", None)
    exclude_pattern = options.get("exclude-pattern", None)
    serapi_options = options.get("serapi-options", "")
    output_dir = Path(options["output"]).absolute()
    model_dir = Path(options["model-dir"]).absolute()

    # Extract data
    print(">>>>> Extracting lemmas ...")
    DataMiner.extract_data_project(project_path, files, exclude_files, exclude_pattern, serapi_options, output_dir/"raw-data")

    # Get the ML model
    print(">>>>> Initializing model ...")
    model_spec = IOUtils.dejsonfy(IOUtils.load(model_dir/"spec.json", IOUtils.Format.json), ModelSpec)
    model = MLModels.get_model(model_dir, model_spec, is_eval=True)

    # Process data
    print(">>>>> Processing data ...")
    model.process_data(output_dir/"raw-data", output_dir/"eval-processed-data")

    # Eval
    print(">>>>> Applying model ...")
    model.eval(output_dir/"eval-processed-data", output_dir/"eval-result")

    # Print suggestions
    print(">>>>> Suggestions:")
    print(IOUtils.load(output_dir/"eval-result"/"suggestions.txt", IOUtils.Format.txt))
    return
示例#2
0
 def data_cut(self, data_size: int):
     """cut down the dataset to data_size, then save the projects list to data_dir"""
     collected_projects_file = Macros.data_dir / "projects-github.txt"
     self.collected_projects_list = list()
     if collected_projects_file.exists():
         self.collected_projects_list += IOUtils.load(
             collected_projects_file, IOUtils.Format.txt).splitlines()
     # end if
     project_name_list = list()
     for project_url in self.collected_projects_list:
         user_repo = self.parse_github_url(project_url)
         project_name_list.append(f"{user_repo[0]}_{user_repo[1]}")
     all_used_projects = [
         str(x).split("/")[-1] for x in Macros.repos_results_dir.iterdir()
         if x.is_dir()
     ]
     # Find the overlapping projects and select the top data_size projects
     overall_project_num = 0
     reduced_project_list = list()
     for p in project_name_list:
         if p in all_used_projects and overall_project_num < data_size:
             # load the revision data
             filtered_methods = IOUtils.load(Macros.repos_results_dir / p /
                                             "collector" /
                                             "method-project-revision.json")
             new_method_ids = [
                 delta_data["method_ids"] for delta_data in filtered_methods
                 if delta_data["year"] == "2020_Jan_1"
             ][0]
             if len(new_method_ids) > 0:
                 reduced_project_list.append(p)
                 overall_project_num += 1
                 all_used_projects.remove(p)
     IOUtils.dump(Macros.data_dir / f"projects-github-{data_size}.json",
                  reduced_project_list, IOUtils.Format.jsonNoSort)
示例#3
0
文件: ex_ngram.py 项目: mfkiwl/hdlp
def write_seq_len_stat(num_pa, ref_modelname):
    stat_list = list()
    src_l = list()
    data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            f"data/vhdl/{ref_modelname}")
    for mode in ["train", "val", "test"]:
        src_l_file = os.path.join(data_dir, f"src.l.{mode}.txt")
        src_l += [
            l.split() for l in IOUtils.load(
                src_l_file, IOUtils.Format.txt).strip().splitlines()
        ]
    stat_list.append(get_seq_len_stat(src_l))

    for i in range(num_pa):
        src_pa = list()
        result_list = list()
        for mode in ["train", "val", "test"]:
            src_pa_file = os.path.join(data_dir,
                                       f"src.prevassign{i}.{mode}.txt")
            src_pa += [
                l.split() for l in IOUtils.load(
                    src_pa_file, IOUtils.Format.txt).strip().splitlines()
            ]
        for j, pa in enumerate(src_pa):
            if pa != ["<empty>"]:
                src_l[j] = pa + src_l[j]
                result_list.append(src_l[j])
        stat_list.append(get_seq_len_stat(result_list))
    results_file = os.path.join("../slpproject/_results/vhdl/ALL/metrics",
                                f"lhs-pa-len-stat.json")
    IOUtils.dump(results_file, stat_list, IOUtils.Format.json)
    return
示例#4
0
文件: ex_ngram.py 项目: mfkiwl/hdlp
def load_data(num_pa, ref_modelname):
    src_dict = dict()
    stat_list = list()
    data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            f"data/vhdl/{ref_modelname}")
    for mode in ["train", "val", "test"]:
        src_l_file = os.path.join(data_dir, f"src.l.{mode}.txt")
        src_l = [
            l.split() for l in IOUtils.load(
                src_l_file, IOUtils.Format.txt).strip().splitlines()
        ]
        src_r_file = os.path.join(data_dir, f"tgt.{mode}.txt")
        src_r = [
            l.split() for l in IOUtils.load(
                src_r_file, IOUtils.Format.txt).strip().splitlines()
        ]
        src_seq = [l + ["<="] + r for l, r in zip(src_l, src_r)]
        for i in range(num_pa):
            src_pa_file = os.path.join(data_dir,
                                       f"src.prevassign{i}.{mode}.txt")
            src_pa = [
                l.split() for l in IOUtils.load(
                    src_pa_file, IOUtils.Format.txt).strip().splitlines()
            ]
            for j, pa in enumerate(src_pa):
                src_seq[j] = pa + src_seq[j]
        src_dict[f"{mode}"] = src_seq
    return src_dict
示例#5
0
    def process_data(self, project_dir):
        try:
            revision_data = IOUtils.load(project_dir / "collector" /
                                         "method-project-revision.json")
            method_data = IOUtils.load(project_dir / "collector" /
                                       "method-data.json")
            output_dir = project_dir / "collector"
            method_project_evo = []
            for year in BetaFilter.YEARS[:-1]:
                curr_time = f"{year}_Jan_1"
                curr_method_ids = \
                    [year_data["method_ids"] for year_data in revision_data if year_data["year"] == curr_time][0]
                next_time = f"{year + 1}_Jan_1"
                next_method_ids = \
                    [year_data["method_ids"] for year_data in revision_data if year_data["year"] == next_time][0]
                new_method_ids = list(
                    set(next_method_ids) - set(curr_method_ids))
                filtered_method_ids = BetaFilter.beta_filter(
                    new_method_ids, curr_method_ids, method_data)
                method_project_evo.append({
                    "prj_name":
                    revision_data[0]["prj_name"],
                    "time":
                    f"{curr_time}-{next_time}",
                    "method_ids":
                    filtered_method_ids
                })

            IOUtils.dump(output_dir / "method-project-beta-filtered.json",
                         IOUtils.jsonfy(method_project_evo),
                         IOUtils.Format.json)
            return
        except:
            self.logger.info(f"Unexpected error: {sys.exc_info()[0]}")
            return
    def load_configs(self,
                     prj_root: Optional[Path] = None,
                     force_reload: bool = False):
        """
        Load configs (first project-local, then global) to this user interface.
        """
        # If the configs of the current project is already loaded, skip
        if not force_reload and prj_root is not None and prj_root == self.loaded_config_prj:
            return

        # Reset the project-local config indicator
        self.loaded_config_prj = None

        # First, load global config
        global_config_file = RoosterizeDirUtils.get_global_config_file()
        if global_config_file.exists():
            global_config = IOUtils.load(global_config_file,
                                         IOUtils.Format.yaml)
            self.set_configs_from_dict(global_config, self.GLOBAL_CONFIGS)

        # Then, load local config
        if prj_root is not None:
            local_config_file = RoosterizeDirUtils.get_local_config_file(
                prj_root)
            if local_config_file.exists():
                local_config = IOUtils.load(local_config_file,
                                            IOUtils.Format.yaml)
                self.set_configs_from_dict(local_config, self.LOCAL_CONFIGS)

            self.loaded_config_prj = prj_root
示例#7
0
    def process_data_impl(
        self,
        data_dir: Path,
        output_processed_data_dir: Path,
    ) -> NoReturn:
        lemmas: List[Lemma] = IOUtils.dejsonfy(
            IOUtils.load(data_dir / "lemmas.json", IOUtils.Format.json),
            List[Lemma])
        definitions: List[Definition] = IOUtils.dejsonfy(
            IOUtils.load(data_dir / "definitions.json", IOUtils.Format.json),
            List[Definition])

        docs_sub_tokenizers = SubTokenizer.get_docs_sub_tokenizers(
            lemmas, definitions)

        # Inputs
        all_inputs: Dict[str, List[List[str]]] = self.get_all_inputs(
            lemmas, docs_sub_tokenizers)
        for input_type, src_sentences in all_inputs.items():
            IOUtils.dump(
                output_processed_data_dir / f"src.{input_type}.txt",
                "".join([" ".join(sent) + "\n" for sent in src_sentences]),
                IOUtils.Format.txt)
        # end for

        # Outputs
        IOUtils.dump(
            output_processed_data_dir / f"tgt.txt", "".join([
                " ".join(self.get_output(lemma, docs_sub_tokenizers)) + "\n"
                for lemma in lemmas
            ]), IOUtils.Format.txt)

        super().process_data_impl(data_dir, output_processed_data_dir)
        return
示例#8
0
    def load_data(self,
            rel_path: Union[str, List[str]],
            fmt: IOUtils.Format,
            is_batched: bool = False,
            clz = None,
    ) -> Any:
        if self.is_json_format(fmt) and clz is None:
            self.logger.warning(f"Load data from {rel_path} with json format, but did not specify clz (at {traceback.format_stack()})")
        # end if

        abs_path = self.data_dir / self.assemble_rel_path(rel_path)
        if not abs_path.exists():
            LoggingUtils.log_and_raise(self.logger, f"Cannot find data at {abs_path}", IOError)
        # end if

        if not is_batched:
            data = IOUtils.load(abs_path, fmt)
            if self.is_json_format(fmt) and clz is not None:
                data = IOUtils.dejsonfy(data, clz)
            # end if
            return data
        else:
            data = list()
            batch_numbers = sorted([int(str(f.stem).split("-")[1]) for f in abs_path.iterdir()])
            for batch_number in tqdm(batch_numbers):
                batch_file = abs_path / f"batch-{batch_number}.{fmt.get_extension()}"
                data_batch = IOUtils.load(batch_file, fmt)
                if self.is_json_format(fmt) and clz is not None:
                    data_batch = IOUtils.dejsonfy(data_batch, clz)
                # end if
                data.extend(data_batch)
            # end for
            return data
示例#9
0
def read_data_preds(data_dir: Path, pred_file: Path, target_file: Path) -> Tuple[List[List[str]], List[List[str]], List[List[str]]]:
    # objflag = False

    # Input lhs
    inputs: List[List[str]] = [x.split() for x in IOUtils.load(data_dir/"src.l.test.txt", IOUtils.Format.txt).splitlines()]

    # Pred rhs
    preds: List[List[str]] = [x.split() for x in IOUtils.load(pred_file, IOUtils.Format.txt).splitlines()]

    
    # Target rhs
    targets: List[List[str]] = [x.split() for x in IOUtils.load(data_dir/target_file, IOUtils.Format.txt).splitlines()]

    return inputs, preds, targets
示例#10
0
def clean_comgen_data(**options):
    from csevo.filter.DataFilter import DataFilter
    config_file_name = options.get("config")
    config_file = Macros.config_dir / config_file_name
    df = DataFilter(config_file)
    project_file = options.get("proj_file",
                               Macros.data_dir / "projects-github-CG-100.json")
    projects = IOUtils.load(project_file)
    for proj in tqdm(projects):
        method_data_file = Macros.repos_results_dir / proj / "collector" / "method-data.json"
        filtered_data_file = Macros.repos_results_dir / proj / "collector" / "method-project-alpha-filtered.json"
        revision_data_file = Macros.repos_results_dir / proj / "collector" / "method-project-revision.json"
        # Data filtering and cleaning
        method_data_list = IOUtils.load(method_data_file)
        clean_method_data_list = list()
        clean_method_id_list = list()
        for ex in method_data_list:
            new_ex = ex
            new_ex["code"], new_ex["comment_summary"] = df.data_filter(
                ex["code"], ex["comment_summary"])
            if new_ex["code"] != "" and new_ex["comment_summary"] != "":
                clean_method_data_list.append(new_ex)
                clean_method_id_list.append(new_ex["id"])
        # dump the clean method index for comment generation task
        IOUtils.dump(
            Macros.repos_results_dir / proj / "collector" /
            "clean-method-idx.json", clean_method_id_list,
            IOUtils.Format.jsonNoSort)
        # update alpha-filtered data
        filtered_data_list = IOUtils.load(filtered_data_file)
        for delta_data in filtered_data_list:
            new_clean_filtered_method_ids = set(
                delta_data["method_ids"]).intersection(clean_method_id_list)
            delta_data["method_ids"] = list(new_clean_filtered_method_ids)
        IOUtils.dump(
            Macros.repos_results_dir / proj / "collector" /
            "method-project-CG-filtered.json", filtered_data_list,
            IOUtils.Format.jsonNoSort)
        # update project revision data
        revision_data_list = IOUtils.load(revision_data_file)
        for year_data in revision_data_list:
            new_clean_latest_method_ids = set(
                year_data["method_ids"]).intersection(clean_method_id_list)
            year_data["method_ids"] = list(new_clean_latest_method_ids)
        IOUtils.dump(
            Macros.repos_results_dir / proj / "collector" /
            "method-project-CG-revision.json", revision_data_list,
            IOUtils.Format.jsonNoSort)
示例#11
0
 def load_config(self) -> NoReturn:
     if self.config_file is not None:
         self.config_dict.update(IOUtils.load(self.config_file, IOUtils.Format.jsonPretty))
     else:
         raise ValueError("Config file not set!")
     # end if
     return
示例#12
0
    def get_model_results_all_trials(cls, model: str) -> Dict[str, Dict[str, list]]:
        """
        Gets the model's results, on each exp, of each metric, on test_common set,
        combining all trials.

        Returns:
            mapping of exp -> (mapping of metric -> list of results)
        """
        results = IOUtils.load(Macros.results_dir / "metrics" / f"results-trials-{model}.json")
        results_all_trials = dict()

        for exp, exp_results in results.items():
            exp_results_all_trials = dict()
            for test_set, set_results in exp_results.items():
                # Only use test_common set
                if test_set != Macros.test_common:
                    continue

                for metric, trials_results in set_results.items():
                    metric_results_all_trials = list()

                    # Merge the results from all trials
                    for trial_results in trials_results:
                        if trial_results is not None:
                            metric_results_all_trials += [n for n in trial_results if n != np.NaN and n != "NaN"]

                    exp_results_all_trials[metric] = metric_results_all_trials
            results_all_trials[exp] = exp_results_all_trials

        return results_all_trials
示例#13
0
    def test_format_yaml(self):
        """
        Tests for IOUtils.Format.yaml
        """
        objs = [
            42.001,
            "aaa",
            [13, "24", 56.7],
            {
                "name": "K",
                "job": "Y"
            },
        ]
        exp_strs = [
            "42.001\n...\n",
            "aaa\n...\n",
            "- 13\n- '24'\n- 56.7\n",
            "job: Y\nname: K\n",  # dictionary are forced to be sorted
        ]

        for obj, exp_str in zip(objs, exp_strs):
            path = Path(tempfile.mktemp())

            # Test dump
            IOUtils.dump(path, obj, IOUtils.Format.yaml)
            self.assertEqual(exp_str, self.load_plain(path))

            # Test load
            loaded = IOUtils.load(path, IOUtils.Format.yaml)
            self.assertEqual(obj, loaded)

            self.rm(path)
示例#14
0
    def make_numbers_timewise_filtered_dataset_metrics(self,
                                                       dataset: str = "large",
                                                       filter: str = "beta"):
        file = latex.File(
            self.tables_dir /
            f"numbers-time-wise-{filter}-filtered-{dataset}-dataset-metrics.tex"
        )
        metrics = IOUtils.load(
            Macros.results_dir / "metrics" /
            f"time-wise-{filter}-filtered-{dataset}-dataset-stats.json",
            IOUtils.Format.json)

        for t in metrics.keys():
            for k, v in metrics[t].items():
                if k == "num-methods":
                    file.append_macro(
                        latex.Macro(f"{dataset}-{filter}-{t}-{k}", f"{v}"))
                # TODO: change back
                """
                else:
                    file.append_macro(latex.Macro(f"{dataset}-{filter}-{t}-{k}", "{:.1f}".format(v)))
                """
        # end for

        file.save()
        return
    def parse_file(self, file_path: Path, prj_root: Path, serapi_options: str):
        source_code = IOUtils.load(file_path, IOUtils.Format.txt)
        unicode_offsets = ParserUtils.get_unicode_offsets(source_code)

        with IOUtils.cd(prj_root):
            rel_path = file_path.relative_to(prj_root)
            ast_sexp_str = BashUtils.run(
                f"sercomp {serapi_options} --mode=sexp -- {rel_path}",
                expected_return_code=0).stdout
            tok_sexp_str = BashUtils.run(
                f"sertok {serapi_options} -- {rel_path}",
                expected_return_code=0).stdout

            ast_sexp_list: List[SexpNode] = SexpParser.parse_list(ast_sexp_str)
            tok_sexp_list: List[SexpNode] = SexpParser.parse_list(tok_sexp_str)

            doc = CoqParser.parse_document(
                source_code,
                ast_sexp_list,
                tok_sexp_list,
                unicode_offsets=unicode_offsets,
            )
            doc.file_name = str(rel_path)

            # Collect lemmas & definitions
            lemmas: List[Lemma] = DataMiner.collect_lemmas_doc(
                doc, ast_sexp_list, serapi_options)
            definitions: List[Definition] = DataMiner.collect_definitions_doc(
                doc, ast_sexp_list)

        return ProcessedFile(file_path, source_code, doc, ast_sexp_list,
                             tok_sexp_list, unicode_offsets, lemmas,
                             definitions)
示例#16
0
    def collect_data(cls, **options) -> NoReturn:
        data_mgr = FilesManager(cls.dataset_dir)

        task = options["task"]

        projects_path = Path(options.get("corpus", cls.dataset_dir / "projects-standalone-8.10.yml"))
        projects: List[Project] = IOUtils.dejsonfy(IOUtils.load(projects_path, "json"), Project)

        if task == cls.TASK_COQ_DOCUMENTS:
            files = Utils.get_option_as_list(options, "files", None)
            is_verifying_tokenizer = Utils.get_option_as_boolean(options, "verify-tokenizer")
            cls.collect_coq_documents_projects(data_mgr, projects, files, is_verifying_tokenizer)
        elif task == cls.TASK_DATA_INDEXES:
            cls.collect_data_indexes(data_mgr, projects)
        elif task == cls.TASK_DEFINITIONS:
            cls.collect_definitions(data_mgr)
        elif task == cls.TASK_INSTALL_COQ_PROJECTS:
            cls.install_coq_projects(projects)
        elif task == cls.TASK_LEMMA:
            files = Utils.get_option_as_list(options, "files", None)
            cls.collect_lemmas(data_mgr, projects, files)
        elif task == cls.TASK_LEMMA_BACKEND_SEXP_TRANSFORMATIONS:
            cls.collect_lemmas_backend_sexp_transformations(data_mgr)
        elif task == cls.TASK_LEMMA_FILTERED:
            cls.filter_lemmas(data_mgr)
        elif task == cls.TASK_LEMMA_FOREEND_SEXP_TRANSFORMATIONS:
            cls.collect_lemmas_foreend_sexp_transformations(data_mgr)
        else:
            LoggingUtils.log_and_raise(cls.logger, f"Unknown task {task}", ValueError)
        # end if
        return
    def process_data_impl(self,
            data_dir: Path,
            output_processed_data_dir: Path,
    ) -> NoReturn:
        lemmas: List[Lemma] = IOUtils.dejsonfy(IOUtils.load(data_dir/"lemmas.json", IOUtils.Format.json), List[Lemma])
        definitions: List[Definition] = IOUtils.dejsonfy(IOUtils.load(data_dir/"definitions.json", IOUtils.Format.json), List[Definition])

        docs_sub_tokenizers = SubTokenizer.get_docs_sub_tokenizers(lemmas, definitions)

        # Put data in serialized files
        IOUtils.dump(output_processed_data_dir/f"src.txt",
            "".join([" ".join(self.get_input(lemma, docs_sub_tokenizers)) + "\n" for lemma in lemmas]),
            IOUtils.Format.txt)
        IOUtils.dump(output_processed_data_dir/f"tgt.txt",
            "".join([" ".join(self.get_output(lemma, docs_sub_tokenizers)) + "\n" for lemma in lemmas]),
            IOUtils.Format.txt)
        return
    def eval_impl(self,
            processed_data_dir: Path,
            model_dir: Path,
            beam_search_size: int,
            k: int
    ) -> List[List[Tuple[str, float]]]:
        from roosterize.ml.onmt.CustomTranslator import CustomTranslator
        from onmt.utils.misc import split_corpus
        from onmt.utils.parse import ArgumentParser
        from translate import _get_parser as translate_get_parser

        src_path = processed_data_dir/"src.txt"
        tgt_path = processed_data_dir/"tgt.txt"

        best_step = IOUtils.load(model_dir/"best-step.json", IOUtils.Format.json)
        self.logger.info(f"Taking best step at {best_step}")

        candidates_logprobs: List[List[Tuple[List[str], float]]] = list()

        with IOUtils.cd(self.open_nmt_path):
            parser = translate_get_parser()
            opt = parser.parse_args(
                f" -model {model_dir}/models/ckpt_step_{best_step}.pt"
                f" -src {src_path}"
                f" -tgt {tgt_path}"
            )
            opt.output = f"{model_dir}/last-pred.txt"
            opt.beam_size = beam_search_size
            opt.gpu = 0 if torch.cuda.is_available() else -1
            opt.n_best = k
            opt.block_ngram_repeat = 1
            opt.ignore_when_blocking = ["_"]

            # translate.main
            ArgumentParser.validate_translate_opts(opt)

            translator = CustomTranslator.build_translator(opt, report_score=False)
            src_shards = split_corpus(opt.src, opt.shard_size)
            tgt_shards = split_corpus(opt.tgt, opt.shard_size) if opt.tgt is not None else repeat(None)
            shard_pairs = zip(src_shards, tgt_shards)

            for i, (src_shard, tgt_shard) in enumerate(shard_pairs):
                self.logger.info("Translating shard %d." % i)
                _, _, candidates_logprobs_shard = translator.translate(
                    src=src_shard,
                    tgt=tgt_shard,
                    src_dir=opt.src_dir,
                    batch_size=opt.batch_size,
                    attn_debug=opt.attn_debug
                )
                candidates_logprobs.extend(candidates_logprobs_shard)
            # end for
        # end with

        # Reformat candidates
        candidates_logprobs: List[List[Tuple[str, float]]] = [[("".join(c), l) for c, l in cl] for cl in candidates_logprobs]

        return candidates_logprobs
示例#19
0
def get_baseline_preds(ref_modelname, data_mode):
    filename = f"{DATA_DIR}/{ref_modelname}/src.prevassign.{data_mode}.txt"
    preds: List[str] = [x.strip() for x in IOUtils.load(filename, IOUtils.Format.txt).splitlines()]
    preds_list = list()
    for pred in preds:
        if pred=="<empty>":
            preds_list.append(pred)
        else:
            preds_list.append(pred.split("<= ")[1].split(";")[0].strip())
    return preds_list
示例#20
0
文件: ex_ms2.py 项目: mfkiwl/hdlp
def convert_json2txt(config_dict, data_types: List[str] = None):
    if data_types is None:
        data_types = ["train", "val", "test"]

    for data_type in data_types:
        data_list = IOUtils.load(
            os.path.join(DATADIR, config_dict["intermediate_data_dir"],
                         f"{data_type}.json"), IOUtils.Format.json)

        for src_type in config_dict["src_types"]:
            output_path = os.path.join(DATADIR, config_dict["save_dir"],
                                       f"src.{src_type}.{data_type}.txt")
            pa_i = int(config_dict["augment"])
            if src_type == "l":
                field = "l"
            elif src_type == "type":
                field = "l-type"
            elif src_type == "prevassign":
                field = f"pa{pa_i}"
            elif src_type == "patype":
                field = f"pa{pa_i}-type"
            else:
                raise ValueError(f"Unknown src_type {src_type}")
            # end if

            with open(output_path, "w") as f:
                for data in data_list:
                    if len(data[field]) == 0:
                        if field.endswith("-type"):
                            f.write("<pad>\n")
                        else:
                            f.write("<empty>\n")
                        # end if
                    else:
                        f.write(data[field] + "\n")
                    # end if
                # end for
            # end with
        # end for

        fn_output_path = os.path.join(DATADIR, config_dict["save_dir"],
                                      f"src.fn.{data_type}.txt")
        IOUtils.dump(fn_output_path,
                     "".join([data["file_sha"] + "\n" for data in data_list]),
                     IOUtils.Format.txt)

        tgt_output_path = os.path.join(DATADIR, config_dict["save_dir"],
                                       f"tgt.{data_type}.txt")
        # [3:-2]: remove prefix "<= " and suffix " ;"
        IOUtils.dump(tgt_output_path,
                     "".join([data["r"][3:-2] + "\n" for data in data_list]),
                     IOUtils.Format.txt)
    # end for
    print("Conversion into txt is done.")
    return
示例#21
0
    def collect_all_results(self, model: str, metrics: List[str]):
        # Mapping of eval_setting-year -> metric -> test_set -> [trials]
        all_results: Dict[str, Dict[str, Dict[str, List[any]]]]

        # Load existing results, if any
        results_file = Macros.results_dir / "metrics" / f"results-trials-{model}.json"
        if results_file.exists():
            self.logger.info(f"Loading existing metrics from {results_file}")
            all_results = IOUtils.load(results_file)
        else:
            all_results = {}

        model_work_dir = Macros.data_dir / "models-work" / model
        for eval_setting in self.EVAL_SETTINGS:
            for year in self.YEARS:
                exp = f"{eval_setting}-{year}"
                exp_results = all_results.setdefault(exp, {})
                for test_set in [Macros.test_common, Macros.test_standard]:
                    set_results = exp_results.setdefault(test_set, {})

                    for trial in range(Macros.trials):
                        trial_dir = model_work_dir / exp / f"trial-{trial}"
                        cur_results_file = trial_dir / f"results_{test_set}.json"
                        if not cur_results_file.exists():
                            self.logger.warning(
                                f"Results not found at {cur_results_file}")
                            # Set default value for set_results[mname], but don't touch existing results if any
                            for mname in metrics:
                                set_results.setdefault(mname,
                                                       [None] * Macros.trials)
                        else:
                            results = IOUtils.load(cur_results_file)
                            for mname in metrics:
                                metric = results[mname]
                                set_results.setdefault(
                                    mname,
                                    [None] * Macros.trials)[trial] = metric

        # Save extracted/updated results
        IOUtils.dump(results_file, all_results, IOUtils.Format.jsonPretty)
        return
示例#22
0
文件: Vocabulary.py 项目: mfkiwl/hdlp
 def load(cls, path: Path) -> "Vocabulary":
     d = IOUtils.load(path, IOUtils.Format.json)
     v = Vocabulary(d["index_to_word"][str(VocabularyConsts.PAD_INDEX)],
                    d["index_to_word"][str(VocabularyConsts.UNK_INDEX)])
     for f in ["word_to_index", "index_to_word", "next_index", "counter"]:
         setattr(v, f, d[f])
     # end for
     v.index_to_word = {int(k): v
                        for k, v in v.index_to_word.items()
                        }  # Fix json key can only be string
     v.counter = collections.Counter(v.counter)  # Fix Counter type
     return v
示例#23
0
    def load_data_list(cls,
                       assignments_path: Path) -> List[Dict[str, List[str]]]:
        assignments = IOUtils.load(assignments_path, IOUtils.Format.json)

        # Flatten the dataset, remove file/entity structures
        data_list: List[Dict[str, List[str]]] = list()

        for f in assignments:
            file_names = f["fn"]  # Currently, it's: "{sha}.asg, {sha}.typ"
            file_sha = file_names.split()[1][:-4]
            for ent in f["entity"]:
                var_types = ent["type"]
                var_raw_types = ent["raw_type"]
                assignments_this_entity = ent["agn"]
                for assignment in assignments_this_entity:
                    data = dict()
                    data["file_sha"] = [
                        file_sha
                    ]  # a singleton list rather than string, to be consistent with other fields
                    data["l"] = assignment["l"]
                    data["l-type"] = [
                        cls.get_one_type_token(data["l"], var_types)
                    ]  # One type for entire lhs
                    data["l-type-each-token"] = cls.get_type_tokens(
                        data["l"], var_types
                    )  # Get type for each token in lhs, used by the concat model
                    data["l-raw-type"] = cls.get_raw_type_tokens(
                        data["l"], var_raw_types)
                    data["r"] = assignment["r"]
                    pas = assignment["prevassign"]
                    # Hack: [[""]] is actually fully empty
                    if len(pas) == 1 and len(pas[0]) == 1 and len(
                            pas[0][0]) == 0:
                        pas = []
                    # end if
                    for pa_i in range(Macros.MAX_PA_IN_MODEL):
                        if pa_i < len(pas):
                            # Hack: remove empty token ("") in pa
                            data[f"pa{pa_i+1}"] = [
                                t for t in pas[-(pa_i + 1)] if t != ""
                            ]
                        else:
                            data[f"pa{pa_i+1}"] = []
                        # end if
                        data[f"pa{pa_i + 1}-type"] = cls.get_type_tokens(
                            data[f"pa{pa_i+1}"], var_types)
                    # end for

                    data_list.append(data)
                # end for
            # end for
        # end for
        return data_list
示例#24
0
    def install_coq_project(cls, project: Project, names_projects: Dict[str, Project]) -> None:
        """
        :requires: the project is cloned and checked-out to the desired version.
        """
        if not project.is_cloned:
            project.clone()
            project.checkout(project.data["sha"], is_forced=True)
        # end if

        # Check if the project is already compiled
        confirmation_file = "lpc-installed.txt"
        confirmation_content = project.revision + " " + BashUtils.run("opam list coq -s", expected_return_code=0).stdout.strip()
        if (project.checkout_dir/confirmation_file).is_file() and IOUtils.load(project.checkout_dir/confirmation_file, "txt") == confirmation_content:
            cls.logger.debug(f"Project {project.full_name} already installed")
            return
        # end if

        project.clean()

        # Install dependencies
        for dependency in project.data.get("dependencies", []):
            dependency_project = names_projects.get(dependency)
            if dependency_project is None:  raise Exception(f"Cannot find dependency {dependency}")
            cls.logger.info(f"For Project {project.full_name}, installing dependency {dependency}")
            cls.install_coq_project(dependency_project, names_projects)
        # end for

        if "build_cmd" not in project.data:  raise Exception(f"Project {project.full_name} does not have build_cmd")
        if "install_cmd" not in project.data:  raise Exception(f"Project {project.full_name} does not have install_cmd")

        with IOUtils.cd(project.checkout_dir):
            # Build
            cls.logger.info(f"Project {project.full_name}: Building with {project.data['build_cmd']}")
            r = BashUtils.run(project.data["build_cmd"])
            if r.return_code != 0:
                raise Exception(f"Compilation failed! Return code is {r.return_code}! stdout:\n{r.stdout}\n; stderr:\n{r.stderr}")
            else:
                cls.logger.debug(f"Compilation finished. Return code is {r.return_code}. stdout:\n{r.stdout}\n; stderr:\n{r.stderr}")
            # end if

            # Install
            cls.logger.info(f"Project {project.full_name}: Installing with {project.data['install_cmd']}")
            r = BashUtils.run(project.data["install_cmd"])
            if r.return_code != 0:
                raise Exception(f"Installation failed! Return code is {r.return_code}! stdout:\n{r.stdout}\n; stderr:\n{r.stderr}")
            else:
                cls.logger.debug(f"Installation finished. Return code is {r.return_code}. stdout:\n{r.stdout}\n; stderr:\n{r.stderr}")
            # end if

            IOUtils.dump(project.checkout_dir / confirmation_file, confirmation_content, "txt")
        # end with
        return
示例#25
0
    def make_numbers_dataset_metrics(self):
        for task in Macros.tasks:
            file = latex.File(self.tables_dir /
                              f"numbers-{task}-dataset-metrics.tex")

            dataset_metrics = IOUtils.load(
                Macros.results_dir / "metrics" / f"{task}-dataset.json",
                IOUtils.Format.json)
            for k, v in dataset_metrics.items():
                fmt = f",d" if type(v) == int else f",.2f"
                file.append_macro(latex.Macro(f"ds-{task}-{k}", f"{v:{fmt}}"))

            raw_dataset_metrics = IOUtils.load(
                Macros.results_dir / "metrics" / f"{task}-raw-dataset.json",
                IOUtils.Format.json)
            for k, v in raw_dataset_metrics.items():
                fmt = f",d" if type(v) == int else f",.2f"
                file.append_macro(
                    latex.Macro(f"raw-ds-{task}-{k}", f"{v:{fmt}}"))

            file.save()
        return
示例#26
0
    def prepare_configs_and_scripts(self, trials: List[int]):
        data_dir = self.work_dir / "data"
        base_config = IOUtils.load(self.base_config_file,
                                   IOUtils.Format.jsonPretty)

        for trial in trials:
            trial_dir = self.work_dir / f"trial-{trial}"
            IOUtils.mk_dir(trial_dir)

            config = copy.copy(base_config)
            config["data_dir"] = str(data_dir)
            config["model_dir"] = str(trial_dir / "model")
            config["output"] = str(trial_dir / "output.txt")

            config_file = trial_dir / "config.json"
            IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty)

            training_trace_file = trial_dir / "training-trace.json"

            train_script_file = trial_dir / f"{Macros.train}.sh"
            # The gpu-id argument is necessary for tensorflow, even if we are using CUDA_VISIBLE_DEVICES
            train_script = f"#!/bin/bash\n" \
                           f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                           f"conda activate {self.CONDA_ENV}\n" \
                           f"module load cuda/10.0 cudnn/7.6.2\n" \
                           f"cd {self.code_dir}/translate\n" \
                           f"python3 __main__.py {config_file} --train -v --train-log {training_trace_file} --gpu-id $1 &> {trial_dir}/log-{Macros.train}.txt\n"
            IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt)
            BashUtils.run(f"chmod +x {train_script_file}",
                          expected_return_code=0)

            for test_type in [Macros.test_common, Macros.test_standard]:
                output_file = trial_dir / f"output_{test_type}.txt"
                config["output"] = str(output_file)
                test_config_file = trial_dir / f"config_{test_type}.json"
                IOUtils.dump(test_config_file, config,
                             IOUtils.Format.jsonPretty)

                test_script_file = trial_dir / f"{test_type}.sh"
                test_script = f"#!/bin/bash\n" \
                              f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                              f"conda activate {self.CONDA_ENV}\n" \
                              f"module load cuda/10.0 cudnn/7.6.2\n" \
                              f"cd {self.code_dir}/translate\n" \
                              f"python3 __main__.py {test_config_file} --eval {data_dir}/{test_type}/test.token.code {data_dir}/{test_type}/test.token.sbt {data_dir}/{test_type}/test.token.nl --gpu-id $1 &> {trial_dir}/log-{test_type}.txt\n" \
                              f"python3 Bleu.py {data_dir}/{test_type}/test.token.nl {trial_dir}/output_{test_type}.txt {trial_dir}/results_{test_type}.json\n"
                IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt)
                BashUtils.run(f"chmod +x {test_script_file}",
                              expected_return_code=0)

        return
示例#27
0
 def parse_projects(cls, project_list_file):
     """
     Parse the project list file provided by DeepCom and return the github url file.
     """
     project_list = IOUtils.load(project_list_file,
                                 IOUtils.Format.txt).splitlines()
     git_urls = list()
     for project in project_list:
         project_name = project.split("_", 1)
         git_urls.append(
             f"https://github.com/{project_name[0]}/{project_name[1]}.git")
     IOUtils.dump(Macros.data_dir / "DeepCom-projects-github.txt",
                  "".join([url + "\n" for url in git_urls]),
                  IOUtils.Format.txt)
示例#28
0
 def process_data_concurrent(self, proj_list: Path):
     """Process data concurrently."""
     #projects = [Path(data_dir/proj) for proj in listdir(data_dir)]
     projects = IOUtils.load(proj_list)
     num_proj = len(projects)
     processed = 0
     with ThreadPoolExecutor(8) as executor:
         futures = [
             executor.submit(self.process_data,
                             Macros.repos_results_dir / proj)
             for proj in projects
         ]
         for f in tqdm(as_completed(futures), total=num_proj):
             pass
示例#29
0
    def make_numbers_timewise_dataset_metrics(self, dataset: str = "large"):
        file = latex.File(self.tables_dir /
                          f"numbers-time-wise-{dataset}-dataset-metrics.tex")
        metrics = IOUtils.load(
            Macros.results_dir / "metrics" /
            f"time-wise-{dataset}-dataset-stats.json", IOUtils.Format.json)

        for t in metrics.keys():
            for k, v in metrics[t].items():
                file.append_macro(latex.Macro(f"{dataset}-{t}-{k}", f"{v}"))
        # end for

        file.save()
        return
 def load_local_model(self, prj_root: Path) -> None:
     """
     Try to load the local model, if it exists; otherwise do nothing.
     """
     if self.model is None:
         local_model_dir = RoosterizeDirUtils.get_local_model_dir(prj_root)
         if local_model_dir.is_dir():
             model_spec = IOUtils.dejsonfy(
                 IOUtils.load(local_model_dir / "spec.json",
                              IOUtils.Format.json),
                 ModelSpec,
             )
             self.model = MLModels.get_model(local_model_dir,
                                             model_spec,
                                             is_eval=True)