示例#1
0
    def extract_data_from_corpus(cls,
            corpus_path: Path,
            trainevals: List[str],
            groups: List[str],
            output_path: Path,
    ):
        # 1. Prepare output path
        if output_path.is_dir():
            cls.logger.warning(f"{output_path} already exists, will overwrite the files.")
        elif output_path.is_file():
            LoggingUtils.log_and_raise(cls.logger, f"{output_path} already exists as a file. Aborting.", Exception)
        else:
            IOUtils.mk_dir(output_path)
        # end if

        assert all([traineval in Macros.DS_TRAINEVALS for traineval in trainevals])
        assert all([group in Macros.DS_GROUPS+[Macros.DS_GROUP_TA] for group in groups])

        data_mgr = FilesManager(corpus_path)

        # 2. Load lemmas and definitions
        lemmas_filtered: List[Lemma] = data_mgr.load_data([FilesManager.LEMMAS_FILTERED], IOUtils.Format.json, is_batched=True, clz=Lemma)
        definitions: List[Definition] = data_mgr.load_data([FilesManager.DEFINITIONS, "definitions.json"], IOUtils.Format.json, clz=Definition)

        # 3. Output to output_path for each combination of traineval and group
        for traineval in trainevals:
            for group in groups:
                IOUtils.mk_dir(output_path/f"{group}-{traineval}")
                data_indexes = IOUtils.load(project_dir/"training"/f"{group}-{traineval}.json"], IOUtils.Format.json, clz=str)
                IOUtils.dump(output_path/f"{group}-{traineval}/lemmas.json", IOUtils.jsonfy([l for l in lemmas_filtered if l.data_index in data_indexes]), IOUtils.Format.json)
                IOUtils.dump(output_path/f"{group}-{traineval}/definitions.json", IOUtils.jsonfy([d for d in definitions if d.data_index in data_indexes]), IOUtils.Format.json)
            # end for
        # end for
        return
示例#2
0
文件: ex_ngram.py 项目: mfkiwl/hdlp
def write_seq_len_stat(num_pa, ref_modelname):
    stat_list = list()
    src_l = list()
    data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            f"data/vhdl/{ref_modelname}")
    for mode in ["train", "val", "test"]:
        src_l_file = os.path.join(data_dir, f"src.l.{mode}.txt")
        src_l += [
            l.split() for l in IOUtils.load(
                src_l_file, IOUtils.Format.txt).strip().splitlines()
        ]
    stat_list.append(get_seq_len_stat(src_l))

    for i in range(num_pa):
        src_pa = list()
        result_list = list()
        for mode in ["train", "val", "test"]:
            src_pa_file = os.path.join(data_dir,
                                       f"src.prevassign{i}.{mode}.txt")
            src_pa += [
                l.split() for l in IOUtils.load(
                    src_pa_file, IOUtils.Format.txt).strip().splitlines()
            ]
        for j, pa in enumerate(src_pa):
            if pa != ["<empty>"]:
                src_l[j] = pa + src_l[j]
                result_list.append(src_l[j])
        stat_list.append(get_seq_len_stat(result_list))
    results_file = os.path.join("../slpproject/_results/vhdl/ALL/metrics",
                                f"lhs-pa-len-stat.json")
    IOUtils.dump(results_file, stat_list, IOUtils.Format.json)
    return
示例#3
0
    def test_format_yaml(self):
        """
        Tests for IOUtils.Format.yaml
        """
        objs = [
            42.001,
            "aaa",
            [13, "24", 56.7],
            {
                "name": "K",
                "job": "Y"
            },
        ]
        exp_strs = [
            "42.001\n...\n",
            "aaa\n...\n",
            "- 13\n- '24'\n- 56.7\n",
            "job: Y\nname: K\n",  # dictionary are forced to be sorted
        ]

        for obj, exp_str in zip(objs, exp_strs):
            path = Path(tempfile.mktemp())

            # Test dump
            IOUtils.dump(path, obj, IOUtils.Format.yaml)
            self.assertEqual(exp_str, self.load_plain(path))

            # Test load
            loaded = IOUtils.load(path, IOUtils.Format.yaml)
            self.assertEqual(obj, loaded)

            self.rm(path)
示例#4
0
 def data_cut(self, data_size: int):
     """cut down the dataset to data_size, then save the projects list to data_dir"""
     collected_projects_file = Macros.data_dir / "projects-github.txt"
     self.collected_projects_list = list()
     if collected_projects_file.exists():
         self.collected_projects_list += IOUtils.load(
             collected_projects_file, IOUtils.Format.txt).splitlines()
     # end if
     project_name_list = list()
     for project_url in self.collected_projects_list:
         user_repo = self.parse_github_url(project_url)
         project_name_list.append(f"{user_repo[0]}_{user_repo[1]}")
     all_used_projects = [
         str(x).split("/")[-1] for x in Macros.repos_results_dir.iterdir()
         if x.is_dir()
     ]
     # Find the overlapping projects and select the top data_size projects
     overall_project_num = 0
     reduced_project_list = list()
     for p in project_name_list:
         if p in all_used_projects and overall_project_num < data_size:
             # load the revision data
             filtered_methods = IOUtils.load(Macros.repos_results_dir / p /
                                             "collector" /
                                             "method-project-revision.json")
             new_method_ids = [
                 delta_data["method_ids"] for delta_data in filtered_methods
                 if delta_data["year"] == "2020_Jan_1"
             ][0]
             if len(new_method_ids) > 0:
                 reduced_project_list.append(p)
                 overall_project_num += 1
                 all_used_projects.remove(p)
     IOUtils.dump(Macros.data_dir / f"projects-github-{data_size}.json",
                  reduced_project_list, IOUtils.Format.jsonNoSort)
    def suggest_naming(self, file_path: Path, prj_root: Optional[Path] = None):
        """
        Processes a file to get its lemmas and runs the model to get predictions.
        """
        # Figure out which project we're at, and then load configs
        if prj_root is None:
            prj_root = RoosterizeDirUtils.auto_infer_project_root(file_path)
        self.load_configs(prj_root)

        # Infer SerAPI options
        serapi_options = self.infer_serapi_options(prj_root)

        # If user provided compile_cmd, first compile the project
        if self.compile_cmd is not None:
            with IOUtils.cd(prj_root):
                BashUtils.run(self.compile_cmd, expected_return_code=0)

        # Parse file
        data = self.parse_file(file_path, prj_root, serapi_options)

        # Load model
        self.load_local_model(prj_root)
        model = self.get_model()

        # Use the model to make predictions
        # Temp dirs for processed data and results
        temp_data_dir = Path(tempfile.mkdtemp(prefix="roosterize"))

        # Dump lemmas & definitions
        temp_raw_data_dir = temp_data_dir / "raw"
        temp_raw_data_dir.mkdir()
        IOUtils.dump(
            temp_raw_data_dir / "lemmas.json",
            IOUtils.jsonfy(data.lemmas),
            IOUtils.Format.json,
        )
        IOUtils.dump(
            temp_raw_data_dir / "definitions.json",
            IOUtils.jsonfy(data.definitions),
            IOUtils.Format.json,
        )

        # Model-specific process
        temp_processed_data_dir = temp_data_dir / "processed"
        temp_processed_data_dir.mkdir()
        model.process_data_impl(temp_raw_data_dir, temp_processed_data_dir)

        # Invoke eval
        candidates_logprobs = model.eval_impl(
            temp_processed_data_dir,
            beam_search_size=self.beam_search_size,
            k=self.k,
        )

        # Save predictions
        IOUtils.rm_dir(temp_data_dir)

        # Report predictions
        self.report_predictions(data, candidates_logprobs)
        return
示例#6
0
    def split_project(self,
                      method_file: Path,
                      random_seed: int,
                      debug: bool = False):
        """
        Split projects into train, val, test according to the project names. Will get 2 new files:
        project-list.json, project-split.json.
        """
        proj_list = set()
        with open(method_file, "r") as f:
            objects = ijson.items(f, "item")
            for o in objects:
                proj_list.add(o["prj_name"])
        num_proj = len(proj_list)
        proj_list = list(proj_list)
        if debug:
            output_dir = Path("/tmp/nlpast-data-10")
        else:
            output_dir = Path("/tmp/nlpast-data-880")

        IOUtils.dump(output_dir / "project-list.json", proj_list)

        random.seed(random_seed)
        random.shuffle(proj_list)
        train_index = round(num_proj * 0.8)
        valid_index = train_index + round(num_proj * 0.1)
        train_projs = proj_list[:train_index]
        valid_projs = proj_list[train_index:valid_index]
        test_projs = proj_list[valid_index:]
        project_split = {
            "train": train_projs,
            "val": valid_projs,
            "test": test_projs
        }
        IOUtils.dump(output_dir / "project-split.json", project_split)
示例#7
0
    def process_data_impl(
        self,
        data_dir: Path,
        output_processed_data_dir: Path,
    ) -> NoReturn:
        lemmas: List[Lemma] = IOUtils.dejsonfy(
            IOUtils.load(data_dir / "lemmas.json", IOUtils.Format.json),
            List[Lemma])
        definitions: List[Definition] = IOUtils.dejsonfy(
            IOUtils.load(data_dir / "definitions.json", IOUtils.Format.json),
            List[Definition])

        docs_sub_tokenizers = SubTokenizer.get_docs_sub_tokenizers(
            lemmas, definitions)

        # Inputs
        all_inputs: Dict[str, List[List[str]]] = self.get_all_inputs(
            lemmas, docs_sub_tokenizers)
        for input_type, src_sentences in all_inputs.items():
            IOUtils.dump(
                output_processed_data_dir / f"src.{input_type}.txt",
                "".join([" ".join(sent) + "\n" for sent in src_sentences]),
                IOUtils.Format.txt)
        # end for

        # Outputs
        IOUtils.dump(
            output_processed_data_dir / f"tgt.txt", "".join([
                " ".join(self.get_output(lemma, docs_sub_tokenizers)) + "\n"
                for lemma in lemmas
            ]), IOUtils.Format.txt)

        super().process_data_impl(data_dir, output_processed_data_dir)
        return
示例#8
0
    def process_data(self, project_dir):
        try:
            revision_data = IOUtils.load(project_dir / "collector" /
                                         "method-project-revision.json")
            method_data = IOUtils.load(project_dir / "collector" /
                                       "method-data.json")
            output_dir = project_dir / "collector"
            method_project_evo = []
            for year in BetaFilter.YEARS[:-1]:
                curr_time = f"{year}_Jan_1"
                curr_method_ids = \
                    [year_data["method_ids"] for year_data in revision_data if year_data["year"] == curr_time][0]
                next_time = f"{year + 1}_Jan_1"
                next_method_ids = \
                    [year_data["method_ids"] for year_data in revision_data if year_data["year"] == next_time][0]
                new_method_ids = list(
                    set(next_method_ids) - set(curr_method_ids))
                filtered_method_ids = BetaFilter.beta_filter(
                    new_method_ids, curr_method_ids, method_data)
                method_project_evo.append({
                    "prj_name":
                    revision_data[0]["prj_name"],
                    "time":
                    f"{curr_time}-{next_time}",
                    "method_ids":
                    filtered_method_ids
                })

            IOUtils.dump(output_dir / "method-project-beta-filtered.json",
                         IOUtils.jsonfy(method_project_evo),
                         IOUtils.Format.json)
            return
        except:
            self.logger.info(f"Unexpected error: {sys.exc_info()[0]}")
            return
示例#9
0
def main_val(modelname, ref_modelname):
    bleus, accs, exact_accs = [],[],[]
    target_list = get_targets(ref_modelname, "val")
    preds_list = get_baseline_preds(ref_modelname, "val")
    for pred, target in zip(preds_list, target_list):
        pred_split = [t for t in pred.split(" ") if t!='']
        target_split = [t for t in target.split(" ") if t!='']
        bleu = get_bleu(target=target_split, pred=pred_split)
        acc = get_accuracy(target=target_split, pred=pred_split)
        exact_acc = get_exact_match_accuracy(target=target_split, pred=pred_split)
        bleus.append(bleu)
        accs.append(acc)
        exact_accs.append(exact_acc)

    avg_bleu = np.mean(bleus)
    avg_acc = np.mean(accs)
    avg_exact_acc = np.mean(exact_accs)
    print(f"Average BLEU: {avg_bleu:.3f}, average accuracy: {avg_acc:.3f}, average exact match accuracy: {avg_exact_acc:.3f}")
    results_file = os.path.join(SAVE_DIR, modelname,"testlog.val.assignments.baseline.log")
    results = {
        "bleu-AVG": avg_bleu,
        "acc-AVG": avg_acc,
        "exact-acc-AVG": avg_exact_acc,
        "bleu": bleus,
        "acc": accs,
        "exact-acc": exact_accs,
    }
    IOUtils.dump(results_file, results, IOUtils.Format.jsonNoSort)

    IOUtils.dump(os.path.join(SAVE_DIR, modelname, "pred.val.assignments.baseline.log"), "".join([pred.strip()+"\n" for pred in preds_list]), IOUtils.Format.txt)
    return
示例#10
0
文件: Vocabulary.py 项目: mfkiwl/hdlp
 def dump(self, path: Path):
     d = dict()
     for f in ["word_to_index", "index_to_word", "next_index", "counter"]:
         d[f] = getattr(self, f)
     # end for
     IOUtils.dump(path, d, IOUtils.Format.jsonPretty)
     return
示例#11
0
    def dump_data(self,
            rel_path: Union[str, List[str]],
            data: Any,
            fmt: IOUtils.Format,
            is_batched: bool = False,
            per_batch: int = 100,
            exist_ok: bool = False,
    ):
        abs_path = self.data_dir / self.assemble_rel_path(rel_path)
        if abs_path.exists() and not exist_ok:
            LoggingUtils.log_and_raise(self.logger, f"Cannot rewrite existing data at {abs_path}", IOError)
        # end if

        abs_path.parent.mkdir(parents=True, exist_ok=True)
        if not is_batched:
            if self.is_json_format(fmt):
                data = IOUtils.jsonfy(data)
            # end if
            IOUtils.dump(abs_path, data, fmt)
        else:
            # In batched mode, the data need to be slice-able and sizable
            IOUtils.rm(abs_path)
            abs_path.mkdir(parents=True)

            for batch_i in tqdm(range(math.ceil(len(data)/per_batch))):
                data_batch = data[per_batch*batch_i : per_batch*(batch_i+1)]
                if self.is_json_format(fmt):
                    data_batch = IOUtils.jsonfy(data_batch)
                # end if
                IOUtils.dump(abs_path/f"batch-{batch_i}.{fmt.get_extension()}", data_batch, fmt)
            # end for
        # end if
        return
示例#12
0
文件: ex_ms2.py 项目: mfkiwl/hdlp
def convert_json2txt(config_dict, data_types: List[str] = None):
    if data_types is None:
        data_types = ["train", "val", "test"]

    for data_type in data_types:
        data_list = IOUtils.load(
            os.path.join(DATADIR, config_dict["intermediate_data_dir"],
                         f"{data_type}.json"), IOUtils.Format.json)

        for src_type in config_dict["src_types"]:
            output_path = os.path.join(DATADIR, config_dict["save_dir"],
                                       f"src.{src_type}.{data_type}.txt")
            pa_i = int(config_dict["augment"])
            if src_type == "l":
                field = "l"
            elif src_type == "type":
                field = "l-type"
            elif src_type == "prevassign":
                field = f"pa{pa_i}"
            elif src_type == "patype":
                field = f"pa{pa_i}-type"
            else:
                raise ValueError(f"Unknown src_type {src_type}")
            # end if

            with open(output_path, "w") as f:
                for data in data_list:
                    if len(data[field]) == 0:
                        if field.endswith("-type"):
                            f.write("<pad>\n")
                        else:
                            f.write("<empty>\n")
                        # end if
                    else:
                        f.write(data[field] + "\n")
                    # end if
                # end for
            # end with
        # end for

        fn_output_path = os.path.join(DATADIR, config_dict["save_dir"],
                                      f"src.fn.{data_type}.txt")
        IOUtils.dump(fn_output_path,
                     "".join([data["file_sha"] + "\n" for data in data_list]),
                     IOUtils.Format.txt)

        tgt_output_path = os.path.join(DATADIR, config_dict["save_dir"],
                                       f"tgt.{data_type}.txt")
        # [3:-2]: remove prefix "<= " and suffix " ;"
        IOUtils.dump(tgt_output_path,
                     "".join([data["r"][3:-2] + "\n" for data in data_list]),
                     IOUtils.Format.txt)
    # end for
    print("Conversion into txt is done.")
    return
示例#13
0
    def prepare_configs_and_scripts(self, trials: List[int]):
        exp_dir = self.work_dir

        for trial in trials:
            trial_dir = exp_dir / f"trial-{trial}"
            IOUtils.mk_dir(trial_dir)

            model_dir = trial_dir / "models"
            IOUtils.mk_dir(model_dir)
            log_dir = trial_dir / "logs"
            IOUtils.mk_dir(log_dir)
            data = str(exp_dir / "data/code2seq")
            val_data = data + ".val.c2s"
            train_log = trial_dir / "training-trace.json"

            train_script_file = trial_dir / f"{Macros.train}.sh"
            # Copy config file
            BashUtils.run(
                f"cp {self.base_config_file} {trial_dir}/config.yaml",
                expected_return_code=0)
            output_file = trial_dir / "output_tmp.txt"
            reference_file = trial_dir / "ref_tmp.txt"
            config_file = trial_dir / "config.yaml"
            train_script = f"#!/bin/bash\n" \
                           f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                           f"conda activate {self.CONDA_ENV}\n" \
                           f"module load cuda/10.0 cudnn/7.6.2\n" \
                           f"cd {self.code_dir}\n" \
                           f"python -u code2seq.py --data {data} --test {val_data} --log {train_log} --config {config_file} " \
                           f"--pred_file {output_file} --ref_file {reference_file} "\
                           f"--save_prefix {model_dir}/model --gpu_id $1 &> {trial_dir}/train-log.txt"
            IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt)
            BashUtils.run(f"chmod +x {train_script_file}",
                          expected_return_code=0)

            for test_type in [Macros.test_common, Macros.test_standard]:
                test_data = exp_dir / "data" / f"code2seq.{test_type}.c2s"
                output_file = trial_dir / f"output_{test_type}.txt"
                reference_file = trial_dir / f"ref_{test_type}.txt"
                test_script_file = trial_dir / f"{test_type}.sh"
                test_script = f"#!/bin/bash\n" \
                              f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                              f"conda activate {self.CONDA_ENV}\n" \
                              f"module load cuda/10.0 cudnn/7.6.2\n" \
                              f"cd {self.code_dir}\n" \
                              f"python3 code2seq.py --load {model_dir}/model_best --test {test_data} --config {config_file} " \
                              f"--pred_file {output_file} --ref_file {reference_file} "\
                              f"--gpu_id $1 &> {trial_dir}/{test_type}-log.txt\n" \
                              f"python3 eval_utils.py {reference_file} {output_file} {trial_dir}/results_{test_type}.json\n"
                IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt)
                BashUtils.run(f"chmod +x {test_script_file}",
                              expected_return_code=0)

        return
示例#14
0
def get_eval_stats(pred_file: Path, ref_file: Path, result_dir: Path):
    true_positive, false_positive, false_negative = 0, 0, 0
    with open(pred_file, "r") as pf, open(ref_file, "r") as rf:
        pred_lines = pf.readlines()
        ref_lines = rf.readlines()
    true_positive, false_positive, false_negative = update_per_subtoken_statistics(
        zip(ref_lines, pred_lines), true_positive, false_positive,
        false_negative)
    precision, recall, f1 = calculate_results(true_positive, false_positive,
                                              false_negative)
    test_result = {"f1": f1, "precision": precision, "recall": recall}
    IOUtils.dump(result_dir, test_result, IOUtils.Format.jsonPretty)
示例#15
0
    def install_coq_project(cls, project: Project, names_projects: Dict[str, Project]) -> None:
        """
        :requires: the project is cloned and checked-out to the desired version.
        """
        if not project.is_cloned:
            project.clone()
            project.checkout(project.data["sha"], is_forced=True)
        # end if

        # Check if the project is already compiled
        confirmation_file = "lpc-installed.txt"
        confirmation_content = project.revision + " " + BashUtils.run("opam list coq -s", expected_return_code=0).stdout.strip()
        if (project.checkout_dir/confirmation_file).is_file() and IOUtils.load(project.checkout_dir/confirmation_file, "txt") == confirmation_content:
            cls.logger.debug(f"Project {project.full_name} already installed")
            return
        # end if

        project.clean()

        # Install dependencies
        for dependency in project.data.get("dependencies", []):
            dependency_project = names_projects.get(dependency)
            if dependency_project is None:  raise Exception(f"Cannot find dependency {dependency}")
            cls.logger.info(f"For Project {project.full_name}, installing dependency {dependency}")
            cls.install_coq_project(dependency_project, names_projects)
        # end for

        if "build_cmd" not in project.data:  raise Exception(f"Project {project.full_name} does not have build_cmd")
        if "install_cmd" not in project.data:  raise Exception(f"Project {project.full_name} does not have install_cmd")

        with IOUtils.cd(project.checkout_dir):
            # Build
            cls.logger.info(f"Project {project.full_name}: Building with {project.data['build_cmd']}")
            r = BashUtils.run(project.data["build_cmd"])
            if r.return_code != 0:
                raise Exception(f"Compilation failed! Return code is {r.return_code}! stdout:\n{r.stdout}\n; stderr:\n{r.stderr}")
            else:
                cls.logger.debug(f"Compilation finished. Return code is {r.return_code}. stdout:\n{r.stdout}\n; stderr:\n{r.stderr}")
            # end if

            # Install
            cls.logger.info(f"Project {project.full_name}: Installing with {project.data['install_cmd']}")
            r = BashUtils.run(project.data["install_cmd"])
            if r.return_code != 0:
                raise Exception(f"Installation failed! Return code is {r.return_code}! stdout:\n{r.stdout}\n; stderr:\n{r.stderr}")
            else:
                cls.logger.debug(f"Installation finished. Return code is {r.return_code}. stdout:\n{r.stdout}\n; stderr:\n{r.stderr}")
            # end if

            IOUtils.dump(project.checkout_dir / confirmation_file, confirmation_content, "txt")
        # end with
        return
示例#16
0
    def split_dataset_cross_file(
        cls,
        assignments_path: Path,
        output_dir: Path,
        seed,
        use_new_sub_tokenizer: bool,
    ):
        """Split dataset in a way that assignments that are in test set do not
overlap with those in training/validation set.  Specifically, we split
the entire set of files in testing/training/validation.

        """

        # Load the assignments dataset, as a flattened list
        data_list = cls.load_data_list(assignments_path)
        file_list = cls.shuffle_data(cls.extract_file_list(data_list), seed)

        val_data_list = list()
        test_data_list = list()
        train_data_list = list()

        for fsha in file_list:
            if len(test_data_list) < int(len(data_list) * 0.1):
                test_data_list.extend(
                    cls.extract_assignments_from(fsha, data_list))
            elif len(test_data_list) + len(train_data_list) < int(
                    len(data_list) * 0.9):
                train_data_list.extend(
                    cls.extract_assignments_from(fsha, data_list))
            else:
                val_data_list.extend(
                    cls.extract_assignments_from(fsha, data_list))

        statistics = {
            "num-data": len(data_list),
            "num-data-train": len(train_data_list),
            "num-data-val": len(val_data_list),
            "num-data-test": len(test_data_list),
            "num-files": len(file_list),
        }

        IOUtils.mk_dir(output_dir)
        cls.dump_data_list(output_dir / "train.json", train_data_list)
        cls.dump_data_list(output_dir / "val.json", val_data_list)
        cls.dump_data_list(output_dir / "test.json", test_data_list)
        IOUtils.dump(output_dir / "statistics.json", statistics,
                     IOUtils.Format.jsonNoSort)
        IOUtils.dump(output_dir / "files.json", file_list,
                     IOUtils.Format.jsonNoSort)
        return
示例#17
0
 def parse_projects(cls, project_list_file):
     """
     Parse the project list file provided by DeepCom and return the github url file.
     """
     project_list = IOUtils.load(project_list_file,
                                 IOUtils.Format.txt).splitlines()
     git_urls = list()
     for project in project_list:
         project_name = project.split("_", 1)
         git_urls.append(
             f"https://github.com/{project_name[0]}/{project_name[1]}.git")
     IOUtils.dump(Macros.data_dir / "DeepCom-projects-github.txt",
                  "".join([url + "\n" for url in git_urls]),
                  IOUtils.Format.txt)
示例#18
0
    def prepare_configs_and_scripts(self, trials: List[int]):

        exp_dir = self.work_dir
        for trial in trials:
            trial_dir = exp_dir/f"trial-{trial}"
            IOUtils.mk_dir(trial_dir)

            train_script_file = trial_dir/"train.sh"
            train_script = f"#!/bin/bash\n" \
                           f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                           f"module load cuda/10.1 cudnn/7.6.2\n" \
                           f"conda activate {self.CONDA_ENV}\n" \
                           f"cd {self.code_dir}\n" \
                           f"export MKL_SERVICE_FORCE_INTEL=1\n"\
                           f"python3 train.py " \
                           f"-data {self.data_dir}/transformer -save_model {trial_dir}/bestTransformer "\
                           f"-layers 6 -rnn_size 512 -word_vec_size 512 -transformer_ff 2048 -heads 8 "\
                           f"-encoder_type transformer -decoder_type transformer -position_encoding "\
                           f"-train_steps 50000  -max_generator_batches 2 -dropout 0.1 "\
                           f"-batch_size 4096 -batch_type tokens -normalization tokens  -accum_count 2 "\
                           f"-optim adam -adam_beta2 0.998 -decay_method noam -warmup_steps 8000 -learning_rate 2 " \
                           f"-max_grad_norm 0 -param_init 0 -param_init_glorot -early_stopping 10 -keep_checkpoint 1 " \
                           f"-label_smoothing 0.1 -valid_steps 500 -save_checkpoint_steps 500 -report_every 500 " \
                           f"--world_size 1 --gpu_ranks 0 " \
                           f"&> {trial_dir}/train-log.txt\n"
            IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt)
            BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0)

            for test_type in [Macros.test_common, Macros.test_standard]:

                test_script_file = trial_dir/f"{test_type}.sh"
                output_file = trial_dir / f"output_{test_type}.txt"
                test_script = f"#!/bin/bash\n" \
                              f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                              f"module load cuda/10.1 cudnn/7.6.2\n" \
                              f"conda activate {self.CONDA_ENV}\n" \
                              f"cd {self.code_dir}\n" \
                              f"export MKL_SERVICE_FORCE_INTEL=1\n"\
                              f"python3 translate.py "\
                              f"--model {trial_dir}/*.pt --output {output_file} --src {self.data_dir}/src-{test_type}.txt "\
                              f"&> {trial_dir}/{test_type}-log.txt\n" \
                              f"python3 eval_utils.py " \
                              f"{self.data_dir}/tgt-{test_type}.txt {output_file} {trial_dir}/results_{test_type}.json\n"
                IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt)
                BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0)

            # end for

        return
示例#19
0
文件: ex_s2s.py 项目: mfkiwl/hdlp
def convert_json2txt(config_dict, data_types: List[str] = None):
    if data_types is None:  data_types = ["train", "val", "test"]
    for data_type in data_types:
        data_list = IOUtils.load(os.path.join(DATADIR, config_dict["intermediate_data_dir"], f"{data_type}.json"), IOUtils.Format.json)

        for src_type in config_dict["src_types"]:
            output_path = os.path.join(DATADIR, config_dict["save_dir"], f"src.{src_type}.{data_type}.txt")
            num_pa = int(config_dict["num_pa"])

            with open(output_path, "w") as f:
                for data in data_list:
                    if src_type == "l":
                        seq = data["l"]
                        for pa_i in range(num_pa):
                            seq += " " + data[f"pa{pa_i+1}"]
                        # end for
                    elif src_type == "type":
                        seq = data["l-type-each-token"]
                        for pa_i in range(num_pa):
                            seq += " " + data[f"pa{pa_i+1}-type"]
                        # end for
                    else:
                        raise ValueError(f"Unknown src_type {src_type}")
                    # end if
                    
                    if len(seq) == 0:
                        if src_type == "type":
                            f.write("<pad>\n")
                        else:
                            f.write("<empty>\n")
                        # end if
                    else:
                        f.write(seq+"\n")
                    # end if
                # end for
            # end with
        # end for

        fn_output_path = os.path.join(DATADIR, config_dict["save_dir"], f"src.fn.{data_type}.txt")
        IOUtils.dump(fn_output_path, "".join([data["file_sha"]+"\n" for data in data_list]), IOUtils.Format.txt)

        tgt_output_path = os.path.join(DATADIR, config_dict["save_dir"], f"tgt.{data_type}.txt")
        # [3:-2]: remove prefix "<= " and suffix " ;"
        IOUtils.dump(tgt_output_path, "".join([data["r"][3:-2]+"\n" for data in data_list]), IOUtils.Format.txt)
    # end for
    print("Conversion into txt is done.")
    return
    def process_data_impl(self,
            data_dir: Path,
            output_processed_data_dir: Path,
    ) -> NoReturn:
        lemmas: List[Lemma] = IOUtils.dejsonfy(IOUtils.load(data_dir/"lemmas.json", IOUtils.Format.json), List[Lemma])
        definitions: List[Definition] = IOUtils.dejsonfy(IOUtils.load(data_dir/"definitions.json", IOUtils.Format.json), List[Definition])

        docs_sub_tokenizers = SubTokenizer.get_docs_sub_tokenizers(lemmas, definitions)

        # Put data in serialized files
        IOUtils.dump(output_processed_data_dir/f"src.txt",
            "".join([" ".join(self.get_input(lemma, docs_sub_tokenizers)) + "\n" for lemma in lemmas]),
            IOUtils.Format.txt)
        IOUtils.dump(output_processed_data_dir/f"tgt.txt",
            "".join([" ".join(self.get_output(lemma, docs_sub_tokenizers)) + "\n" for lemma in lemmas]),
            IOUtils.Format.txt)
        return
示例#21
0
    def prepare_configs_and_scripts(self, trials: List[int]):
        with open(self.base_config_file, "r") as f:
            base_config = yaml.load(f)
        exp_dir = self.work_dir
        for trial in trials:
            seed = random.randint(0, 9)
            trial_dir = exp_dir / f"trial-{trial}"
            IOUtils.mk_dir(trial_dir)

            config = copy.copy(base_config)
            config["data"] = str(self.data_dir / "biLSTM")
            config["save_model"] = str(trial_dir / "bestLSTM")
            config_file = trial_dir / "config.yaml"
            with open(config_file, "w+") as f:
                yaml.dump(config, f)
            train_script_file = trial_dir/"train.sh"
            train_script = f"#!/bin/bash\n" \
                           f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                           f"module load cuda/10.1 cudnn/7.6.2\n" \
                           f"conda activate {self.CONDA_ENV}\n" \
                           f"cd {self.code_dir}\n" \
                           f"export MKL_SERVICE_FORCE_INTEL=1\n"\
                           f"python3 train.py --config {config_file} --world_size 1 --gpu_ranks 0 -keep_checkpoint 1 " \
                           f"--seed {seed} &> {trial_dir}/train-log.txt\n"
            IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt)
            BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0)

            for test_type in [Macros.test_common, Macros.test_standard]:
                test_script_file = trial_dir/f"{test_type}.sh"
                output_file = trial_dir / f"output_{test_type}.txt"
                test_script = f"#!/bin/bash\n" \
                              f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                              f"module load cuda/10.1 cudnn/7.6.2\n" \
                              f"conda activate {self.CONDA_ENV}\n" \
                              f"cd {self.code_dir}\n" \
                              f"export MKL_SERVICE_FORCE_INTEL=1\n"\
                              f"python3 translate.py "\
                              f"--model {trial_dir}/*.pt --output {output_file} --src {self.data_dir}/src-{test_type}.txt "\
                              f"&> {trial_dir}/{test_type}-log.txt\n" \
                              f"python3 eval_utils.py " \
                              f"{self.data_dir}/tgt-{test_type}.txt {output_file} {trial_dir}/results_{test_type}.json\n"
                IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt)
                BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0)

            # end for
        return
示例#22
0
    def test_format_txt_list(self):
        """
        Tests for IOUtils.Format.txtList
        """
        obj = ["abcde", "12345", "x y z"]
        path = Path(tempfile.mktemp())
        expected = "abcde\n12345\nx y z\n"

        # Test dump
        IOUtils.dump(path, obj, IOUtils.Format.txtList)
        self.assertEqual(expected, self.load_plain(path))

        # Test load
        loaded = IOUtils.load(path, IOUtils.Format.txtList)
        self.assertEqual(obj, loaded)

        self.rm(path)
示例#23
0
    def test_format_json_list(self):
        """
        Tests for IOUtils.Format.jsonList
        """
        obj = ["abcde", [1, 2, 3], {"abc": "def"}]
        path = Path(tempfile.mktemp())
        expected = '"abcde"\n[1, 2, 3]\n{"abc": "def"}\n'

        # Test dump
        IOUtils.dump(path, obj, IOUtils.Format.jsonList)
        self.assertEqual(expected, self.load_plain(path))

        # Test load
        loaded = IOUtils.load(path, IOUtils.Format.jsonList)
        self.assertEqual(obj, loaded)

        self.rm(path)
示例#24
0
    def compute_bleu(cls, references:str, hypotheses: str, test_result_file: str) -> int:
        with open(references, 'r') as fr, open(hypotheses, 'r') as fh:
            refs = fr.readlines()
            hyps = fh.readlines()
        bleu_4_sentence_scores = []
        for ref, hyp in zip(refs, hyps):
            if len(hyp.strip().split()) < 2:
                bleu_4_sentence_scores.append(0)
            else:
                bleu_4_sentence_scores.append(sentence_bleu([ref.strip().split()], hyp.strip().split(),
                                                        smoothing_function=SmoothingFunction().method2,
                                                        auto_reweigh=True))
        score = 100 * sum(bleu_4_sentence_scores) / float(len(bleu_4_sentence_scores))

        result = {"bleu": score}
        IOUtils.dump(test_result_file, result)
        return score
示例#25
0
    def process_data(self,
                     method_data_list: List[MethodData],
                     data_type: str,
                     output_dir: Path,
                     split: bool = True):
        Environment.require_collector()

        log_file = output_dir / "collector-log.txt"
        data_file = output_dir / "method-data.json"
        IOUtils.dump(data_file, IOUtils.jsonfy(method_data_list),
                     IOUtils.Format.json)

        config = {
            "transform": True,
            "model": "BiLSTM",
            "dataType": data_type,
            "dataFile": str(data_file),
            "logFile": str(log_file),
            "outputDir": str(output_dir),
        }
        config_file = output_dir / "collector-config.json"
        IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty)

        self.logger.info(
            f"Starting the Java collector. Check log at {log_file} and outputs at {output_dir}"
        )
        rr = BashUtils.run(
            f"java -jar {Environment.collector_jar} {config_file}",
            expected_return_code=0)
        if rr.stdout:
            self.logger.warning(f"Stdout of collector:\n{rr.stdout}")
        # end if
        if rr.stderr:
            self.logger.warning(f"Stderr of collector:\n{rr.stderr}")
        # end if
        # build raw dataset
        if split:
            self.tokenizeFile(output_dir / f"{data_type}.raw.txt", data_type)
        else:
            self.noSplit(output_dir / f"{data_type}.raw.txt", data_type)

        error_ids = IOUtils.load(str(output_dir) + "-error-ids.json")
        print(f"Number of error id is: {len(error_ids)}")
        # BashUtils.run(f"rm {output_dir}-error-ids.json", expected_return_code=0)
        return error_ids
示例#26
0
    def collect_lat_results(self,
                            model: str,
                            metrics: List[str],
                            debug: bool = False):
        lat_results = {k: 0 for k in metrics}
        model_work_dir = Macros.data_dir / "models-work" / f"{model}-latest" if not debug \
            else Macros.data_dir / "models-work" / f"{model}-latest-debug"
        for trial in range(Macros.trials):
            trial_dir = model_work_dir / "latest" / f"trial-{trial}"
            result_file = f"{trial_dir}/test_result.json"
            metrics_dict = IOUtils.load(result_file)
            for k, v in metrics_dict.items():
                lat_results[k.lower()] += v
        for k, v in lat_results.items():
            lat_results[k] = round(v / Macros.trials, 2)

        output_dir = Macros.results_dir / "metrics"
        IOUtils.dump(output_dir / f"{model}-latest-results.json", lat_results,
                     IOUtils.Format.jsonPretty)
示例#27
0
文件: ex_ngram.py 项目: mfkiwl/hdlp
def write_results(targets: List[List[str]],
                  preds: List[List[str]],
                  results_dir: Path,
                  mode="test"):
    bleu_scores = list()
    acc_scores = list()
    exact_acc_scores = list()
    results_preds = ""
    for t, p in zip(targets, preds):
        bleu_score = get_bleu(t, p)
        acc_score = get_accuracy(t, p)
        exact_acc_score = get_exact_match_accuracy(t, p)
        bleu_scores.append(bleu_score)
        acc_scores.append(acc_score)
        exact_acc_scores.append(exact_acc_score)
        results_preds += " ".join(p)
        results_preds += "\n"

    # end for
    avg_bleu = np.mean(bleu_scores)
    avg_acc = np.mean(acc_scores)
    avg_exact_acc = np.mean(exact_acc_scores)

    print(
        f"Average BLEU: {avg_bleu:.3f}, average accuracy: {avg_acc:.3f}, average exact match accuracy: {avg_exact_acc:.3f}"
    )

    results = {
        "bleu-AVG": avg_bleu,
        "acc-AVG": avg_acc,
        "exact-acc-AVG": avg_exact_acc,
        "bleu": bleu_scores,
        "acc": acc_scores,
        "exact-acc": exact_acc_scores,
    }
    isval = ".assignments" if mode == "test" else ".val.assignments"
    results_dir.mkdir(parents=True, exist_ok=True)
    results_file: Path = results_dir / f"testlog{isval}.ngram.log"
    pred_file: Path = results_dir / f"pred{isval}.ngram.log"
    IOUtils.dump(results_file, results, IOUtils.Format.jsonNoSort)
    IOUtils.dump(pred_file, results_preds, IOUtils.Format.txt)
    return
示例#28
0
    def get_available_projects(self) -> List[str]:
        project_urls = IOUtils.load(Macros.data_dir / "projects.txt",
                                    IOUtils.Format.txt).splitlines()
        project_names = DataCollector.urls_to_names(project_urls)

        project_names_in_db = self.database.ls_projects()
        projects_not_collected = [
            p for p in project_names if p not in project_names_in_db
        ]
        if len(projects_not_collected) > 0:
            self.logger.warning(
                f"Ignoring {len(projects_not_collected)} projects whose data is not collected."
            )
            IOUtils.dump(self.output_dir / "projects-not-collected.txt",
                         "".join([p + "\n" for p in projects_not_collected]),
                         IOUtils.Format.txt)
        # end if
        project_names = [p for p in project_names if p in project_names_in_db]

        return project_names
示例#29
0
    def collect_all_results(self, model: str, metrics: List[str]):
        # Mapping of eval_setting-year -> metric -> test_set -> [trials]
        all_results: Dict[str, Dict[str, Dict[str, List[any]]]]

        # Load existing results, if any
        results_file = Macros.results_dir / "metrics" / f"results-trials-{model}.json"
        if results_file.exists():
            self.logger.info(f"Loading existing metrics from {results_file}")
            all_results = IOUtils.load(results_file)
        else:
            all_results = {}

        model_work_dir = Macros.data_dir / "models-work" / model
        for eval_setting in self.EVAL_SETTINGS:
            for year in self.YEARS:
                exp = f"{eval_setting}-{year}"
                exp_results = all_results.setdefault(exp, {})
                for test_set in [Macros.test_common, Macros.test_standard]:
                    set_results = exp_results.setdefault(test_set, {})

                    for trial in range(Macros.trials):
                        trial_dir = model_work_dir / exp / f"trial-{trial}"
                        cur_results_file = trial_dir / f"results_{test_set}.json"
                        if not cur_results_file.exists():
                            self.logger.warning(
                                f"Results not found at {cur_results_file}")
                            # Set default value for set_results[mname], but don't touch existing results if any
                            for mname in metrics:
                                set_results.setdefault(mname,
                                                       [None] * Macros.trials)
                        else:
                            results = IOUtils.load(cur_results_file)
                            for mname in metrics:
                                metric = results[mname]
                                set_results.setdefault(
                                    mname,
                                    [None] * Macros.trials)[trial] = metric

        # Save extracted/updated results
        IOUtils.dump(results_file, all_results, IOUtils.Format.jsonPretty)
        return
示例#30
0
    def prepare_configs_and_scripts(self, trials: List[int]):
        data_dir = self.work_dir / "data"
        base_config = IOUtils.load(self.base_config_file,
                                   IOUtils.Format.jsonPretty)

        for trial in trials:
            trial_dir = self.work_dir / f"trial-{trial}"
            IOUtils.mk_dir(trial_dir)

            config = copy.copy(base_config)
            config["data_dir"] = str(data_dir)
            config["model_dir"] = str(trial_dir / "model")
            config["output"] = str(trial_dir / "output.txt")

            config_file = trial_dir / "config.json"
            IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty)

            training_trace_file = trial_dir / "training-trace.json"

            train_script_file = trial_dir / f"{Macros.train}.sh"
            # The gpu-id argument is necessary for tensorflow, even if we are using CUDA_VISIBLE_DEVICES
            train_script = f"#!/bin/bash\n" \
                           f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                           f"conda activate {self.CONDA_ENV}\n" \
                           f"module load cuda/10.0 cudnn/7.6.2\n" \
                           f"cd {self.code_dir}/translate\n" \
                           f"python3 __main__.py {config_file} --train -v --train-log {training_trace_file} --gpu-id $1 &> {trial_dir}/log-{Macros.train}.txt\n"
            IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt)
            BashUtils.run(f"chmod +x {train_script_file}",
                          expected_return_code=0)

            for test_type in [Macros.test_common, Macros.test_standard]:
                output_file = trial_dir / f"output_{test_type}.txt"
                config["output"] = str(output_file)
                test_config_file = trial_dir / f"config_{test_type}.json"
                IOUtils.dump(test_config_file, config,
                             IOUtils.Format.jsonPretty)

                test_script_file = trial_dir / f"{test_type}.sh"
                test_script = f"#!/bin/bash\n" \
                              f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \
                              f"conda activate {self.CONDA_ENV}\n" \
                              f"module load cuda/10.0 cudnn/7.6.2\n" \
                              f"cd {self.code_dir}/translate\n" \
                              f"python3 __main__.py {test_config_file} --eval {data_dir}/{test_type}/test.token.code {data_dir}/{test_type}/test.token.sbt {data_dir}/{test_type}/test.token.nl --gpu-id $1 &> {trial_dir}/log-{test_type}.txt\n" \
                              f"python3 Bleu.py {data_dir}/{test_type}/test.token.nl {trial_dir}/output_{test_type}.txt {trial_dir}/results_{test_type}.json\n"
                IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt)
                BashUtils.run(f"chmod +x {test_script_file}",
                              expected_return_code=0)

        return