def extract_data_from_corpus(cls, corpus_path: Path, trainevals: List[str], groups: List[str], output_path: Path, ): # 1. Prepare output path if output_path.is_dir(): cls.logger.warning(f"{output_path} already exists, will overwrite the files.") elif output_path.is_file(): LoggingUtils.log_and_raise(cls.logger, f"{output_path} already exists as a file. Aborting.", Exception) else: IOUtils.mk_dir(output_path) # end if assert all([traineval in Macros.DS_TRAINEVALS for traineval in trainevals]) assert all([group in Macros.DS_GROUPS+[Macros.DS_GROUP_TA] for group in groups]) data_mgr = FilesManager(corpus_path) # 2. Load lemmas and definitions lemmas_filtered: List[Lemma] = data_mgr.load_data([FilesManager.LEMMAS_FILTERED], IOUtils.Format.json, is_batched=True, clz=Lemma) definitions: List[Definition] = data_mgr.load_data([FilesManager.DEFINITIONS, "definitions.json"], IOUtils.Format.json, clz=Definition) # 3. Output to output_path for each combination of traineval and group for traineval in trainevals: for group in groups: IOUtils.mk_dir(output_path/f"{group}-{traineval}") data_indexes = IOUtils.load(project_dir/"training"/f"{group}-{traineval}.json"], IOUtils.Format.json, clz=str) IOUtils.dump(output_path/f"{group}-{traineval}/lemmas.json", IOUtils.jsonfy([l for l in lemmas_filtered if l.data_index in data_indexes]), IOUtils.Format.json) IOUtils.dump(output_path/f"{group}-{traineval}/definitions.json", IOUtils.jsonfy([d for d in definitions if d.data_index in data_indexes]), IOUtils.Format.json) # end for # end for return
def test_dejsonfy_basic(self): self.assertEqual("aaa", IOUtils.dejsonfy("aaa")) self.assertEqual(42, IOUtils.dejsonfy(42)) self.assertEqual(1.111, IOUtils.dejsonfy(1.111)) self.assertEqual([1, 2.0, "ccc"], IOUtils.dejsonfy([1, 2.0, "ccc"])) self.assertEqual({"f1": 1, "f2": 2.0, "f3": "ccc"}, IOUtils.dejsonfy({"f1": 1, "f2": 2.0, "f3": "ccc"})) return
def collect_data(cls, **options) -> NoReturn: data_mgr = FilesManager(cls.dataset_dir) task = options["task"] projects_path = Path(options.get("corpus", cls.dataset_dir / "projects-standalone-8.10.yml")) projects: List[Project] = IOUtils.dejsonfy(IOUtils.load(projects_path, "json"), Project) if task == cls.TASK_COQ_DOCUMENTS: files = Utils.get_option_as_list(options, "files", None) is_verifying_tokenizer = Utils.get_option_as_boolean(options, "verify-tokenizer") cls.collect_coq_documents_projects(data_mgr, projects, files, is_verifying_tokenizer) elif task == cls.TASK_DATA_INDEXES: cls.collect_data_indexes(data_mgr, projects) elif task == cls.TASK_DEFINITIONS: cls.collect_definitions(data_mgr) elif task == cls.TASK_INSTALL_COQ_PROJECTS: cls.install_coq_projects(projects) elif task == cls.TASK_LEMMA: files = Utils.get_option_as_list(options, "files", None) cls.collect_lemmas(data_mgr, projects, files) elif task == cls.TASK_LEMMA_BACKEND_SEXP_TRANSFORMATIONS: cls.collect_lemmas_backend_sexp_transformations(data_mgr) elif task == cls.TASK_LEMMA_FILTERED: cls.filter_lemmas(data_mgr) elif task == cls.TASK_LEMMA_FOREEND_SEXP_TRANSFORMATIONS: cls.collect_lemmas_foreend_sexp_transformations(data_mgr) else: LoggingUtils.log_and_raise(cls.logger, f"Unknown task {task}", ValueError) # end if return
def write_seq_len_stat(num_pa, ref_modelname): stat_list = list() src_l = list() data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), f"data/vhdl/{ref_modelname}") for mode in ["train", "val", "test"]: src_l_file = os.path.join(data_dir, f"src.l.{mode}.txt") src_l += [ l.split() for l in IOUtils.load( src_l_file, IOUtils.Format.txt).strip().splitlines() ] stat_list.append(get_seq_len_stat(src_l)) for i in range(num_pa): src_pa = list() result_list = list() for mode in ["train", "val", "test"]: src_pa_file = os.path.join(data_dir, f"src.prevassign{i}.{mode}.txt") src_pa += [ l.split() for l in IOUtils.load( src_pa_file, IOUtils.Format.txt).strip().splitlines() ] for j, pa in enumerate(src_pa): if pa != ["<empty>"]: src_l[j] = pa + src_l[j] result_list.append(src_l[j]) stat_list.append(get_seq_len_stat(result_list)) results_file = os.path.join("../slpproject/_results/vhdl/ALL/metrics", f"lhs-pa-len-stat.json") IOUtils.dump(results_file, stat_list, IOUtils.Format.json) return
def test_format_yaml(self): """ Tests for IOUtils.Format.yaml """ objs = [ 42.001, "aaa", [13, "24", 56.7], { "name": "K", "job": "Y" }, ] exp_strs = [ "42.001\n...\n", "aaa\n...\n", "- 13\n- '24'\n- 56.7\n", "job: Y\nname: K\n", # dictionary are forced to be sorted ] for obj, exp_str in zip(objs, exp_strs): path = Path(tempfile.mktemp()) # Test dump IOUtils.dump(path, obj, IOUtils.Format.yaml) self.assertEqual(exp_str, self.load_plain(path)) # Test load loaded = IOUtils.load(path, IOUtils.Format.yaml) self.assertEqual(obj, loaded) self.rm(path)
def split_project(self, method_file: Path, random_seed: int, debug: bool = False): """ Split projects into train, val, test according to the project names. Will get 2 new files: project-list.json, project-split.json. """ proj_list = set() with open(method_file, "r") as f: objects = ijson.items(f, "item") for o in objects: proj_list.add(o["prj_name"]) num_proj = len(proj_list) proj_list = list(proj_list) if debug: output_dir = Path("/tmp/nlpast-data-10") else: output_dir = Path("/tmp/nlpast-data-880") IOUtils.dump(output_dir / "project-list.json", proj_list) random.seed(random_seed) random.shuffle(proj_list) train_index = round(num_proj * 0.8) valid_index = train_index + round(num_proj * 0.1) train_projs = proj_list[:train_index] valid_projs = proj_list[train_index:valid_index] test_projs = proj_list[valid_index:] project_split = { "train": train_projs, "val": valid_projs, "test": test_projs } IOUtils.dump(output_dir / "project-split.json", project_split)
def load_configs(self, prj_root: Optional[Path] = None, force_reload: bool = False): """ Load configs (first project-local, then global) to this user interface. """ # If the configs of the current project is already loaded, skip if not force_reload and prj_root is not None and prj_root == self.loaded_config_prj: return # Reset the project-local config indicator self.loaded_config_prj = None # First, load global config global_config_file = RoosterizeDirUtils.get_global_config_file() if global_config_file.exists(): global_config = IOUtils.load(global_config_file, IOUtils.Format.yaml) self.set_configs_from_dict(global_config, self.GLOBAL_CONFIGS) # Then, load local config if prj_root is not None: local_config_file = RoosterizeDirUtils.get_local_config_file( prj_root) if local_config_file.exists(): local_config = IOUtils.load(local_config_file, IOUtils.Format.yaml) self.set_configs_from_dict(local_config, self.LOCAL_CONFIGS) self.loaded_config_prj = prj_root
def main_val(modelname, ref_modelname): bleus, accs, exact_accs = [],[],[] target_list = get_targets(ref_modelname, "val") preds_list = get_baseline_preds(ref_modelname, "val") for pred, target in zip(preds_list, target_list): pred_split = [t for t in pred.split(" ") if t!=''] target_split = [t for t in target.split(" ") if t!=''] bleu = get_bleu(target=target_split, pred=pred_split) acc = get_accuracy(target=target_split, pred=pred_split) exact_acc = get_exact_match_accuracy(target=target_split, pred=pred_split) bleus.append(bleu) accs.append(acc) exact_accs.append(exact_acc) avg_bleu = np.mean(bleus) avg_acc = np.mean(accs) avg_exact_acc = np.mean(exact_accs) print(f"Average BLEU: {avg_bleu:.3f}, average accuracy: {avg_acc:.3f}, average exact match accuracy: {avg_exact_acc:.3f}") results_file = os.path.join(SAVE_DIR, modelname,"testlog.val.assignments.baseline.log") results = { "bleu-AVG": avg_bleu, "acc-AVG": avg_acc, "exact-acc-AVG": avg_exact_acc, "bleu": bleus, "acc": accs, "exact-acc": exact_accs, } IOUtils.dump(results_file, results, IOUtils.Format.jsonNoSort) IOUtils.dump(os.path.join(SAVE_DIR, modelname, "pred.val.assignments.baseline.log"), "".join([pred.strip()+"\n" for pred in preds_list]), IOUtils.Format.txt) return
def load_data(num_pa, ref_modelname): src_dict = dict() stat_list = list() data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), f"data/vhdl/{ref_modelname}") for mode in ["train", "val", "test"]: src_l_file = os.path.join(data_dir, f"src.l.{mode}.txt") src_l = [ l.split() for l in IOUtils.load( src_l_file, IOUtils.Format.txt).strip().splitlines() ] src_r_file = os.path.join(data_dir, f"tgt.{mode}.txt") src_r = [ l.split() for l in IOUtils.load( src_r_file, IOUtils.Format.txt).strip().splitlines() ] src_seq = [l + ["<="] + r for l, r in zip(src_l, src_r)] for i in range(num_pa): src_pa_file = os.path.join(data_dir, f"src.prevassign{i}.{mode}.txt") src_pa = [ l.split() for l in IOUtils.load( src_pa_file, IOUtils.Format.txt).strip().splitlines() ] for j, pa in enumerate(src_pa): src_seq[j] = pa + src_seq[j] src_dict[f"{mode}"] = src_seq return src_dict
def download_global_model(self, force_yes: bool = False): """ Downloads a global Roosterize model. """ global_model_dir = RoosterizeDirUtils.get_global_model_dir() if global_model_dir.exists(): ans = self.ask_for_confirmation( f"A Roosterize model already exists at {global_model_dir}. " f"Do you want to delete it and download again?") if force_yes: ans = True if ans != True: return IOUtils.rm_dir(global_model_dir) self.show_message("Downloading Roosterize model...") # Download and unpack temp_model_dir = Path(tempfile.mkdtemp(prefix="roosterize")) urllib.request.urlretrieve(self.model_url, str(temp_model_dir / "model.tgz")) with IOUtils.cd(temp_model_dir): BashUtils.run("tar xzf model.tgz", expected_return_code=0) # Move the stuff to global model place shutil.move(str(Path.cwd() / "model"), global_model_dir) # Delete temp dir IOUtils.rm_dir(temp_model_dir) self.show_message("Finish downloading Roosterize model.")
def prepare_data(self): data_prefix = f"{self.eval_setting}-{self.year}" data_dir = self.work_dir / "data" self.logger.info( f"Preparing the data for {self.eval_setting} {self.year} at {data_dir}" ) IOUtils.rm_dir(data_dir) IOUtils.mk_dir(data_dir) # Copy train/val/test_common/test_standard data BashUtils.run( f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.train}/train {data_dir}/train", expected_return_code=0) BashUtils.run( f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.val}/valid {data_dir}/valid", expected_return_code=0) BashUtils.run( f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.test_common}/test {data_dir}/{Macros.test_common}", expected_return_code=0) BashUtils.run( f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/test {data_dir}/{Macros.test_standard}", expected_return_code=0) # Copy vocab BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/vocab* {data_dir}/", expected_return_code=0) return
def process_data(self, project_dir): try: revision_data = IOUtils.load(project_dir / "collector" / "method-project-revision.json") method_data = IOUtils.load(project_dir / "collector" / "method-data.json") output_dir = project_dir / "collector" method_project_evo = [] for year in BetaFilter.YEARS[:-1]: curr_time = f"{year}_Jan_1" curr_method_ids = \ [year_data["method_ids"] for year_data in revision_data if year_data["year"] == curr_time][0] next_time = f"{year + 1}_Jan_1" next_method_ids = \ [year_data["method_ids"] for year_data in revision_data if year_data["year"] == next_time][0] new_method_ids = list( set(next_method_ids) - set(curr_method_ids)) filtered_method_ids = BetaFilter.beta_filter( new_method_ids, curr_method_ids, method_data) method_project_evo.append({ "prj_name": revision_data[0]["prj_name"], "time": f"{curr_time}-{next_time}", "method_ids": filtered_method_ids }) IOUtils.dump(output_dir / "method-project-beta-filtered.json", IOUtils.jsonfy(method_project_evo), IOUtils.Format.json) return except: self.logger.info(f"Unexpected error: {sys.exc_info()[0]}") return
def load_data(self, rel_path: Union[str, List[str]], fmt: IOUtils.Format, is_batched: bool = False, clz = None, ) -> Any: if self.is_json_format(fmt) and clz is None: self.logger.warning(f"Load data from {rel_path} with json format, but did not specify clz (at {traceback.format_stack()})") # end if abs_path = self.data_dir / self.assemble_rel_path(rel_path) if not abs_path.exists(): LoggingUtils.log_and_raise(self.logger, f"Cannot find data at {abs_path}", IOError) # end if if not is_batched: data = IOUtils.load(abs_path, fmt) if self.is_json_format(fmt) and clz is not None: data = IOUtils.dejsonfy(data, clz) # end if return data else: data = list() batch_numbers = sorted([int(str(f.stem).split("-")[1]) for f in abs_path.iterdir()]) for batch_number in tqdm(batch_numbers): batch_file = abs_path / f"batch-{batch_number}.{fmt.get_extension()}" data_batch = IOUtils.load(batch_file, fmt) if self.is_json_format(fmt) and clz is not None: data_batch = IOUtils.dejsonfy(data_batch, clz) # end if data.extend(data_batch) # end for return data
def suggest_lemmas(**options): from roosterize.data.DataMiner import DataMiner from roosterize.data.ModelSpec import ModelSpec from roosterize.ml.MLModels import MLModels project_path = Path(options["project"]).absolute() files = Utils.get_option_as_list(options, "files", None) exclude_files = Utils.get_option_as_list(options, "exclude-files", None) exclude_pattern = options.get("exclude-pattern", None) serapi_options = options.get("serapi-options", "") output_dir = Path(options["output"]).absolute() model_dir = Path(options["model-dir"]).absolute() # Extract data print(">>>>> Extracting lemmas ...") DataMiner.extract_data_project(project_path, files, exclude_files, exclude_pattern, serapi_options, output_dir/"raw-data") # Get the ML model print(">>>>> Initializing model ...") model_spec = IOUtils.dejsonfy(IOUtils.load(model_dir/"spec.json", IOUtils.Format.json), ModelSpec) model = MLModels.get_model(model_dir, model_spec, is_eval=True) # Process data print(">>>>> Processing data ...") model.process_data(output_dir/"raw-data", output_dir/"eval-processed-data") # Eval print(">>>>> Applying model ...") model.eval(output_dir/"eval-processed-data", output_dir/"eval-result") # Print suggestions print(">>>>> Suggestions:") print(IOUtils.load(output_dir/"eval-result"/"suggestions.txt", IOUtils.Format.txt)) return
def data_cut(self, data_size: int): """cut down the dataset to data_size, then save the projects list to data_dir""" collected_projects_file = Macros.data_dir / "projects-github.txt" self.collected_projects_list = list() if collected_projects_file.exists(): self.collected_projects_list += IOUtils.load( collected_projects_file, IOUtils.Format.txt).splitlines() # end if project_name_list = list() for project_url in self.collected_projects_list: user_repo = self.parse_github_url(project_url) project_name_list.append(f"{user_repo[0]}_{user_repo[1]}") all_used_projects = [ str(x).split("/")[-1] for x in Macros.repos_results_dir.iterdir() if x.is_dir() ] # Find the overlapping projects and select the top data_size projects overall_project_num = 0 reduced_project_list = list() for p in project_name_list: if p in all_used_projects and overall_project_num < data_size: # load the revision data filtered_methods = IOUtils.load(Macros.repos_results_dir / p / "collector" / "method-project-revision.json") new_method_ids = [ delta_data["method_ids"] for delta_data in filtered_methods if delta_data["year"] == "2020_Jan_1" ][0] if len(new_method_ids) > 0: reduced_project_list.append(p) overall_project_num += 1 all_used_projects.remove(p) IOUtils.dump(Macros.data_dir / f"projects-github-{data_size}.json", reduced_project_list, IOUtils.Format.jsonNoSort)
def parse_file(self, file_path: Path, prj_root: Path, serapi_options: str): source_code = IOUtils.load(file_path, IOUtils.Format.txt) unicode_offsets = ParserUtils.get_unicode_offsets(source_code) with IOUtils.cd(prj_root): rel_path = file_path.relative_to(prj_root) ast_sexp_str = BashUtils.run( f"sercomp {serapi_options} --mode=sexp -- {rel_path}", expected_return_code=0).stdout tok_sexp_str = BashUtils.run( f"sertok {serapi_options} -- {rel_path}", expected_return_code=0).stdout ast_sexp_list: List[SexpNode] = SexpParser.parse_list(ast_sexp_str) tok_sexp_list: List[SexpNode] = SexpParser.parse_list(tok_sexp_str) doc = CoqParser.parse_document( source_code, ast_sexp_list, tok_sexp_list, unicode_offsets=unicode_offsets, ) doc.file_name = str(rel_path) # Collect lemmas & definitions lemmas: List[Lemma] = DataMiner.collect_lemmas_doc( doc, ast_sexp_list, serapi_options) definitions: List[Definition] = DataMiner.collect_definitions_doc( doc, ast_sexp_list) return ProcessedFile(file_path, source_code, doc, ast_sexp_list, tok_sexp_list, unicode_offsets, lemmas, definitions)
def prepare_data(self): data_prefix = f"{self.eval_setting}-{self.year}" data_dir = self.work_dir / "data" self.logger.info( f"Preparing the data for {self.eval_setting} {self.year} at {self.work_dir}" ) IOUtils.rm_dir(data_dir) IOUtils.mk_dir(data_dir) # Copy train/val/test_common/test_standard data BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/code2seq.train.c2s {self.work_dir}/data/", expected_return_code=0) BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.val}/code2seq.val.c2s {self.work_dir}/data/", expected_return_code=0) BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_common}/code2seq.test.c2s {self.work_dir}/data/code2seq.{Macros.test_common}.c2s", expected_return_code=0) BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/code2seq.test.c2s {self.work_dir}/data/code2seq.{Macros.test_standard}.c2s", expected_return_code=0) # Copy vocab BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/code2seq.dict.c2s {self.work_dir}/data/", expected_return_code=0) return
def dump(self, path: Path): d = dict() for f in ["word_to_index", "index_to_word", "next_index", "counter"]: d[f] = getattr(self, f) # end for IOUtils.dump(path, d, IOUtils.Format.jsonPretty) return
def clean_path(self, rel_path: Union[str, List[str]]): abs_path = self.data_dir / self.assemble_rel_path(rel_path) if abs_path.exists(): self.logger.info(f"Removing existing things at {abs_path}") IOUtils.rm(abs_path) # end if return
def __init__(self, database: Database): self.database = database self.output_dir = Macros.data_dir / "split" IOUtils.rm_dir(self.output_dir) IOUtils.mk_dir(self.output_dir) self.statistics = dict() return
def eval_impl(self, processed_data_dir: Path, model_dir: Path, beam_search_size: int, k: int ) -> List[List[Tuple[str, float]]]: from roosterize.ml.onmt.CustomTranslator import CustomTranslator from onmt.utils.misc import split_corpus from onmt.utils.parse import ArgumentParser from translate import _get_parser as translate_get_parser src_path = processed_data_dir/"src.txt" tgt_path = processed_data_dir/"tgt.txt" best_step = IOUtils.load(model_dir/"best-step.json", IOUtils.Format.json) self.logger.info(f"Taking best step at {best_step}") candidates_logprobs: List[List[Tuple[List[str], float]]] = list() with IOUtils.cd(self.open_nmt_path): parser = translate_get_parser() opt = parser.parse_args( f" -model {model_dir}/models/ckpt_step_{best_step}.pt" f" -src {src_path}" f" -tgt {tgt_path}" ) opt.output = f"{model_dir}/last-pred.txt" opt.beam_size = beam_search_size opt.gpu = 0 if torch.cuda.is_available() else -1 opt.n_best = k opt.block_ngram_repeat = 1 opt.ignore_when_blocking = ["_"] # translate.main ArgumentParser.validate_translate_opts(opt) translator = CustomTranslator.build_translator(opt, report_score=False) src_shards = split_corpus(opt.src, opt.shard_size) tgt_shards = split_corpus(opt.tgt, opt.shard_size) if opt.tgt is not None else repeat(None) shard_pairs = zip(src_shards, tgt_shards) for i, (src_shard, tgt_shard) in enumerate(shard_pairs): self.logger.info("Translating shard %d." % i) _, _, candidates_logprobs_shard = translator.translate( src=src_shard, tgt=tgt_shard, src_dir=opt.src_dir, batch_size=opt.batch_size, attn_debug=opt.attn_debug ) candidates_logprobs.extend(candidates_logprobs_shard) # end for # end with # Reformat candidates candidates_logprobs: List[List[Tuple[str, float]]] = [[("".join(c), l) for c, l in cl] for cl in candidates_logprobs] return candidates_logprobs
def suggest_naming(self, file_path: Path, prj_root: Optional[Path] = None): """ Processes a file to get its lemmas and runs the model to get predictions. """ # Figure out which project we're at, and then load configs if prj_root is None: prj_root = RoosterizeDirUtils.auto_infer_project_root(file_path) self.load_configs(prj_root) # Infer SerAPI options serapi_options = self.infer_serapi_options(prj_root) # If user provided compile_cmd, first compile the project if self.compile_cmd is not None: with IOUtils.cd(prj_root): BashUtils.run(self.compile_cmd, expected_return_code=0) # Parse file data = self.parse_file(file_path, prj_root, serapi_options) # Load model self.load_local_model(prj_root) model = self.get_model() # Use the model to make predictions # Temp dirs for processed data and results temp_data_dir = Path(tempfile.mkdtemp(prefix="roosterize")) # Dump lemmas & definitions temp_raw_data_dir = temp_data_dir / "raw" temp_raw_data_dir.mkdir() IOUtils.dump( temp_raw_data_dir / "lemmas.json", IOUtils.jsonfy(data.lemmas), IOUtils.Format.json, ) IOUtils.dump( temp_raw_data_dir / "definitions.json", IOUtils.jsonfy(data.definitions), IOUtils.Format.json, ) # Model-specific process temp_processed_data_dir = temp_data_dir / "processed" temp_processed_data_dir.mkdir() model.process_data_impl(temp_raw_data_dir, temp_processed_data_dir) # Invoke eval candidates_logprobs = model.eval_impl( temp_processed_data_dir, beam_search_size=self.beam_search_size, k=self.k, ) # Save predictions IOUtils.rm_dir(temp_data_dir) # Report predictions self.report_predictions(data, candidates_logprobs) return
def convert_json2txt(config_dict, data_types: List[str] = None): if data_types is None: data_types = ["train", "val", "test"] for data_type in data_types: data_list = IOUtils.load( os.path.join(DATADIR, config_dict["intermediate_data_dir"], f"{data_type}.json"), IOUtils.Format.json) for src_type in config_dict["src_types"]: output_path = os.path.join(DATADIR, config_dict["save_dir"], f"src.{src_type}.{data_type}.txt") pa_i = int(config_dict["augment"]) if src_type == "l": field = "l" elif src_type == "type": field = "l-type" elif src_type == "prevassign": field = f"pa{pa_i}" elif src_type == "patype": field = f"pa{pa_i}-type" else: raise ValueError(f"Unknown src_type {src_type}") # end if with open(output_path, "w") as f: for data in data_list: if len(data[field]) == 0: if field.endswith("-type"): f.write("<pad>\n") else: f.write("<empty>\n") # end if else: f.write(data[field] + "\n") # end if # end for # end with # end for fn_output_path = os.path.join(DATADIR, config_dict["save_dir"], f"src.fn.{data_type}.txt") IOUtils.dump(fn_output_path, "".join([data["file_sha"] + "\n" for data in data_list]), IOUtils.Format.txt) tgt_output_path = os.path.join(DATADIR, config_dict["save_dir"], f"tgt.{data_type}.txt") # [3:-2]: remove prefix "<= " and suffix " ;" IOUtils.dump(tgt_output_path, "".join([data["r"][3:-2] + "\n" for data in data_list]), IOUtils.Format.txt) # end for print("Conversion into txt is done.") return
def prepare_configs_and_scripts(self, trials: List[int]): exp_dir = self.work_dir for trial in trials: trial_dir = exp_dir / f"trial-{trial}" IOUtils.mk_dir(trial_dir) model_dir = trial_dir / "models" IOUtils.mk_dir(model_dir) log_dir = trial_dir / "logs" IOUtils.mk_dir(log_dir) data = str(exp_dir / "data/code2seq") val_data = data + ".val.c2s" train_log = trial_dir / "training-trace.json" train_script_file = trial_dir / f"{Macros.train}.sh" # Copy config file BashUtils.run( f"cp {self.base_config_file} {trial_dir}/config.yaml", expected_return_code=0) output_file = trial_dir / "output_tmp.txt" reference_file = trial_dir / "ref_tmp.txt" config_file = trial_dir / "config.yaml" train_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}\n" \ f"python -u code2seq.py --data {data} --test {val_data} --log {train_log} --config {config_file} " \ f"--pred_file {output_file} --ref_file {reference_file} "\ f"--save_prefix {model_dir}/model --gpu_id $1 &> {trial_dir}/train-log.txt" IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0) for test_type in [Macros.test_common, Macros.test_standard]: test_data = exp_dir / "data" / f"code2seq.{test_type}.c2s" output_file = trial_dir / f"output_{test_type}.txt" reference_file = trial_dir / f"ref_{test_type}.txt" test_script_file = trial_dir / f"{test_type}.sh" test_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}\n" \ f"python3 code2seq.py --load {model_dir}/model_best --test {test_data} --config {config_file} " \ f"--pred_file {output_file} --ref_file {reference_file} "\ f"--gpu_id $1 &> {trial_dir}/{test_type}-log.txt\n" \ f"python3 eval_utils.py {reference_file} {output_file} {trial_dir}/results_{test_type}.json\n" IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0) return
def get_eval_stats(pred_file: Path, ref_file: Path, result_dir: Path): true_positive, false_positive, false_negative = 0, 0, 0 with open(pred_file, "r") as pf, open(ref_file, "r") as rf: pred_lines = pf.readlines() ref_lines = rf.readlines() true_positive, false_positive, false_negative = update_per_subtoken_statistics( zip(ref_lines, pred_lines), true_positive, false_positive, false_negative) precision, recall, f1 = calculate_results(true_positive, false_positive, false_negative) test_result = {"f1": f1, "precision": precision, "recall": recall} IOUtils.dump(result_dir, test_result, IOUtils.Format.jsonPretty)
def prepare_code(self): IOUtils.rm_dir(self.code_dir) with IOUtils.cd(self.code_dir.parent): BashUtils.run(f"git clone {self.REPO_URL} {self.code_dir.name}", expected_return_code=0) # end with with IOUtils.cd(self.code_dir): BashUtils.run(f"git checkout {self.REPO_SHA}", expected_return_code=0) # end with return
def install_coq_project(cls, project: Project, names_projects: Dict[str, Project]) -> None: """ :requires: the project is cloned and checked-out to the desired version. """ if not project.is_cloned: project.clone() project.checkout(project.data["sha"], is_forced=True) # end if # Check if the project is already compiled confirmation_file = "lpc-installed.txt" confirmation_content = project.revision + " " + BashUtils.run("opam list coq -s", expected_return_code=0).stdout.strip() if (project.checkout_dir/confirmation_file).is_file() and IOUtils.load(project.checkout_dir/confirmation_file, "txt") == confirmation_content: cls.logger.debug(f"Project {project.full_name} already installed") return # end if project.clean() # Install dependencies for dependency in project.data.get("dependencies", []): dependency_project = names_projects.get(dependency) if dependency_project is None: raise Exception(f"Cannot find dependency {dependency}") cls.logger.info(f"For Project {project.full_name}, installing dependency {dependency}") cls.install_coq_project(dependency_project, names_projects) # end for if "build_cmd" not in project.data: raise Exception(f"Project {project.full_name} does not have build_cmd") if "install_cmd" not in project.data: raise Exception(f"Project {project.full_name} does not have install_cmd") with IOUtils.cd(project.checkout_dir): # Build cls.logger.info(f"Project {project.full_name}: Building with {project.data['build_cmd']}") r = BashUtils.run(project.data["build_cmd"]) if r.return_code != 0: raise Exception(f"Compilation failed! Return code is {r.return_code}! stdout:\n{r.stdout}\n; stderr:\n{r.stderr}") else: cls.logger.debug(f"Compilation finished. Return code is {r.return_code}. stdout:\n{r.stdout}\n; stderr:\n{r.stderr}") # end if # Install cls.logger.info(f"Project {project.full_name}: Installing with {project.data['install_cmd']}") r = BashUtils.run(project.data["install_cmd"]) if r.return_code != 0: raise Exception(f"Installation failed! Return code is {r.return_code}! stdout:\n{r.stdout}\n; stderr:\n{r.stderr}") else: cls.logger.debug(f"Installation finished. Return code is {r.return_code}. stdout:\n{r.stdout}\n; stderr:\n{r.stderr}") # end if IOUtils.dump(project.checkout_dir / confirmation_file, confirmation_content, "txt") # end with return
def parse_projects(cls, project_list_file): """ Parse the project list file provided by DeepCom and return the github url file. """ project_list = IOUtils.load(project_list_file, IOUtils.Format.txt).splitlines() git_urls = list() for project in project_list: project_name = project.split("_", 1) git_urls.append( f"https://github.com/{project_name[0]}/{project_name[1]}.git") IOUtils.dump(Macros.data_dir / "DeepCom-projects-github.txt", "".join([url + "\n" for url in git_urls]), IOUtils.Format.txt)
def dump_data(self, rel_path: Union[str, List[str]], data: Any, fmt: IOUtils.Format, is_batched: bool = False, per_batch: int = 100, exist_ok: bool = False, ): abs_path = self.data_dir / self.assemble_rel_path(rel_path) if abs_path.exists() and not exist_ok: LoggingUtils.log_and_raise(self.logger, f"Cannot rewrite existing data at {abs_path}", IOError) # end if abs_path.parent.mkdir(parents=True, exist_ok=True) if not is_batched: if self.is_json_format(fmt): data = IOUtils.jsonfy(data) # end if IOUtils.dump(abs_path, data, fmt) else: # In batched mode, the data need to be slice-able and sizable IOUtils.rm(abs_path) abs_path.mkdir(parents=True) for batch_i in tqdm(range(math.ceil(len(data)/per_batch))): data_batch = data[per_batch*batch_i : per_batch*(batch_i+1)] if self.is_json_format(fmt): data_batch = IOUtils.jsonfy(data_batch) # end if IOUtils.dump(abs_path/f"batch-{batch_i}.{fmt.get_extension()}", data_batch, fmt) # end for # end if return
def process_data_impl( self, data_dir: Path, output_processed_data_dir: Path, ) -> NoReturn: lemmas: List[Lemma] = IOUtils.dejsonfy( IOUtils.load(data_dir / "lemmas.json", IOUtils.Format.json), List[Lemma]) definitions: List[Definition] = IOUtils.dejsonfy( IOUtils.load(data_dir / "definitions.json", IOUtils.Format.json), List[Definition]) docs_sub_tokenizers = SubTokenizer.get_docs_sub_tokenizers( lemmas, definitions) # Inputs all_inputs: Dict[str, List[List[str]]] = self.get_all_inputs( lemmas, docs_sub_tokenizers) for input_type, src_sentences in all_inputs.items(): IOUtils.dump( output_processed_data_dir / f"src.{input_type}.txt", "".join([" ".join(sent) + "\n" for sent in src_sentences]), IOUtils.Format.txt) # end for # Outputs IOUtils.dump( output_processed_data_dir / f"tgt.txt", "".join([ " ".join(self.get_output(lemma, docs_sub_tokenizers)) + "\n" for lemma in lemmas ]), IOUtils.Format.txt) super().process_data_impl(data_dir, output_processed_data_dir) return