def suggest_naming(self, file_path: Path, prj_root: Optional[Path] = None): """ Processes a file to get its lemmas and runs the model to get predictions. """ # Figure out which project we're at, and then load configs if prj_root is None: prj_root = RoosterizeDirUtils.auto_infer_project_root(file_path) self.load_configs(prj_root) # Infer SerAPI options serapi_options = self.infer_serapi_options(prj_root) # If user provided compile_cmd, first compile the project if self.compile_cmd is not None: with IOUtils.cd(prj_root): BashUtils.run(self.compile_cmd, expected_return_code=0) # Parse file data = self.parse_file(file_path, prj_root, serapi_options) # Load model self.load_local_model(prj_root) model = self.get_model() # Use the model to make predictions # Temp dirs for processed data and results temp_data_dir = Path(tempfile.mkdtemp(prefix="roosterize")) # Dump lemmas & definitions temp_raw_data_dir = temp_data_dir / "raw" temp_raw_data_dir.mkdir() IOUtils.dump( temp_raw_data_dir / "lemmas.json", IOUtils.jsonfy(data.lemmas), IOUtils.Format.json, ) IOUtils.dump( temp_raw_data_dir / "definitions.json", IOUtils.jsonfy(data.definitions), IOUtils.Format.json, ) # Model-specific process temp_processed_data_dir = temp_data_dir / "processed" temp_processed_data_dir.mkdir() model.process_data_impl(temp_raw_data_dir, temp_processed_data_dir) # Invoke eval candidates_logprobs = model.eval_impl( temp_processed_data_dir, beam_search_size=self.beam_search_size, k=self.k, ) # Save predictions IOUtils.rm_dir(temp_data_dir) # Report predictions self.report_predictions(data, candidates_logprobs) return
def dump_data(self, rel_path: Union[str, List[str]], data: Any, fmt: IOUtils.Format, is_batched: bool = False, per_batch: int = 100, exist_ok: bool = False, ): abs_path = self.data_dir / self.assemble_rel_path(rel_path) if abs_path.exists() and not exist_ok: LoggingUtils.log_and_raise(self.logger, f"Cannot rewrite existing data at {abs_path}", IOError) # end if abs_path.parent.mkdir(parents=True, exist_ok=True) if not is_batched: if self.is_json_format(fmt): data = IOUtils.jsonfy(data) # end if IOUtils.dump(abs_path, data, fmt) else: # In batched mode, the data need to be slice-able and sizable IOUtils.rm(abs_path) abs_path.mkdir(parents=True) for batch_i in tqdm(range(math.ceil(len(data)/per_batch))): data_batch = data[per_batch*batch_i : per_batch*(batch_i+1)] if self.is_json_format(fmt): data_batch = IOUtils.jsonfy(data_batch) # end if IOUtils.dump(abs_path/f"batch-{batch_i}.{fmt.get_extension()}", data_batch, fmt) # end for # end if return
def train( self, train_processed_data_dir: Path, val_processed_data_dir: Path, force_retrain: bool = False, ) -> NoReturn: """ Trains the model on the training data. The trained model should be saved to output_dir. This function auto-saves a training-completed.txt as a proof of completion of training at the end. :param train_processed_data_dir: the directory containing the processed train data :param val_processed_data_dir: the directory containing the processed val data :param force_retrain: if set to True, re-train the model even if it was already trained (will remove previously trained model) """ if force_retrain or not self.is_training_completed(): self.logger.info(self.logging_prefix + f"Training model at {self.model_dir}; train: {train_processed_data_dir}, val: {val_processed_data_dir}") IOUtils.rm_dir(self.model_dir) IOUtils.mk_dir(self.model_dir) # Save spec & configs of this model IOUtils.dump(self.model_dir/"config-dict.json", IOUtils.jsonfy(self.config), IOUtils.Format.jsonPretty) IOUtils.dump(self.model_dir/"spec.json", IOUtils.jsonfy(self.spec), IOUtils.Format.jsonPretty) self.train_impl(train_processed_data_dir, val_processed_data_dir) IOUtils.dump(self.model_dir / self.TRAINING_COMPLETED_FILE_NAME, str(time.time_ns()), IOUtils.Format.txt) # end if return
def extract_data_from_corpus(cls, corpus_path: Path, trainevals: List[str], groups: List[str], output_path: Path, ): # 1. Prepare output path if output_path.is_dir(): cls.logger.warning(f"{output_path} already exists, will overwrite the files.") elif output_path.is_file(): LoggingUtils.log_and_raise(cls.logger, f"{output_path} already exists as a file. Aborting.", Exception) else: IOUtils.mk_dir(output_path) # end if assert all([traineval in Macros.DS_TRAINEVALS for traineval in trainevals]) assert all([group in Macros.DS_GROUPS+[Macros.DS_GROUP_TA] for group in groups]) data_mgr = FilesManager(corpus_path) # 2. Load lemmas and definitions lemmas_filtered: List[Lemma] = data_mgr.load_data([FilesManager.LEMMAS_FILTERED], IOUtils.Format.json, is_batched=True, clz=Lemma) definitions: List[Definition] = data_mgr.load_data([FilesManager.DEFINITIONS, "definitions.json"], IOUtils.Format.json, clz=Definition) # 3. Output to output_path for each combination of traineval and group for traineval in trainevals: for group in groups: IOUtils.mk_dir(output_path/f"{group}-{traineval}") data_indexes = IOUtils.load(project_dir/"training"/f"{group}-{traineval}.json"], IOUtils.Format.json, clz=str) IOUtils.dump(output_path/f"{group}-{traineval}/lemmas.json", IOUtils.jsonfy([l for l in lemmas_filtered if l.data_index in data_indexes]), IOUtils.Format.json) IOUtils.dump(output_path/f"{group}-{traineval}/definitions.json", IOUtils.jsonfy([d for d in definitions if d.data_index in data_indexes]), IOUtils.Format.json) # end for # end for return
def process_data(self, project_dir): try: revision_data = IOUtils.load(project_dir / "collector" / "method-project-revision.json") method_data = IOUtils.load(project_dir / "collector" / "method-data.json") output_dir = project_dir / "collector" method_project_evo = [] for year in BetaFilter.YEARS[:-1]: curr_time = f"{year}_Jan_1" curr_method_ids = \ [year_data["method_ids"] for year_data in revision_data if year_data["year"] == curr_time][0] next_time = f"{year + 1}_Jan_1" next_method_ids = \ [year_data["method_ids"] for year_data in revision_data if year_data["year"] == next_time][0] new_method_ids = list( set(next_method_ids) - set(curr_method_ids)) filtered_method_ids = BetaFilter.beta_filter( new_method_ids, curr_method_ids, method_data) method_project_evo.append({ "prj_name": revision_data[0]["prj_name"], "time": f"{curr_time}-{next_time}", "method_ids": filtered_method_ids }) IOUtils.dump(output_dir / "method-project-beta-filtered.json", IOUtils.jsonfy(method_project_evo), IOUtils.Format.json) return except: self.logger.info(f"Unexpected error: {sys.exc_info()[0]}") return
def test_jsonfy_record_class(self): example_obj = test_IOUtils.ExampleRecordClass(field_str="aaa", field_int=42, field_list=[1,2], nested_rc=test_IOUtils.ExampleSimpleRecordClass()) jsonfied = IOUtils.jsonfy(example_obj) self.assertTrue(jsonfied.get("field_str") == "aaa") self.assertTrue(jsonfied.get("field_int") == 42) self.assertTrue(jsonfied.get("field_list") == [1,2]) self.assertTrue(jsonfied.get("nested_rc").get("f") == 1) return
def test_jsonfy_basic(self): self.assertEqual("aaa", IOUtils.jsonfy("aaa")) self.assertEqual(42, IOUtils.jsonfy(42)) self.assertEqual(1.111, IOUtils.jsonfy(1.111)) self.assertEqual([1, 2.0, "ccc"], IOUtils.jsonfy([1, 2.0, "ccc"])) self.assertEqual({1, 2.0, "ccc"}, set(IOUtils.jsonfy({1, 2.0, "ccc"}))) self.assertEqual({"f1": 1, "f2": 2.0, "f3": "ccc"}, IOUtils.jsonfy({"f1": 1, "f2": 2.0, "f3": "ccc"})) return
def process_data(self, method_data_list: List[MethodData], data_type: str, output_dir: Path, split: bool = True): Environment.require_collector() log_file = output_dir / "collector-log.txt" data_file = output_dir / "method-data.json" IOUtils.dump(data_file, IOUtils.jsonfy(method_data_list), IOUtils.Format.json) config = { "transform": True, "model": "BiLSTM", "dataType": data_type, "dataFile": str(data_file), "logFile": str(log_file), "outputDir": str(output_dir), } config_file = output_dir / "collector-config.json" IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty) self.logger.info( f"Starting the Java collector. Check log at {log_file} and outputs at {output_dir}" ) rr = BashUtils.run( f"java -jar {Environment.collector_jar} {config_file}", expected_return_code=0) if rr.stdout: self.logger.warning(f"Stdout of collector:\n{rr.stdout}") # end if if rr.stderr: self.logger.warning(f"Stderr of collector:\n{rr.stderr}") # end if # build raw dataset if split: self.tokenizeFile(output_dir / f"{data_type}.raw.txt", data_type) else: self.noSplit(output_dir / f"{data_type}.raw.txt", data_type) error_ids = IOUtils.load(str(output_dir) + "-error-ids.json") print(f"Number of error id is: {len(error_ids)}") # BashUtils.run(f"rm {output_dir}-error-ids.json", expected_return_code=0) return error_ids
def test_jsonfy_enum(self): example_obj = test_IOUtils.ExampleEnum.Item1 jsonfied = IOUtils.jsonfy(example_obj) self.assertTrue(jsonfied, example_obj.value) return
def generate_configs(cls, name: str, path: Path, **options): config_files: Set[str] = set() ml_model_clz = cls.NAMES_MODELS[name] config = ml_model_clz.config_clz() type_hints = get_type_hints(ml_model_clz.config_clz) model_path = path/name model_path.mkdir(parents=True, exist_ok=True) cls.logger.info(f"Possible attrs and default values: {config.__dict__}") attrs_choices: dict = dict() attrs: list = list() for k, default_v in config.__dict__.items(): attrs.append(k) if k not in options: attrs_choices[k] = [default_v] else: if type_hints[k] == bool: attrs_choices[k] = [v == "True" for v in str(options[k]).split()] elif issubclass(type_hints[k], recordclass.mutabletuple): attrs_choices[k] = [IOUtils.dejsonfy(v, type_hints[k]) if v != "None" else None for v in str(options[k]).split()] else: attrs_choices[k] = [type_hints[k](v) for v in str(options[k]).split()] # end if attrs_choices[k] = list(set(attrs_choices[k])) cls.logger.debug(f"attr {k}, choices: {attrs_choices[k]}") options.pop(k) # end if # end for if len(options) > 0: cls.logger.warning(f"These options are not recognized: {options.keys()}") # end if candidate = [0] * len(attrs_choices) is_explore_finished = False while True: # Generate current candidate for i, attr in enumerate(attrs): config.__setattr__(attr, attrs_choices[attr][candidate[i]]) # end for if config.repOk(): # Adjust batch size adjust_batch_size_func = getattr(config, "adjust_batch_size", None) if callable(adjust_batch_size_func): adjust_batch_size_func() # end if config_file = model_path / (str(config)+".json") cls.logger.info(f"Saving candidate to {config_file}: {config}") config_files.add(name + "/" + str(config) + ".json") IOUtils.dump(config_file, IOUtils.jsonfy(config), IOUtils.Format.jsonPretty) else: cls.logger.info(f"Skipping invalid candidate: {config}") # end if # To next candidate for i, attr in enumerate(attrs): candidate[i] += 1 if candidate[i] >= len(attrs_choices[attr]): candidate[i] = 0 if i == len(attrs) - 1: is_explore_finished = True break else: continue # end if else: break # end if # end for if is_explore_finished: break # end while for config_file in config_files: print(f"- model: {name}") print(f" config-file: {config_file}") print() # end for return
def extract_data_project( cls, project_path: Path, files: Optional[List[str]], exclude_files: Optional[List[str]], exclude_pattern: Optional[str], serapi_options: str, output_path: Path, ): # 1. Prepare output path if output_path.is_dir(): cls.logger.warning( f"{output_path} already exists, will overwrite the files.") elif output_path.is_file(): LoggingUtils.log_and_raise( cls.logger, f"{output_path} already exists as a file. Aborting.", Exception) else: IOUtils.mk_dir(output_path) # end if # 2. Extract documents, tok.sexp and ast.sexp coq_documents: Dict[str, CoqDocument] = collections.OrderedDict() ast_sexp_lists: Dict[str, List[SexpNode]] = dict() tok_sexp_lists: Dict[str, List[SexpNode]] = dict() with IOUtils.cd(project_path): coq_files: List[str] = BashUtils.run( f"find -name '*.v' -type f").stdout.split("\n")[:-1] coq_files = [coq_file[2:] for coq_file in coq_files] if files is not None: coq_files = [f for f in coq_files if f in files] # end if if exclude_files is not None: coq_files = [f for f in coq_files if f not in exclude_files] # end if if exclude_pattern is not None: re_exclude_pattern = re.compile(exclude_pattern) coq_files = [ f for f in coq_files if not re_exclude_pattern.fullmatch(f) ] # end if for i, coq_file in enumerate(tqdm(coq_files)): try: # Read file with open(coq_file, "r", newline="") as f: source_code = f.read() # end with # Get unicode offsets unicode_offsets = ParserUtils.get_unicode_offsets( source_code) # Call SerAPI ast_sexp_str: str = BashUtils.run( f"sercomp {serapi_options} --mode=sexp -- {coq_file}", expected_return_code=0).stdout tok_sexp_str: str = BashUtils.run( f"sertok {serapi_options} -- {coq_file}", expected_return_code=0).stdout # Parse ast sexp ast_sexp_list: List[SexpNode] = SexpParser.parse_list( ast_sexp_str) tok_sexp_list: List[SexpNode] = SexpParser.parse_list( tok_sexp_str) # Parse the document coq_document = CoqParser.parse_document( source_code, ast_sexp_list, tok_sexp_list, unicode_offsets=unicode_offsets) # Set meta data coq_document.file_name = coq_file coq_document.project_name = project_path.name coq_documents[coq_file] = coq_document ast_sexp_lists[coq_file] = ast_sexp_list tok_sexp_lists[coq_file] = tok_sexp_list except KeyboardInterrupt: cls.logger.warning("Keyboard interrupt!") raise except: cls.logger.warning( f"File {coq_file} failed! Exception was: {traceback.format_exc()}" ) continue # end try # end for # 3. Extract and save lemmas and definitions lemmas: List[Lemma] = list() definitions: List[Definition] = list() # Increase recursion limit because the backend sexps are CRAZZZZY deep sys.setrecursionlimit(10000) for file_path, doc in tqdm(coq_documents.items()): ast_sexp_list = ast_sexp_lists[file_path] lemmas_doc = cls.collect_lemmas_doc(doc, ast_sexp_list, serapi_options) lemmas.extend(lemmas_doc) definitions_doc = cls.collect_definitions_doc( doc, ast_sexp_list) definitions.extend(definitions_doc) # end for IOUtils.dump(output_path / "lemmas.json", IOUtils.jsonfy(lemmas), IOUtils.Format.json) IOUtils.dump(output_path / "definitions.json", IOUtils.jsonfy(definitions), IOUtils.Format.json) # end with return
def collect_project(self, project_name: str, project_url: str): Environment.require_collector() # 0. Download repo downloads_dir = self.repos_downloads_dir / project_name results_dir = self.repos_results_dir / project_name # Remove previous results if any IOUtils.rm_dir(results_dir) IOUtils.mk_dir(results_dir) # Clone the repo if not exists if not downloads_dir.exists(): with IOUtils.cd(self.repos_downloads_dir): with TimeUtils.time_limit(300): BashUtils.run(f"git clone {project_url} {project_name}", expected_return_code=0) # end with # end with # end if project_data = ProjectData.create() project_data.name = project_name project_data.url = project_url # 1. Get list of revisions with IOUtils.cd(downloads_dir): git_log_out = BashUtils.run(f"git log --pretty=format:'%H %P'", expected_return_code=0).stdout for line in git_log_out.splitlines()[:self.MAX_REVISIONS]: shas = line.split() project_data.revisions.append(shas[0]) project_data.parent_revisions[shas[0]] = shas[1:] # end for # end with # 2. Get revisions in different year with IOUtils.cd(downloads_dir): for year in self.YEARS: git_log_out = BashUtils.run( f"git rev-list -1 --before=\"Jan 1 {year}\" origin", expected_return_code=0).stdout project_data.year_revisions[str(year) + "_Jan_1"] = git_log_out.rstrip() # end for # end with project_data_file = results_dir / "project.json" IOUtils.dump(project_data_file, IOUtils.jsonfy(project_data), IOUtils.Format.jsonPretty) # 2. Start java collector # Prepare config log_file = results_dir / "collector-log.txt" output_dir = results_dir / "collector" config = { "collect": True, "projectDir": str(downloads_dir), "projectDataFile": str(project_data_file), "logFile": str(log_file), "outputDir": str(output_dir), "year": True # To indicate whether to collect all evo data or yearly data } config_file = results_dir / "collector-config.json" IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty) self.logger.info( f"Starting the Java collector. Check log at {log_file} and outputs at {output_dir}" ) rr = BashUtils.run( f"java -jar {Environment.collector_jar} {config_file}", expected_return_code=0) if rr.stderr: self.logger.warning(f"Stderr of collector:\n{rr.stderr}") # end if # 3. In some cases, save collected data to appropriate location or database # TODO private info # On luzhou server for user pynie, move it to a dedicated location at /user/disk2 if BashUtils.run( f"hostname").stdout.strip() == "luzhou" and BashUtils.run( f"echo $USER").stdout.strip() == "pynie": alter_results_dir = Path( "/home/disk2/pynie/csevo-results") / project_name IOUtils.rm_dir(alter_results_dir) IOUtils.mk_dir(alter_results_dir.parent) BashUtils.run(f"mv {results_dir} {alter_results_dir}") self.logger.info(f"Results moved to {alter_results_dir}") # end if # -1. Remove repo IOUtils.rm_dir(downloads_dir) return