def parse_file(self, file_path: Path, prj_root: Path, serapi_options: str): source_code = IOUtils.load(file_path, IOUtils.Format.txt) unicode_offsets = ParserUtils.get_unicode_offsets(source_code) with IOUtils.cd(prj_root): rel_path = file_path.relative_to(prj_root) ast_sexp_str = BashUtils.run( f"sercomp {serapi_options} --mode=sexp -- {rel_path}", expected_return_code=0).stdout tok_sexp_str = BashUtils.run( f"sertok {serapi_options} -- {rel_path}", expected_return_code=0).stdout ast_sexp_list: List[SexpNode] = SexpParser.parse_list(ast_sexp_str) tok_sexp_list: List[SexpNode] = SexpParser.parse_list(tok_sexp_str) doc = CoqParser.parse_document( source_code, ast_sexp_list, tok_sexp_list, unicode_offsets=unicode_offsets, ) doc.file_name = str(rel_path) # Collect lemmas & definitions lemmas: List[Lemma] = DataMiner.collect_lemmas_doc( doc, ast_sexp_list, serapi_options) definitions: List[Definition] = DataMiner.collect_definitions_doc( doc, ast_sexp_list) return ProcessedFile(file_path, source_code, doc, ast_sexp_list, tok_sexp_list, unicode_offsets, lemmas, definitions)
def suggest_lemmas(**options): from roosterize.data.DataMiner import DataMiner from roosterize.data.ModelSpec import ModelSpec from roosterize.ml.MLModels import MLModels project_path = Path(options["project"]).absolute() files = Utils.get_option_as_list(options, "files", None) exclude_files = Utils.get_option_as_list(options, "exclude-files", None) exclude_pattern = options.get("exclude-pattern", None) serapi_options = options.get("serapi-options", "") output_dir = Path(options["output"]).absolute() model_dir = Path(options["model-dir"]).absolute() # Extract data print(">>>>> Extracting lemmas ...") DataMiner.extract_data_project(project_path, files, exclude_files, exclude_pattern, serapi_options, output_dir/"raw-data") # Get the ML model print(">>>>> Initializing model ...") model_spec = IOUtils.dejsonfy(IOUtils.load(model_dir/"spec.json", IOUtils.Format.json), ModelSpec) model = MLModels.get_model(model_dir, model_spec, is_eval=True) # Process data print(">>>>> Processing data ...") model.process_data(output_dir/"raw-data", output_dir/"eval-processed-data") # Eval print(">>>>> Applying model ...") model.eval(output_dir/"eval-processed-data", output_dir/"eval-result") # Print suggestions print(">>>>> Suggestions:") print(IOUtils.load(output_dir/"eval-result"/"suggestions.txt", IOUtils.Format.txt)) return
def extract_data_from_corpus(**options): from roosterize.data.DataMiner import DataMiner corpus_path = Path(options["corpus"]).absolute() trainevals = Utils.get_option_as_list(options, "trainevals", Macros.DS_TRAINEVALS) groups = Utils.get_option_as_list(options, "groups", [Macros.DS_GROUP_T1, Macros.DS_GROUP_TA]) output_path = Path(options["output"]).absolute() DataMiner.extract_data_from_corpus(corpus_path, trainevals, groups, output_path) return
def extract_data(**options): from roosterize.data.DataMiner import DataMiner project_path = Path(options["project"]).absolute() files = Utils.get_option_as_list(options, "files", None) exclude_files = Utils.get_option_as_list(options, "exclude-files", None) exclude_pattern = options.get("exclude-pattern", None) serapi_options = options.get("serapi-options", "") output_path = Path(options["output"]).absolute() DataMiner.extract_data_project(project_path, files, exclude_files, exclude_pattern, serapi_options, output_path) return
def improve_project_model(self, prj_root: Optional[Path]): if prj_root is None: prj_root = RoosterizeDirUtils.auto_infer_project_root() # Deactivate loaded model self.model = None # Delete existing local model local_model_dir = RoosterizeDirUtils.get_local_model_dir(prj_root) if local_model_dir.exists(): ans = self.ask_for_confirmation( f"A Roosterize model already exists at {local_model_dir}" f"Do you want to delete it and train again?") if not ans: return else: IOUtils.rm_dir(local_model_dir) # Copy global model to local model, but remove "training complete" marker global_model_dir = RoosterizeDirUtils.get_global_model_dir() if not global_model_dir.exists(): raise Exception( "Global Roosterize model not found! Please download model first." ) shutil.copytree(global_model_dir, local_model_dir) # Load local model self.load_local_model(prj_root) model = self.get_model() # Collect all lemmas in this project temp_data_dir = Path(tempfile.mkdtemp(prefix="roosterize")) DataMiner.extract_data_project( prj_root, files=None, exclude_files=self.exclude_files, exclude_pattern=self.exclude_pattern, serapi_options=self.infer_serapi_options(prj_root), output_path=temp_data_dir) # TODO: Split data into train/val set, then process each data (no pre-processing / rebuilding vocab!) # TODO: Train model # Delete temp file IOUtils.rm_dir(temp_data_dir)
def collect_data(**options): from roosterize.data.DataMiner import DataMiner DataMiner.collect_data(**options) return