def parse_file(self, file_path: Path, prj_root: Path, serapi_options: str):
        source_code = IOUtils.load(file_path, IOUtils.Format.txt)
        unicode_offsets = ParserUtils.get_unicode_offsets(source_code)

        with IOUtils.cd(prj_root):
            rel_path = file_path.relative_to(prj_root)
            ast_sexp_str = BashUtils.run(
                f"sercomp {serapi_options} --mode=sexp -- {rel_path}",
                expected_return_code=0).stdout
            tok_sexp_str = BashUtils.run(
                f"sertok {serapi_options} -- {rel_path}",
                expected_return_code=0).stdout

            ast_sexp_list: List[SexpNode] = SexpParser.parse_list(ast_sexp_str)
            tok_sexp_list: List[SexpNode] = SexpParser.parse_list(tok_sexp_str)

            doc = CoqParser.parse_document(
                source_code,
                ast_sexp_list,
                tok_sexp_list,
                unicode_offsets=unicode_offsets,
            )
            doc.file_name = str(rel_path)

            # Collect lemmas & definitions
            lemmas: List[Lemma] = DataMiner.collect_lemmas_doc(
                doc, ast_sexp_list, serapi_options)
            definitions: List[Definition] = DataMiner.collect_definitions_doc(
                doc, ast_sexp_list)

        return ProcessedFile(file_path, source_code, doc, ast_sexp_list,
                             tok_sexp_list, unicode_offsets, lemmas,
                             definitions)
예제 #2
0
def suggest_lemmas(**options):
    from roosterize.data.DataMiner import DataMiner
    from roosterize.data.ModelSpec import ModelSpec
    from roosterize.ml.MLModels import MLModels

    project_path = Path(options["project"]).absolute()
    files = Utils.get_option_as_list(options, "files", None)
    exclude_files = Utils.get_option_as_list(options, "exclude-files", None)
    exclude_pattern = options.get("exclude-pattern", None)
    serapi_options = options.get("serapi-options", "")
    output_dir = Path(options["output"]).absolute()
    model_dir = Path(options["model-dir"]).absolute()

    # Extract data
    print(">>>>> Extracting lemmas ...")
    DataMiner.extract_data_project(project_path, files, exclude_files, exclude_pattern, serapi_options, output_dir/"raw-data")

    # Get the ML model
    print(">>>>> Initializing model ...")
    model_spec = IOUtils.dejsonfy(IOUtils.load(model_dir/"spec.json", IOUtils.Format.json), ModelSpec)
    model = MLModels.get_model(model_dir, model_spec, is_eval=True)

    # Process data
    print(">>>>> Processing data ...")
    model.process_data(output_dir/"raw-data", output_dir/"eval-processed-data")

    # Eval
    print(">>>>> Applying model ...")
    model.eval(output_dir/"eval-processed-data", output_dir/"eval-result")

    # Print suggestions
    print(">>>>> Suggestions:")
    print(IOUtils.load(output_dir/"eval-result"/"suggestions.txt", IOUtils.Format.txt))
    return
예제 #3
0
def extract_data_from_corpus(**options):
    from roosterize.data.DataMiner import DataMiner

    corpus_path = Path(options["corpus"]).absolute()
    trainevals = Utils.get_option_as_list(options, "trainevals", Macros.DS_TRAINEVALS)
    groups = Utils.get_option_as_list(options, "groups", [Macros.DS_GROUP_T1, Macros.DS_GROUP_TA])
    output_path = Path(options["output"]).absolute()

    DataMiner.extract_data_from_corpus(corpus_path, trainevals, groups, output_path)
    return
예제 #4
0
def extract_data(**options):
    from roosterize.data.DataMiner import DataMiner

    project_path = Path(options["project"]).absolute()
    files = Utils.get_option_as_list(options, "files", None)
    exclude_files = Utils.get_option_as_list(options, "exclude-files", None)
    exclude_pattern = options.get("exclude-pattern", None)
    serapi_options = options.get("serapi-options", "")
    output_path = Path(options["output"]).absolute()


    DataMiner.extract_data_project(project_path, files, exclude_files, exclude_pattern, serapi_options, output_path)
    return
    def improve_project_model(self, prj_root: Optional[Path]):
        if prj_root is None:
            prj_root = RoosterizeDirUtils.auto_infer_project_root()

        # Deactivate loaded model
        self.model = None

        # Delete existing local model
        local_model_dir = RoosterizeDirUtils.get_local_model_dir(prj_root)
        if local_model_dir.exists():
            ans = self.ask_for_confirmation(
                f"A Roosterize model already exists at {local_model_dir}"
                f"Do you want to delete it and train again?")
            if not ans:
                return
            else:
                IOUtils.rm_dir(local_model_dir)

        # Copy global model to local model, but remove "training complete" marker
        global_model_dir = RoosterizeDirUtils.get_global_model_dir()
        if not global_model_dir.exists():
            raise Exception(
                "Global Roosterize model not found! Please download model first."
            )
        shutil.copytree(global_model_dir, local_model_dir)

        # Load local model
        self.load_local_model(prj_root)
        model = self.get_model()

        # Collect all lemmas in this project
        temp_data_dir = Path(tempfile.mkdtemp(prefix="roosterize"))

        DataMiner.extract_data_project(
            prj_root,
            files=None,
            exclude_files=self.exclude_files,
            exclude_pattern=self.exclude_pattern,
            serapi_options=self.infer_serapi_options(prj_root),
            output_path=temp_data_dir)

        # TODO: Split data into train/val set, then process each data (no pre-processing / rebuilding vocab!)

        # TODO: Train model

        # Delete temp file
        IOUtils.rm_dir(temp_data_dir)
예제 #6
0
def collect_data(**options):
    from roosterize.data.DataMiner import DataMiner
    DataMiner.collect_data(**options)
    return