예제 #1
0
def suggest_lemmas(**options):
    from roosterize.data.DataMiner import DataMiner
    from roosterize.data.ModelSpec import ModelSpec
    from roosterize.ml.MLModels import MLModels

    project_path = Path(options["project"]).absolute()
    files = Utils.get_option_as_list(options, "files", None)
    exclude_files = Utils.get_option_as_list(options, "exclude-files", None)
    exclude_pattern = options.get("exclude-pattern", None)
    serapi_options = options.get("serapi-options", "")
    output_dir = Path(options["output"]).absolute()
    model_dir = Path(options["model-dir"]).absolute()

    # Extract data
    print(">>>>> Extracting lemmas ...")
    DataMiner.extract_data_project(project_path, files, exclude_files, exclude_pattern, serapi_options, output_dir/"raw-data")

    # Get the ML model
    print(">>>>> Initializing model ...")
    model_spec = IOUtils.dejsonfy(IOUtils.load(model_dir/"spec.json", IOUtils.Format.json), ModelSpec)
    model = MLModels.get_model(model_dir, model_spec, is_eval=True)

    # Process data
    print(">>>>> Processing data ...")
    model.process_data(output_dir/"raw-data", output_dir/"eval-processed-data")

    # Eval
    print(">>>>> Applying model ...")
    model.eval(output_dir/"eval-processed-data", output_dir/"eval-result")

    # Print suggestions
    print(">>>>> Suggestions:")
    print(IOUtils.load(output_dir/"eval-result"/"suggestions.txt", IOUtils.Format.txt))
    return
예제 #2
0
def extract_data(**options):
    from roosterize.data.DataMiner import DataMiner

    project_path = Path(options["project"]).absolute()
    files = Utils.get_option_as_list(options, "files", None)
    exclude_files = Utils.get_option_as_list(options, "exclude-files", None)
    exclude_pattern = options.get("exclude-pattern", None)
    serapi_options = options.get("serapi-options", "")
    output_path = Path(options["output"]).absolute()


    DataMiner.extract_data_project(project_path, files, exclude_files, exclude_pattern, serapi_options, output_path)
    return
    def improve_project_model(self, prj_root: Optional[Path]):
        if prj_root is None:
            prj_root = RoosterizeDirUtils.auto_infer_project_root()

        # Deactivate loaded model
        self.model = None

        # Delete existing local model
        local_model_dir = RoosterizeDirUtils.get_local_model_dir(prj_root)
        if local_model_dir.exists():
            ans = self.ask_for_confirmation(
                f"A Roosterize model already exists at {local_model_dir}"
                f"Do you want to delete it and train again?")
            if not ans:
                return
            else:
                IOUtils.rm_dir(local_model_dir)

        # Copy global model to local model, but remove "training complete" marker
        global_model_dir = RoosterizeDirUtils.get_global_model_dir()
        if not global_model_dir.exists():
            raise Exception(
                "Global Roosterize model not found! Please download model first."
            )
        shutil.copytree(global_model_dir, local_model_dir)

        # Load local model
        self.load_local_model(prj_root)
        model = self.get_model()

        # Collect all lemmas in this project
        temp_data_dir = Path(tempfile.mkdtemp(prefix="roosterize"))

        DataMiner.extract_data_project(
            prj_root,
            files=None,
            exclude_files=self.exclude_files,
            exclude_pattern=self.exclude_pattern,
            serapi_options=self.infer_serapi_options(prj_root),
            output_path=temp_data_dir)

        # TODO: Split data into train/val set, then process each data (no pre-processing / rebuilding vocab!)

        # TODO: Train model

        # Delete temp file
        IOUtils.rm_dir(temp_data_dir)