def suggest_lemmas(**options): from roosterize.data.DataMiner import DataMiner from roosterize.data.ModelSpec import ModelSpec from roosterize.ml.MLModels import MLModels project_path = Path(options["project"]).absolute() files = Utils.get_option_as_list(options, "files", None) exclude_files = Utils.get_option_as_list(options, "exclude-files", None) exclude_pattern = options.get("exclude-pattern", None) serapi_options = options.get("serapi-options", "") output_dir = Path(options["output"]).absolute() model_dir = Path(options["model-dir"]).absolute() # Extract data print(">>>>> Extracting lemmas ...") DataMiner.extract_data_project(project_path, files, exclude_files, exclude_pattern, serapi_options, output_dir/"raw-data") # Get the ML model print(">>>>> Initializing model ...") model_spec = IOUtils.dejsonfy(IOUtils.load(model_dir/"spec.json", IOUtils.Format.json), ModelSpec) model = MLModels.get_model(model_dir, model_spec, is_eval=True) # Process data print(">>>>> Processing data ...") model.process_data(output_dir/"raw-data", output_dir/"eval-processed-data") # Eval print(">>>>> Applying model ...") model.eval(output_dir/"eval-processed-data", output_dir/"eval-result") # Print suggestions print(">>>>> Suggestions:") print(IOUtils.load(output_dir/"eval-result"/"suggestions.txt", IOUtils.Format.txt)) return
def extract_data(**options): from roosterize.data.DataMiner import DataMiner project_path = Path(options["project"]).absolute() files = Utils.get_option_as_list(options, "files", None) exclude_files = Utils.get_option_as_list(options, "exclude-files", None) exclude_pattern = options.get("exclude-pattern", None) serapi_options = options.get("serapi-options", "") output_path = Path(options["output"]).absolute() DataMiner.extract_data_project(project_path, files, exclude_files, exclude_pattern, serapi_options, output_path) return
def improve_project_model(self, prj_root: Optional[Path]): if prj_root is None: prj_root = RoosterizeDirUtils.auto_infer_project_root() # Deactivate loaded model self.model = None # Delete existing local model local_model_dir = RoosterizeDirUtils.get_local_model_dir(prj_root) if local_model_dir.exists(): ans = self.ask_for_confirmation( f"A Roosterize model already exists at {local_model_dir}" f"Do you want to delete it and train again?") if not ans: return else: IOUtils.rm_dir(local_model_dir) # Copy global model to local model, but remove "training complete" marker global_model_dir = RoosterizeDirUtils.get_global_model_dir() if not global_model_dir.exists(): raise Exception( "Global Roosterize model not found! Please download model first." ) shutil.copytree(global_model_dir, local_model_dir) # Load local model self.load_local_model(prj_root) model = self.get_model() # Collect all lemmas in this project temp_data_dir = Path(tempfile.mkdtemp(prefix="roosterize")) DataMiner.extract_data_project( prj_root, files=None, exclude_files=self.exclude_files, exclude_pattern=self.exclude_pattern, serapi_options=self.infer_serapi_options(prj_root), output_path=temp_data_dir) # TODO: Split data into train/val set, then process each data (no pre-processing / rebuilding vocab!) # TODO: Train model # Delete temp file IOUtils.rm_dir(temp_data_dir)