Пример #1
0
    def load_data(self,
            rel_path: Union[str, List[str]],
            fmt: IOUtils.Format,
            is_batched: bool = False,
            clz = None,
    ) -> Any:
        if self.is_json_format(fmt) and clz is None:
            self.logger.warning(f"Load data from {rel_path} with json format, but did not specify clz (at {traceback.format_stack()})")
        # end if

        abs_path = self.data_dir / self.assemble_rel_path(rel_path)
        if not abs_path.exists():
            LoggingUtils.log_and_raise(self.logger, f"Cannot find data at {abs_path}", IOError)
        # end if

        if not is_batched:
            data = IOUtils.load(abs_path, fmt)
            if self.is_json_format(fmt) and clz is not None:
                data = IOUtils.dejsonfy(data, clz)
            # end if
            return data
        else:
            data = list()
            batch_numbers = sorted([int(str(f.stem).split("-")[1]) for f in abs_path.iterdir()])
            for batch_number in tqdm(batch_numbers):
                batch_file = abs_path / f"batch-{batch_number}.{fmt.get_extension()}"
                data_batch = IOUtils.load(batch_file, fmt)
                if self.is_json_format(fmt) and clz is not None:
                    data_batch = IOUtils.dejsonfy(data_batch, clz)
                # end if
                data.extend(data_batch)
            # end for
            return data
Пример #2
0
 def test_dejsonfy_basic(self):
     self.assertEqual("aaa", IOUtils.dejsonfy("aaa"))
     self.assertEqual(42, IOUtils.dejsonfy(42))
     self.assertEqual(1.111, IOUtils.dejsonfy(1.111))
     self.assertEqual([1, 2.0, "ccc"], IOUtils.dejsonfy([1, 2.0, "ccc"]))
     self.assertEqual({"f1": 1, "f2": 2.0, "f3": "ccc"}, IOUtils.dejsonfy({"f1": 1, "f2": 2.0, "f3": "ccc"}))
     return
Пример #3
0
    def process_data_impl(
        self,
        data_dir: Path,
        output_processed_data_dir: Path,
    ) -> NoReturn:
        lemmas: List[Lemma] = IOUtils.dejsonfy(
            IOUtils.load(data_dir / "lemmas.json", IOUtils.Format.json),
            List[Lemma])
        definitions: List[Definition] = IOUtils.dejsonfy(
            IOUtils.load(data_dir / "definitions.json", IOUtils.Format.json),
            List[Definition])

        docs_sub_tokenizers = SubTokenizer.get_docs_sub_tokenizers(
            lemmas, definitions)

        # Inputs
        all_inputs: Dict[str, List[List[str]]] = self.get_all_inputs(
            lemmas, docs_sub_tokenizers)
        for input_type, src_sentences in all_inputs.items():
            IOUtils.dump(
                output_processed_data_dir / f"src.{input_type}.txt",
                "".join([" ".join(sent) + "\n" for sent in src_sentences]),
                IOUtils.Format.txt)
        # end for

        # Outputs
        IOUtils.dump(
            output_processed_data_dir / f"tgt.txt", "".join([
                " ".join(self.get_output(lemma, docs_sub_tokenizers)) + "\n"
                for lemma in lemmas
            ]), IOUtils.Format.txt)

        super().process_data_impl(data_dir, output_processed_data_dir)
        return
Пример #4
0
def suggest_lemmas(**options):
    from roosterize.data.DataMiner import DataMiner
    from roosterize.data.ModelSpec import ModelSpec
    from roosterize.ml.MLModels import MLModels

    project_path = Path(options["project"]).absolute()
    files = Utils.get_option_as_list(options, "files", None)
    exclude_files = Utils.get_option_as_list(options, "exclude-files", None)
    exclude_pattern = options.get("exclude-pattern", None)
    serapi_options = options.get("serapi-options", "")
    output_dir = Path(options["output"]).absolute()
    model_dir = Path(options["model-dir"]).absolute()

    # Extract data
    print(">>>>> Extracting lemmas ...")
    DataMiner.extract_data_project(project_path, files, exclude_files, exclude_pattern, serapi_options, output_dir/"raw-data")

    # Get the ML model
    print(">>>>> Initializing model ...")
    model_spec = IOUtils.dejsonfy(IOUtils.load(model_dir/"spec.json", IOUtils.Format.json), ModelSpec)
    model = MLModels.get_model(model_dir, model_spec, is_eval=True)

    # Process data
    print(">>>>> Processing data ...")
    model.process_data(output_dir/"raw-data", output_dir/"eval-processed-data")

    # Eval
    print(">>>>> Applying model ...")
    model.eval(output_dir/"eval-processed-data", output_dir/"eval-result")

    # Print suggestions
    print(">>>>> Suggestions:")
    print(IOUtils.load(output_dir/"eval-result"/"suggestions.txt", IOUtils.Format.txt))
    return
Пример #5
0
    def collect_data(cls, **options) -> NoReturn:
        data_mgr = FilesManager(cls.dataset_dir)

        task = options["task"]

        projects_path = Path(options.get("corpus", cls.dataset_dir / "projects-standalone-8.10.yml"))
        projects: List[Project] = IOUtils.dejsonfy(IOUtils.load(projects_path, "json"), Project)

        if task == cls.TASK_COQ_DOCUMENTS:
            files = Utils.get_option_as_list(options, "files", None)
            is_verifying_tokenizer = Utils.get_option_as_boolean(options, "verify-tokenizer")
            cls.collect_coq_documents_projects(data_mgr, projects, files, is_verifying_tokenizer)
        elif task == cls.TASK_DATA_INDEXES:
            cls.collect_data_indexes(data_mgr, projects)
        elif task == cls.TASK_DEFINITIONS:
            cls.collect_definitions(data_mgr)
        elif task == cls.TASK_INSTALL_COQ_PROJECTS:
            cls.install_coq_projects(projects)
        elif task == cls.TASK_LEMMA:
            files = Utils.get_option_as_list(options, "files", None)
            cls.collect_lemmas(data_mgr, projects, files)
        elif task == cls.TASK_LEMMA_BACKEND_SEXP_TRANSFORMATIONS:
            cls.collect_lemmas_backend_sexp_transformations(data_mgr)
        elif task == cls.TASK_LEMMA_FILTERED:
            cls.filter_lemmas(data_mgr)
        elif task == cls.TASK_LEMMA_FOREEND_SEXP_TRANSFORMATIONS:
            cls.collect_lemmas_foreend_sexp_transformations(data_mgr)
        else:
            LoggingUtils.log_and_raise(cls.logger, f"Unknown task {task}", ValueError)
        # end if
        return
    def process_data_impl(self,
            data_dir: Path,
            output_processed_data_dir: Path,
    ) -> NoReturn:
        lemmas: List[Lemma] = IOUtils.dejsonfy(IOUtils.load(data_dir/"lemmas.json", IOUtils.Format.json), List[Lemma])
        definitions: List[Definition] = IOUtils.dejsonfy(IOUtils.load(data_dir/"definitions.json", IOUtils.Format.json), List[Definition])

        docs_sub_tokenizers = SubTokenizer.get_docs_sub_tokenizers(lemmas, definitions)

        # Put data in serialized files
        IOUtils.dump(output_processed_data_dir/f"src.txt",
            "".join([" ".join(self.get_input(lemma, docs_sub_tokenizers)) + "\n" for lemma in lemmas]),
            IOUtils.Format.txt)
        IOUtils.dump(output_processed_data_dir/f"tgt.txt",
            "".join([" ".join(self.get_output(lemma, docs_sub_tokenizers)) + "\n" for lemma in lemmas]),
            IOUtils.Format.txt)
        return
Пример #7
0
    def __init__(
            self,
            model_dir: Path,
            model_spec: ModelSpec,
            config_clz: type,
    ):
        self.model_dir = model_dir
        self.spec = model_spec
        self.config: TConfig = IOUtils.dejsonfy(model_spec.config_dict, config_clz) if model_spec.config_dict is not None else config_clz()

        self.logger.info(f"{type(self).__name__} {self.spec.model} created with config {self.config}")
        return
 def load_local_model(self, prj_root: Path) -> None:
     """
     Try to load the local model, if it exists; otherwise do nothing.
     """
     if self.model is None:
         local_model_dir = RoosterizeDirUtils.get_local_model_dir(prj_root)
         if local_model_dir.is_dir():
             model_spec = IOUtils.dejsonfy(
                 IOUtils.load(local_model_dir / "spec.json",
                              IOUtils.Format.json),
                 ModelSpec,
             )
             self.model = MLModels.get_model(local_model_dir,
                                             model_spec,
                                             is_eval=True)
 def get_model(self) -> NamingModelBase:
     """
     Try to get the currently loaded model; if no model is loaded, gets the global model.
     The local model can be loaded by invoking load_local_model (before invoking this method).
     """
     if self.model is None:
         # Load global model
         global_model_dir = RoosterizeDirUtils.get_global_model_dir()
         model_spec = IOUtils.dejsonfy(
             IOUtils.load(global_model_dir / "spec.json",
                          IOUtils.Format.json),
             ModelSpec,
         )
         self.model = MLModels.get_model(global_model_dir,
                                         model_spec,
                                         is_eval=True)
     return self.model
Пример #10
0
def eval_model(**options):
    from roosterize.data.ModelSpec import ModelSpec
    from roosterize.ml.MLModels import MLModels

    data_dir = Path(options["data"]).absolute()
    model_dir = Path(options["model-dir"]).absolute()
    output_dir = Path(options["output"]).absolute()

    # Get the ML model
    model_spec = IOUtils.dejsonfy(IOUtils.load(model_dir/"spec.json", IOUtils.Format.json), ModelSpec)
    model = MLModels.get_model(model_dir, model_spec, is_eval=True)

    # Process data
    model.process_data(data_dir, output_dir/"eval-processed-data")

    # Eval
    model.eval(output_dir/"eval-processed-data", output_dir/"eval-result")
    return
Пример #11
0
 def test_dejsonfy_record_class(self):
     example_obj = test_IOUtils.ExampleRecordClass(
         field_str="aaa",
         field_int=42,
         field_int_2=66,
         field_list=[1, 2],
         nested_rc=test_IOUtils.ExampleSimpleRecordClass(f=225))
     dejsonfied = IOUtils.dejsonfy(
         {
             "field_str": "aaa",
             "field_int": 42,
             "field_int_2": "66",
             "field_list": [1, 2],
             "nested_rc": {
                 "f": 225
             }
         }, test_IOUtils.ExampleRecordClass)
     self.assertEqual(example_obj, dejsonfied)
     return
Пример #12
0
    def iter_batched_data(
            self,
            rel_path: Union[str, List[str]],
            fmt: IOUtils.Format,
            clz=None,
    ) -> Iterator:
        if self.is_json_format(fmt) and clz is None:
            logger.warning(f"Load data from {rel_path} with json format, but did not specify clz")

        abs_path = self.data_dir / self.assemble_rel_path(rel_path)
        if not abs_path.exists():
            raise IOError(f"Cannot find data at {abs_path}")

        batch_numbers = sorted([int(str(f.stem).split("-")[1]) for f in abs_path.iterdir()])
        for batch_number in batch_numbers:
            batch_file = abs_path / f"batch-{batch_number}.{fmt.get_extension()}"
            for data_entry in IOUtils.load(batch_file, fmt):
                if self.is_json_format(fmt) and clz is not None:
                    data_entry = IOUtils.dejsonfy(data_entry, clz)
                # end if
                yield data_entry
Пример #13
0
 def test_dejsonfy_seqs(self):
     self.assertEqual([1, 2, 3], IOUtils.dejsonfy([1, 2, 3], List[int]))
     self.assertEqual((1, 2, 3),
                      IOUtils.dejsonfy([1, 2, 3], Tuple[int, int, int]))
     self.assertEqual({1, 2, 3}, IOUtils.dejsonfy([1, 2, 3], Set[int]))
     return
Пример #14
0
 def test_dejsonfy_enum(self):
     example_obj = test_IOUtils.ExampleEnum.Item3
     dejsonfied = IOUtils.dejsonfy(3, test_IOUtils.ExampleEnum)
     self.assertEqual(example_obj, dejsonfied)
     return
Пример #15
0
    def generate_configs(cls, name: str, path: Path, **options):
        config_files: Set[str] = set()
        ml_model_clz = cls.NAMES_MODELS[name]
        config = ml_model_clz.config_clz()

        type_hints = get_type_hints(ml_model_clz.config_clz)

        model_path = path/name
        model_path.mkdir(parents=True, exist_ok=True)

        cls.logger.info(f"Possible attrs and default values: {config.__dict__}")

        attrs_choices: dict = dict()
        attrs: list = list()

        for k, default_v in config.__dict__.items():
            attrs.append(k)
            if k not in options:
                attrs_choices[k] = [default_v]
            else:
                if type_hints[k] == bool:
                    attrs_choices[k] = [v == "True" for v in str(options[k]).split()]
                elif issubclass(type_hints[k], recordclass.mutabletuple):
                    attrs_choices[k] = [IOUtils.dejsonfy(v, type_hints[k]) if v != "None" else None for v in str(options[k]).split()]
                else:
                    attrs_choices[k] = [type_hints[k](v) for v in str(options[k]).split()]
                # end if
                attrs_choices[k] = list(set(attrs_choices[k]))
                cls.logger.debug(f"attr {k}, choices: {attrs_choices[k]}")
                options.pop(k)
            # end if
        # end for

        if len(options) > 0:
            cls.logger.warning(f"These options are not recognized: {options.keys()}")
        # end if

        candidate = [0] * len(attrs_choices)
        is_explore_finished = False
        while True:
            # Generate current candidate
            for i, attr in enumerate(attrs):
                config.__setattr__(attr, attrs_choices[attr][candidate[i]])
            # end for
            if config.repOk():
                # Adjust batch size
                adjust_batch_size_func = getattr(config, "adjust_batch_size", None)
                if callable(adjust_batch_size_func):
                    adjust_batch_size_func()
                # end if

                config_file = model_path / (str(config)+".json")
                cls.logger.info(f"Saving candidate to {config_file}: {config}")
                config_files.add(name + "/" + str(config) + ".json")
                IOUtils.dump(config_file, IOUtils.jsonfy(config), IOUtils.Format.jsonPretty)
            else:
                cls.logger.info(f"Skipping invalid candidate: {config}")
            # end if

            # To next candidate
            for i, attr in enumerate(attrs):
                candidate[i] += 1
                if candidate[i] >= len(attrs_choices[attr]):
                    candidate[i] = 0
                    if i == len(attrs) - 1:
                        is_explore_finished = True
                        break
                    else:
                        continue
                    # end if
                else:
                    break
                # end if
            # end for
            if is_explore_finished:  break
        # end while

        for config_file in config_files:
            print(f"- model: {name}")
            print(f"  config-file: {config_file}")
            print()
        # end for

        return