def suggest_naming(self, file_path: Path, prj_root: Optional[Path] = None):
        """
        Processes a file to get its lemmas and runs the model to get predictions.
        """
        # Figure out which project we're at, and then load configs
        if prj_root is None:
            prj_root = RoosterizeDirUtils.auto_infer_project_root(file_path)
        self.load_configs(prj_root)

        # Infer SerAPI options
        serapi_options = self.infer_serapi_options(prj_root)

        # If user provided compile_cmd, first compile the project
        if self.compile_cmd is not None:
            with IOUtils.cd(prj_root):
                BashUtils.run(self.compile_cmd, expected_return_code=0)

        # Parse file
        data = self.parse_file(file_path, prj_root, serapi_options)

        # Load model
        self.load_local_model(prj_root)
        model = self.get_model()

        # Use the model to make predictions
        # Temp dirs for processed data and results
        temp_data_dir = Path(tempfile.mkdtemp(prefix="roosterize"))

        # Dump lemmas & definitions
        temp_raw_data_dir = temp_data_dir / "raw"
        temp_raw_data_dir.mkdir()
        IOUtils.dump(
            temp_raw_data_dir / "lemmas.json",
            IOUtils.jsonfy(data.lemmas),
            IOUtils.Format.json,
        )
        IOUtils.dump(
            temp_raw_data_dir / "definitions.json",
            IOUtils.jsonfy(data.definitions),
            IOUtils.Format.json,
        )

        # Model-specific process
        temp_processed_data_dir = temp_data_dir / "processed"
        temp_processed_data_dir.mkdir()
        model.process_data_impl(temp_raw_data_dir, temp_processed_data_dir)

        # Invoke eval
        candidates_logprobs = model.eval_impl(
            temp_processed_data_dir,
            beam_search_size=self.beam_search_size,
            k=self.k,
        )

        # Save predictions
        IOUtils.rm_dir(temp_data_dir)

        # Report predictions
        self.report_predictions(data, candidates_logprobs)
        return
Exemplo n.º 2
0
    def dump_data(self,
            rel_path: Union[str, List[str]],
            data: Any,
            fmt: IOUtils.Format,
            is_batched: bool = False,
            per_batch: int = 100,
            exist_ok: bool = False,
    ):
        abs_path = self.data_dir / self.assemble_rel_path(rel_path)
        if abs_path.exists() and not exist_ok:
            LoggingUtils.log_and_raise(self.logger, f"Cannot rewrite existing data at {abs_path}", IOError)
        # end if

        abs_path.parent.mkdir(parents=True, exist_ok=True)
        if not is_batched:
            if self.is_json_format(fmt):
                data = IOUtils.jsonfy(data)
            # end if
            IOUtils.dump(abs_path, data, fmt)
        else:
            # In batched mode, the data need to be slice-able and sizable
            IOUtils.rm(abs_path)
            abs_path.mkdir(parents=True)

            for batch_i in tqdm(range(math.ceil(len(data)/per_batch))):
                data_batch = data[per_batch*batch_i : per_batch*(batch_i+1)]
                if self.is_json_format(fmt):
                    data_batch = IOUtils.jsonfy(data_batch)
                # end if
                IOUtils.dump(abs_path/f"batch-{batch_i}.{fmt.get_extension()}", data_batch, fmt)
            # end for
        # end if
        return
Exemplo n.º 3
0
    def train(
            self,
            train_processed_data_dir: Path,
            val_processed_data_dir: Path,
            force_retrain: bool = False,
    ) -> NoReturn:
        """
        Trains the model on the training data.

        The trained model should be saved to output_dir.
        This function auto-saves a training-completed.txt as a proof of completion of training at the end.

        :param train_processed_data_dir: the directory containing the processed train data
        :param val_processed_data_dir: the directory containing the processed val data
        :param force_retrain: if set to True, re-train the model even if it was already trained (will remove previously trained model)
        """
        if force_retrain or not self.is_training_completed():
            self.logger.info(self.logging_prefix + f"Training model at {self.model_dir}; train: {train_processed_data_dir}, val: {val_processed_data_dir}")
            IOUtils.rm_dir(self.model_dir)
            IOUtils.mk_dir(self.model_dir)

            # Save spec & configs of this model
            IOUtils.dump(self.model_dir/"config-dict.json", IOUtils.jsonfy(self.config), IOUtils.Format.jsonPretty)
            IOUtils.dump(self.model_dir/"spec.json", IOUtils.jsonfy(self.spec), IOUtils.Format.jsonPretty)
            self.train_impl(train_processed_data_dir, val_processed_data_dir)
            IOUtils.dump(self.model_dir / self.TRAINING_COMPLETED_FILE_NAME, str(time.time_ns()), IOUtils.Format.txt)
        # end if
        return
Exemplo n.º 4
0
    def extract_data_from_corpus(cls,
            corpus_path: Path,
            trainevals: List[str],
            groups: List[str],
            output_path: Path,
    ):
        # 1. Prepare output path
        if output_path.is_dir():
            cls.logger.warning(f"{output_path} already exists, will overwrite the files.")
        elif output_path.is_file():
            LoggingUtils.log_and_raise(cls.logger, f"{output_path} already exists as a file. Aborting.", Exception)
        else:
            IOUtils.mk_dir(output_path)
        # end if

        assert all([traineval in Macros.DS_TRAINEVALS for traineval in trainevals])
        assert all([group in Macros.DS_GROUPS+[Macros.DS_GROUP_TA] for group in groups])

        data_mgr = FilesManager(corpus_path)

        # 2. Load lemmas and definitions
        lemmas_filtered: List[Lemma] = data_mgr.load_data([FilesManager.LEMMAS_FILTERED], IOUtils.Format.json, is_batched=True, clz=Lemma)
        definitions: List[Definition] = data_mgr.load_data([FilesManager.DEFINITIONS, "definitions.json"], IOUtils.Format.json, clz=Definition)

        # 3. Output to output_path for each combination of traineval and group
        for traineval in trainevals:
            for group in groups:
                IOUtils.mk_dir(output_path/f"{group}-{traineval}")
                data_indexes = IOUtils.load(project_dir/"training"/f"{group}-{traineval}.json"], IOUtils.Format.json, clz=str)
                IOUtils.dump(output_path/f"{group}-{traineval}/lemmas.json", IOUtils.jsonfy([l for l in lemmas_filtered if l.data_index in data_indexes]), IOUtils.Format.json)
                IOUtils.dump(output_path/f"{group}-{traineval}/definitions.json", IOUtils.jsonfy([d for d in definitions if d.data_index in data_indexes]), IOUtils.Format.json)
            # end for
        # end for
        return
Exemplo n.º 5
0
    def process_data(self, project_dir):
        try:
            revision_data = IOUtils.load(project_dir / "collector" /
                                         "method-project-revision.json")
            method_data = IOUtils.load(project_dir / "collector" /
                                       "method-data.json")
            output_dir = project_dir / "collector"
            method_project_evo = []
            for year in BetaFilter.YEARS[:-1]:
                curr_time = f"{year}_Jan_1"
                curr_method_ids = \
                    [year_data["method_ids"] for year_data in revision_data if year_data["year"] == curr_time][0]
                next_time = f"{year + 1}_Jan_1"
                next_method_ids = \
                    [year_data["method_ids"] for year_data in revision_data if year_data["year"] == next_time][0]
                new_method_ids = list(
                    set(next_method_ids) - set(curr_method_ids))
                filtered_method_ids = BetaFilter.beta_filter(
                    new_method_ids, curr_method_ids, method_data)
                method_project_evo.append({
                    "prj_name":
                    revision_data[0]["prj_name"],
                    "time":
                    f"{curr_time}-{next_time}",
                    "method_ids":
                    filtered_method_ids
                })

            IOUtils.dump(output_dir / "method-project-beta-filtered.json",
                         IOUtils.jsonfy(method_project_evo),
                         IOUtils.Format.json)
            return
        except:
            self.logger.info(f"Unexpected error: {sys.exc_info()[0]}")
            return
Exemplo n.º 6
0
 def test_jsonfy_record_class(self):
     example_obj = test_IOUtils.ExampleRecordClass(field_str="aaa", field_int=42, field_list=[1,2], nested_rc=test_IOUtils.ExampleSimpleRecordClass())
     jsonfied = IOUtils.jsonfy(example_obj)
     self.assertTrue(jsonfied.get("field_str") == "aaa")
     self.assertTrue(jsonfied.get("field_int") == 42)
     self.assertTrue(jsonfied.get("field_list") == [1,2])
     self.assertTrue(jsonfied.get("nested_rc").get("f") == 1)
     return
Exemplo n.º 7
0
 def test_jsonfy_basic(self):
     self.assertEqual("aaa", IOUtils.jsonfy("aaa"))
     self.assertEqual(42, IOUtils.jsonfy(42))
     self.assertEqual(1.111, IOUtils.jsonfy(1.111))
     self.assertEqual([1, 2.0, "ccc"], IOUtils.jsonfy([1, 2.0, "ccc"]))
     self.assertEqual({1, 2.0, "ccc"}, set(IOUtils.jsonfy({1, 2.0, "ccc"})))
     self.assertEqual({"f1": 1, "f2": 2.0, "f3": "ccc"}, IOUtils.jsonfy({"f1": 1, "f2": 2.0, "f3": "ccc"}))
     return
Exemplo n.º 8
0
    def process_data(self,
                     method_data_list: List[MethodData],
                     data_type: str,
                     output_dir: Path,
                     split: bool = True):
        Environment.require_collector()

        log_file = output_dir / "collector-log.txt"
        data_file = output_dir / "method-data.json"
        IOUtils.dump(data_file, IOUtils.jsonfy(method_data_list),
                     IOUtils.Format.json)

        config = {
            "transform": True,
            "model": "BiLSTM",
            "dataType": data_type,
            "dataFile": str(data_file),
            "logFile": str(log_file),
            "outputDir": str(output_dir),
        }
        config_file = output_dir / "collector-config.json"
        IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty)

        self.logger.info(
            f"Starting the Java collector. Check log at {log_file} and outputs at {output_dir}"
        )
        rr = BashUtils.run(
            f"java -jar {Environment.collector_jar} {config_file}",
            expected_return_code=0)
        if rr.stdout:
            self.logger.warning(f"Stdout of collector:\n{rr.stdout}")
        # end if
        if rr.stderr:
            self.logger.warning(f"Stderr of collector:\n{rr.stderr}")
        # end if
        # build raw dataset
        if split:
            self.tokenizeFile(output_dir / f"{data_type}.raw.txt", data_type)
        else:
            self.noSplit(output_dir / f"{data_type}.raw.txt", data_type)

        error_ids = IOUtils.load(str(output_dir) + "-error-ids.json")
        print(f"Number of error id is: {len(error_ids)}")
        # BashUtils.run(f"rm {output_dir}-error-ids.json", expected_return_code=0)
        return error_ids
Exemplo n.º 9
0
 def test_jsonfy_enum(self):
     example_obj = test_IOUtils.ExampleEnum.Item1
     jsonfied = IOUtils.jsonfy(example_obj)
     self.assertTrue(jsonfied, example_obj.value)
     return
Exemplo n.º 10
0
    def generate_configs(cls, name: str, path: Path, **options):
        config_files: Set[str] = set()
        ml_model_clz = cls.NAMES_MODELS[name]
        config = ml_model_clz.config_clz()

        type_hints = get_type_hints(ml_model_clz.config_clz)

        model_path = path/name
        model_path.mkdir(parents=True, exist_ok=True)

        cls.logger.info(f"Possible attrs and default values: {config.__dict__}")

        attrs_choices: dict = dict()
        attrs: list = list()

        for k, default_v in config.__dict__.items():
            attrs.append(k)
            if k not in options:
                attrs_choices[k] = [default_v]
            else:
                if type_hints[k] == bool:
                    attrs_choices[k] = [v == "True" for v in str(options[k]).split()]
                elif issubclass(type_hints[k], recordclass.mutabletuple):
                    attrs_choices[k] = [IOUtils.dejsonfy(v, type_hints[k]) if v != "None" else None for v in str(options[k]).split()]
                else:
                    attrs_choices[k] = [type_hints[k](v) for v in str(options[k]).split()]
                # end if
                attrs_choices[k] = list(set(attrs_choices[k]))
                cls.logger.debug(f"attr {k}, choices: {attrs_choices[k]}")
                options.pop(k)
            # end if
        # end for

        if len(options) > 0:
            cls.logger.warning(f"These options are not recognized: {options.keys()}")
        # end if

        candidate = [0] * len(attrs_choices)
        is_explore_finished = False
        while True:
            # Generate current candidate
            for i, attr in enumerate(attrs):
                config.__setattr__(attr, attrs_choices[attr][candidate[i]])
            # end for
            if config.repOk():
                # Adjust batch size
                adjust_batch_size_func = getattr(config, "adjust_batch_size", None)
                if callable(adjust_batch_size_func):
                    adjust_batch_size_func()
                # end if

                config_file = model_path / (str(config)+".json")
                cls.logger.info(f"Saving candidate to {config_file}: {config}")
                config_files.add(name + "/" + str(config) + ".json")
                IOUtils.dump(config_file, IOUtils.jsonfy(config), IOUtils.Format.jsonPretty)
            else:
                cls.logger.info(f"Skipping invalid candidate: {config}")
            # end if

            # To next candidate
            for i, attr in enumerate(attrs):
                candidate[i] += 1
                if candidate[i] >= len(attrs_choices[attr]):
                    candidate[i] = 0
                    if i == len(attrs) - 1:
                        is_explore_finished = True
                        break
                    else:
                        continue
                    # end if
                else:
                    break
                # end if
            # end for
            if is_explore_finished:  break
        # end while

        for config_file in config_files:
            print(f"- model: {name}")
            print(f"  config-file: {config_file}")
            print()
        # end for

        return
Exemplo n.º 11
0
    def extract_data_project(
        cls,
        project_path: Path,
        files: Optional[List[str]],
        exclude_files: Optional[List[str]],
        exclude_pattern: Optional[str],
        serapi_options: str,
        output_path: Path,
    ):
        # 1. Prepare output path
        if output_path.is_dir():
            cls.logger.warning(
                f"{output_path} already exists, will overwrite the files.")
        elif output_path.is_file():
            LoggingUtils.log_and_raise(
                cls.logger,
                f"{output_path} already exists as a file. Aborting.",
                Exception)
        else:
            IOUtils.mk_dir(output_path)
        # end if

        # 2. Extract documents, tok.sexp and ast.sexp
        coq_documents: Dict[str, CoqDocument] = collections.OrderedDict()
        ast_sexp_lists: Dict[str, List[SexpNode]] = dict()
        tok_sexp_lists: Dict[str, List[SexpNode]] = dict()

        with IOUtils.cd(project_path):
            coq_files: List[str] = BashUtils.run(
                f"find -name '*.v' -type f").stdout.split("\n")[:-1]
            coq_files = [coq_file[2:] for coq_file in coq_files]

            if files is not None:
                coq_files = [f for f in coq_files if f in files]
            # end if

            if exclude_files is not None:
                coq_files = [f for f in coq_files if f not in exclude_files]
            # end if

            if exclude_pattern is not None:
                re_exclude_pattern = re.compile(exclude_pattern)
                coq_files = [
                    f for f in coq_files if not re_exclude_pattern.fullmatch(f)
                ]
            # end if

            for i, coq_file in enumerate(tqdm(coq_files)):
                try:
                    # Read file
                    with open(coq_file, "r", newline="") as f:
                        source_code = f.read()
                    # end with

                    # Get unicode offsets
                    unicode_offsets = ParserUtils.get_unicode_offsets(
                        source_code)

                    # Call SerAPI
                    ast_sexp_str: str = BashUtils.run(
                        f"sercomp {serapi_options} --mode=sexp -- {coq_file}",
                        expected_return_code=0).stdout
                    tok_sexp_str: str = BashUtils.run(
                        f"sertok {serapi_options} -- {coq_file}",
                        expected_return_code=0).stdout

                    # Parse ast sexp
                    ast_sexp_list: List[SexpNode] = SexpParser.parse_list(
                        ast_sexp_str)
                    tok_sexp_list: List[SexpNode] = SexpParser.parse_list(
                        tok_sexp_str)

                    # Parse the document
                    coq_document = CoqParser.parse_document(
                        source_code,
                        ast_sexp_list,
                        tok_sexp_list,
                        unicode_offsets=unicode_offsets)

                    # Set meta data
                    coq_document.file_name = coq_file
                    coq_document.project_name = project_path.name

                    coq_documents[coq_file] = coq_document
                    ast_sexp_lists[coq_file] = ast_sexp_list
                    tok_sexp_lists[coq_file] = tok_sexp_list
                except KeyboardInterrupt:
                    cls.logger.warning("Keyboard interrupt!")
                    raise
                except:
                    cls.logger.warning(
                        f"File {coq_file} failed! Exception was: {traceback.format_exc()}"
                    )
                    continue
                # end try
            # end for

            # 3. Extract and save lemmas and definitions
            lemmas: List[Lemma] = list()
            definitions: List[Definition] = list()

            # Increase recursion limit because the backend sexps are CRAZZZZY deep
            sys.setrecursionlimit(10000)

            for file_path, doc in tqdm(coq_documents.items()):
                ast_sexp_list = ast_sexp_lists[file_path]
                lemmas_doc = cls.collect_lemmas_doc(doc, ast_sexp_list,
                                                    serapi_options)
                lemmas.extend(lemmas_doc)
                definitions_doc = cls.collect_definitions_doc(
                    doc, ast_sexp_list)
                definitions.extend(definitions_doc)
            # end for

            IOUtils.dump(output_path / "lemmas.json", IOUtils.jsonfy(lemmas),
                         IOUtils.Format.json)
            IOUtils.dump(output_path / "definitions.json",
                         IOUtils.jsonfy(definitions), IOUtils.Format.json)
        # end with
        return
Exemplo n.º 12
0
    def collect_project(self, project_name: str, project_url: str):
        Environment.require_collector()

        # 0. Download repo
        downloads_dir = self.repos_downloads_dir / project_name
        results_dir = self.repos_results_dir / project_name

        # Remove previous results if any
        IOUtils.rm_dir(results_dir)
        IOUtils.mk_dir(results_dir)

        # Clone the repo if not exists
        if not downloads_dir.exists():
            with IOUtils.cd(self.repos_downloads_dir):
                with TimeUtils.time_limit(300):
                    BashUtils.run(f"git clone {project_url} {project_name}",
                                  expected_return_code=0)
                # end with
            # end with
        # end if

        project_data = ProjectData.create()
        project_data.name = project_name
        project_data.url = project_url

        # 1. Get list of revisions
        with IOUtils.cd(downloads_dir):
            git_log_out = BashUtils.run(f"git log --pretty=format:'%H %P'",
                                        expected_return_code=0).stdout
            for line in git_log_out.splitlines()[:self.MAX_REVISIONS]:
                shas = line.split()
                project_data.revisions.append(shas[0])
                project_data.parent_revisions[shas[0]] = shas[1:]
            # end for
        # end with

        # 2. Get revisions in different year
        with IOUtils.cd(downloads_dir):
            for year in self.YEARS:
                git_log_out = BashUtils.run(
                    f"git rev-list -1 --before=\"Jan 1 {year}\" origin",
                    expected_return_code=0).stdout
                project_data.year_revisions[str(year) +
                                            "_Jan_1"] = git_log_out.rstrip()
            # end for
        # end with

        project_data_file = results_dir / "project.json"
        IOUtils.dump(project_data_file, IOUtils.jsonfy(project_data),
                     IOUtils.Format.jsonPretty)

        # 2. Start java collector
        # Prepare config
        log_file = results_dir / "collector-log.txt"
        output_dir = results_dir / "collector"

        config = {
            "collect": True,
            "projectDir": str(downloads_dir),
            "projectDataFile": str(project_data_file),
            "logFile": str(log_file),
            "outputDir": str(output_dir),
            "year":
            True  # To indicate whether to collect all evo data or yearly data
        }
        config_file = results_dir / "collector-config.json"
        IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty)

        self.logger.info(
            f"Starting the Java collector. Check log at {log_file} and outputs at {output_dir}"
        )
        rr = BashUtils.run(
            f"java -jar {Environment.collector_jar} {config_file}",
            expected_return_code=0)
        if rr.stderr:
            self.logger.warning(f"Stderr of collector:\n{rr.stderr}")
        # end if

        # 3. In some cases, save collected data to appropriate location or database
        # TODO private info
        # On luzhou server for user pynie, move it to a dedicated location at /user/disk2
        if BashUtils.run(
                f"hostname").stdout.strip() == "luzhou" and BashUtils.run(
                    f"echo $USER").stdout.strip() == "pynie":
            alter_results_dir = Path(
                "/home/disk2/pynie/csevo-results") / project_name
            IOUtils.rm_dir(alter_results_dir)
            IOUtils.mk_dir(alter_results_dir.parent)
            BashUtils.run(f"mv {results_dir} {alter_results_dir}")
            self.logger.info(f"Results moved to {alter_results_dir}")
        # end if

        # -1. Remove repo
        IOUtils.rm_dir(downloads_dir)
        return