Пример #1
0
 def setUpClass(cls):
     config = FormatAnalyzer._load_config(
         merge_dicts(
             get_config(), {
                 "train": {
                     "javascript": {
                         "feature_extractor": {
                             "left_siblings_window":
                             1,
                             "right_siblings_window":
                             1,
                             "parents_depth":
                             1,
                             "node_features":
                             ["start_line", "reserved", "roles"],
                         },
                     },
                 },
             }))["train"]
     base = Path(__file__).parent
     with lzma.open(str(base / "benchmark.js.xz"), mode="rt") as fin:
         contents = fin.read()
     with lzma.open(str(base / "benchmark.uast.xz")) as fin:
         uast = bblfsh.Node.FromString(fin.read())
     file = File(content=bytes(contents, "utf-8"), uast=uast)
     files = [file, file]
     cls.fe = FeatureExtractor(language="javascript",
                               **config["javascript"]["feature_extractor"])
     cls.fe.extract_features(files)
     cls.class_representations = cls.fe.composite_class_representations
     cls.n_classes = len(cls.fe.labels_to_class_sequences)
     cls.ordinal = cls.return_node_feature(FeatureId.start_line)
     cls.categorical = cls.return_node_feature(FeatureId.reserved)
     cls.bag = cls.return_node_feature(FeatureId.roles)
Пример #2
0
def evaluate_smoke_entry(
    inputpath: str,
    reportdir: str,
    database: str,
    bblfsh: str,
    config: dict,
) -> None:
    """
    CLI entry point.
    """
    start_time = time.time()
    report_filename = os.path.join(reportdir, "report.csv")
    log = logging.getLogger("evaluate_smoke")
    if database is None:
        db = tempfile.NamedTemporaryFile(dir=inputpath,
                                         prefix="db",
                                         suffix=".sqlite3")
        database = db.name
        log.info("Database %s created" % database)
    else:
        if os.path.exists(database):
            log.info("Found existing database %s" % database)
        else:
            log.info("Database %s not found and will be created." % database)
    with tempfile.TemporaryDirectory(dir=inputpath) as fs:
        with AnalyzerContextManager(SmokeEvalFormatAnalyzer,
                                    db=database,
                                    fs=fs) as server:
            inputpath = Path(inputpath)
            index_file = inputpath / "index.csv"
            os.makedirs(reportdir, exist_ok=True)
            with open(report_filename, "w") as report:
                csv.DictWriter(
                    report,
                    fieldnames=SmokeEvalFormatAnalyzer.REPORT_COLNAMES,
                ).writeheader()
            with open(str(index_file)) as index:
                reader = csv.DictReader(index)
                for row in tqdm(reader):
                    repopath = inputpath / row["repo"]
                    config_json = {
                        SmokeEvalFormatAnalyzer.name:
                        merge_dicts(config, {
                            "style_name": row["style"],
                            "report_path": reportdir,
                        })
                    }
                    server.review(fr=row["from"],
                                  to=row["to"],
                                  git_dir=str(repopath),
                                  log_level="warning",
                                  bblfsh=bblfsh,
                                  config_json=config_json)
            log.info("Quality report saved to %s", reportdir)

    report = pandas.read_csv(report_filename)
    with pandas.option_context("display.max_columns", 10,
                               "display.expand_frame_repr", False):
        print(report.describe())
    log.info("Time spent: %.3f" % (time.time() - start_time))
Пример #3
0
def train_from_scratch(
        config: Optional[Mapping[str, Any]] = None) -> TyposCorrector:
    """
    Train TyposCorrector on raw data.

    1. Prepare data, for more info check :func:`prepare_data`.
    2. Construct train and test datasets, for more info check :func:`get_train_test`.
    3. Train and evaluate TyposCorrector model, for more info check :func:`train_and_evaluate`.
    4. Return result.
    :param config: Parameters for data preparation and corrector training.
    :return: Trained TyposCorrector model.
    """
    log = logging.getLogger("train_from_scratch")
    if config is None:
        config = {}
    config = merge_dicts(DEFAULT_CORRECTOR_CONFIG, config)
    log.info("effective config:\n%s", pformat(config, width=120, compact=True))
    prepared_data = prepare_data(config["preparation"])
    if config["fasttext"]["path"] is None or not os.path.exists(
            config["fasttext"]["path"]):
        log.info("fasttext model is not found and will be trained")
        train_fasttext(prepared_data, config["fasttext"])
    train_data, test_data = get_datasets(prepared_data, config["datasets"])
    model = train_and_evaluate(
        train_data, test_data,
        os.path.join(config["preparation"]["data_dir"],
                     config["preparation"]["vocabulary_filename"]),
        os.path.join(config["preparation"]["data_dir"],
                     config["preparation"]["frequencies_filename"]),
        config["fasttext"]["path"], config["generation"], config["ranking"],
        config["processes_number"])
    if config["corrector_path"] is not None:
        model.save(config["corrector_path"], series=0.0)
        log.info("corrector model is saved to %s", config["corrector_path"])
    return model
Пример #4
0
    def _load_config(cls, config: Mapping[str, Any]) -> Mapping[str, Any]:
        """
        Merge provided config with the default values.

        :param config: User-defined config.
        :return: Full config.
        """
        return merge_dicts(cls.default_config, config)
Пример #5
0
    def _load_config(cls, config: Mapping[str, Any]) -> Mapping[str, Any]:
        """
        Merge provided config with the default values.

        :param config: User-defined config.
        :return: Full config.
        """
        supported_langs = get_supported_languages()
        effective_config = merge_dicts(cls.default_config, config)
        for key in ["train", "analyze"]:
            global_config = effective_config[key].pop("language_defaults")
            try:
                for lang in supported_langs:
                    effective_config[key][lang] = merge_dicts(
                        global_config, effective_config[key].get(lang, {}))
            except AttributeError as e:
                raise ValueError("Config %s can not be merged with default values config: "
                                 "%s: %s" % (config, global_config, e)) from None
        return effective_config
Пример #6
0
def get_datasets(
    prepared_data: pandas.DataFrame,
    config: Optional[Mapping[str, Any]] = None,
    processes_number: int = DEFAULT_CORRECTOR_CONFIG["processes_number"],
) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
    """
    Create the train and the test datasets of typos.

    1. Take the specified number of lines from the input dataset.
    2. Make artificial typos in picked identifiers and split them into train and test.
    3. Return results.
    :param prepared_data: Dataframe of correct splitted identifiers. Must contain columns \
                          Columns.Split, Columns.Frequency and Columns.Token.
    :param config: Parameters for creating train and test datasets, options:
                   train_size: Train dataset size.
                   test_size: Test dataset size.
                   typo_probability: Probability of token corruption.
                   add_typo_probability: Probability of second corruption for a corrupted token.
                   train_path: Path to the .csv file where to save the train dataset.
                   test_path: Path to the .csv file where to save the test dataset.
    :param processes_number: Number of processes for multiprocessing.
    :return: Train and test datasets.
    """
    log = logging.getLogger("get_datasets")
    if config is None:
        config = {}
    config = merge_dicts(DEFAULT_CORRECTOR_CONFIG["datasets"], config)
    # With replace=True we get the real examples distribution, but there's a small
    # probability of having the same examples of misspellings in train and test datasets
    # (it IS small because a big number of random typos can be made in a single word)
    data = prepared_data[[len(x) > 1 for x in prepared_data[Columns.Token]
                          ]].sample(config["train_size"] + config["test_size"],
                                    weights=Columns.Frequency,
                                    replace=True)
    train, test = train_test_split(data[[Columns.Token, Columns.Split]],
                                   test_size=config["test_size"])
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    log.info("train dataset shape: %s", train.shape)
    log.info("test dataset shape: %s", test.shape)
    train = corrupt_tokens_in_df(train, config["typo_probability"],
                                 config["add_typo_probability"],
                                 processes_number)
    test = corrupt_tokens_in_df(test, config["typo_probability"],
                                config["add_typo_probability"],
                                processes_number)
    if config["test_path"] is not None:
        test.to_csv(config["test_path"])
        log.info("test dataset is saved to %s", config["test_path"])
    if config["train_path"] is not None:
        train.to_csv(config["train_path"])
        log.info("train dataset is saved to %s", config["train_path"])
    return train, test
Пример #7
0
    def set_config(self, config: Optional[Mapping[str, Any]] = None) -> None:
        """
        Update ranking configuration.

        :param config: Ranking configuration, options:
                       train_rounds: Number of training rounds (int).
                       early_stopping: Early stopping parameter (int).
                       boost_param: Boosting parameters (dict).
        """
        if config is None:
            config = {}
        self.config = merge_dicts(self.config, config)
Пример #8
0
def train_fasttext(data: pandas.DataFrame, config: Optional[Mapping[str, Any]] = None) -> None:
    """
    Train fasttext model on the given dataset of code identifiers.

    :param data: Dataframe with columns Columns.Split and Columns.Frequency.
    :param config: Parameters for training the model, options:
                   size: Number of identifiers to pick from the given data to train fasttext on.
                   corrupt: Value indicating whether to make random artificial typos in \
                            the training data. Identifiers are corrupted with `typo_probability`.
                   typo_probability: Token corruption probability if `corrupt == True`.
                   add_typo_probability: Probability of second corruption in a corrupted token. \
                                         used if `corrupt == True`.
                   path: Path where to store the trained fasttext model.
                   dim: Number of dimensions for embeddings in the new model.
                   bucket: Number of hash buckets to keep in the fasttext model: \
                           the less there are, the more compact the model gets.
                   adjust_frequencies: Whether to divide frequencies by the number of tokens in \
                                       the identifiers. Needs to be done when the result of the \
                                       `prepare` function is used as data to have a true \
                                       identifiers distribution.
    """
    try:
        import fastText
    except ImportError:
        sys.exit("Please install fastText."
                 "Run `pip3 install git+https://github.com/facebookresearch/fastText"
                 "@51e6738d734286251b6ad02e4fdbbcfe5b679382`")
    log = logging.getLogger("train_fasttext")
    if config is None:
        config = {}
    config = merge_dicts(DEFAULT_CORRECTOR_CONFIG["fasttext"], config)
    tokens_number = data[Columns.Split].apply(lambda x: len(str(x).split()))
    if config["adjust_frequencies"]:
        weights = data[Columns.Frequency] / tokens_number
    else:
        weights = data[Columns.Frequency]
    train_data = data[tokens_number > 1].sample(config["size"], weights=weights, replace=True)
    if config["corrupt"]:
        train_data = corrupt_tokens_in_df(train_data, config["typo_probability"],
                                          config["add_typo_probability"])
    with tempfile.NamedTemporaryFile() as ids_file:
        with open(ids_file.name, "w") as f:
            for token_split in train_data[Columns.Split]:
                f.write(token_split + "\n")
        log.info("Training fasttext model...")
        model = fastText.train_unsupervised(ids_file.name, minCount=1, epoch=10,
                                            dim=config["dim"],
                                            bucket=config["bucket"])
    model.save_model(config["path"])
    log.info("fasttext model is saved to %s", config["path"])
Пример #9
0
 def _trigger_review_event(
         self, dataset_row: Dict[str, Any]) -> Sequence[TypoFix]:
     config = merge_dicts(self._config if self._config is not None else {},
                          {
                              IdTyposAnalyzerSpy.name: {
                                  "filepath_to_analyze": dataset_row["file"]
                              }
                          })
     start_time = time.perf_counter()
     comments = self._analyzer_context_manager.review(
         dataset_row["commit_typo"],
         "HEAD",
         git_dir=dataset_row["repo_path"],
         bblfsh=self._bblfsh,
         log_level="info",
         config_json=config)
     self._review_time = time.perf_counter() - start_time
     return [TypoFix(**json.loads(comment.text)) for comment in comments]
Пример #10
0
 def test_merge_two_dicts(self):
     cases = [
         ({}, {}, {}),
         (dict(a=1), dict(b=2), dict(a=1, b=2)),
         (dict(a=1), dict(a=2, b=2), dict(a=2, b=2)),
         (dict(a=1, b=1), dict(b=2), dict(a=1, b=2)),
         (dict(a=1, b={"c": 1}), dict(b={"c": 2}), dict(a=1, b={"c": 2})),
         (dict(a=1), dict(b={"c": 2}), dict(a=1, b={"c": 2})),
         (dict(a=1, b={"c": 1}), dict(b={"c": 2}), dict(a=1, b={"c": 2})),
         (dict(a=dict(b=dict(c=dict(d=1)))),
          dict(a=dict(b=dict(c=dict(d=2)))),
          dict(a=dict(b=dict(c=dict(d=2))))),
         (dict(a=dict(b=dict(c=dict(d=1)))),
          dict(a=dict(b=dict(c=dict(d2=2)))),
          dict(a=dict(b=dict(c=dict(d=1, d2=2))))),
         (dict(a=dict(b=dict(c=dict(d=1)))),
          dict(a=dict(b=dict(c2=dict(d=2)))),
          dict(a=dict(b=dict(c=dict(d=1), c2=dict(d=2))))),
     ]
     for d1, d2, res in cases:
         self.assertEqual(merge_dicts(d1, d2), res)
Пример #11
0
    def set_config(self, config: Optional[Mapping[str, Any]] = None) -> None:
        """
        Update candidates generation config.

        :param config: Candidates generation configuration, options:
                       neighbors_number: Number of neighbors of context and typo embeddings \
                                         to consider as candidates (int).
                       edit_dist_number: Number of the most frequent tokens among tokens at \
                                         equal edit distance from the typo to consider as \
                                         candidates (int).
                       max_distance: Maximum edit distance for symspell lookup for candidates \
                                    (int).
                       radius: Maximum edit distance from typo allowed for candidates (int).
                       max_corrected_length: Maximum length of prefix in which symspell lookup \
                                             for typos is conducted (int).
                       start_pool_size: Length of data, starting from which multiprocessing is \
                                        desired (int).
                       chunksize: Max size of a chunk for one process during multiprocessing (int).
        """
        if config is None:
            config = {}
        self.config = merge_dicts(self.config, config)
Пример #12
0
def generate_quality_report(input: str,
                            output: str,
                            force: bool,
                            bblfsh: str,
                            config: dict,
                            database: Optional[str] = None,
                            fs: Optional[str] = None) -> None:
    """
    Generate quality report for the given data. Entry point for command line interface.

    :param input: csv file with repositories to make report. Should contain url, to and from \
                  columns.
    :param output: Directory where to save results.
    :param force: force to overwrite results stored in output directory if True. \
                  Stored results will be used if False.
    :param bblfsh: bblfsh address to use.
    :param config: config for FormatAnalyzer.
    :param database: sqlite3 database path to store the models. Temporary file is used if not set.
    :param fs: Model repository file system root. Temporary directory is used if not set.
    :return:
    """
    os.makedirs(output, exist_ok=True)
    assert os.path.isdir(output), "Output should be a directory"
    log = logging.getLogger("QualityAnalyzer")
    handler = logging.handlers.RotatingFileHandler(
        os.path.join(output, "errors.txt"))
    handler.setLevel(logging.ERROR)
    log.addHandler(handler)
    if not server.exefile.exists():
        server.fetch()  # download executable
    reports = []
    port = server.find_port()
    config = {
        QualityReportAnalyzer.name: merge_dicts(config, {"aggregate": True})
    }
    repositories = list(csv.DictReader(handle_input_arg(input)))
    with tempfile.TemporaryDirectory() as tmpdirname:
        database = database if database else os.path.join(
            tmpdirname, "db.sqlite3")
        fs = fs if fs else os.path.join(tmpdirname, "models")
        os.makedirs(fs, exist_ok=True)
        with AnalyzerContextManager(QualityReportAnalyzer,
                                    port=port,
                                    db=database,
                                    fs=fs,
                                    init=False):
            start_time = datetime.now()
            for ri, row in enumerate(repositories):
                now = datetime.now()
                if ri > 0:
                    left = (len(repositories) - ri) / ri * (now - start_time)
                else:
                    left = None
                log.info(
                    "\n%s\n"
                    "= %-76s =\n"
                    "= %2d / %2d%s=\n"
                    "= Now:  %-60s%s=\n"
                    "= Left: %-40s%s=\n"
                    "= Ends: %-60s%s=\n"
                    "%s",
                    "=" * 80,
                    row["url"],
                    ri + 1,
                    len(repositories),
                    " " * 70,
                    now,
                    " " * 11,
                    left,
                    " " * 31,
                    now + left if left is not None else None,
                    " " * 11,
                    "=" * 80,
                )
                report_loc = os.path.join(output, get_repo_name(row["url"]))
                train_rep_loc = report_loc + ".train_report.md"
                model_rep_loc = report_loc + ".model_report.md"
                test_rep_loc = report_loc + ".test_report.md"
                # generate or read report
                try:
                    if force or not os.path.exists(train_rep_loc) or \
                            not os.path.exists(model_rep_loc):
                        # Skip this step if report was already generated
                        vnodes_expected_number = int(row["vnodes_number"]) \
                            if "vnodes_number" in row else None
                        report = measure_quality(
                            row["url"],
                            to_commit=row["to"],
                            from_commit=row["from"],
                            port=port,
                            config=config,
                            bblfsh=bblfsh,
                            vnodes_expected_number=vnodes_expected_number)
                        if report.train_report is not None:
                            with open(train_rep_loc, "w",
                                      encoding="utf-8") as f:
                                f.write(report.train_report)
                        if report.model_report is not None:
                            with open(model_rep_loc, "w",
                                      encoding="utf-8") as f:
                                f.write(report.model_report)
                        if report.test_report is not None:
                            with open(test_rep_loc, "w",
                                      encoding="utf-8") as f:
                                f.write(report.test_report)
                    else:
                        log.info("Found existing reports for %s in %s",
                                 row["url"], output)
                        report = QualityReport()
                        with open(train_rep_loc, encoding="utf-8") as f:
                            report.train_report = f.read()
                        with open(model_rep_loc, encoding="utf-8") as f:
                            report.model_report = f.read()
                        with open(test_rep_loc, encoding="utf-8") as f:
                            report.test_report = f.read()
                    if (report.train_report is not None
                            and report.model_report is not None
                            and report.test_report is not None):
                        reports.append((row["url"], report))
                    else:
                        log.warning(
                            "skipped %s: train_report %s, model_report %s, test_report %s",
                            row["url"], report.train_report is not None,
                            report.model_report is not None, report.test_report
                            is not None)
                except Exception:
                    log.exception("-" * 20 + "\nFailed to process %s repo",
                                  row["url"])
                    continue

        for report_name in ("train_report", "test_report"):
            summary = _generate_report_summary(reports, report_name)
            log.info("\n%s\n%s", report_name, summary)
            summary_loc = os.path.join(output, "summary-%s.md" % report_name)
            with open(summary_loc, "w", encoding="utf-8") as f:
                f.write(summary)
Пример #13
0
def prepare_data(
        config: Optional[Mapping[str, Any]] = None) -> pandas.DataFrame:
    """
    Generate all the necessary data from the raw dataset of split identifiers.

    Brief algorithm description:
    1. Derive vocabulary for typos correction which is a set of tokens, which is considered
       correctly spelled. All typos corrections will belong to the vocabulary.
       It is a set of most frequent tokens (based on given statistics).
    2. Save vocabulary and statistics for a given amount of most frequent tokens for future use.
    3. Filter raw data, leaving identifiers, containing only tokens from the vocabulary.
       The result is a dataset of tokens which will be considered correct. It will be used
       for creating artificial misspelling cases for training and testing the corrector model.
    4. Save prepared dataset, if needed.
    :param config: Dictionary with parameters for data preparation. Used fields are:
                   data_dir: Directory to put all derived data to.
                   drive_dataset_id: ID of google drive document, where a raw dataset is stored.
                   input_path: Path to a .csv dump of input dataframe. Should contain \
                               column Columns.Split. If None or file doesn't exist,
                               the dataset will be loaded from Google drive.
                   frequency_column: Name of the column with identifiers frequencies. If not \
                                     specified, every split is considered to have frequency 1.
                   vocabulary_size: Number of most frequent tokens to take as a vocabulary.
                   frequencies_size: Number of most frequent tokens to save frequencies info for. \
                                     This information will be used by corrector as features for \
                                     these tokens when they will be checked. If not specified, \
                                     frequencies for all present tokens will be saved.
                   raw_data_filename: Name of the .csv file in data_dir to put raw dataset \
                                      in case of loading from drive.
                   vocabulary_filename: Name of the .csv file in data_dir to save vocabulary to.
                   frequencies_filename: Name of the .csv file in data_dir to save frequencies to.
                   prepared_filename: Name of the .csv file in data_dir to save prepared \
                                      dataset to.
    :return: Dataset baked for training the typos correction.
    """
    log = logging.getLogger("prepare_data")
    if config is None:
        config = {}
    config = merge_dicts(DEFAULT_CORRECTOR_CONFIG["preparation"], config)

    os.makedirs(config["data_dir"], exist_ok=True)
    raw_data_path = config["input_path"]
    if raw_data_path is None or not os.path.exists(raw_data_path):
        raw_data_path = os.path.join(config["data_dir"],
                                     config["raw_data_filename"])
        log.warning("raw dataset was not found, downloading from %s to %s",
                    config["dataset_url"], raw_data_path)
        _download_url(config["dataset_url"], raw_data_path)

    data = pandas.read_csv(raw_data_path, index_col=0, keep_default_na=False)
    log.debug("raw dataset shape: %s", data.shape)
    if config["frequency_column"] not in data.columns:
        log.info("frequency column is not found. Set all frequencies to 1")
        data[Columns.Frequency] = 1
    else:
        log.info("frequency column `%s` is found", config["frequency_column"])
        data = data.rename(
            columns={config["frequency_column"]: Columns.Frequency})

    # Expand dataframe by splits (repeat rows for every token in splits)
    data[Columns.Split] = data[Columns.Split].astype(str)
    log.debug("expand data by splits")
    flat_data = flatten_df_by_column(data,
                                     Columns.Split,
                                     Columns.Token,
                                     apply_function=lambda x: x.split())
    log.debug("expanded data shape %s", flat_data.shape)

    log.info("collect statistics for tokens")
    stats = flat_data[[Columns.Frequency,
                       Columns.Token]].groupby([Columns.Token]).sum()
    stats = stats.sort_values(by=Columns.Frequency,
                              ascending=False)[Columns.Frequency]

    log.info("derive the new vocabulary")
    frequencies = stats.iloc[:(
        config["frequencies_size"] or len(stats))].to_dict()
    log.info("tokens with frequencies data size: %d", len(frequencies))
    vocabulary = stats.iloc[:config["vocabulary_size"]].to_dict()
    log.info("vocabulary size: %d", len(vocabulary))
    vocabulary_filepath = os.path.join(config["data_dir"],
                                       config["vocabulary_filename"])
    print_frequencies(vocabulary, vocabulary_filepath)
    log.info("vocabulary saved to %s", vocabulary_filepath)
    frequencies_filepath = os.path.join(config["data_dir"],
                                        config["frequencies_filename"])
    print_frequencies(frequencies, frequencies_filepath)
    log.info("tokens with frequencies data are saved to %s",
             frequencies_filepath)

    # Leave only splits that contain tokens from vocabulary
    prepared_data = filter_splits(flat_data, set(
        vocabulary.keys()))[[Columns.Frequency, Columns.Split, Columns.Token]]
    prepared_data.reset_index(drop=True, inplace=True)
    log.info("final dataset shape: %s", prepared_data.shape)
    if config["prepared_filename"] is not None:
        prepared_data_filepath = os.path.join(config["data_dir"],
                                              config["prepared_filename"])
        prepared_data.to_csv(prepared_data_filepath)
        log.info("final dataset is saved to %s", prepared_data_filepath)
    return prepared_data
Пример #14
0
class ReportAnalyzer(FormatAnalyzerSpy):
    """
    Base class for different kind of reports.

    * analyze - generate report for all files. If you want only aggregated report set aggregate
    flag to True in analyze config.
    * train - train or load the model.

    Child classes are required to implement 2 methods:
    * generate_report
    * generate_model_report (optional - by default it will return empty string)
    """

    default_config = merge_dicts(FormatAnalyzer.default_config,
                                 {"aggregate": False})

    def generate_train_report(self, fixes: Iterable[FileFix]) -> str:
        """
        Generate report on the train dataset.

        :param fixes: fixes with all required information for report generation.
        :return: Report.
        """
        raise NotImplementedError()

    def generate_model_report(self) -> str:
        """
        Generate report about the trained model.

        :return: Report.
        """
        return ""

    def generate_test_report(self) -> str:
        """
        Generate report on the test dataset.

        :return: Report.
        """
        raise NotImplementedError()

    def analyze(self, ptr_from: ReferencePointer, ptr_to: ReferencePointer,
                data_service: DataService, **data) -> List[Comment]:
        """
        Analyze ptr_from revision and generate reports for all files in it.

        If you want to get an aggregated report set aggregate flag to True in analyze config.

        :param ptr_from: Git repository state pointer to the base revision.
        :param ptr_to: Git repository state pointer to the head revision. Not used.
        :param data_service: Connection to the Lookout data retrieval service.
        :param data: Contains "files" - the list of changes in the pointed state.
        :return: List of comments.
        """
        comments = []
        fixes = []
        for fix in self.run(ptr_from, data_service):
            filepath = fix.head_file.path
            if fix.error:
                continue
            if self.config["aggregate"]:
                fixes.append(fix)
            else:
                report = self.generate_train_report(fixes=[fix])
                comments.append(
                    generate_comment(filename=filepath,
                                     line=0,
                                     confidence=100,
                                     text=report))
        if self.config["aggregate"]:
            report = self.generate_train_report(fixes=fixes)
            comments.append(
                generate_comment(filename="",
                                 line=0,
                                 confidence=100,
                                 text=report))
        comments.append(
            generate_comment(filename="",
                             line=0,
                             confidence=100,
                             text=self.generate_model_report()))
        try:
            comments.append(
                generate_comment(filename="",
                                 line=0,
                                 confidence=100,
                                 text=self.generate_test_report()))
        except ValueError:
            pass
        return comments
Пример #15
0
def generate_quality_report(input: str, output: str, force: bool, bblfsh: str, config: dict,
                            database: Optional[str] = None, fs: Optional[str] = None) -> None:
    """
    Generate quality report for the given data. Entry point for command line interface.

    :param input: csv file with repositories to make report. Should contain url, to and from \
                  columns.
    :param output: Directory where to save results.
    :param force: force to overwrite results stored in output directory if True. \
                  Stored results will be used if False.
    :param bblfsh: bblfsh address to use.
    :param config: config for FormatAnalyzer.
    :param database: sqlite3 database path to store the models. Temporary file is used if not set.
    :param fs: Model repository file system root. Temporary directory is used if not set.
    :return:
    """
    os.makedirs(output, exist_ok=True)
    assert os.path.isdir(output), "Output should be a directory"
    log = logging.getLogger("QualityAnalyzer")
    handler = logging.handlers.RotatingFileHandler(os.path.join(output, "errors.txt"))
    handler.setLevel(logging.ERROR)
    log.addHandler(handler)
    reports = []
    config = {QualityReportAnalyzer.name: merge_dicts(config, {"aggregate": True})}
    repositories = list(csv.DictReader(handle_input_arg(input)))
    with tempfile.TemporaryDirectory() as tmpdirname:
        database = database if database else os.path.join(tmpdirname, "db.sqlite3")
        fs = fs if fs else os.path.join(tmpdirname, "models")
        os.makedirs(fs, exist_ok=True)
        with AnalyzerContextManager(QualityReportAnalyzer, db=database, fs=fs,
                                    init=False) as context:
            for row in huge_progress_bar(repositories, log, lambda row: row["url"]):
                path_tmpl = os.path.join(output, get_repo_name(row["url"])) + "-%s_report.md"
                try:
                    if force or not any(os.path.exists(path_tmpl % name)
                                        for name in QualityReportAnalyzer.get_report_names()):
                        vnodes_expected_number = int(row["vnodes_number"]) \
                            if "vnodes_number" in row else None
                        report = measure_quality(
                            row["url"], to_commit=row["to"], from_commit=row["from"],
                            context=context, config=config, bblfsh=bblfsh,
                            vnodes_expected_number=vnodes_expected_number)
                        for report_name in report:
                            with open(path_tmpl % report_name, "w", encoding="utf-8") as f:
                                f.write(report[report_name])
                        reports.append((row["url"], report))
                    else:
                        report = {}
                        log.info("Found existing reports for %s in %s", row["url"], output)
                        for report_name in QualityReportAnalyzer.get_report_names():
                            report_path = path_tmpl % report_name
                            if not os.path.exists(report_path):
                                log.warning(
                                    "skipped %s. %s report is missing", row["url"], report_name)
                                break
                            with open(path_tmpl % report_name, encoding="utf-8") as f:
                                report[report_name] = f.read()
                        else:
                            reports.append((row["url"], report))
                except Exception:
                    log.exception("-" * 20 + "\nFailed to process %s repo", row["url"])
                    continue

        for report_name in ("train", "test"):
            summary = _generate_report_summary(reports, report_name)
            log.info("\n%s\n%s", report_name, summary)
            summary_loc = os.path.join(output, "summary-%s_report.md" % report_name)
            with open(summary_loc, "w", encoding="utf-8") as f:
                f.write(summary)
Пример #16
0
class QualityReportAnalyzer(ReportAnalyzer):
    """
    Generate basic quality reports for the model.

    * analyze - generate report for all files. If you want only aggregated report set aggregate
    flag to True in analyze config.
    * train - train or load the model.

    It is possible to run this analyzer independently and query it with lookout-sdk.
    If you want to use pretrained model it is possible to specify it in config, for example:
    `--config-json='{"style.format.analyzer.FormatAnalyzer": {"model": "/saved/model.asdf"}}`
    Otherwise model will be trained with `FormatAnalyzer.train()`

    Usage examples:
    1) Launch analyzer: `analyzer run lookout.style.format.quality_report_analyzer -c config.yml`
    2) Query analyzer
    2.1) Get one quality report per file for pretrained model /saved/model.asdf:
    ```
    lookout-sdk review ipv4://localhost:2000 --git-dir /git/dir/ --from REV1 --to REV2 \
    --config-json='{"style.format.analyzer.FormatAnalyzer": {"model": "/saved/model.asdf"}}'
    ```
    2.2) Get aggregated quality report for all files without pretrained model
    ```
    lookout-sdk review ipv4://localhost:2000 --git-dir /git/dir/ --from REV1 --to REV2 \
    --config-json='{"style.format.analyzer.FormatAnalyzer": {"aggregate": true}}'
    ```
    """

    version = 1
    description = "Source code formatting quality report generator: " \
                  "whitespace, new lines, quotes, etc."
    default_config = merge_dicts(
        ReportAnalyzer.default_config, {
            "max_files": 10,
            "train": {
                "language_defaults": {
                    "test_dataset_ratio": 0.2
                }
            },
        })

    @classmethod
    def get_report_names(cls) -> Tuple[str, str, str]:
        """
        Get all available report names.

        :return: Tuple with report names.
        """
        return "model", "train", "test"

    def generate_reports(self, fixes: Iterable[FileFix]) -> Dict[str, str]:
        """
        Generate model train and test reports.

        Model report generated only if config["aggregate"] is True.

        :param fixes: List of fixes per file or for all files if config["aggregate"] is True.
        :return: Ordered dictionary with report names as keys and report string as values.
        """
        reports = OrderedDict()  # to keep reports order.
        if self.config["aggregate"]:
            reports["model"] = self.generate_model_report()
        try:
            reports["train"] = self.generate_train_report(fixes)
        except ValueError as e:
            self._log.warning("Train report generation failed. %s", e.args[0])
        reports["test"] = self.generate_test_report()
        return reports

    def generate_model_report(self) -> str:
        """
        Generate report about the trained model.

        :return: report.
        """
        return generate_model_report(model=self.model,
                                     analyze_config=self.analyze_config)

    def generate_train_report(self, fixes: Iterable[FileFix]) -> str:
        """
        Generate train report: classification report, confusion matrix, files with most errors.

        :return: report.
        """
        fixes = list(fixes)
        if not fixes:
            raise ValueError("There are no fixes for %s" % self.model.dump())
        vnodes = chain.from_iterable(fix.file_vnodes for fix in fixes)
        ys = numpy.hstack(fix.y for fix in fixes)
        y_pred_pure = numpy.hstack(fix.y_pred_pure for fix in fixes)
        report = get_classification_report(
            y_pred_pure, ys,
            fixes[0].feature_extractor.composite_class_representations)
        # FIXME(vmarkovtsev): we are taking the first fix here which does not work for >1 language
        return generate_quality_report(fixes[0].language,
                                       report,
                                       self.model.ptr,
                                       vnodes,
                                       self.config["max_files"],
                                       name="Train")

    def generate_test_report(self) -> str:
        """
        Generate report on the test dataset.

        :return: Report.
        """
        for lang in self.model:
            classification_report = self.model[lang].classification_report[
                "test"]
            if not classification_report:
                raise ValueError(
                    "Test classification report is unavailable for language %s. Skipping."
                    % lang)
            return generate_quality_report(lang,
                                           classification_report,
                                           self.model.ptr, [],
                                           0,
                                           name="Test")
Пример #17
0
 def test_merge_three_dicts(self):
     d1 = dict(a=1, b={"c": 1})
     d2 = dict(b={"c": 2})
     d3 = dict(b={"c": 3}, d=4)
     res = dict(a=1, b={"c": 3}, d=4)
     self.assertEqual(merge_dicts(d1, d2, d3), res)
Пример #18
0
def prepare_data(
        params: Optional[Mapping[str, Any]] = None) -> pandas.DataFrame:
    """
    Generate all the necessary data from the raw dataset of split identifiers.

    Brief algorithm description:
    1. Derive vocabulary for typos correction which is a set of tokens, which is considered
       correctly spelled. All typos corrections will belong to the vocabulary.
       It is a set of most frequent tokens (based on given statistics).
    2. Save vocabulary and statistics for given amount of most frequent tokens for future use.
    3. Filter raw data, leaving identifiers, containing only tokens from the vocabulary.
       The result is a dataset of tokens which will be considered correct. It will be used
       for creating artificial misspelling cases for training and testing the corrector model.
    4. Save prepared dataset, if needed.
    :param params: Dictionary with parameters for data preparation. Used fields are:
                   data_dir: Directory to put all derived data to.
                   drive_dataset_id: ID of google drive document, where raw dataset is stored.
                   input_path: Path to a .csv dump of input dataframe. Should contain \
                               column Columns.Split. If None or file doesn't exist,
                               the dataset will be loaded from drive.
                   frequency_column: Name of column with identifiers frequencies. If not \
                                     specified, every split is considered to have frequency 1.
                   vocabulary_size: Number of most frequent tokens to take as a vocabulary.
                   frequencies_size: Number of most frequent tokens to save  frequencies info for.\
                                     This information will be used by corrector as features for \
                                     these tokens when they will be checked. If not specified, \
                                     frequencies for all present tokens will be saved.
                   raw_data_filename: Name of .csv file in data_dir to put raw dataset in case of \
                                      loading from drive.
                   vocabulary_path: Name of .csv file in data_dir to save vocabulary to.
                   frequencies_path: Name of .csv file in data_dir to save frequencies to.
    :return: Dataset baked for training the typos correction.
    """
    if params is None:
        params = deepcopy(defaults_for_preparation)
    else:
        params = merge_dicts(defaults_for_preparation, params)

    raw_data_path = params["input_path"]
    if raw_data_path is None or not os.path.exists(raw_data_path):
        raw_data_path = os.path.join(params["data_dir"],
                                     params["raw_data_filename"])
        _download_url(params["dataset_url"], raw_data_path)

    data = pandas.read_csv(raw_data_path, index_col=0)
    if params["frequency_column"] not in data.columns:
        data[Columns.Frequency] = 1
    else:
        data = data.rename(
            columns={params["frequency_column"]: Columns.Frequency})

    # Expand dataframe by splits (repeat rows for every token in splits)
    data[Columns.Split] = data[Columns.Split].astype(str)
    flat_data = flatten_df_by_column(data,
                                     Columns.Split,
                                     Columns.Token,
                                     apply_function=lambda x: x.split())

    # Collect statistics for tokens
    stats = flat_data[[Columns.Frequency,
                       Columns.Token]].groupby([Columns.Token]).sum()
    stats = stats.sort_values(by=Columns.Frequency, ascending=False)

    # Derive new vocabulary for future use
    frequencies_tokens = set(
        stats.index[:(params["frequencies_size"] or len(stats))])
    vocabulary_tokens = set(stats.index[:params["vocabulary_size"]])
    print_frequencies(
        vocabulary_tokens, stats,
        os.path.join(params["data_dir"], params["vocabulary_filename"]))
    print_frequencies(
        frequencies_tokens, stats,
        os.path.join(params["data_dir"], params["frequencies_filename"]))

    # Leave only splits that contain tokens from vocabulary
    prepared_data = filter_splits(flat_data, vocabulary_tokens)[[
        Columns.Frequency, Columns.Split, Columns.Token
    ]]
    return prepared_data
Пример #19
0
class ReportAnalyzer(FormatAnalyzerSpy):
    """
    Base class for different kind of reports.

    * analyze - generate report for all files. If you want only aggregated report set aggregate
    flag to True in analyze config.
    * train - train or load the model.

    Child classes are required to implement 2 methods:
    * generate_report
    * generate_model_report (optional - by default it will return empty string)
    """

    default_config = merge_dicts(FormatAnalyzer.default_config,
                                 {"aggregate": False})

    @classmethod
    def get_report_names(cls) -> Tuple[str, ...]:
        """
        Get all available report names.

        :return: List of report names.
        """
        raise NotImplementedError()

    def generate_reports(self, fixes: Iterable[FileFix]) -> Dict[str, str]:
        """
        General function to generate reports.

        :param fixes: List of fixes per file or for all files if config["aggregate"] is True.
        :return: Dictionary with report names as keys and report string as values.
        """
        raise NotImplementedError()

    def analyze(self, ptr_from: ReferencePointer, ptr_to: ReferencePointer,
                data_service: DataService, **data) -> List[Comment]:
        """
        Analyze ptr_from revision and generate reports for all files in it.

        If you want to get an aggregated report set aggregate flag to True in analyze config.

        :param ptr_from: Git repository state pointer to the base revision.
        :param ptr_to: Git repository state pointer to the head revision. Not used.
        :param data_service: Connection to the Lookout data retrieval service.
        :param data: Contains "files" - the list of changes in the pointed state.
        :return: List of comments.
        """
        def convert_fixes_to_report_comments(fixes: List[FileFix],
                                             filepath: str):
            for report in self.generate_reports(fixes=fixes).values():
                yield generate_comment(filename=filepath,
                                       line=0,
                                       confidence=100,
                                       text=report)

        comments = []
        if not self.config["aggregate"]:
            for fix in self.run(ptr_from, data_service):
                filepath = fix.head_file.path
                if fix.error:
                    continue
                comments.extend(
                    convert_fixes_to_report_comments([fix], filepath))
        else:
            comments.extend(
                convert_fixes_to_report_comments([
                    fix for fix in self.run(ptr_from, data_service)
                    if not fix.error
                ], ""))
        return comments