Пример #1
0
def test_repro_with_stages(subprocess_mock):
    repro(
        ["stage1", "stage2"],
        path_config=PathConfig(Path("/project_root"), Path("/software-path")),
    )
    subprocess_mock.check_output.assert_called_with(
        ["dvc", "repro", "stage1", "stage2"],
        cwd=Path("/project_root"),
        encoding="utf8")
Пример #2
0
def test_repro_force(subprocess_mock):
    repro(
        force=True,
        path_config=PathConfig(Path("/project_root"), Path("/software-path")),
    )
    subprocess_mock.check_output.assert_called_with(
        ["dvc", "repro", "--force"],
        cwd=Path("/project_root"),
        encoding="utf8")
Пример #3
0
def refresh_if_necessary(path_config: Optional[PathConfig] = None) -> None:
    path_config = path_config or PathConfig.load()
    if not bohr_up_to_date(path_config):
        logger.info(
            "There are changes to the bohr config. Refreshing the workspace..."
        )
        refresh(path_config)
    else:
        logger.info("Bohr config hasn't changed.")
Пример #4
0
 def load(project_root: Optional[AbsolutePath] = None) -> "AppConfig":
     project_root = project_root or find_project_root()
     config_dict = load_config_dict_from_file(project_root)
     try:
         verbose_str = config_dict["core"]["verbose"]
         verbose = verbose_str == "true" or verbose_str == "True"
     except KeyError:
         verbose = False
     return AppConfig(verbose, PathConfig.load())
def combine_applied_heuristics(task: Task,
                               path_config: Optional[PathConfig] = None
                               ) -> None:

    path_config = path_config or PathConfig.load()
    task_dir_generated = path_config.generated / task.name
    for dataset_name, dataset in task.datasets.items():
        all_heuristics_file = (task_dir_generated /
                               f"heuristic_matrix_{dataset_name}.pkl")
        matrix_list = []
        all_heuristics = []
        for heuristic_module_path in task.heuristic_groups:
            partial_heuristics_file = (task_dir_generated /
                                       heuristic_module_path /
                                       f"heuristic_matrix_{dataset_name}.pkl")
            matrix = pd.read_pickle(str(partial_heuristics_file))
            matrix_list.append(matrix)
            heuristics = load_heuristics_from_module(task.top_artifact,
                                                     heuristic_module_path)
            all_heuristics.extend(heuristics)
        labeling_functions = to_labeling_functions(all_heuristics,
                                                   dataset.mapper, task.labels)
        all_heuristics_matrix = pd.concat(matrix_list, axis=1)
        if sum(all_heuristics_matrix.columns.duplicated()) != 0:
            s = set()
            for c in all_heuristics_matrix.columns:
                if c in s:
                    raise ValueError(f"Duplicate heuristics are present: {c}")
                s.add(c)
            raise AssertionError()
        all_heuristics_matrix.to_pickle(str(all_heuristics_file))
        artifact_df = dataset.load()
        label_series = (artifact_df[task.label_column_name]
                        if task.label_column_name in artifact_df.columns else
                        None)
        save_csv_to = path_config.generated / task.name / f"analysis_{dataset_name}.csv"
        save_json_to = path_config.metrics / task.name / f"analysis_{dataset_name}.json"
        save_metrics_to = (path_config.metrics / task.name /
                           f"heuristic_metrics_{dataset_name}.json")

        run_analysis(
            all_heuristics_matrix.to_numpy(),
            labeling_functions,
            save_csv_to,
            save_json_to,
            label_series,
        )

        stats = calculate_metrics(
            all_heuristics_matrix.to_numpy(),
            labeling_functions,
            label_series,
            save_to=save_metrics_to,
        )

        pprint(stats)
Пример #6
0
def test_pull(subprocess_mock):
    pull(
        ["path/to/dataset1.csv", "path/to/dataset2.csv"],
        PathConfig(Path("/project_root"), Path("/software-path")),
    )
    subprocess_mock.check_output.assert_called_with(
        ["dvc", "pull", "path/to/dataset1.csv", "path/to/dataset2.csv"],
        cwd=Path("/project_root"),
        encoding="utf8",
    )
Пример #7
0
def save_transient_stages_to_config(
        transient_stages: List[str],
        path_config: Optional[PathConfig] = None) -> None:
    path_config = path_config or PathConfig.load()
    conf_dir = path_config.project_root / ".bohr"
    if not conf_dir.exists():
        conf_dir.mkdir()
    transient_stages_file = conf_dir / "transient_stages.json"
    with transient_stages_file.open("w") as f:
        json.dump(transient_stages, f)
Пример #8
0
def add(
    path: Path,
    artifact: str,
    name: Optional[str] = None,
    author: Optional[str] = None,
    description: Optional[str] = "",
    format: Optional[str] = None,
    preprocessor: Optional[str] = None,
    bohr_repo: Optional[BohrRepo] = None,
    path_config: Optional[PathConfig] = None,
) -> Dataset:
    path_config = path_config or PathConfig.load()
    bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root)
    destination_path = path_config.downloaded_data / path.name
    logger.info(f"Copying {path.name} to {destination_path} ...")
    shutil.copy(path, destination_path)
    dvc_output = dvc.add(destination_path, path_config.project_root)
    logger.info(dvc_output)
    file_name = path.name
    if preprocessor is None:
        file_name, preprocessor = extract_preprocessor_from_file_name(
            file_name)
    if format is None:
        file_name, format = extract_format_from_file_name(file_name)
    dataset_name = name or file_name
    if dataset_name in bohr_repo.datasets:
        message = f"Dataset with name {dataset_name} already exists."
        if name is None:
            message += (
                "\nAre you trying to add the same dataset twice?\n"
                "If not, please specifying the `name` parameter explicitly.")
        raise ValueError(message)
    try:
        mapper = default_mappers[artifact_map[artifact]]
    except KeyError:
        mapper = load_class_by_full_path(artifact)
    path_preprocessed: RelativePath = get_preprocessed_path(
        None,
        relative_to_safe(destination_path, path_config.downloaded_data),
        path_config.data_dir,
        preprocessor,
    )
    dataset = Dataset(
        dataset_name,
        author,
        description,
        path_preprocessed=path_preprocessed,
        path_dist=path_config.downloaded_data_dir / path.name,
        dataloader=CsvDatasetLoader(path_preprocessed, mapper()),
        preprocessor=preprocessor,
    )
    bohr_repo.datasets[dataset.name] = dataset
    bohr_repo.dump(path_config.project_root)
    repro(bohr_repo=bohr_repo, path_config=path_config)
    return dataset
Пример #9
0
def create_directories_if_necessary(
        bohr_repo: Optional[BohrRepo] = None) -> None:
    bohr_repo = bohr_repo or load_bohr_repo()
    path_config = PathConfig.load()
    for task in bohr_repo.tasks.values():
        for heuristic_group in task.heuristic_groups:
            (path_config.generated / task.name / heuristic_group).mkdir(
                exist_ok=True, parents=True)
            (path_config.metrics / task.name / heuristic_group).mkdir(
                exist_ok=True, parents=True)
    path_config.labeled_data.mkdir(exist_ok=True, parents=True)
Пример #10
0
def pull(task: str, target: str, verbose: bool = False):
    try:
        with verbosity(verbose):
            path_config = PathConfig.load()
            refresh_if_necessary(path_config)
            path = api.pull(task, target, path_config=path_config)
            logger.info(
                f"The dataset is available at {path_config.project_root / path}"
            )
    except BohrDatasetNotFound as ex:
        logger.error(ex, exc_info=logger.getEffectiveLevel() == logging.DEBUG)
        exit(404)
Пример #11
0
def parse_labels(path_config: Optional[PathConfig] = None) -> None:
    path_config = path_config or PathConfig.load()
    label_tree_list = load_label_tree(path_config.labels)
    from jinja2 import Environment

    env = Environment(loader=FileSystemLoader(Path(__file__).parent.parent))
    template = env.get_template("resources/labels.template")
    s = template.render(hierarchies=[
        l for label_tree in label_tree_list for l in label_tree.flatten()
    ])
    with open("labels.py", "w") as f:
        f.write(s)
Пример #12
0
def add(
    name: str,
    artifact: str,
    labels: str,
    label_column: str,
    authors: str,
    description: str,
    use_all_datasets: bool,
    repro: bool,
    force: bool,
    verbose: bool,
) -> None:
    with verbosity(verbose):
        project_root = find_project_root()
        bohr_repo = load_bohr_repo(project_root)
        path_config = PathConfig.load(project_root)
        if name in bohr_repo.tasks and not force:
            logger.error(f"Task {name} is already defined")
            exit(400)
        try:
            artifact_type = artifact_map[artifact]
        except KeyError:
            logger.error(f"Artifact not found: {artifact}")
            exit(404)
        label_list = list(map(lambda s: s.strip(), labels.split(",")))
        if not use_all_datasets:
            train_datasets, test_datasets = {}, {}
        else:
            all_datasets = {
                n: d
                for n, d in bohr_repo.datasets.items()
                if d.artifact_type == artifact_type
            }
            train_datasets, test_datasets = train_and_test(all_datasets, label_column)
        heuristic_groups = get_heuristic_module_list(
            artifact_type, path_config.heuristics
        )
        task = Task(
            name,
            authors,
            description,
            artifact_type,
            label_list,
            train_datasets,
            test_datasets,
            label_column,
            heuristic_groups,
        )
        bohr_repo.tasks[name] = task
        bohr_repo.dump(project_root)
        if repro:
            logger.info("Re-running the pipeline ...")
            api.repro(name, bohr_repo=bohr_repo)
Пример #13
0
def train_label_model(task: str, target_dataset: str):
    from bohr.pipeline.train_label_model import train_label_model

    setup_loggers()
    bohr_repo = load_bohr_repo()
    path_config = PathConfig.load()
    task = bohr_repo.tasks[task]
    target_dataset = bohr_repo.datasets[target_dataset]
    stats = train_label_model(task, target_dataset, path_config)
    with open(path_config.metrics / task.name / "label_model_metrics.json",
              "w") as f:
        json.dump(stats, f)
    pprint(stats)
Пример #14
0
    def __init__(
        self,
        task: str,
        labeled_dataset_name: str,
        rev: Optional[str] = "master",
        force_update: bool = False,
    ):
        path_config = PathConfig.load()
        path_to_old_revision = get_path_to_revision(path_config.project_root,
                                                    rev, force_update)
        self.labeled_dataset_name = labeled_dataset_name
        logging.disable(logging.WARNING)
        labeled_dataset_path = (path_config.labeled_data_dir / task /
                                f"{labeled_dataset_name}.labeled.csv")
        with dvc.api.open(labeled_dataset_path, path_to_old_revision) as f:
            old_df = pd.read_csv(f)
        with dvc.api.open(labeled_dataset_path) as f:
            new_df = pd.read_csv(f)
        logging.disable(logging.NOTSET)

        self.is_test_set = "bug" in old_df.columns

        old_df_columns = ["prob_CommitLabel.BugFix"]
        if self.is_test_set:
            old_df_columns.append("bug")
        self.combined_df = pd.concat(
            [
                old_df[old_df_columns],
                new_df["prob_CommitLabel.BugFix"].rename(
                    "prob_CommitLabel.BugFix_new"),
            ],
            axis=1,
        )
        if self.is_test_set:
            self.combined_df.loc[:, "improvement"] = (
                self.combined_df["prob_CommitLabel.BugFix_new"] -
                self.combined_df["prob_CommitLabel.BugFix"]) * (
                    self.combined_df["bug"] * 2 - 1)

        self.combined_df.loc[:, "certainty"] = (
            np.abs(self.combined_df["prob_CommitLabel.BugFix_new"] - 0.5) * 2)
        if self.is_test_set:
            self.combined_df.loc[:, "precision"] = 1 - np.abs(
                self.combined_df["prob_CommitLabel.BugFix_new"] -
                self.combined_df["bug"])

        self.combined_df = pd.concat([self.combined_df, old_df["message"]],
                                     axis=1)
        if "url" in old_df.columns:
            self.combined_df["url"] = old_df["url"]
Пример #15
0
def deserialize_bohr_repo(dct,
                          cls,
                          path_config: Optional[PathConfig] = None,
                          **kwargs) -> BohrRepo:
    """
    >>> jsons.loads('{"bohr_framework_version": 0.1, "tasks": {}, "datasets": {}, "dataset-linkers": {}}', BohrRepo, \
path_config={'project_root': '/'})
    BohrRepo(bohr_framework_version=0.1, tasks={}, datasets={}, linkers=[])
    """
    path_config = path_config or PathConfig.load()
    datasets: Dict[str, Dataset] = {}
    for dataset_name, dataset_object in dct["datasets"].items():
        datasets[dataset_name] = jsons.load(
            dataset_object,
            Dataset,
            dataset_name=dataset_name,
            downloaded_data_dir=path_config.downloaded_data_dir,
            data_dir=path_config.data_dir,
        )
    linkers = [
        jsons.load(
            dataset_linker_obj,
            DatasetLinker,
            datasets=datasets,
            data_dir=path_config.data_dir,
        ) for dataset_linker_obj in dct["dataset-linkers"]
    ]

    for dataset_name, dataset in datasets.items():
        dataset.mapper.linkers = []

    for linker in linkers:
        linker.from_.mapper.linkers = linkers

    tasks = dict()
    for task_name, task_json in dct["tasks"].items():
        tasks[task_name] = jsons.load(
            task_json,
            Task,
            task_name=task_name,
            heuristic_path=path_config.heuristics,
            datasets=datasets,
        )
    return BohrRepo(
        dct["bohr_framework_version"],
        tasks,
        datasets,
        linkers,
    )
Пример #16
0
def add_dataset(
    task: Task,
    dataset: Dataset,
    bohr_repo: Optional[BohrRepo] = None,
    path_config: Optional[PathConfig] = None,
) -> Dataset:
    path_config = path_config or PathConfig.load()
    bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root)

    is_test_set = dataset.is_column_present(task.label_column_name)
    logger.info(
        f'Adding dataset {dataset.name} as a {"test" if is_test_set else "train"} set'
    )
    task.add_dataset(dataset, is_test_set)
    bohr_repo.dump(path_config.project_root)
    return dataset
Пример #17
0
def pull(
    task: str,
    target: str,
    bohr_repo: Optional[BohrRepo] = None,
    path_config: Optional[PathConfig] = None,
) -> RelativePath:
    path_config = path_config or PathConfig.load()
    bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root)

    path = path_config.labeled_data_dir / task / f"{target}.labeled.csv"
    if path.exists():
        logger.info(dvc.pull([str(path)]))
        return path
    else:
        raise BohrDatasetNotFound(
            f"Dataset {target} in task {task} not found! Available datasets in this task: {list(bohr_repo.tasks[task].datasets.keys())}"
        )
def train_label_model(
        task: Task,
        target_dataset: Dataset,
        path_config: Optional[PathConfig] = None) -> Dict[str, Any]:
    path_config = path_config or PathConfig.load()

    task_dir_generated = path_config.generated / task.name
    if not task_dir_generated.exists():
        task_dir_generated.mkdir()

    lines_train = pd.read_pickle(
        str(task_dir_generated /
            f"heuristic_matrix_{target_dataset.name}.pkl"))
    label_model = fit_label_model(lines_train.to_numpy())
    label_model.save(str(task_dir_generated / "label_model.pkl"))
    label_model.eval()

    label_model_weights_file = (path_config.generated / task.name /
                                f"label_model_weights.csv")

    df = pd.DataFrame(
        label_model.mu.cpu().detach().numpy().reshape(-1, 4),
        columns=["00", "01", "10", "11"],
        index=lines_train.columns,
    )
    df.to_csv(label_model_weights_file, index_label="heuristic_name")

    stats = {}
    for test_set_name, test_set in task._test_datasets.items():
        df = test_set.load()
        if task.label_column_name not in df.columns:
            raise GroundTruthColumnNotFound(
                f"Dataset {test_set_name} is added as a test set to the {task.name} task.\n"
                f"However, column with ground-thruth labels '{task.label_column_name}' not found."
            )
        stats.update(
            calculate_metrics(
                label_model,
                test_set_name,
                df[task.label_column_name].values,
                save_to=task_dir_generated,
            ))

    return stats
Пример #19
0
def test_parse_labels_command():
    command = ParseLabelsCommand(
        PathConfig(Path("/project_root"), Path("/software_root")))
    assert command.summary() == "parse labels"
    assert command.get_name() == "parse_labels"
    assert command.to_string() == [
        "dvc",
        "run",
        "-v",
        "--no-exec",
        "--force",
        "-n",
        "parse_labels",
        "-d",
        "/project_root/labels",
        "-O",
        "labels.py",
        "bohr",
        "porcelain",
        "parse-labels",
    ]
Пример #20
0
def repro(
    task: Optional[str] = None,
    only_transient: bool = False,
    force: bool = False,
    bohr_repo: Optional[BohrRepo] = None,
    path_config: Optional[PathConfig] = None,
) -> None:
    """
    # >>> import tempfile
    # >>> with tempfile.TemporaryDirectory() as tmpdirname:
    # ...     with open(Path(tmpdirname) / 'bohr.json', 'w') as f:
    # ...         print(f.write('{"bohr_framework_version": "0.3.9-rc", "tasks": {}, "datasets": {}}'))
    # ...     get_dvc_commands_to_repro(None, False, load_config(Path(tmpdirname)))
    """
    path_config = path_config or PathConfig.load()
    bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root)

    refresh_if_necessary(path_config)

    paths_to_pull = [str(d.path_dist) for d in bohr_repo.datasets.values()]
    if len(paths_to_pull) > 0:
        logger.info(dvc.pull(paths_to_pull))

    # TODO run only task-related transient stages if task is passed:
    transient_stages = load_transient_stages(path_config)
    if len(transient_stages) > 0:
        logger.info(
            dvc.repro(transient_stages, force=force, path_config=path_config))

    if not only_transient:
        glob = None
        if task:
            if task not in bohr_repo.tasks:
                raise ValueError(f"Task {task} not found in bohr.json")
            glob = f"{task}_*"
        logger.info(
            dvc.repro(pull=True,
                      glob=glob,
                      force=force,
                      path_config=path_config))
Пример #21
0
def test_apply_heuristics_command():
    command = ApplyHeuristicsCommand(
        PathConfig(Path("/project_root"), Path("/software_root")),
        stub_task,
        "group.1",
        datasets=["dataset1"],
        execute_immediately=True,
    )
    assert (command.summary() ==
            "[bugginess] apply heuristics (group: group.1) to dataset1")
    assert command.get_name(
    ) == "bugginess_apply_heuristics__group_1__dataset1"
    assert command.to_string() == [
        "dvc",
        "run",
        "-v",
        "--force",
        "-n",
        "bugginess_apply_heuristics__group_1__dataset1",
        "-d",
        "labels.py",
        "-d",
        "group/1.py",
        "-d",
        "prep_path/dataset1",
        "-p",
        "bohr.json:bohr_framework_version",
        "-o",
        "generated/bugginess/group.1/heuristic_matrix_dataset1.pkl",
        "-M",
        "metrics/bugginess/group.1/heuristic_metrics_dataset1.json",
        "bohr",
        "porcelain",
        "apply-heuristics",
        "bugginess",
        "--heuristic-group",
        "group.1",
        "--dataset",
        "dataset1",
    ]
Пример #22
0
def label_dataset(
    task: Task,
    dataset: Dataset,
    path_config: Optional[PathConfig] = None,
    debug: bool = False,
):
    path_config = path_config or PathConfig.load()

    applied_heuristics_df = pd.read_pickle(
        str(path_config.generated / task.name /
            f"heuristic_matrix_{dataset.name}.pkl"))

    label_model = LabelModel()
    label_model.load(str(path_config.generated / task.name /
                         "label_model.pkl"))
    df = dataset.load()
    df_labeled = do_labeling(label_model, applied_heuristics_df.to_numpy(), df,
                             task.labels)

    if debug:
        for (
                heuristic_name,
                applied_heuristic_series,
        ) in applied_heuristics_df.iteritems():
            applied_heuristics_df[
                heuristic_name] = applied_heuristic_series.map({
                    0: heuristic_name,
                    1: heuristic_name,
                    -1: ""
                })
        col_lfs = applied_heuristics_df.apply(
            lambda row: ";".join([elm for elm in row if elm]), axis=1)
        df_labeled["lfs"] = col_lfs

    labeled_data_path = path_config.labeled_data / task.name
    if not labeled_data_path.exists():
        labeled_data_path.mkdir(parents=True)
    target_file = labeled_data_path / f"{dataset.name}.labeled.csv"
    df_labeled.to_csv(target_file, index=False)
    print(f"Labeled dataset has been written to {target_file}.")
Пример #23
0
def test_status(subprocess_mock):
    status(PathConfig(Path("/project_root"), Path("/software-root")))
    subprocess_mock.check_output.assert_called_with(["dvc", "status"],
                                                    cwd=Path("/project_root"),
                                                    encoding="utf8")
 def __init__(self):
     super().__init__()
     path_config = PathConfig.load()
     refactoring_miner_dir = os.listdir(path_config.software_path)[0]
     logger.debug(f"Using RefactoringMiner version {refactoring_miner_dir}")
     self.path = path_config.software_path / refactoring_miner_dir / "bin"
Пример #25
0
    if not heuristics:
        raise ValueError(
            f"Heuristics not found for artifact: {task.top_artifact}")

    save_to_matrix = task_dir_generated / f"heuristic_matrix_{dataset.name}.pkl"
    save_to_metrics = task_dir_metrics / f"heuristic_metrics_{dataset.name}.json"
    labeling_functions = to_labeling_functions(heuristics, dataset.mapper,
                                               task.labels)
    artifact_df = dataset.load()
    apply_lf_matrix = apply_lfs_to_dataset(labeling_functions,
                                           artifact_df=artifact_df,
                                           save_to=save_to_matrix)
    label_series = (artifact_df[task.label_column_name]
                    if task.label_column_name in artifact_df.columns else None)
    calculate_metrics(apply_lf_matrix,
                      labeling_functions,
                      label_series,
                      save_to=save_to_metrics)


if __name__ == "__main__":
    bohr_repo = load_bohr_repo()
    task = bohr_repo.tasks["bugginess"]
    dataset = bohr_repo.datasets["1151-commits"]
    apply_heuristics(
        task,
        PathConfig.load(),
        "heuristics.bugginess.main_heurstics",
        dataset,
    )
Пример #26
0
def refresh(path_config: Optional[PathConfig] = None) -> None:
    path_config = path_config or PathConfig.load()
    (path_config.project_root / "dvc.yaml").unlink(missing_ok=True)
    add_all_tasks_to_dvc_pipeline()
    update_lock(path_config)
Пример #27
0
import sys
from pathlib import Path

import pandas as pd

from bohr.config.pathconfig import PathConfig


def combine_labels(path_to_labeled_dataset: Path, path_to_transformer_labels,
                   output_path: Path) -> None:
    labeled_dataset = pd.read_csv(path_to_labeled_dataset)
    transformer_labels = pd.read_csv(
        path_to_transformer_labels)['prediction'].rename('transformer_preds')
    combined = pd.concat([labeled_dataset, transformer_labels], axis=1)
    combined.to_csv(output_path)


if __name__ == '__main__':
    project_root = PathConfig.load().project_root
    combine_labels(project_root / Path(sys.argv[1]),
                   project_root / Path(sys.argv[2]),
                   project_root / Path(sys.argv[3]))
Пример #28
0
def test_pull_paths_not_iterable(subprocess_mock):
    with pytest.raises(ValueError):
        pull(
            Path("path/to/dataset.csv"),
            PathConfig(Path("/project_root"), Path("/software-path")),
        )