Пример #1
0
def test_prepare_restart(
    original_existing_results: Optional[RequestResult],
    restart: bool,
    expected_message: Optional[str],
    caplog,
):

    with temporary_cd():

        # Create a mock data set
        original_data_set = DataSetCollection(
            data_sets=[create_data_set("data-set-1", 1)])
        original_data_set.to_file("test-set-collection.json")

        if original_existing_results is not None:
            original_existing_results.json("results.json")

        with caplog.at_level(logging.INFO):
            data_set, existing_result = _prepare_restart(restart)

    if original_existing_results is not None and restart:
        assert existing_result.json() == original_existing_results.json()
    else:
        assert existing_result is None

    if expected_message is not None:
        assert expected_message in caplog.text
    else:
        assert caplog.text == ""

    expected_n_data_points = (1 if existing_result is None else len(
        existing_result.unsuccessful_properties))

    assert len(data_set) == expected_n_data_points
Пример #2
0
    async def get_all(
        db: Session = Depends(depends.get_db),
        skip: int = 0,
        limit: int = 100,
        children: bool = True,
    ):

        data_set_collection = DataSetCollection(data_sets=DataSetCRUD.read_all(
            db, skip=skip, limit=limit, include_children=children))
        data_set_collection.metadata = CollectionMeta(
            skip=skip, limit=limit, total_records=DataSetCRUD.n_total(db))

        return data_set_collection
Пример #3
0
def test_data_set_collection_validation():
    """Check that the data set correctly validates for unique set ids."""

    # Create a set which should not error
    DataSetCollection(data_sets=[create_data_set("data-set-1")])

    # Create a set which should error
    with pytest.raises(ValidationError):

        DataSetCollection(data_sets=[
            create_data_set("data-set-1"),
            create_data_set("data-set-1")
        ])
Пример #4
0
    def _retrieve_data_sets(
        cls,
        benchmark: Benchmark,
        reference_data_sets: Optional[List[Union[DataSet, QCDataSet]]],
    ):
        """Retrieve the data sets to benchmark against from the RESTful API and
        store them in the current directory."""

        test_sets = cls._find_or_retrieve_data_sets(benchmark.test_set_ids,
                                                    DataSet,
                                                    reference_data_sets)
        test_set_collection = DataSetCollection(data_sets=test_sets)

        with open("test-set-collection.json", "w") as file:
            file.write(test_set_collection.json())
Пример #5
0
    def _plot(
        cls,
        directories: List[str],
        sub_studies: List[Benchmark],
        results: List[BenchmarkResult],
        file_type: Literal["png", "pdf"],
    ):

        data_sets = {}

        # Load in the benchmarked data sets
        for directory in directories:

            reference_data_sets = DataSetCollection.parse_file(
                os.path.join(directory, "test-set-collection.json"))

            data_sets.update({
                data_set.id: data_set
                for data_set in reference_data_sets.data_sets
            })

        # Plot overall statistics about the optimization.
        for statistic_type in [StatisticType.RMSE, StatisticType.R2]:
            plot_overall_statistics(sub_studies, results, statistic_type, "",
                                    file_type)

        # Plot statistics about each category
        plot_categorized_rmse(sub_studies, results, "", file_type)

        # Plot the results as a scatter plot.
        plot_scatter_results(sub_studies, results, [*data_sets.values()], "",
                             file_type)
Пример #6
0
def mock_target(tmpdir) -> Tuple[Optimization, EvaluatorTarget, str]:
    """Create a mock evaluator target directory which is populated with a dummy
    set of results.

    Returns
    -------
        A tuple of the parent optimization, the mock target and the path to the
        directory in which the files were created.
    """

    with temporary_cd(str(tmpdir)):

        # Mock the target to analyze.
        target = create_evaluator_target("evaluator-target-1", ["data-set-1"])

        optimization = create_optimization("project-1", "study-1",
                                           "optimization-1", [target])
        optimization.analysis_environments = []

        # Create a dummy data set and estimated result.
        reference_data_set = create_data_set("data-set-1", 1)
        DataSetCollection(data_sets=[reference_data_set]).to_evaluator().json(
            "training-set.json")

        results = RequestResult()
        results.estimated_properties = reference_data_set.to_evaluator()
        results.json("results.json")

        lp_dump({"X": 1.0}, "objective.p")

    return optimization, target, str(tmpdir)
Пример #7
0
def _prepare_restart(
    restart: bool,
) -> Tuple["PhysicalPropertyDataSet", Optional["RequestResult"]]:
    """Prepare the requisite files for restarting if requested, or give a warning
    if not and files will be overwritten.

    Parameters
    ----------
    restart
        Whether to restart previous calculations.

    Returns
    -------
        The data set to be estimated (in the case of restarts this will be the set
        of un-estimated properties) and any existing results.
    """

    from openff.evaluator.client import RequestResult

    # Check for existing results files to restart from.
    existing_results: Optional[RequestResult] = None

    if os.path.isfile("results.json"):

        message = "An existing results file was found."

        if not restart:
            message = f"{message} These results will be overwritten."
        else:

            existing_results: RequestResult = RequestResult.from_json(
                "results.json")

            if len(existing_results.unsuccessful_properties) == 0:
                message = (
                    f"{message} All properties were successfully estimated and so "
                    f"this command will now exit.")

            else:
                message = (
                    f"{message} {len(existing_results.estimated_properties)} data "
                    f"points were successfully estimated, while "
                    f"{len(existing_results.unsuccessful_properties)} could not be. "
                    f"Attempting to re-estimate these unsuccessful data points."
                )

        logger.info(message)

    # Load in the data set.
    if existing_results is None:
        data_set = DataSetCollection.parse_file(
            "test-set-collection.json").to_evaluator()
    else:
        data_set = existing_results.unsuccessful_properties

    return data_set, existing_results
Пример #8
0
def test_plot(force_field, monkeypatch):

    from nonbonded.library.plotting.seaborn import optimization as optimization_module

    # Mock the required file inputs
    data_set = create_data_set("data-set-1", 1)
    data_set_collection = DataSetCollection(data_sets=[data_set])

    optimization = create_optimization(
        "project-1",
        "study-1",
        "optimization-1",
        [create_evaluator_target("target-1", ["data-set-1"])],
    )
    optimization_result = create_optimization_result("project-1", "study-1",
                                                     "optimization-1",
                                                     ["target-1"], [])

    # Mock the already tested plotting methods.
    monkeypatch.setattr(optimization_module, "plot_parameter_changes",
                        lambda *args: None)
    monkeypatch.setattr(optimization_module, "plot_objective_per_iteration",
                        lambda *args: None)
    monkeypatch.setattr(optimization_module, "plot_rmse_change",
                        lambda *args: None)

    if "nonbonded.library.factories.plots.optimization" in sys.modules:
        sys.modules.pop("nonbonded.library.factories.plots.optimization")

    from nonbonded.library.factories.plots.optimization import OptimizationPlotFactory

    with temporary_cd():

        # Save the inputs in their expected locations.
        data_set_collection.to_file("test-set-collection.json")
        optimization.to_file("optimization.json")
        os.makedirs("analysis")
        optimization_result.to_file(
            os.path.join("analysis", "optimization-results.json"))

        OptimizationPlotFactory.plot([""], "png")

        assert os.path.isdir("plots")
Пример #9
0
def test_plot(force_field, monkeypatch):

    from nonbonded.library.plotting.seaborn import benchmark as benchmark_module

    # Mock the required file inputs
    data_set = create_data_set("data-set-1", 1)
    data_set_collection = DataSetCollection(data_sets=[data_set])

    benchmark = create_benchmark(
        "project-1",
        "study-1",
        "benchmark-1",
        ["data-set-1"],
        None,
        force_field,
    )
    benchmark_result = create_benchmark_result(
        "project-1", "study-1", "benchmark-1", [create_data_set("data-set-1", 1)]
    )

    # Mock the already tested plotting methods.
    monkeypatch.setattr(benchmark_module, "plot_categorized_rmse", lambda *args: None)
    monkeypatch.setattr(benchmark_module, "plot_overall_statistics", lambda *args: None)
    monkeypatch.setattr(benchmark_module, "plot_scatter_results", lambda *args: None)

    if "nonbonded.library.factories.plots.benchmark" in sys.modules:
        sys.modules.pop("nonbonded.library.factories.plots.benchmark")

    from nonbonded.library.factories.plots.benchmark import BenchmarkPlotFactory

    with temporary_cd():

        # Save the inputs in their expected locations.
        data_set_collection.to_file("test-set-collection.json")
        benchmark.to_file("benchmark.json")
        os.makedirs("analysis")
        benchmark_result.to_file(os.path.join("analysis", "benchmark-results.json"))

        BenchmarkPlotFactory.plot([""], "png")

        assert os.path.isdir("plots")
Пример #10
0
    def test_list(self, requests_mock, runner):

        data_sets = DataSetCollection(
            data_sets=[create_data_set("data-set-1")])
        mock_get_data_sets(requests_mock, data_sets)

        result = runner.invoke(dataset_cli, ["list"])

        if result.exit_code != 0:
            raise result.exception

        assert data_sets.data_sets[0].id in result.output
Пример #11
0
def test_benchmark_analysis(caplog, monkeypatch, dummy_conda_env):

    from openff.evaluator.client import RequestResult
    from openff.evaluator.datasets import PhysicalPropertyDataSet

    benchmark = create_benchmark(
        "project-1", "study-1", "benchmark-1", ["data-set-1"], "optimization-1", None
    )

    # Create a reference data set.
    reference_data_set = create_data_set("data-set-1")
    reference_data_set.entries.append(reference_data_set.entries[0].copy())
    reference_data_set.entries[0].id = 1
    reference_data_set.entries[1].id = 2

    # Create a set of evaluator results
    estimated_data_set = PhysicalPropertyDataSet()
    estimated_data_set.add_properties(reference_data_set.entries[0].to_evaluator())

    unsuccessful_properties = PhysicalPropertyDataSet()
    unsuccessful_properties.add_properties(reference_data_set.entries[1].to_evaluator())

    results = RequestResult()
    results.estimated_properties = estimated_data_set
    results.unsuccessful_properties = unsuccessful_properties

    with temporary_cd(os.path.dirname(dummy_conda_env)):

        # Save the expected input files.
        with open("benchmark.json", "w") as file:
            file.write(benchmark.json())

        with open("test-set-collection.json", "w") as file:
            file.write(DataSetCollection(data_sets=[reference_data_set]).json())

        results.json("results.json")

        with caplog.at_level(logging.WARNING):
            BenchmarkAnalysisFactory.analyze(True)

        assert (
            "1 properties could not be estimated and so were not analyzed"
            in caplog.text
        )

        assert os.path.isdir("analysis")
        assert os.path.isfile(os.path.join("analysis", "benchmark-results.json"))

        results_object = BenchmarkResult.parse_file(
            os.path.join("analysis", "benchmark-results.json")
        )
        assert len(results_object.calculation_environment) > 0
        assert len(results_object.analysis_environment) > 0
Пример #12
0
    def _generate_evaluator_target(
        cls,
        target: EvaluatorTarget,
        port: int,
        reference_data_sets: Optional[List[Union[DataSet, QCDataSet]]],
    ):
        """Generates the input files for an evaluator target."""

        from forcebalance.evaluator_io import Evaluator_SMIRNOFF
        from openff.evaluator import unit

        # Store the data set in the targets directory
        training_sets: List[DataSet] = cls._find_or_retrieve_data_sets(
            target.data_set_ids, DataSet, reference_data_sets)
        training_set_collection = DataSetCollection(data_sets=training_sets)

        evaluator_set = training_set_collection.to_evaluator()
        evaluator_set.json("training-set.json")

        # Create the target options
        target_options = Evaluator_SMIRNOFF.OptionsFile()
        target_options.connection_options.server_port = port

        target_options.estimation_options = cls._generate_request_options(
            target, evaluator_set)

        target_options.data_set_path = "training-set.json"

        target_options.weights = {
            property_type: 1.0
            for property_type in evaluator_set.property_types
        }
        target_options.denominators = {
            property_type: unit.Quantity(value)
            for property_type, value in target.denominators.items()
        }
        target_options.polling_interval = 600

        with open("options.json", "w") as file:
            file.write(target_options.to_json())
Пример #13
0
    def analyze(cls, reindex):

        from openff.evaluator.client import RequestResult

        # Load in the definition of the benchmark to optimize.
        benchmark = Benchmark.parse_file("benchmark.json")

        # Create a directory to store the results in
        output_directory = "analysis"
        os.makedirs(output_directory, exist_ok=True)

        # Load the reference data set
        reference_data_sets = DataSetCollection.parse_file(
            "test-set-collection.json")

        # Load in the request results.
        request_results: RequestResult = RequestResult.from_json(
            "results.json")

        if reindex:
            request_results = reindex_results(request_results,
                                              reference_data_sets)

        if len(request_results.unsuccessful_properties) > 0:

            logger.warning(
                f"{len(request_results.unsuccessful_properties)} properties could "
                f"not be estimated and so were not analyzed:")

            for unsuccessful_property in request_results.unsuccessful_properties:
                logger.warning(
                    f"{unsuccessful_property.id} could not be estimated.")

        estimated_data_set = request_results.estimated_properties

        # Generate statistics for the estimated properties.
        benchmark_results = BenchmarkResult.from_evaluator(
            project_id=benchmark.project_id,
            study_id=benchmark.study_id,
            benchmark_id=benchmark.id,
            reference_data_set=reference_data_sets,
            estimated_data_set=estimated_data_set,
            analysis_environments=benchmark.analysis_environments,
        )
        benchmark_results.calculation_environment = cls._parse_calculation_environment(
        )
        benchmark_results.analysis_environment = summarise_current_versions()

        # Save the results
        with open(os.path.join(output_directory, "benchmark-results.json"),
                  "w") as file:
            file.write(benchmark_results.json())
Пример #14
0
def list_data_sets():
    """Lists all of the data sets which are available from the RESTful API."""

    data_sets = DataSetCollection.from_rest()

    text_wrapper = TextWrapper(initial_indent="    ", subsequent_indent="    ")

    for index, data_set in enumerate(data_sets.data_sets):

        print(f"{index}) {data_set.id}\n")
        print("\n".join(text_wrapper.wrap(
            data_set.description.split("\n")[0])))
        print()
Пример #15
0
    def test_get_all(self, rest_client: TestClient, db: Session):

        data_set = commit_data_set(db)
        rest_data_collection = DataSetCollection.from_rest(
            requests_class=rest_client)

        assert rest_data_collection is not None
        assert len(rest_data_collection.data_sets) == 1

        assert rest_data_collection.metadata is not None
        assert rest_data_collection.metadata.skip == 0
        assert rest_data_collection.metadata.limit == 100
        assert rest_data_collection.metadata.total_records == 1

        compare_pydantic_models(data_set, rest_data_collection.data_sets[0])
Пример #16
0
def test_collection_to_evaluator(evaluator_data_set):
    """A simple test that the `DataSetCollection.to_evaluator` function
    works as expected."""

    data_frame = evaluator_data_set.to_pandas()

    data_set = DataSet.from_pandas(
        data_frame,
        "id",
        description="Lorem Ipsum",
        authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
    )

    data_set_collection = DataSetCollection(data_sets=[data_set])

    recreated_data_set = data_set_collection.to_evaluator()
    assert len(recreated_data_set) == len(evaluator_data_set)

    evaluator_properties_by_id = {x.id: x for x in evaluator_data_set}

    for recreated_property in recreated_data_set:

        evaluator_property = evaluator_properties_by_id[recreated_property.id]
        compare_evaluator_properties(evaluator_property, recreated_property)
Пример #17
0
    def test_retrieve_data_sets(self, benchmark, requests_mock):

        # Mock the data set to retrieve.
        data_set = create_data_set("data-set-1", 1)
        mock_get_data_set(requests_mock, data_set)

        with temporary_cd():

            BenchmarkInputFactory._retrieve_data_sets(benchmark, None)

            assert os.path.isfile("test-set-collection.json")
            from nonbonded.library.models.datasets import DataSetCollection

            data_set_collection = DataSetCollection.parse_file(
                "test-set-collection.json")
            assert data_set_collection.data_sets[0].json() == data_set.json()
Пример #18
0
def commit_data_set_collection(db: Session) -> DataSetCollection:
    """Commits two data sets to the current session and returns
    them in a collection object.

    Parameters
    ----------
    db
        The current database session.
    """

    # Create the training set.
    data_set_ids = ["data-set-1", "data-set-2"]

    data_sets = [commit_data_set(db, x) for x in data_set_ids]
    data_set_collection = DataSetCollection(data_sets=data_sets)

    return data_set_collection
Пример #19
0
def test_reindex_data_set():
    """Tests that the ``reindex_data_set`` function behaves as expected."""

    setup_timestamp_logging(logging.INFO)

    evaluator_data_set = PhysicalPropertyDataSet()

    evaluator_data_set.add_properties(
        Density(
            thermodynamic_state=ThermodynamicState(
                temperature=298.15 * unit.kelvin,
                pressure=1.0 * unit.atmosphere),
            phase=PropertyPhase.Liquid,
            substance=substances.Substance.from_components("O"),
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
        ),
        Density(
            thermodynamic_state=ThermodynamicState(
                temperature=298.15 * unit.kelvin,
                pressure=1.0 * unit.atmosphere),
            phase=PropertyPhase.Liquid,
            substance=substances.Substance.from_components("C", "O"),
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
        ),
        Density(
            thermodynamic_state=ThermodynamicState(
                temperature=300.0 * unit.kelvin,
                pressure=1.0 * unit.atmosphere),
            phase=PropertyPhase.Liquid,
            substance=substances.Substance.from_components("C", "O"),
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
        ),
    )

    data_set = DataSet(
        id="data-set",
        description=" ",
        authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
        entries=[
            DataSetEntry(
                id=1,
                property_type="Density",
                temperature=298.15,
                pressure=101.325,
                value=1.0,
                std_error=1.0,
                doi=" ",
                components=[
                    Component(smiles="O", mole_fraction=0.5),
                    Component(smiles="C", mole_fraction=0.5),
                ],
            ),
            DataSetEntry(
                id=2,
                property_type="Density",
                temperature=298.15,
                pressure=101.325,
                value=1.0,
                std_error=1.0,
                doi=" ",
                components=[Component(smiles="O", mole_fraction=1.0)],
            ),
        ],
    )

    un_indexed_id = evaluator_data_set.properties[2].id

    reindex_data_set(evaluator_data_set, data_set)

    assert evaluator_data_set.properties[0].id == "2"
    assert evaluator_data_set.properties[1].id == "1"
    assert evaluator_data_set.properties[2].id == un_indexed_id

    data_set_collection = DataSetCollection(data_sets=[
        DataSet(
            id="0",
            description=" ",
            authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
            entries=[
                DataSetEntry(
                    id=3,
                    property_type="Density",
                    temperature=298.15,
                    pressure=101.325,
                    value=1.0,
                    std_error=1.0,
                    doi=" ",
                    components=[
                        Component(smiles="O", mole_fraction=0.5),
                        Component(smiles="C", mole_fraction=0.5),
                    ],
                )
            ],
        ),
        DataSet(
            id="1",
            description=" ",
            authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
            entries=[
                DataSetEntry(
                    id=4,
                    property_type="Density",
                    temperature=298.15,
                    pressure=101.325,
                    value=1.0,
                    std_error=1.0,
                    doi=" ",
                    components=[Component(smiles="O", mole_fraction=1.0)],
                )
            ],
        ),
    ])

    reindex_data_set(evaluator_data_set, data_set_collection)

    assert evaluator_data_set.properties[0].id == "4"
    assert evaluator_data_set.properties[1].id == "3"
    assert evaluator_data_set.properties[2].id == un_indexed_id
Пример #20
0
def mock_get_data_sets(requests_mock, data_sets: DataSetCollection):
    """Mock the get data sets endpoint."""
    requests_mock.get(
        f"{settings.API_URL}/datasets/phys-prop/",
        text=data_sets.json(),
    )