Exemplo n.º 1
0
def test_pandas_round_trip(evaluator_data_set):
    """A simple test that the `DataSet.from_pandas` and `DataSet.to_pandas`
    functions work in conjunction with one another."""

    data_frame = evaluator_data_set.to_pandas()

    data_set = DataSet.from_pandas(
        data_frame,
        "id",
        description="Lorem Ipsum",
        authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
    )

    data_frame = data_set.to_pandas()

    data_set = DataSet.from_pandas(
        data_frame,
        "id",
        description="Lorem Ipsum",
        authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
    )

    evaluator_properties_by_id = {x.id: x for x in evaluator_data_set}

    for entry in data_set.entries:

        evaluator_property = evaluator_properties_by_id[str(entry.id)]
        compare_properties(evaluator_property, entry)
Exemplo n.º 2
0
    def test_retrieve(self, requests_mock, runner, as_pandas):

        data_set = create_data_set("data-set-1")
        data_set.entries[0].id = 1
        mock_get_data_set(requests_mock, data_set)

        output_path = "dataset.json" if not as_pandas else "dataset.csv"

        arguments = ["retrieve", "--id", data_set.id, "--output", output_path]

        if as_pandas:
            arguments.append("--pandas")

        result = runner.invoke(dataset_cli, arguments)

        if result.exit_code != 0:
            raise result.exception

        if as_pandas:
            rest_data_set = pandas.read_csv(output_path)
            assert len(rest_data_set) == len(data_set.entries)

        else:

            rest_data_set = DataSet.parse_file(output_path)
            assert rest_data_set.json().replace("\n", "") == data_set.json()
Exemplo n.º 3
0
def estimated_reference_sets():
    estimated_density = Density(
        thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin,
                                               pressure=1.0 * unit.atmosphere),
        phase=PropertyPhase.Liquid,
        substance=Substance.from_components("O", "CC=O"),
        value=1.0 * unit.kilogram / unit.meter**3,
        uncertainty=0.1 * unit.kilogram / unit.meter**3,
    )
    estimated_density.id = "1"
    estimated_enthalpy = EnthalpyOfMixing(
        thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin,
                                               pressure=1.0 * unit.atmosphere),
        phase=PropertyPhase.Liquid,
        substance=Substance.from_components("O", "CC=O"),
        value=1.0 * unit.kilocalorie / unit.mole,
        uncertainty=0.1 * unit.kilojoule / unit.mole,
    )
    estimated_enthalpy.id = "2"

    estimated_data_set = PhysicalPropertyDataSet()
    estimated_data_set.add_properties(estimated_density, estimated_enthalpy)

    reference_density = DataSetEntry(
        id=1,
        property_type="Density",
        temperature=298.15,
        pressure=101.325,
        value=0.001,
        std_error=0.0001,
        doi=" ",
        components=[
            Component(smiles="O", mole_fraction=0.5),
            Component(smiles="CC=O", mole_fraction=0.5),
        ],
    )
    reference_enthalpy = DataSetEntry(
        id=2,
        property_type="EnthalpyOfMixing",
        temperature=298.15,
        pressure=101.325,
        value=4.184,
        std_error=0.1,
        doi=" ",
        components=[
            Component(smiles="O", mole_fraction=0.5),
            Component(smiles="CC=O", mole_fraction=0.5),
        ],
    )

    reference_data_set = DataSet(
        id="ref",
        description=" ",
        authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
        entries=[reference_density, reference_enthalpy],
    )

    return estimated_data_set, reference_data_set
Exemplo n.º 4
0
def retrieve(data_set_id, return_pandas, output_path):

    data_set = DataSet.from_rest(data_set_id=data_set_id)

    if return_pandas:
        data_set = data_set.to_pandas()
        data_set.to_csv(output_path, index=False)
    else:
        with open(output_path, "w") as file:
            file.write(data_set.json())
Exemplo n.º 5
0
def test_reindex_data_set_no_mole_fraction():
    """Tests that the ``reindex_data_set`` function behaves as expected
    when exact amounts are present."""

    setup_timestamp_logging(logging.INFO)

    substance = substances.Substance()
    substance.add_component(substances.Component(smiles="O"),
                            amount=substances.MoleFraction(1.0))
    substance.add_component(
        substances.Component(smiles="CO",
                             role=substances.Component.Role.Solute),
        amount=substances.ExactAmount(1),
    )

    evaluator_data_set = PhysicalPropertyDataSet()

    evaluator_data_set.add_properties(
        SolvationFreeEnergy(
            thermodynamic_state=ThermodynamicState(
                temperature=298.15 * unit.kelvin,
                pressure=1.0 * unit.atmosphere),
            phase=PropertyPhase.Liquid,
            substance=substance,
            value=1.0 * SolvationFreeEnergy.default_unit(),
            uncertainty=1.0 * SolvationFreeEnergy.default_unit(),
        ), )

    data_set = DataSet(
        id="data-set",
        description=" ",
        authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
        entries=[
            DataSetEntry(
                id=1,
                property_type="SolvationFreeEnergy",
                temperature=298.15,
                pressure=101.325,
                value=1.0,
                std_error=1.0,
                doi=" ",
                components=[
                    Component(smiles="O", mole_fraction=1.0),
                    Component(smiles="CO",
                              mole_fraction=0.0,
                              exact_amount=1,
                              role="Solute"),
                ],
            )
        ],
    )

    reindex_data_set(evaluator_data_set, data_set)
    assert evaluator_data_set.properties[0].id == "1"
Exemplo n.º 6
0
def create_data_set(data_set_id: str, entry_id: Optional[int] = None):
    """Creates a single author data set which contains a single
    density data entry. The entry contains two components, an
    aqueous solvent (x=1) and a methanol solute (n=1).

    Parameters
    ----------
    data_set_id: str
        The id to assign to the data set.
    entry_id
        The id to assign to the one data entry.

    Returns
    -------
    DataSet
    """

    author = create_author()

    data_entry = DataSetEntry(
        id=entry_id,
        property_type="Density",
        temperature=298.15,
        pressure=101.325,
        value=1.0,
        std_error=0.1,
        doi=" ",
        components=[
            Component(smiles="O",
                      mole_fraction=1.0,
                      exact_amount=0,
                      role="Solvent"),
            Component(smiles="CO",
                      mole_fraction=0.0,
                      exact_amount=1,
                      role="Solute"),
        ],
    )

    data_set = DataSet(id=data_set_id,
                       description=" ",
                       authors=[author],
                       entries=[data_entry])

    return data_set
Exemplo n.º 7
0
def test_evaluator_round_trip(evaluator_data_set):
    """A simple test that the `DataSet.from_pandas` and `DataSet.to_evaluator`
    functions work in conjunction with one another."""

    data_frame = evaluator_data_set.to_pandas()

    data_set = DataSet.from_pandas(
        data_frame,
        "id",
        description="Lorem Ipsum",
        authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
    )

    recreated_data_set = data_set.to_evaluator()
    assert len(recreated_data_set) == len(evaluator_data_set)

    evaluator_properties_by_id = {x.id: x for x in evaluator_data_set}

    for recreated_property in recreated_data_set:

        evaluator_property = evaluator_properties_by_id[recreated_property.id]
        compare_evaluator_properties(evaluator_property, recreated_property)
Exemplo n.º 8
0
def test_collection_to_evaluator(evaluator_data_set):
    """A simple test that the `DataSetCollection.to_evaluator` function
    works as expected."""

    data_frame = evaluator_data_set.to_pandas()

    data_set = DataSet.from_pandas(
        data_frame,
        "id",
        description="Lorem Ipsum",
        authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
    )

    data_set_collection = DataSetCollection(data_sets=[data_set])

    recreated_data_set = data_set_collection.to_evaluator()
    assert len(recreated_data_set) == len(evaluator_data_set)

    evaluator_properties_by_id = {x.id: x for x in evaluator_data_set}

    for recreated_property in recreated_data_set:

        evaluator_property = evaluator_properties_by_id[recreated_property.id]
        compare_evaluator_properties(evaluator_property, recreated_property)
Exemplo n.º 9
0
def test_from_pandas(evaluator_data_set):
    """A test that the `DataSet.from_pandas` function works as expected."""

    data_frame = evaluator_data_set.to_pandas()

    data_set = DataSet.from_pandas(
        data_frame,
        "id",
        description="Lorem Ipsum",
        authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
    )

    assert data_set.id == "id"
    assert data_set.description == "Lorem Ipsum"
    assert len(data_set.authors) == 1

    assert len(data_set.entries) == len(evaluator_data_set)

    evaluator_properties_by_id = {x.id: x for x in evaluator_data_set}

    for entry in data_set.entries:

        evaluator_property = evaluator_properties_by_id[str(entry.id)]
        compare_properties(evaluator_property, entry)
Exemplo n.º 10
0
def test_analysed_result_from_evaluator():
    """Tests the `AnalysedResult.from_evaluator` function."""
    expected_mean = 0.0
    expected_std = numpy.random.rand() + 1.0

    values = numpy.random.normal(expected_mean, expected_std, 1000)

    estimated_properties = []
    reference_entries = []

    for index, value in enumerate(values):
        property_id = index + 1

        estimated_density = Density(
            thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin,
                                                   pressure=1.0 *
                                                   unit.atmosphere),
            phase=PropertyPhase.Liquid,
            substance=Substance.from_components("O"),
            value=value * Density.default_unit(),
            uncertainty=0.0 * Density.default_unit(),
        )
        estimated_density.id = str(property_id)
        estimated_properties.append(estimated_density)

        reference_density = DataSetEntry(
            id=property_id,
            property_type="Density",
            temperature=298.15,
            pressure=101.325,
            value=expected_mean,
            std_error=None,
            doi=" ",
            components=[Component(smiles="O", mole_fraction=1.0)],
        )
        reference_entries.append(reference_density)

    estimated_data_set = PhysicalPropertyDataSet()
    estimated_data_set.add_properties(*estimated_properties)

    reference_data_set = DataSet(
        id="ref",
        description=" ",
        authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
        entries=reference_entries,
    )

    analysis_environments = [ChemicalEnvironment.Aqueous]

    analysed_results = DataSetResult.from_evaluator(
        reference_data_set=reference_data_set,
        estimated_data_set=estimated_data_set,
        analysis_environments=analysis_environments,
        statistic_types=[StatisticType.RMSE],
        bootstrap_iterations=1000,
    )

    assert len(analysed_results.result_entries) == len(estimated_properties)

    full_statistics = next(
        iter(x for x in analysed_results.statistic_entries
             if x.category is None))

    assert full_statistics.property_type == "Density"
    assert full_statistics.n_components == 1
    assert full_statistics.statistic_type == StatisticType.RMSE
    assert numpy.isclose(full_statistics.value, expected_std, rtol=0.10)
Exemplo n.º 11
0
    def analyze(
        cls,
        optimization: Optimization,
        target: EvaluatorTarget,
        target_directory: str,
        result_directory: str,
        reindex: bool = False,
    ) -> Optional[EvaluatorTargetResult]:

        from openff.evaluator.client import RequestResult
        from openff.evaluator.datasets import PhysicalPropertyDataSet

        results_path = os.path.join(result_directory, "results.json")

        if not os.path.isfile(results_path):
            return None

        # Load the reference data set
        reference_data_set: PhysicalPropertyDataSet = PhysicalPropertyDataSet.from_json(
            os.path.join(target_directory, "training-set.json")
        )

        # Check to see if any of the ids were set to strings that can't be cast to
        # integers, and if so, apply slight re-indexing
        try:
            {int(entry.id) for entry in reference_data_set.properties}
        except (TypeError, ValueError):

            _logger.warning(
                "The reference data set contains properties with ids that cannot be "
                "cast to integers - attempting to fix. Note this in general is not "
                "recommended and in future it is suggested to use integer ids in "
                "physical property data sets."
            )

            for i, physical_property in enumerate(reference_data_set):
                physical_property.id = str(i + 1)

            reindex = True

        reference_data_set: DataSet = DataSet.from_pandas(
            reference_data_set.to_pandas(),
            identifier="empty",
            description="empty",
            authors=[Author(name="empty", email="*****@*****.**", institute="empty")],
        )

        results = RequestResult.from_json(results_path)

        if reindex:
            results = reindex_results(results, reference_data_set)

        estimated_data_set = results.estimated_properties

        # Generate statistics about each iteration.
        data_set_result = DataSetResult.from_evaluator(
            reference_data_set=reference_data_set,
            estimated_data_set=estimated_data_set,
            analysis_environments=optimization.analysis_environments,
            statistic_types=[StatisticType.RMSE],
        )

        objective_function = cls._read_objective_function(result_directory)

        return EvaluatorTargetResult(
            objective_function=target.weight * objective_function,
            statistic_entries=data_set_result.statistic_entries,
        )
Exemplo n.º 12
0
def mock_get_data_set(requests_mock, data_set: DataSet):
    """Mock the get data sets endpoint."""
    requests_mock.get(
        DataSet._get_endpoint(data_set_id=data_set.id),
        text=data_set.json(),
    )
Exemplo n.º 13
0
def test_reindex_data_set():
    """Tests that the ``reindex_data_set`` function behaves as expected."""

    setup_timestamp_logging(logging.INFO)

    evaluator_data_set = PhysicalPropertyDataSet()

    evaluator_data_set.add_properties(
        Density(
            thermodynamic_state=ThermodynamicState(
                temperature=298.15 * unit.kelvin,
                pressure=1.0 * unit.atmosphere),
            phase=PropertyPhase.Liquid,
            substance=substances.Substance.from_components("O"),
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
        ),
        Density(
            thermodynamic_state=ThermodynamicState(
                temperature=298.15 * unit.kelvin,
                pressure=1.0 * unit.atmosphere),
            phase=PropertyPhase.Liquid,
            substance=substances.Substance.from_components("C", "O"),
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
        ),
        Density(
            thermodynamic_state=ThermodynamicState(
                temperature=300.0 * unit.kelvin,
                pressure=1.0 * unit.atmosphere),
            phase=PropertyPhase.Liquid,
            substance=substances.Substance.from_components("C", "O"),
            value=1.0 * Density.default_unit(),
            uncertainty=1.0 * Density.default_unit(),
        ),
    )

    data_set = DataSet(
        id="data-set",
        description=" ",
        authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
        entries=[
            DataSetEntry(
                id=1,
                property_type="Density",
                temperature=298.15,
                pressure=101.325,
                value=1.0,
                std_error=1.0,
                doi=" ",
                components=[
                    Component(smiles="O", mole_fraction=0.5),
                    Component(smiles="C", mole_fraction=0.5),
                ],
            ),
            DataSetEntry(
                id=2,
                property_type="Density",
                temperature=298.15,
                pressure=101.325,
                value=1.0,
                std_error=1.0,
                doi=" ",
                components=[Component(smiles="O", mole_fraction=1.0)],
            ),
        ],
    )

    un_indexed_id = evaluator_data_set.properties[2].id

    reindex_data_set(evaluator_data_set, data_set)

    assert evaluator_data_set.properties[0].id == "2"
    assert evaluator_data_set.properties[1].id == "1"
    assert evaluator_data_set.properties[2].id == un_indexed_id

    data_set_collection = DataSetCollection(data_sets=[
        DataSet(
            id="0",
            description=" ",
            authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
            entries=[
                DataSetEntry(
                    id=3,
                    property_type="Density",
                    temperature=298.15,
                    pressure=101.325,
                    value=1.0,
                    std_error=1.0,
                    doi=" ",
                    components=[
                        Component(smiles="O", mole_fraction=0.5),
                        Component(smiles="C", mole_fraction=0.5),
                    ],
                )
            ],
        ),
        DataSet(
            id="1",
            description=" ",
            authors=[Author(name=" ", email="*****@*****.**", institute=" ")],
            entries=[
                DataSetEntry(
                    id=4,
                    property_type="Density",
                    temperature=298.15,
                    pressure=101.325,
                    value=1.0,
                    std_error=1.0,
                    doi=" ",
                    components=[Component(smiles="O", mole_fraction=1.0)],
                )
            ],
        ),
    ])

    reindex_data_set(evaluator_data_set, data_set_collection)

    assert evaluator_data_set.properties[0].id == "4"
    assert evaluator_data_set.properties[1].id == "3"
    assert evaluator_data_set.properties[2].id == un_indexed_id