def estimated_reference_sets(): estimated_density = Density( thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=Substance.from_components("O", "CC=O"), value=1.0 * unit.kilogram / unit.meter**3, uncertainty=0.1 * unit.kilogram / unit.meter**3, ) estimated_density.id = "1" estimated_enthalpy = EnthalpyOfMixing( thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=Substance.from_components("O", "CC=O"), value=1.0 * unit.kilocalorie / unit.mole, uncertainty=0.1 * unit.kilojoule / unit.mole, ) estimated_enthalpy.id = "2" estimated_data_set = PhysicalPropertyDataSet() estimated_data_set.add_properties(estimated_density, estimated_enthalpy) reference_density = DataSetEntry( id=1, property_type="Density", temperature=298.15, pressure=101.325, value=0.001, std_error=0.0001, doi=" ", components=[ Component(smiles="O", mole_fraction=0.5), Component(smiles="CC=O", mole_fraction=0.5), ], ) reference_enthalpy = DataSetEntry( id=2, property_type="EnthalpyOfMixing", temperature=298.15, pressure=101.325, value=4.184, std_error=0.1, doi=" ", components=[ Component(smiles="O", mole_fraction=0.5), Component(smiles="CC=O", mole_fraction=0.5), ], ) reference_data_set = DataSet( id="ref", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=[reference_density, reference_enthalpy], ) return estimated_data_set, reference_data_set
def test_pandas_string_id(): data_series = create_data_set("data-set-1").entries[0].to_series() data_series["Id"] = "String" data_entry = DataSetEntry.from_series(data_series) assert data_entry.id is None
def property_type_to_title(property_type: str, n_components: int): try: from openff.evaluator import unit except ImportError: unit = None abbreviations = { "Density": r"\rho", "DielectricConstant": r"\epsilon", "EnthalpyOfMixing": r"H_{mix}", "EnthalpyOfVaporization": r"H_{vap}", "ExcessMolarVolume": r"V_{ex}", "SolvationFreeEnergy": r"G_{solv}", } unit_string = DataSetEntry.default_units()[property_type] if unit is not None: property_unit = unit.Unit(unit_string) unit_string = ( "" if property_unit == unit.dimensionless else f" ({property_unit:~P})" ) abbreviation = abbreviations.get(property_type, property_type) if "FreeEnergy" not in property_type and n_components > 1: abbreviation = f"{abbreviation} (x)" return f"${abbreviation}$ {unit_string}"
def test_reindex_data_set_no_mole_fraction(): """Tests that the ``reindex_data_set`` function behaves as expected when exact amounts are present.""" setup_timestamp_logging(logging.INFO) substance = substances.Substance() substance.add_component(substances.Component(smiles="O"), amount=substances.MoleFraction(1.0)) substance.add_component( substances.Component(smiles="CO", role=substances.Component.Role.Solute), amount=substances.ExactAmount(1), ) evaluator_data_set = PhysicalPropertyDataSet() evaluator_data_set.add_properties( SolvationFreeEnergy( thermodynamic_state=ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=substance, value=1.0 * SolvationFreeEnergy.default_unit(), uncertainty=1.0 * SolvationFreeEnergy.default_unit(), ), ) data_set = DataSet( id="data-set", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=[ DataSetEntry( id=1, property_type="SolvationFreeEnergy", temperature=298.15, pressure=101.325, value=1.0, std_error=1.0, doi=" ", components=[ Component(smiles="O", mole_fraction=1.0), Component(smiles="CO", mole_fraction=0.0, exact_amount=1, role="Solute"), ], ) ], ) reindex_data_set(evaluator_data_set, data_set) assert evaluator_data_set.properties[0].id == "1"
def create_data_set(data_set_id: str, entry_id: Optional[int] = None): """Creates a single author data set which contains a single density data entry. The entry contains two components, an aqueous solvent (x=1) and a methanol solute (n=1). Parameters ---------- data_set_id: str The id to assign to the data set. entry_id The id to assign to the one data entry. Returns ------- DataSet """ author = create_author() data_entry = DataSetEntry( id=entry_id, property_type="Density", temperature=298.15, pressure=101.325, value=1.0, std_error=0.1, doi=" ", components=[ Component(smiles="O", mole_fraction=1.0, exact_amount=0, role="Solvent"), Component(smiles="CO", mole_fraction=0.0, exact_amount=1, role="Solute"), ], ) data_set = DataSet(id=data_set_id, description=" ", authors=[author], entries=[data_entry]) return data_set
def test_analysed_result_from_evaluator(): """Tests the `AnalysedResult.from_evaluator` function.""" expected_mean = 0.0 expected_std = numpy.random.rand() + 1.0 values = numpy.random.normal(expected_mean, expected_std, 1000) estimated_properties = [] reference_entries = [] for index, value in enumerate(values): property_id = index + 1 estimated_density = Density( thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=Substance.from_components("O"), value=value * Density.default_unit(), uncertainty=0.0 * Density.default_unit(), ) estimated_density.id = str(property_id) estimated_properties.append(estimated_density) reference_density = DataSetEntry( id=property_id, property_type="Density", temperature=298.15, pressure=101.325, value=expected_mean, std_error=None, doi=" ", components=[Component(smiles="O", mole_fraction=1.0)], ) reference_entries.append(reference_density) estimated_data_set = PhysicalPropertyDataSet() estimated_data_set.add_properties(*estimated_properties) reference_data_set = DataSet( id="ref", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=reference_entries, ) analysis_environments = [ChemicalEnvironment.Aqueous] analysed_results = DataSetResult.from_evaluator( reference_data_set=reference_data_set, estimated_data_set=estimated_data_set, analysis_environments=analysis_environments, statistic_types=[StatisticType.RMSE], bootstrap_iterations=1000, ) assert len(analysed_results.result_entries) == len(estimated_properties) full_statistics = next( iter(x for x in analysed_results.statistic_entries if x.category is None)) assert full_statistics.property_type == "Density" assert full_statistics.n_components == 1 assert full_statistics.statistic_type == StatisticType.RMSE assert numpy.isclose(full_statistics.value, expected_std, rtol=0.10)
def test_reindex_data_set(): """Tests that the ``reindex_data_set`` function behaves as expected.""" setup_timestamp_logging(logging.INFO) evaluator_data_set = PhysicalPropertyDataSet() evaluator_data_set.add_properties( Density( thermodynamic_state=ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=substances.Substance.from_components("O"), value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), ), Density( thermodynamic_state=ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=substances.Substance.from_components("C", "O"), value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), ), Density( thermodynamic_state=ThermodynamicState( temperature=300.0 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=substances.Substance.from_components("C", "O"), value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), ), ) data_set = DataSet( id="data-set", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=[ DataSetEntry( id=1, property_type="Density", temperature=298.15, pressure=101.325, value=1.0, std_error=1.0, doi=" ", components=[ Component(smiles="O", mole_fraction=0.5), Component(smiles="C", mole_fraction=0.5), ], ), DataSetEntry( id=2, property_type="Density", temperature=298.15, pressure=101.325, value=1.0, std_error=1.0, doi=" ", components=[Component(smiles="O", mole_fraction=1.0)], ), ], ) un_indexed_id = evaluator_data_set.properties[2].id reindex_data_set(evaluator_data_set, data_set) assert evaluator_data_set.properties[0].id == "2" assert evaluator_data_set.properties[1].id == "1" assert evaluator_data_set.properties[2].id == un_indexed_id data_set_collection = DataSetCollection(data_sets=[ DataSet( id="0", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=[ DataSetEntry( id=3, property_type="Density", temperature=298.15, pressure=101.325, value=1.0, std_error=1.0, doi=" ", components=[ Component(smiles="O", mole_fraction=0.5), Component(smiles="C", mole_fraction=0.5), ], ) ], ), DataSet( id="1", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=[ DataSetEntry( id=4, property_type="Density", temperature=298.15, pressure=101.325, value=1.0, std_error=1.0, doi=" ", components=[Component(smiles="O", mole_fraction=1.0)], ) ], ), ]) reindex_data_set(evaluator_data_set, data_set_collection) assert evaluator_data_set.properties[0].id == "4" assert evaluator_data_set.properties[1].id == "3" assert evaluator_data_set.properties[2].id == un_indexed_id
def _evaluator_to_results_entries( cls, reference_data_set: Union[DataSet, DataSetCollection], estimated_data_set: "PhysicalPropertyDataSet", analysis_environments: List[ChemicalEnvironment], ) -> Tuple[List[DataSetResultEntry], pandas.DataFrame]: from openff.evaluator.datasets import PhysicalProperty if isinstance(reference_data_set, DataSet): reference_entries_by_id: Dict[str, DataSetEntry] = { int(x.id): x for x in reference_data_set.entries } elif isinstance(reference_data_set, DataSetCollection): reference_entries_by_id: Dict[str, DataSetEntry] = { int(y.id): y for x in reference_data_set.data_sets for y in x.entries } else: raise NotImplementedError() estimated_entries_by_id: Dict[str, PhysicalProperty] = { int(x.id): x for x in estimated_data_set } results_entries = [] results_rows = [] internal_units = DataSetEntry.default_units() for identifier in reference_entries_by_id: if identifier not in estimated_entries_by_id: logger.warning( f"The property with id={identifier} appears in the reference data " f"set but not in the estimated set." ) continue reference_entry = reference_entries_by_id[identifier] estimated_entry = estimated_entries_by_id[identifier] # Check that at the very least the two types of property are of the same # type and were measured for the same number of components assert reference_entry.property_type == estimated_entry.__class__.__name__ assert len(reference_entry.components) == len(estimated_entry.substance) internal_unit = internal_units[reference_entry.property_type] results_entry = DataSetResultEntry( reference_id=reference_entry.id, estimated_value=estimated_entry.value.to(internal_unit).magnitude, estimated_std_error=estimated_entry.uncertainty.to( internal_unit ).magnitude, categories=components_to_categories( reference_entry.components, analysis_environments ), ) results_entries.append(results_entry) for category in ( [None] if len(results_entry.categories) == 0 else results_entry.categories ): results_rows.append( { "Property Type": reference_entry.property_type, "N Components": len(reference_entry.components), "Reference Value": reference_entry.value, "Reference Std": reference_entry.std_error, "Estimated Value": results_entry.estimated_value, "Estimated Std": results_entry.estimated_std_error, "Category": category, } ) results_frame = pandas.DataFrame(results_rows) return results_entries, results_frame