def test_filter_ionic_liquid(): thermodynamic_state = ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=101.325 * unit.kilopascal, ) # Ensure ionic liquids are filtered. data_set = PhysicalPropertyDataSet() data_set.add_properties( Density( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), source=MeasurementSource(doi=" "), substance=Substance.from_components("[Na+].[Cl-]"), ), Density( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), source=MeasurementSource(doi=" "), substance=Substance.from_components("C"), ), ) data_frame = data_set.to_pandas() filtered_frame = FilterByIonicLiquid.apply( data_frame, FilterByIonicLiquidSchema(), ) assert len(filtered_frame) == 1
def test_from_pandas(): """A test to ensure that data sets may be created from pandas objects.""" thermodynamic_state = ThermodynamicState(temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere) original_data_set = PhysicalPropertyDataSet() original_data_set.add_properties( Density( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=Substance.from_components("CO", "O"), value=1.0 * unit.kilogram / unit.meter**3, uncertainty=1.0 * unit.kilogram / unit.meter**3, source=MeasurementSource(doi="10.5281/zenodo.596537"), ), EnthalpyOfVaporization( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.from_string("Liquid + Gas"), substance=Substance.from_components("C"), value=2.0 * unit.kilojoule / unit.mole, source=MeasurementSource(reference="2"), ), DielectricConstant( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=Substance.from_components("C"), value=3.0 * unit.dimensionless, source=MeasurementSource(reference="3"), ), ) data_frame = original_data_set.to_pandas() recreated_data_set = PhysicalPropertyDataSet.from_pandas(data_frame) assert len(original_data_set) == len(recreated_data_set) for original_property in original_data_set: recreated_property = next(x for x in recreated_data_set if x.id == original_property.id) assert (original_property.thermodynamic_state == recreated_property.thermodynamic_state) assert original_property.phase == recreated_property.phase assert original_property.substance == recreated_property.substance assert numpy.isclose(original_property.value, recreated_property.value) if original_property.uncertainty == UNDEFINED: assert original_property.uncertainty == recreated_property.uncertainty else: assert numpy.isclose(original_property.uncertainty, recreated_property.uncertainty) assert original_property.source.doi == recreated_property.source.doi assert original_property.source.reference == recreated_property.source.reference
def data_frame() -> pandas.DataFrame: temperatures = [298.15, 318.15] pressures = [101.325, 101.0] properties = [Density, EnthalpyOfMixing] mole_fractions = [(1.0, ), (1.0, ), (0.25, 0.75), (0.75, 0.25)] smiles = {1: [("C(F)(Cl)(Br)", ), ("C", )], 2: [("CO", "C"), ("C", "CO")]} loop_variables = [( temperature, pressure, property_type, mole_fraction, ) for temperature in temperatures for pressure in pressures for property_type in properties for mole_fraction in mole_fractions] data_entries = [] for temperature, pressure, property_type, mole_fraction in loop_variables: n_components = len(mole_fraction) for smiles_tuple in smiles[n_components]: substance = Substance() for smiles_pattern, x in zip(smiles_tuple, mole_fraction): substance.add_component(Component(smiles_pattern), MoleFraction(x)) data_entries.append( property_type( thermodynamic_state=ThermodynamicState( temperature=temperature * unit.kelvin, pressure=pressure * unit.kilopascal, ), phase=PropertyPhase.Liquid, value=1.0 * property_type.default_unit(), uncertainty=1.0 * property_type.default_unit(), source=MeasurementSource(doi=" "), substance=substance, )) data_set = PhysicalPropertyDataSet() data_set.add_properties(*data_entries) return data_set.to_pandas()
def data_frame() -> pandas.DataFrame: temperatures = [303.15, 298.15] property_types = [Density, EnthalpyOfVaporization] data_set_entries = [] def _temperature_noise(): return (numpy.random.rand() / 2.0 + 0.51) / 10.0 for temperature in temperatures: for index, property_type in enumerate(property_types): noise = _temperature_noise() noise *= 1 if index == 0 else -1 data_set_entries.append( property_type( thermodynamic_state=ThermodynamicState( temperature=temperature * unit.kelvin, pressure=101.325 * unit.kilopascal, ), phase=PropertyPhase.Liquid, value=1.0 * property_type.default_unit(), uncertainty=1.0 * property_type.default_unit(), source=MeasurementSource(doi=" "), substance=Substance.from_components("C"), ), ) data_set_entries.append( property_type( thermodynamic_state=ThermodynamicState( temperature=(temperature + noise) * unit.kelvin, pressure=101.325 * unit.kilopascal, ), phase=PropertyPhase.Liquid, value=1.0 * property_type.default_unit(), uncertainty=1.0 * property_type.default_unit(), source=MeasurementSource(doi=" "), substance=Substance.from_components("C"), ), ) data_set = PhysicalPropertyDataSet() data_set.add_properties(*data_set_entries) data_frame = data_set.to_pandas() return data_frame
def data_frame() -> pandas.DataFrame: data_set = PhysicalPropertyDataSet() data_set.add_properties( Density( thermodynamic_state=ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=101.325 * unit.kilopascal, ), phase=PropertyPhase.Liquid, value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), source=MeasurementSource(doi=" "), substance=Substance.from_components("C"), ), Density( thermodynamic_state=ThermodynamicState( temperature=305.15 * unit.kelvin, pressure=101.325 * unit.kilopascal, ), phase=PropertyPhase.Liquid, value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), source=MeasurementSource(doi=" "), substance=Substance.from_components("C"), ), Density( thermodynamic_state=ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=105.325 * unit.kilopascal, ), phase=PropertyPhase.Liquid, value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), source=MeasurementSource(doi=" "), substance=Substance.from_components("C"), ), ) return data_set.to_pandas()
def main(): os.makedirs("raw_data_v2", exist_ok=True) for data_set_name in [ "curated_data_set", "gaff 1.81", "gaff 2.11", "parsley 1.0.0", "smirnoff99frosst 1.1.0", ]: with open(os.path.join("raw_data", f"{data_set_name}.json")) as file: raw_data_set = json.load(file) assert (raw_data_set["@type"] == "propertyestimator.datasets.datasets.PhysicalPropertyDataSet") physical_properties = [] for raw_data_set_entries in raw_data_set["properties"].values(): for raw_data_set_entry in raw_data_set_entries: # Extract the substance this entry was measured for. substance = Substance() for raw_component in raw_data_set_entry["substance"][ "components"]: component = Component( smiles=raw_component["smiles"], role=Component.Role[raw_component["role"]["value"]], ) raw_amounts = raw_data_set_entry["substance"]["amounts"][ raw_component["smiles"]] for raw_amount in raw_amounts["value"]: if (raw_amount["@type"] == "propertyestimator.substances.Substance->MoleFraction" ): substance.add_component( component, MoleFraction(raw_amount["value"])) elif (raw_amount["@type"] == "propertyestimator.substances.Substance->ExactAmount" ): substance.add_component( component, ExactAmount(raw_amount["value"])) else: raise NotImplementedError() # Extract the source of the property if (raw_data_set_entry["source"]["@type"] == "propertyestimator.properties.properties.CalculationSource" ): source = CalculationSource( fidelity=raw_data_set_entry["source"]["fidelity"]) elif (raw_data_set_entry["source"]["@type"] == "propertyestimator.properties.properties.MeasurementSource" ): source = MeasurementSource(doi=correct_doi( raw_data_set_entry["source"]["reference"])) else: raise NotImplementedError() # Generate the new property object. property_class = getattr( properties, raw_data_set_entry["@type"].split(".")[-1]) physical_property = property_class( thermodynamic_state=ThermodynamicState( temperature=( raw_data_set_entry["thermodynamic_state"] ["temperature"]["value"] * unit.Unit(raw_data_set_entry["thermodynamic_state"] ["temperature"]["unit"])), pressure=( raw_data_set_entry["thermodynamic_state"] ["pressure"]["value"] * unit.Unit(raw_data_set_entry["thermodynamic_state"] ["pressure"]["unit"])), ), phase=PropertyPhase(raw_data_set_entry["phase"]), substance=substance, value=(raw_data_set_entry["value"]["value"] * unit.Unit(raw_data_set_entry["value"]["unit"])), uncertainty=( None if isinstance(source, MeasurementSource) else (raw_data_set_entry["uncertainty"]["value"] * unit.Unit(raw_data_set_entry["uncertainty"]["unit"]) )), source=source, ) physical_property.id = raw_data_set_entry["id"] physical_properties.append(physical_property) data_set = PhysicalPropertyDataSet() data_set.add_properties(*physical_properties) data_set.json(os.path.join("raw_data_v2", f"{data_set_name}.json"), format=True) data_set.to_pandas().to_csv( os.path.join("raw_data_v2", f"{data_set_name}.csv"))
def test_filter_by_environment_per_component(): """Test that the ``FilterByEnvironments`` filter works well with the ``per_component_environments`` schema option""" data_set = PhysicalPropertyDataSet() data_set.add_properties( _build_entry("O"), _build_entry("C"), _build_entry("C", "O"), _build_entry("O", "CC(=O)CC=O"), _build_entry("CC(=O)CC=O", "O"), ) data_frame = data_set.to_pandas() # Retain only aqueous functionality filtered_frame = FilterByEnvironments.apply( data_frame, FilterByEnvironmentsSchema( per_component_environments={ 1: [[ChemicalEnvironment.Aqueous]], 2: [[ChemicalEnvironment.Aqueous], [ChemicalEnvironment.Aqueous]], }, at_least_one_environment=True, ), ) assert len(filtered_frame) == 1 assert filtered_frame["N Components"].max() == 1 assert {*filtered_frame["Component 1"].unique()} == {"O"} # Retain any pure component data, and only aqueous aldehyde mixture data. filtered_frame = FilterByEnvironments.apply( data_frame, FilterByEnvironmentsSchema( per_component_environments={ 2: [[ChemicalEnvironment.Aldehyde], [ChemicalEnvironment.Aqueous]] }, at_least_one_environment=True, ), ) assert len(filtered_frame) == 4 assert filtered_frame["N Components"].min() == 1 assert filtered_frame["N Components"].max() == 2 pure_data = filtered_frame[filtered_frame["N Components"] == 1] binary_data = filtered_frame[filtered_frame["N Components"] == 2] assert len(pure_data) == 2 assert {*pure_data["Component 1"].unique()} == {"O", "C"} assert len(binary_data) == 2 assert { *binary_data["Component 1"].unique(), *binary_data["Component 2"].unique(), } == {"CC(=O)CC=O", "O"} # Repeat the last test but this time make the filtering strict. filtered_frame = FilterByEnvironments.apply( data_frame, FilterByEnvironmentsSchema( per_component_environments={ 2: [[ChemicalEnvironment.Aldehyde], [ChemicalEnvironment.Aqueous]] }, at_least_one_environment=False, strictly_specified_environments=True, ), ) assert len(filtered_frame) == 2 assert filtered_frame["N Components"].max() == 1 assert {*filtered_frame["Component 1"].unique()} == {"O", "C"} filtered_frame = FilterByEnvironments.apply( data_frame, FilterByEnvironmentsSchema( per_component_environments={ 2: [ [ ChemicalEnvironment.Aldehyde, ChemicalEnvironment.Ketone, ChemicalEnvironment.Carbonyl, ], [ChemicalEnvironment.Aqueous], ] }, at_least_one_environment=False, strictly_specified_environments=True, ), ) assert len(filtered_frame) == 4 assert filtered_frame["N Components"].min() == 1 assert filtered_frame["N Components"].max() == 2 pure_data = filtered_frame[filtered_frame["N Components"] == 1] binary_data = filtered_frame[filtered_frame["N Components"] == 2] assert len(pure_data) == 2 assert {*pure_data["Component 1"].unique()} == {"O", "C"} assert len(binary_data) == 2
def test_filter_by_environment_list(): """Test that the ``FilterByEnvironments`` filter works well with the ``environments`` schema option""" data_set = PhysicalPropertyDataSet() data_set.add_properties( _build_entry("O"), _build_entry("C"), _build_entry("C", "O"), _build_entry("O", "CC(=O)CC=O"), _build_entry("CC(=O)CC=O", "O"), ) data_frame = data_set.to_pandas() # Retain only aqueous functionality filtered_frame = FilterByEnvironments.apply( data_frame, FilterByEnvironmentsSchema(environments=[ChemicalEnvironment.Aqueous], at_least_one_environment=True), ) assert len(filtered_frame) == 1 assert filtered_frame["N Components"].max() == 1 assert {*filtered_frame["Component 1"].unique()} == {"O"} # Retain both aqueous and aldehyde functionality but not strictly filtered_frame = FilterByEnvironments.apply( data_frame, FilterByEnvironmentsSchema( environments=[ ChemicalEnvironment.Aqueous, ChemicalEnvironment.Aldehyde ], at_least_one_environment=True, ), ) assert len(filtered_frame) == 3 assert filtered_frame["N Components"].min() == 1 assert filtered_frame["N Components"].max() == 2 pure_data = filtered_frame[filtered_frame["N Components"] == 1] binary_data = filtered_frame[filtered_frame["N Components"] == 2] assert len(pure_data) == 1 assert {*pure_data["Component 1"].unique()} == {"O"} assert len(binary_data) == 2 assert { *binary_data["Component 1"].unique(), *binary_data["Component 2"].unique(), } == {"CC(=O)CC=O", "O"} # Ensure enforcing the strict behaviour correctly filters out the # combined aldehyde and ketone functionality when only aldehyde and # aqueous is permitted. filtered_frame = FilterByEnvironments.apply( data_frame, FilterByEnvironmentsSchema( environments=[ ChemicalEnvironment.Aqueous, ChemicalEnvironment.Aldehyde ], at_least_one_environment=False, strictly_specified_environments=True, ), ) assert len(filtered_frame) == 1 assert filtered_frame["N Components"].max() == 1 assert {*filtered_frame["Component 1"].unique()} == {"O"}
def test_to_pandas(): """A test to ensure that data sets are convertable to pandas objects.""" source = CalculationSource("Dummy", {}) pure_substance = Substance.from_components("C") binary_substance = Substance.from_components("C", "O") data_set = PhysicalPropertyDataSet() for temperature in [ 298 * unit.kelvin, 300 * unit.kelvin, 302 * unit.kelvin ]: thermodynamic_state = ThermodynamicState(temperature=temperature, pressure=1.0 * unit.atmosphere) density_property = Density( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=pure_substance, value=1 * unit.gram / unit.milliliter, uncertainty=0.11 * unit.gram / unit.milliliter, source=source, ) dielectric_property = DielectricConstant( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=pure_substance, value=1 * unit.dimensionless, uncertainty=0.11 * unit.dimensionless, source=source, ) data_set.add_properties(density_property) data_set.add_properties(dielectric_property) for temperature in [ 298 * unit.kelvin, 300 * unit.kelvin, 302 * unit.kelvin ]: thermodynamic_state = ThermodynamicState(temperature=temperature, pressure=1.0 * unit.atmosphere) enthalpy_property = EnthalpyOfMixing( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=binary_substance, value=1 * unit.kilojoules / unit.mole, uncertainty=0.11 * unit.kilojoules / unit.mole, source=source, ) excess_property = ExcessMolarVolume( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=binary_substance, value=1 * unit.meter**3 / unit.mole, uncertainty=0.11 * unit.meter**3 / unit.mole, source=source, ) data_set.add_properties(enthalpy_property) data_set.add_properties(excess_property) data_set_pandas = data_set.to_pandas() required_columns = [ "Id", "Temperature (K)", "Pressure (kPa)", "Phase", "N Components", "Source", "Component 1", "Role 1", "Mole Fraction 1", "Exact Amount 1", "Component 2", "Role 2", "Mole Fraction 2", "Exact Amount 2", ] assert all(x in data_set_pandas for x in required_columns) assert data_set_pandas is not None assert data_set_pandas.shape == (12, 22) data_set_without_na = data_set_pandas.dropna(axis=1, how="all") assert data_set_without_na.shape == (12, 20)
def _apply( cls, data_frame: pandas.DataFrame, schema: ImportFreeSolvSchema, n_processes, ) -> pandas.DataFrame: from openff.evaluator import properties, substances, unit # Convert the data frame into data rows. free_solv_data_frame = cls._download_free_solv() data_entries = [] for _, row in free_solv_data_frame.iterrows(): # Extract and standardize the SMILES pattern of the solute_smiles = row["SMILES"].lstrip().rstrip() solute_smiles = substances.Component(solute_smiles).smiles # Build the substance. substance = Substance() substance.add_component(Component(smiles="O"), MoleFraction(1.0)) substance.add_component( Component(smiles=solute_smiles, role=Component.Role.Solute), ExactAmount(1), ) # Extract the value and uncertainty value = (float(row["experimental value (kcal/mol)"]) * unit.kilocalorie / unit.mole) std_error = (float(row["experimental uncertainty (kcal/mol)"]) * unit.kilocalorie / unit.mole) # Attempt to extract a DOI original_source = row[ "experimental reference (original or paper this value was taken from)"] doi = cls._validate_doi(original_source) data_entry = SolvationFreeEnergy( thermodynamic_state=ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=101.325 * unit.kilopascal, ), phase=PropertyPhase.Liquid, substance=substance, value=value.to(properties.SolvationFreeEnergy.default_unit()), uncertainty=std_error.to( properties.SolvationFreeEnergy.default_unit()), source=MeasurementSource(doi=doi), ) data_entries.append(data_entry) data_set = PhysicalPropertyDataSet() data_set.add_properties(*data_entries) free_solv_data_frame = data_set.to_pandas() data_frame = pandas.concat([data_frame, free_solv_data_frame], ignore_index=True, sort=False) return data_frame