def test_filter_ionic_liquid(): thermodynamic_state = ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=101.325 * unit.kilopascal, ) # Ensure ionic liquids are filtered. data_set = PhysicalPropertyDataSet() data_set.add_properties( Density( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), source=MeasurementSource(doi=" "), substance=Substance.from_components("[Na+].[Cl-]"), ), Density( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), source=MeasurementSource(doi=" "), substance=Substance.from_components("C"), ), ) data_frame = data_set.to_pandas() filtered_frame = FilterByIonicLiquid.apply( data_frame, FilterByIonicLiquidSchema(), ) assert len(filtered_frame) == 1
def results_to_pandas(force_fields: List[str]) -> pandas.DataFrame: """Imports the experimental and estimated data sets and stores them in a pandas data frame. """ # Load in the experimental data set. training_set = { physical_property.id: physical_property for physical_property in PhysicalPropertyDataSet.from_json( os.path.join("raw_data_v2", "curated_data_set.json")) } # Load in the results. estimated_results = { force_field: { physical_property.id: physical_property for physical_property in PhysicalPropertyDataSet.from_json( os.path.join("raw_data_v2", f"{force_field}.json")) } for force_field in force_fields } # Refactor the experimental and estimated data into a single data frame. data_rows = [] for property_id in training_set: experimental_property = training_set[property_id] estimated_properties = { force_field: estimated_results[force_field].get(property_id, None) for force_field in force_fields } if (any(estimated_property is None for estimated_property in estimated_properties.values()) or property_id in OUTLIERS): print(f"Skipping property {property_id}") continue data_rows.extend({ "Id": property_id, "Type": (f"{experimental_property.__class__.__name__}_" f"{len(experimental_property.substance)}"), "Force Field": force_field, "NIST ThermoML": experimental_property.value.to( experimental_property.default_unit()).magnitude, "Estimated": estimated_properties[force_field].value.to( experimental_property.default_unit()).magnitude, "Estimated Uncertainty": estimated_properties[force_field].uncertainty.to( experimental_property.default_unit()).magnitude, } for force_field in force_fields) return pandas.DataFrame(data_rows)
def create_filterable_data_set(): """Creates a dummy data with a diverse set of properties to be filtered, namely: - a liquid density measured at 298 K and 0.5 atm with 1 component containing only carbon. - a gaseous dielectric measured at 288 K and 1 atm with 2 components containing only nitrogen. - a solid EoM measured at 308 K and 1.5 atm with 3 components containing only oxygen. Returns ------- PhysicalPropertyDataSet The created data set. """ source = CalculationSource("Dummy", {}) carbon_substance = create_dummy_substance(number_of_components=1, elements=["C"]) density_property = Density( thermodynamic_state=ThermodynamicState(temperature=298 * unit.kelvin, pressure=0.5 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=carbon_substance, value=1 * unit.gram / unit.milliliter, uncertainty=0.11 * unit.gram / unit.milliliter, source=source, ) nitrogen_substance = create_dummy_substance(number_of_components=2, elements=["N"]) dielectric_property = DielectricConstant( thermodynamic_state=ThermodynamicState(temperature=288 * unit.kelvin, pressure=1 * unit.atmosphere), phase=PropertyPhase.Gas, substance=nitrogen_substance, value=1 * unit.dimensionless, uncertainty=0.11 * unit.dimensionless, source=source, ) oxygen_substance = create_dummy_substance(number_of_components=3, elements=["O"]) enthalpy_property = EnthalpyOfMixing( thermodynamic_state=ThermodynamicState(temperature=308 * unit.kelvin, pressure=1.5 * unit.atmosphere), phase=PropertyPhase.Solid, substance=oxygen_substance, value=1 * unit.kilojoules / unit.mole, uncertainty=0.11 * unit.kilojoules / unit.mole, source=source, ) data_set = PhysicalPropertyDataSet() data_set.add_properties(density_property, dielectric_property, enthalpy_property) return data_set
def estimated_reference_sets(): estimated_density = Density( thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=Substance.from_components("O", "CC=O"), value=1.0 * unit.kilogram / unit.meter**3, uncertainty=0.1 * unit.kilogram / unit.meter**3, ) estimated_density.id = "1" estimated_enthalpy = EnthalpyOfMixing( thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=Substance.from_components("O", "CC=O"), value=1.0 * unit.kilocalorie / unit.mole, uncertainty=0.1 * unit.kilojoule / unit.mole, ) estimated_enthalpy.id = "2" estimated_data_set = PhysicalPropertyDataSet() estimated_data_set.add_properties(estimated_density, estimated_enthalpy) reference_density = DataSetEntry( id=1, property_type="Density", temperature=298.15, pressure=101.325, value=0.001, std_error=0.0001, doi=" ", components=[ Component(smiles="O", mole_fraction=0.5), Component(smiles="CC=O", mole_fraction=0.5), ], ) reference_enthalpy = DataSetEntry( id=2, property_type="EnthalpyOfMixing", temperature=298.15, pressure=101.325, value=4.184, std_error=0.1, doi=" ", components=[ Component(smiles="O", mole_fraction=0.5), Component(smiles="CC=O", mole_fraction=0.5), ], ) reference_data_set = DataSet( id="ref", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=[reference_density, reference_enthalpy], ) return estimated_data_set, reference_data_set
def test_sources_substances(): physical_property = create_dummy_property(Density) data_set = PhysicalPropertyDataSet() data_set.add_properties(physical_property) assert next(iter(data_set.sources)) == physical_property.source assert next(iter(data_set.substances)) == physical_property.substance
def to_evaluator(self) -> "PhysicalPropertyDataSet": from openff.evaluator.datasets import PhysicalPropertyDataSet physical_properties = [entry.to_evaluator() for entry in self.entries] evaluator_set = PhysicalPropertyDataSet() evaluator_set.add_properties(*physical_properties) return evaluator_set
def test_reindex_data_set_no_mole_fraction(): """Tests that the ``reindex_data_set`` function behaves as expected when exact amounts are present.""" setup_timestamp_logging(logging.INFO) substance = substances.Substance() substance.add_component(substances.Component(smiles="O"), amount=substances.MoleFraction(1.0)) substance.add_component( substances.Component(smiles="CO", role=substances.Component.Role.Solute), amount=substances.ExactAmount(1), ) evaluator_data_set = PhysicalPropertyDataSet() evaluator_data_set.add_properties( SolvationFreeEnergy( thermodynamic_state=ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=substance, value=1.0 * SolvationFreeEnergy.default_unit(), uncertainty=1.0 * SolvationFreeEnergy.default_unit(), ), ) data_set = DataSet( id="data-set", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=[ DataSetEntry( id=1, property_type="SolvationFreeEnergy", temperature=298.15, pressure=101.325, value=1.0, std_error=1.0, doi=" ", components=[ Component(smiles="O", mole_fraction=1.0), Component(smiles="CO", mole_fraction=0.0, exact_amount=1, role="Solute"), ], ) ], ) reindex_data_set(evaluator_data_set, data_set) assert evaluator_data_set.properties[0].id == "1"
def data_frame() -> pandas.DataFrame: temperatures = [298.15, 318.15] pressures = [101.325, 101.0] properties = [Density, EnthalpyOfMixing] mole_fractions = [(1.0, ), (1.0, ), (0.25, 0.75), (0.75, 0.25)] smiles = {1: [("C(F)(Cl)(Br)", ), ("C", )], 2: [("CO", "C"), ("C", "CO")]} loop_variables = [( temperature, pressure, property_type, mole_fraction, ) for temperature in temperatures for pressure in pressures for property_type in properties for mole_fraction in mole_fractions] data_entries = [] for temperature, pressure, property_type, mole_fraction in loop_variables: n_components = len(mole_fraction) for smiles_tuple in smiles[n_components]: substance = Substance() for smiles_pattern, x in zip(smiles_tuple, mole_fraction): substance.add_component(Component(smiles_pattern), MoleFraction(x)) data_entries.append( property_type( thermodynamic_state=ThermodynamicState( temperature=temperature * unit.kelvin, pressure=pressure * unit.kilopascal, ), phase=PropertyPhase.Liquid, value=1.0 * property_type.default_unit(), uncertainty=1.0 * property_type.default_unit(), source=MeasurementSource(doi=" "), substance=substance, )) data_set = PhysicalPropertyDataSet() data_set.add_properties(*data_entries) return data_set.to_pandas()
def main(): setup_timestamp_logging() # Load in the force field force_field_path = "smirnoff99Frosst-1.1.0.offxml" force_field_source = SmirnoffForceFieldSource.from_path(force_field_path) # Load in the data set containing the pure and binary properties. data_set = PhysicalPropertyDataSet.from_json("pure_data_set.json") data_set.merge(PhysicalPropertyDataSet.from_json("binary_data_set.json")) # Set up a server object to run the calculations using. server = setup_server(backend_type=BackendType.LocalGPU, max_number_of_workers=1, port=8001) with server: # Request the estimates. property_estimator = EvaluatorClient( ConnectionOptions(server_port=8001)) for calculation_layer in ["SimulationLayer", "ReweightingLayer"]: options = RequestOptions() options.calculation_layers = [calculation_layer] parameter_gradient_keys = [ ParameterGradientKey(tag="vdW", smirks="[#6X4:1]", attribute="epsilon"), ParameterGradientKey(tag="vdW", smirks="[#6X4:1]", attribute="rmin_half"), ] request, _ = property_estimator.request_estimate( property_set=data_set, force_field_source=force_field_source, options=options, parameter_gradient_keys=parameter_gradient_keys, ) # Wait for the results. results, _ = request.results(True, 5) layer_name = re.sub(r"(?<!^)(?=[A-Z])", "_", calculation_layer).lower() results.json(f"pure_binary_{layer_name}.json", True)
def test_protocol_replacement(force_field_source, expected_protocol_type): data_set = PhysicalPropertyDataSet() for property_type in property_types: physical_property = create_dummy_property(property_type) data_set.add_properties(physical_property) options = EvaluatorClient.default_request_options(data_set, force_field_source) options_json = options.json(format=True) assert options_json.find('BaseBuildSystem"') < 0 assert options_json.find(expected_protocol_type) >= 0
def test_analyze_non_integer_ids(mock_target, caplog): optimization, target, directory = mock_target reference_data_set: PhysicalPropertyDataSet = PhysicalPropertyDataSet.from_json( os.path.join(directory, "training-set.json")) assert len(reference_data_set ) == 1 # Sanity check in case this changes in future. reference_data_set.properties[0].id = "a" reference_data_set.json(os.path.join(directory, "training-set.json")) results = RequestResult() results.estimated_properties = reference_data_set results.json(os.path.join(directory, "results.json")) with caplog.at_level(logging.WARNING): target_result = EvaluatorAnalysisFactory.analyze( optimization=optimization, target=target, target_directory=directory, result_directory=directory, reindex=False, ) assert ("The reference data set contains properties " "with ids that cannot be cast to integers" in caplog.text) assert numpy.isclose(target_result.objective_function, 1.0) assert len(target_result.statistic_entries) == 1
def test_submission(): with tempfile.TemporaryDirectory() as directory: with temporarily_change_directory(directory): with DaskLocalCluster() as calculation_backend: # Spin up a server instance. server = EvaluatorServer( calculation_backend=calculation_backend, working_directory=directory, ) with server: # Connect a client. client = EvaluatorClient() # Submit an empty data set. force_field_path = "smirnoff99Frosst-1.1.0.offxml" force_field_source = SmirnoffForceFieldSource.from_path( force_field_path) request, error = client.request_estimate( PhysicalPropertyDataSet(), force_field_source) assert error is None assert isinstance(request, Request) result, error = request.results(polling_interval=0.01) assert error is None assert isinstance(result, RequestResult)
def data_frame() -> pandas.DataFrame: temperatures = [303.15, 298.15] property_types = [Density, EnthalpyOfVaporization] data_set_entries = [] def _temperature_noise(): return (numpy.random.rand() / 2.0 + 0.51) / 10.0 for temperature in temperatures: for index, property_type in enumerate(property_types): noise = _temperature_noise() noise *= 1 if index == 0 else -1 data_set_entries.append( property_type( thermodynamic_state=ThermodynamicState( temperature=temperature * unit.kelvin, pressure=101.325 * unit.kilopascal, ), phase=PropertyPhase.Liquid, value=1.0 * property_type.default_unit(), uncertainty=1.0 * property_type.default_unit(), source=MeasurementSource(doi=" "), substance=Substance.from_components("C"), ), ) data_set_entries.append( property_type( thermodynamic_state=ThermodynamicState( temperature=(temperature + noise) * unit.kelvin, pressure=101.325 * unit.kilopascal, ), phase=PropertyPhase.Liquid, value=1.0 * property_type.default_unit(), uncertainty=1.0 * property_type.default_unit(), source=MeasurementSource(doi=" "), substance=Substance.from_components("C"), ), ) data_set = PhysicalPropertyDataSet() data_set.add_properties(*data_set_entries) data_frame = data_set.to_pandas() return data_frame
def test_same_component_batching(): thermodynamic_state = ThermodynamicState(temperature=1.0 * unit.kelvin, pressure=1.0 * unit.atmosphere) data_set = PhysicalPropertyDataSet() data_set.add_properties( Density( thermodynamic_state=thermodynamic_state, substance=Substance.from_components("O", "C"), value=0.0 * unit.kilogram / unit.meter**3, ), EnthalpyOfVaporization( thermodynamic_state=thermodynamic_state, substance=Substance.from_components("O", "C"), value=0.0 * unit.kilojoule / unit.mole, ), Density( thermodynamic_state=thermodynamic_state, substance=Substance.from_components("O", "CO"), value=0.0 * unit.kilogram / unit.meter**3, ), EnthalpyOfVaporization( thermodynamic_state=thermodynamic_state, substance=Substance.from_components("O", "CO"), value=0.0 * unit.kilojoule / unit.mole, ), ) options = RequestOptions() submission = EvaluatorClient._Submission() submission.dataset = data_set submission.options = options with DaskLocalCluster() as calculation_backend: server = EvaluatorServer(calculation_backend) batches = server._batch_by_same_component(submission, "") assert len(batches) == 2 assert len(batches[0].queued_properties) == 2 assert len(batches[1].queued_properties) == 2
def test_benchmark_analysis(caplog, monkeypatch, dummy_conda_env): from openff.evaluator.client import RequestResult from openff.evaluator.datasets import PhysicalPropertyDataSet benchmark = create_benchmark( "project-1", "study-1", "benchmark-1", ["data-set-1"], "optimization-1", None ) # Create a reference data set. reference_data_set = create_data_set("data-set-1") reference_data_set.entries.append(reference_data_set.entries[0].copy()) reference_data_set.entries[0].id = 1 reference_data_set.entries[1].id = 2 # Create a set of evaluator results estimated_data_set = PhysicalPropertyDataSet() estimated_data_set.add_properties(reference_data_set.entries[0].to_evaluator()) unsuccessful_properties = PhysicalPropertyDataSet() unsuccessful_properties.add_properties(reference_data_set.entries[1].to_evaluator()) results = RequestResult() results.estimated_properties = estimated_data_set results.unsuccessful_properties = unsuccessful_properties with temporary_cd(os.path.dirname(dummy_conda_env)): # Save the expected input files. with open("benchmark.json", "w") as file: file.write(benchmark.json()) with open("test-set-collection.json", "w") as file: file.write(DataSetCollection(data_sets=[reference_data_set]).json()) results.json("results.json") with caplog.at_level(logging.WARNING): BenchmarkAnalysisFactory.analyze(True) assert ( "1 properties could not be estimated and so were not analyzed" in caplog.text ) assert os.path.isdir("analysis") assert os.path.isfile(os.path.join("analysis", "benchmark-results.json")) results_object = BenchmarkResult.parse_file( os.path.join("analysis", "benchmark-results.json") ) assert len(results_object.calculation_environment) > 0 assert len(results_object.analysis_environment) > 0
def test_default_options(): """Test creating the default estimation options.""" data_set = PhysicalPropertyDataSet() force_field_source = SmirnoffForceFieldSource.from_path( "smirnoff99Frosst-1.1.0.offxml") for property_type in property_types: physical_property = create_dummy_property(property_type) data_set.add_properties(physical_property) options = EvaluatorClient.default_request_options(data_set, force_field_source) options.validate() assert len(options.calculation_layers) == 2 assert len(options.calculation_schemas) == len(property_types) assert all( len(x) == len(options.calculation_layers) for x in options.calculation_schemas.values())
def data_frame() -> pandas.DataFrame: data_set = PhysicalPropertyDataSet() data_set.add_properties( Density( thermodynamic_state=ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=101.325 * unit.kilopascal, ), phase=PropertyPhase.Liquid, value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), source=MeasurementSource(doi=" "), substance=Substance.from_components("C"), ), Density( thermodynamic_state=ThermodynamicState( temperature=305.15 * unit.kelvin, pressure=101.325 * unit.kilopascal, ), phase=PropertyPhase.Liquid, value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), source=MeasurementSource(doi=" "), substance=Substance.from_components("C"), ), Density( thermodynamic_state=ThermodynamicState( temperature=298.15 * unit.kelvin, pressure=105.325 * unit.kilopascal, ), phase=PropertyPhase.Liquid, value=1.0 * Density.default_unit(), uncertainty=1.0 * Density.default_unit(), source=MeasurementSource(doi=" "), substance=Substance.from_components("C"), ), ) return data_set.to_pandas()
def test_launch_batch(): # Set up a dummy data set data_set = PhysicalPropertyDataSet() data_set.add_properties(create_dummy_property(Density), create_dummy_property(Density)) batch = Batch() batch.force_field_id = "" batch.options = RequestOptions() batch.options.calculation_layers = ["QuickCalculationLayer"] batch.options.calculation_schemas = { "Density": { "QuickCalculationLayer": CalculationLayerSchema() } } batch.parameter_gradient_keys = [] batch.queued_properties = [*data_set] batch.validate() with tempfile.TemporaryDirectory() as directory: with temporarily_change_directory(directory): with DaskLocalCluster() as calculation_backend: server = EvaluatorServer( calculation_backend=calculation_backend, working_directory=directory, ) server._queued_batches[batch.id] = batch server._launch_batch(batch) while len(batch.queued_properties) > 0: sleep(0.01) assert len(batch.estimated_properties) == 1 assert len(batch.unsuccessful_properties) == 1
class RequestResult(AttributeClass): """The current results of an estimation request - these results may be partial if the server hasn't yet completed the request. """ queued_properties = Attribute( docstring="The set of properties which have yet to be, or " "are currently being estimated.", type_hint=PhysicalPropertyDataSet, default_value=PhysicalPropertyDataSet(), ) estimated_properties = Attribute( docstring= "The set of properties which have been successfully estimated.", type_hint=PhysicalPropertyDataSet, default_value=PhysicalPropertyDataSet(), ) unsuccessful_properties = Attribute( docstring= "The set of properties which could not be successfully estimated.", type_hint=PhysicalPropertyDataSet, default_value=PhysicalPropertyDataSet(), ) exceptions = Attribute( docstring="The set of properties which have yet to be, or " "are currently being estimated.", type_hint=list, default_value=[], ) def validate(self, attribute_type=None): super(RequestResult, self).validate(attribute_type) assert all( (isinstance(x, EvaluatorException) for x in self.exceptions))
def apply(cls, data_set, schema, n_processes=1): """Apply each component of this curation workflow to an initial data set in sequence. Parameters ---------- data_set The data set to apply the workflow to. This may either be a data set object or it's pandas representation. schema The schema which defines the components to apply. n_processes The number of processes that each component is allowed to parallelize across. Returns ------- The data set which has had the curation workflow applied to it. """ component_classes = CurationComponent.components data_frame = data_set if isinstance(data_frame, PhysicalPropertyDataSet): data_frame = data_frame.to_pandas() data_frame = data_frame.copy() data_frame = data_frame.fillna(value=numpy.nan) for component_schema in schema.component_schemas: component_class_name = component_schema.__class__.__name__.replace( "Schema", "") component_class = component_classes[component_class_name] logger.info(f"Applying {component_class_name}") data_frame = component_class.apply(data_frame, component_schema, n_processes) logger.info(f"{component_class_name} applied") data_frame = data_frame.fillna(value=numpy.nan) if isinstance(data_set, PhysicalPropertyDataSet): data_frame = PhysicalPropertyDataSet.from_pandas(data_frame) return data_frame
def test_from_pandas(): """A test to ensure that data sets may be created from pandas objects.""" thermodynamic_state = ThermodynamicState(temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere) original_data_set = PhysicalPropertyDataSet() original_data_set.add_properties( Density( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=Substance.from_components("CO", "O"), value=1.0 * unit.kilogram / unit.meter**3, uncertainty=1.0 * unit.kilogram / unit.meter**3, source=MeasurementSource(doi="10.5281/zenodo.596537"), ), EnthalpyOfVaporization( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.from_string("Liquid + Gas"), substance=Substance.from_components("C"), value=2.0 * unit.kilojoule / unit.mole, source=MeasurementSource(reference="2"), ), DielectricConstant( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=Substance.from_components("C"), value=3.0 * unit.dimensionless, source=MeasurementSource(reference="3"), ), ) data_frame = original_data_set.to_pandas() recreated_data_set = PhysicalPropertyDataSet.from_pandas(data_frame) assert len(original_data_set) == len(recreated_data_set) for original_property in original_data_set: recreated_property = next(x for x in recreated_data_set if x.id == original_property.id) assert (original_property.thermodynamic_state == recreated_property.thermodynamic_state) assert original_property.phase == recreated_property.phase assert original_property.substance == recreated_property.substance assert numpy.isclose(original_property.value, recreated_property.value) if original_property.uncertainty == UNDEFINED: assert original_property.uncertainty == recreated_property.uncertainty else: assert numpy.isclose(original_property.uncertainty, recreated_property.uncertainty) assert original_property.source.doi == recreated_property.source.doi assert original_property.source.reference == recreated_property.source.reference
def simple_evaluator_data_set(): """Create a simple evaluator `PhysicalPropertyDataSet` which contains a simple binary density measurement. Returns ------- PhysicalPropertyDataSet """ evaluator_density = Density( thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=Substance.from_components("O", "CC=O"), value=1.0 * unit.kilogram / unit.meter**3, uncertainty=0.1 * unit.kilogram / unit.meter**3, source=MeasurementSource(doi="10.1000/xyz123"), ) evaluator_density.id = "1" evaluator_data_set = PhysicalPropertyDataSet() evaluator_data_set.add_properties(evaluator_density) return evaluator_data_set
def test_generate_evaluator_target(self, requests_mock): data_set = create_data_set("data-set-1") mock_get_data_set(requests_mock, data_set) target = create_evaluator_target("evaluator-target-1", [data_set.id]) with temporary_cd(): OptimizationInputFactory._generate_evaluator_target( target, 8000, None) assert os.path.isfile("training-set.json") off_data_set = PhysicalPropertyDataSet.from_json( "training-set.json") assert off_data_set.json() == data_set.to_evaluator().json() assert os.path.isfile("options.json")
def test_serialization(): """A test to ensure that data sets are JSON serializable.""" data_set = PhysicalPropertyDataSet() data_set.add_properties(create_dummy_property(Density)) data_set_json = data_set.json() parsed_data_set = PhysicalPropertyDataSet.parse_json(data_set_json) assert len(data_set) == len(parsed_data_set) parsed_data_set_json = parsed_data_set.json() assert parsed_data_set_json == data_set_json
def apply(cls, data_set, schema, n_processes=1): """Apply this curation component to a data set. Parameters ---------- data_set The data frame to apply the component to. schema The schema which defines how this component should be applied. n_processes The number of processes that this component is allowed to parallelize across. Returns ------- The data set which has had the component applied to it. """ data_frame = data_set if isinstance(data_frame, PhysicalPropertyDataSet): data_frame = data_frame.to_pandas() modified_data_frame = cls._apply(data_frame, schema, n_processes) n_data_points = len(data_frame) n_filtered = len(modified_data_frame) if n_filtered != n_data_points: direction = "removed" if n_filtered < n_data_points else "added" logger.info( f"{abs(n_filtered - n_data_points)} data points were {direction} after " f"applying the {cls.__name__} component.") if isinstance(data_set, PhysicalPropertyDataSet): modified_data_frame = PhysicalPropertyDataSet.from_pandas( modified_data_frame) return modified_data_frame
def main(): setup_timestamp_logging() # Load in the force field force_field_path = "smirnoff99Frosst-1.1.0.offxml" force_field_source = SmirnoffForceFieldSource.from_path(force_field_path) # Create a data set containing three solvation free energies. data_set = PhysicalPropertyDataSet.from_json("hydration_data_set.json") data_set.json("hydration_data_set.json", format=True) # Set up a server object to run the calculations using. server = setup_server(backend_type=BackendType.LocalGPU, max_number_of_workers=1, port=8002) with server: # Request the estimates. property_estimator = EvaluatorClient( ConnectionOptions(server_port=8002)) options = RequestOptions() options.calculation_layers = ["SimulationLayer"] options.add_schema("SimulationLayer", "SolvationFreeEnergy", _get_fixed_lambda_schema()) request, _ = property_estimator.request_estimate( property_set=data_set, force_field_source=force_field_source, options=options, ) # Wait for the results. results, _ = request.results(True, 60) # Save the result to file. results.json("results.json", True)
def test_properties_by_type(): density = create_dummy_property(Density) dielectric = create_dummy_property(DielectricConstant) data_set = PhysicalPropertyDataSet() data_set.add_properties(density, dielectric) densities = [x for x in data_set.properties_by_type("Density")] assert len(densities) == 1 assert densities[0] == density dielectrics = [ x for x in data_set.properties_by_type("DielectricConstant") ] assert len(dielectrics) == 1 assert dielectrics[0] == dielectric
def data_set(data_frame: pandas.DataFrame) -> PhysicalPropertyDataSet: return PhysicalPropertyDataSet.from_pandas(data_frame)
def test_analysed_result_from_evaluator(): """Tests the `AnalysedResult.from_evaluator` function.""" expected_mean = 0.0 expected_std = numpy.random.rand() + 1.0 values = numpy.random.normal(expected_mean, expected_std, 1000) estimated_properties = [] reference_entries = [] for index, value in enumerate(values): property_id = index + 1 estimated_density = Density( thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere), phase=PropertyPhase.Liquid, substance=Substance.from_components("O"), value=value * Density.default_unit(), uncertainty=0.0 * Density.default_unit(), ) estimated_density.id = str(property_id) estimated_properties.append(estimated_density) reference_density = DataSetEntry( id=property_id, property_type="Density", temperature=298.15, pressure=101.325, value=expected_mean, std_error=None, doi=" ", components=[Component(smiles="O", mole_fraction=1.0)], ) reference_entries.append(reference_density) estimated_data_set = PhysicalPropertyDataSet() estimated_data_set.add_properties(*estimated_properties) reference_data_set = DataSet( id="ref", description=" ", authors=[Author(name=" ", email="*****@*****.**", institute=" ")], entries=reference_entries, ) analysis_environments = [ChemicalEnvironment.Aqueous] analysed_results = DataSetResult.from_evaluator( reference_data_set=reference_data_set, estimated_data_set=estimated_data_set, analysis_environments=analysis_environments, statistic_types=[StatisticType.RMSE], bootstrap_iterations=1000, ) assert len(analysed_results.result_entries) == len(estimated_properties) full_statistics = next( iter(x for x in analysed_results.statistic_entries if x.category is None)) assert full_statistics.property_type == "Density" assert full_statistics.n_components == 1 assert full_statistics.statistic_type == StatisticType.RMSE assert numpy.isclose(full_statistics.value, expected_std, rtol=0.10)
def main(): os.makedirs("raw_data_v2", exist_ok=True) for data_set_name in [ "curated_data_set", "gaff 1.81", "gaff 2.11", "parsley 1.0.0", "smirnoff99frosst 1.1.0", ]: with open(os.path.join("raw_data", f"{data_set_name}.json")) as file: raw_data_set = json.load(file) assert (raw_data_set["@type"] == "propertyestimator.datasets.datasets.PhysicalPropertyDataSet") physical_properties = [] for raw_data_set_entries in raw_data_set["properties"].values(): for raw_data_set_entry in raw_data_set_entries: # Extract the substance this entry was measured for. substance = Substance() for raw_component in raw_data_set_entry["substance"][ "components"]: component = Component( smiles=raw_component["smiles"], role=Component.Role[raw_component["role"]["value"]], ) raw_amounts = raw_data_set_entry["substance"]["amounts"][ raw_component["smiles"]] for raw_amount in raw_amounts["value"]: if (raw_amount["@type"] == "propertyestimator.substances.Substance->MoleFraction" ): substance.add_component( component, MoleFraction(raw_amount["value"])) elif (raw_amount["@type"] == "propertyestimator.substances.Substance->ExactAmount" ): substance.add_component( component, ExactAmount(raw_amount["value"])) else: raise NotImplementedError() # Extract the source of the property if (raw_data_set_entry["source"]["@type"] == "propertyestimator.properties.properties.CalculationSource" ): source = CalculationSource( fidelity=raw_data_set_entry["source"]["fidelity"]) elif (raw_data_set_entry["source"]["@type"] == "propertyestimator.properties.properties.MeasurementSource" ): source = MeasurementSource(doi=correct_doi( raw_data_set_entry["source"]["reference"])) else: raise NotImplementedError() # Generate the new property object. property_class = getattr( properties, raw_data_set_entry["@type"].split(".")[-1]) physical_property = property_class( thermodynamic_state=ThermodynamicState( temperature=( raw_data_set_entry["thermodynamic_state"] ["temperature"]["value"] * unit.Unit(raw_data_set_entry["thermodynamic_state"] ["temperature"]["unit"])), pressure=( raw_data_set_entry["thermodynamic_state"] ["pressure"]["value"] * unit.Unit(raw_data_set_entry["thermodynamic_state"] ["pressure"]["unit"])), ), phase=PropertyPhase(raw_data_set_entry["phase"]), substance=substance, value=(raw_data_set_entry["value"]["value"] * unit.Unit(raw_data_set_entry["value"]["unit"])), uncertainty=( None if isinstance(source, MeasurementSource) else (raw_data_set_entry["uncertainty"]["value"] * unit.Unit(raw_data_set_entry["uncertainty"]["unit"]) )), source=source, ) physical_property.id = raw_data_set_entry["id"] physical_properties.append(physical_property) data_set = PhysicalPropertyDataSet() data_set.add_properties(*physical_properties) data_set.json(os.path.join("raw_data_v2", f"{data_set_name}.json"), format=True) data_set.to_pandas().to_csv( os.path.join("raw_data_v2", f"{data_set_name}.csv"))