def test_from_pandas(): """A test to ensure that data sets may be created from pandas objects.""" thermodynamic_state = ThermodynamicState(temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere) original_data_set = PhysicalPropertyDataSet() original_data_set.add_properties( Density( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=Substance.from_components("CO", "O"), value=1.0 * unit.kilogram / unit.meter**3, uncertainty=1.0 * unit.kilogram / unit.meter**3, source=MeasurementSource(doi="10.5281/zenodo.596537"), ), EnthalpyOfVaporization( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.from_string("Liquid + Gas"), substance=Substance.from_components("C"), value=2.0 * unit.kilojoule / unit.mole, source=MeasurementSource(reference="2"), ), DielectricConstant( thermodynamic_state=thermodynamic_state, phase=PropertyPhase.Liquid, substance=Substance.from_components("C"), value=3.0 * unit.dimensionless, source=MeasurementSource(reference="3"), ), ) data_frame = original_data_set.to_pandas() recreated_data_set = PhysicalPropertyDataSet.from_pandas(data_frame) assert len(original_data_set) == len(recreated_data_set) for original_property in original_data_set: recreated_property = next(x for x in recreated_data_set if x.id == original_property.id) assert (original_property.thermodynamic_state == recreated_property.thermodynamic_state) assert original_property.phase == recreated_property.phase assert original_property.substance == recreated_property.substance assert numpy.isclose(original_property.value, recreated_property.value) if original_property.uncertainty == UNDEFINED: assert original_property.uncertainty == recreated_property.uncertainty else: assert numpy.isclose(original_property.uncertainty, recreated_property.uncertainty) assert original_property.source.doi == recreated_property.source.doi assert original_property.source.reference == recreated_property.source.reference
def apply(cls, data_set, schema, n_processes=1): """Apply each component of this curation workflow to an initial data set in sequence. Parameters ---------- data_set The data set to apply the workflow to. This may either be a data set object or it's pandas representation. schema The schema which defines the components to apply. n_processes The number of processes that each component is allowed to parallelize across. Returns ------- The data set which has had the curation workflow applied to it. """ component_classes = CurationComponent.components data_frame = data_set if isinstance(data_frame, PhysicalPropertyDataSet): data_frame = data_frame.to_pandas() data_frame = data_frame.copy() data_frame = data_frame.fillna(value=numpy.nan) for component_schema in schema.component_schemas: component_class_name = component_schema.__class__.__name__.replace( "Schema", "") component_class = component_classes[component_class_name] logger.info(f"Applying {component_class_name}") data_frame = component_class.apply(data_frame, component_schema, n_processes) logger.info(f"{component_class_name} applied") data_frame = data_frame.fillna(value=numpy.nan) if isinstance(data_set, PhysicalPropertyDataSet): data_frame = PhysicalPropertyDataSet.from_pandas(data_frame) return data_frame
def apply(cls, data_set, schema, n_processes=1): """Apply this curation component to a data set. Parameters ---------- data_set The data frame to apply the component to. schema The schema which defines how this component should be applied. n_processes The number of processes that this component is allowed to parallelize across. Returns ------- The data set which has had the component applied to it. """ data_frame = data_set if isinstance(data_frame, PhysicalPropertyDataSet): data_frame = data_frame.to_pandas() modified_data_frame = cls._apply(data_frame, schema, n_processes) n_data_points = len(data_frame) n_filtered = len(modified_data_frame) if n_filtered != n_data_points: direction = "removed" if n_filtered < n_data_points else "added" logger.info( f"{abs(n_filtered - n_data_points)} data points were {direction} after " f"applying the {cls.__name__} component.") if isinstance(data_set, PhysicalPropertyDataSet): modified_data_frame = PhysicalPropertyDataSet.from_pandas( modified_data_frame) return modified_data_frame
def data_set(data_frame: pandas.DataFrame) -> PhysicalPropertyDataSet: return PhysicalPropertyDataSet.from_pandas(data_frame)