Пример #1
0
def test_from_pandas():
    """A test to ensure that data sets may be created from pandas objects."""

    thermodynamic_state = ThermodynamicState(temperature=298.15 * unit.kelvin,
                                             pressure=1.0 * unit.atmosphere)

    original_data_set = PhysicalPropertyDataSet()
    original_data_set.add_properties(
        Density(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=Substance.from_components("CO", "O"),
            value=1.0 * unit.kilogram / unit.meter**3,
            uncertainty=1.0 * unit.kilogram / unit.meter**3,
            source=MeasurementSource(doi="10.5281/zenodo.596537"),
        ),
        EnthalpyOfVaporization(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.from_string("Liquid + Gas"),
            substance=Substance.from_components("C"),
            value=2.0 * unit.kilojoule / unit.mole,
            source=MeasurementSource(reference="2"),
        ),
        DielectricConstant(
            thermodynamic_state=thermodynamic_state,
            phase=PropertyPhase.Liquid,
            substance=Substance.from_components("C"),
            value=3.0 * unit.dimensionless,
            source=MeasurementSource(reference="3"),
        ),
    )

    data_frame = original_data_set.to_pandas()

    recreated_data_set = PhysicalPropertyDataSet.from_pandas(data_frame)
    assert len(original_data_set) == len(recreated_data_set)

    for original_property in original_data_set:

        recreated_property = next(x for x in recreated_data_set
                                  if x.id == original_property.id)

        assert (original_property.thermodynamic_state ==
                recreated_property.thermodynamic_state)
        assert original_property.phase == recreated_property.phase
        assert original_property.substance == recreated_property.substance
        assert numpy.isclose(original_property.value, recreated_property.value)

        if original_property.uncertainty == UNDEFINED:
            assert original_property.uncertainty == recreated_property.uncertainty
        else:
            assert numpy.isclose(original_property.uncertainty,
                                 recreated_property.uncertainty)

        assert original_property.source.doi == recreated_property.source.doi
        assert original_property.source.reference == recreated_property.source.reference
Пример #2
0
    def apply(cls, data_set, schema, n_processes=1):
        """Apply each component of this curation workflow to an initial data set in
        sequence.

        Parameters
        ----------
        data_set
            The data set to apply the workflow to. This may either be a
            data set object or it's pandas representation.
        schema
            The schema which defines the components to apply.
        n_processes
            The number of processes that each component is allowed to
            parallelize across.

        Returns
        -------
            The data set which has had the curation workflow applied to it.
        """

        component_classes = CurationComponent.components

        data_frame = data_set

        if isinstance(data_frame, PhysicalPropertyDataSet):
            data_frame = data_frame.to_pandas()

        data_frame = data_frame.copy()
        data_frame = data_frame.fillna(value=numpy.nan)

        for component_schema in schema.component_schemas:

            component_class_name = component_schema.__class__.__name__.replace(
                "Schema", "")
            component_class = component_classes[component_class_name]

            logger.info(f"Applying {component_class_name}")

            data_frame = component_class.apply(data_frame, component_schema,
                                               n_processes)

            logger.info(f"{component_class_name} applied")

            data_frame = data_frame.fillna(value=numpy.nan)

        if isinstance(data_set, PhysicalPropertyDataSet):
            data_frame = PhysicalPropertyDataSet.from_pandas(data_frame)

        return data_frame
Пример #3
0
    def apply(cls, data_set, schema, n_processes=1):
        """Apply this curation component to a data set.

        Parameters
        ----------
        data_set
            The data frame to apply the component to.
        schema
            The schema which defines how this component should be applied.
        n_processes
            The number of processes that this component is allowed to
            parallelize across.

        Returns
        -------
            The data set which has had the component applied to it.
        """

        data_frame = data_set

        if isinstance(data_frame, PhysicalPropertyDataSet):
            data_frame = data_frame.to_pandas()

        modified_data_frame = cls._apply(data_frame, schema, n_processes)

        n_data_points = len(data_frame)
        n_filtered = len(modified_data_frame)

        if n_filtered != n_data_points:

            direction = "removed" if n_filtered < n_data_points else "added"

            logger.info(
                f"{abs(n_filtered - n_data_points)} data points were {direction} after "
                f"applying the {cls.__name__} component.")

        if isinstance(data_set, PhysicalPropertyDataSet):

            modified_data_frame = PhysicalPropertyDataSet.from_pandas(
                modified_data_frame)

        return modified_data_frame
Пример #4
0
def data_set(data_frame: pandas.DataFrame) -> PhysicalPropertyDataSet:
    return PhysicalPropertyDataSet.from_pandas(data_frame)