Exemplo n.º 1
0
def test_reorder_data_frame():
    """Tests that the ``reorder_data_frame`` function behaves as expected
    for 1 and 2 component entries."""

    data_rows = [
        {
            "N Components": 1,
            "Component 1": "C",
            "Role": "Solvent",
            "Mole Fraction 1": 1.0,
            "Exact Amount": 1,
        },
        {
            "N Components": 2,
            "Component 1": "CC",
            "Role 1": "Solvent",
            "Mole Fraction 1": 0.25,
            "Exact Amount 1": 1,
            "Component 2": "CO",
            "Role 2": "Solute",
            "Mole Fraction 2": 0.75,
            "Exact Amount 2": 2,
        },
        {
            "N Components": 2,
            "Component 1": "CO",
            "Role 1": "Solute",
            "Mole Fraction 1": 0.75,
            "Exact Amount 1": 2,
            "Component 2": "CC",
            "Role 2": "Solvent",
            "Mole Fraction 2": 0.25,
            "Exact Amount 2": 1,
        },
    ]

    data_frame = pandas.DataFrame(data_rows)

    reordered_data_frame = reorder_data_frame(data_frame)
    assert len(reordered_data_frame) == 3

    assert reordered_data_frame.loc[0, "N Components"] == 1

    for index in [1, 2]:

        assert reordered_data_frame.loc[index, "N Components"] == 2
        assert reordered_data_frame.loc[index, "Component 1"] == "CC"
        assert reordered_data_frame.loc[index, "Role 1"] == "Solvent"
        assert reordered_data_frame.loc[index, "Mole Fraction 1"] == 0.25
        assert reordered_data_frame.loc[index, "Exact Amount 1"] == 1
        assert reordered_data_frame.loc[index, "Component 2"] == "CO"
        assert reordered_data_frame.loc[index, "Role 2"] == "Solute"
        assert reordered_data_frame.loc[index, "Mole Fraction 2"] == 0.75
        assert reordered_data_frame.loc[index, "Exact Amount 2"] == 2
Exemplo n.º 2
0
    def _apply(cls, data_frame: pandas.DataFrame,
               schema: SelectDataPointsSchema,
               n_processes) -> pandas.DataFrame:

        max_n_substances = data_frame["N Components"].max()
        component_headers = [
            f"Component {i + 1}" for i in range(max_n_substances)
        ]

        # Re-order the data frame so that the components are alphabetically sorted.
        # This will make it easier to find unique substances.
        ordered_data_frame = reorder_data_frame(data_frame)

        # Find all of the unique substances in the data frame.
        unique_substances = ordered_data_frame[
            component_headers].drop_duplicates()

        selected_data = []

        # Start to choose the state points for each unique substance.
        for _, unique_substance in unique_substances.iterrows():

            substance_data_frame = ordered_data_frame

            for index, component in enumerate(
                    unique_substance[component_headers]):

                if pandas.isnull(component):

                    substance_data_frame = substance_data_frame[
                        substance_data_frame[component_headers[index]].isna()]

                else:

                    substance_data_frame = substance_data_frame[
                        substance_data_frame[
                            component_headers[index]] == component]

            for target_state in schema.target_states:

                substance_selected_data = cls._select_substance_data_points(
                    substance_data_frame, target_state)

                if len(substance_selected_data) == 0:
                    continue

                selected_data.append(substance_selected_data)

        selected_data = pandas.concat(selected_data,
                                      ignore_index=True,
                                      sort=False)
        return selected_data
Exemplo n.º 3
0
def reindex_data_set(
    data_set: "PhysicalPropertyDataSet",
    reference_set: Union[DataSet, DataSetCollection],
):
    """Attempts to change the unique id of estimated estimated data points to match
    the unique id of their corresponding reference data points based upon the state
    at which they were measured.

    **Note**: the data set will be modified in place.

    This method should **only** be used when attempting to convert previous results
    into the new framework, and not in general.

    Parameters
    ----------
    data_set: PhysicalPropertyDataSet
        The data set to re-index.
    reference_set: DataSet or DataSetCollection
        The data set(s) whose ids should be matched.
    """

    import pandas
    from openff.evaluator.datasets.utilities import reorder_data_frame

    if len(data_set) == 0:
        return

    estimated_data_frame = reorder_data_frame(data_set.to_pandas())

    if isinstance(reference_set, DataSet):
        reference_data_frame = reference_set.to_pandas()
    elif isinstance(reference_set, DataSetCollection):

        reference_data_frames = [
            x.to_pandas() for x in reference_set.data_sets
        ]

        reference_data_frame: pandas.DataFrame = pandas.concat(
            reference_data_frames, ignore_index=True, sort=False)
    else:
        raise NotImplementedError

    reference_data_frame = reorder_data_frame(reference_data_frame)

    minimum_n_components = estimated_data_frame["N Components"].min()
    maximum_n_components = estimated_data_frame["N Components"].max()

    id_mappings = []

    property_headers = [
        x for x in estimated_data_frame if x.find(" Value ") >= 0
    ]

    for n_components in range(minimum_n_components, maximum_n_components + 1):

        for property_header in property_headers:

            estimated_component_data = estimated_data_frame[
                estimated_data_frame["N Components"] == n_components]
            reference_component_data = reference_data_frame[
                reference_data_frame["N Components"] == n_components]

            estimated_component_data = estimated_component_data[
                estimated_component_data[property_header].notna()].copy()
            reference_component_data = reference_component_data[
                reference_component_data[property_header].notna()].copy()

            if len(estimated_component_data) == 0 or len(
                    reference_component_data) == 0:
                continue

            component_data_frames = [
                estimated_component_data, reference_component_data
            ]

            comparison_columns = {"Temperature (K)", "Pressure (kPa)", "Phase"}

            for component_data in component_data_frames:

                component_data["Temperature (K)"] = component_data[
                    "Temperature (K)"].round(1)
                component_data["Pressure (kPa)"] = (
                    component_data["Pressure (kPa)"].fillna(
                        value=numpy.nan).round(1))

                for index in range(n_components):

                    component_data[f"Mole Fraction {index + 1}"] = (
                        component_data[f"Mole Fraction {index + 1}"].fillna(
                            value=0.0).round(2))
                    component_data[
                        f"Exact Amount {index + 1}"] = component_data[
                            f"Exact Amount {index + 1}"].fillna(value=0)

                    comparison_columns.update([
                        f"Component {index + 1}",
                        f"Role {index + 1}",
                        f"Mole Fraction {index + 1}",
                        f"Exact Amount {index + 1}",
                    ])

            comparison_columns = [*comparison_columns]

            joined_frames = pandas.merge(
                estimated_component_data,
                reference_component_data,
                on=comparison_columns,
                suffixes=("_orig", "_new"),
            )

            joined_frames.drop_duplicates(subset=["Id_orig"], inplace=True)

            if len(joined_frames) != len(estimated_component_data):

                logging.warning(
                    f"{abs(len(joined_frames) - len(estimated_component_data))} "
                    f"properties could not be re-indexed.")

            id_mappings.append(joined_frames[["Id_orig", "Id_new"]])

    id_mappings_frame = pandas.concat(id_mappings,
                                      ignore_index=True,
                                      sort=False)

    id_mappings = {
        x["Id_orig"]: x["Id_new"]
        for _, x in id_mappings_frame.iterrows()
    }

    for physical_property in data_set:

        if physical_property.id not in id_mappings:
            logger.warning(f"{physical_property.id} could not be re-indexed.")
            continue

        physical_property.id = f"{id_mappings[physical_property.id]}"
Exemplo n.º 4
0
    def _apply(
        cls, data_frame: pandas.DataFrame, schema: FilterDuplicatesSchema, n_processes
    ) -> pandas.DataFrame:

        if len(data_frame) == 0:
            return data_frame

        data_frame = data_frame.copy()
        data_frame = reorder_data_frame(data_frame)

        minimum_n_components = data_frame["N Components"].min()
        maximum_n_components = data_frame["N Components"].max()

        filtered_data = []

        for n_components in range(minimum_n_components, maximum_n_components + 1):

            component_data = data_frame[
                data_frame["N Components"] == n_components
            ].copy()

            component_data["Temperature (K)"] = component_data["Temperature (K)"].round(
                schema.temperature_precision
            )
            component_data["Pressure (kPa)"] = component_data["Pressure (kPa)"].round(
                schema.pressure_precision
            )

            subset_columns = ["Temperature (K)", "Pressure (kPa)", "Phase"]

            for index in range(n_components):

                component_data[f"Mole Fraction {index + 1}"] = component_data[
                    f"Mole Fraction {index + 1}"
                ].round(schema.mole_fraction_precision)

                subset_columns.extend(
                    [
                        f"Component {index + 1}",
                        f"Role {index + 1}",
                        f"Mole Fraction {index + 1}",
                        f"Exact Amount {index + 1}",
                    ]
                )

            subset_columns = [x for x in subset_columns if x in component_data]
            value_headers = [x for x in component_data if x.find(" Value ") >= 0]

            sorted_filtered_data = []

            for value_header in value_headers:

                uncertainty_header = value_header.replace("Value", "Uncertainty")

                property_data = component_data[component_data[value_header].notna()]

                if uncertainty_header in component_data:
                    property_data = property_data.sort_values(uncertainty_header)

                property_data = property_data.drop_duplicates(
                    subset=subset_columns, keep="last"
                )

                sorted_filtered_data.append(property_data)

            sorted_filtered_data = pandas.concat(
                sorted_filtered_data, ignore_index=True, sort=False
            )

            filtered_data.append(sorted_filtered_data)

        filtered_data = pandas.concat(filtered_data, ignore_index=True, sort=False)
        return filtered_data
Exemplo n.º 5
0
    def _apply(
        cls,
        data_frame: pandas.DataFrame,
        schema: FilterByPropertyTypesSchema,
        n_processes,
    ) -> pandas.DataFrame:

        property_headers = [
            header for header in data_frame if header.find(" Value ") >= 0
        ]

        # Removes the columns for properties which are not of interest.
        for header in property_headers:

            property_type = header.split(" ")[0]

            if property_type in schema.property_types:
                continue

            data_frame = data_frame.drop(header, axis=1)

            uncertainty_header = header.replace(" Value ", " Uncertainty ")

            if uncertainty_header in data_frame:
                data_frame = data_frame.drop(uncertainty_header, axis=1)

        # Drop any rows which do not contain any values for the property types of
        # interest.
        property_headers = [
            header
            for header in property_headers
            if header.split(" ")[0] in schema.property_types
        ]

        data_frame = data_frame.dropna(subset=property_headers, how="all")

        # Apply a more specific filter which only retain which contain values
        # for the specific property types, and which were measured for the
        # specified number of components.
        for property_type, n_components in schema.n_components.items():

            property_header = next(
                iter(x for x in property_headers if x.find(f"{property_type} ") == 0),
                None,
            )

            if property_header is None:
                continue

            data_frame = data_frame[
                data_frame[property_header].isna()
                | data_frame["N Components"].isin(n_components)
            ]

        # Apply the strict filter if requested
        if schema.strict:

            reordered_data_frame = reorder_data_frame(data_frame)

            # Build a dictionary of which properties should be present partitioned
            # by the number of components they should have been be measured for.
            property_types = defaultdict(list)

            if len(schema.n_components) > 0:

                for property_type, n_components in schema.n_components.items():

                    for n_component in n_components:
                        property_types[n_component].append(property_type)

                min_n_components = min(property_types)
                max_n_components = max(property_types)

            else:

                min_n_components = reordered_data_frame["N Components"].min()
                max_n_components = reordered_data_frame["N Components"].max()

                for n_components in range(min_n_components, max_n_components + 1):
                    property_types[n_components].extend(schema.property_types)

            substances_with_data = set()
            components_with_data = {}

            # For each N component find substances which have data points for
            # all of the specified property types.
            for n_components in range(min_n_components, max_n_components + 1):

                component_data = reordered_data_frame[
                    reordered_data_frame["N Components"] == n_components
                ]

                if n_components not in property_types or len(component_data) == 0:
                    continue

                n_component_headers = [
                    header
                    for header in property_headers
                    if header.split(" ")[0] in property_types[n_components]
                    and header in component_data
                ]

                if len(n_component_headers) != len(property_types[n_components]):
                    continue

                n_component_substances = set.intersection(
                    *[
                        data_frame_to_substances(
                            component_data[component_data[header].notna()]
                        )
                        for header in n_component_headers
                    ]
                )
                substances_with_data.update(n_component_substances)
                components_with_data[n_components] = {
                    component
                    for substance in n_component_substances
                    for component in substance
                }

            if len(schema.n_components) > 0:
                components_with_all_data = set.intersection(
                    *components_with_data.values()
                )

                # Filter out any smiles for don't appear in all of the N component
                # substances.
                data_frame = FilterBySmiles.apply(
                    data_frame,
                    FilterBySmilesSchema(smiles_to_include=[*components_with_all_data]),
                )

            # Filter out any substances which (within each N component) don't have
            # all of the specified data types.
            data_frame = FilterBySubstances.apply(
                data_frame,
                FilterBySubstancesSchema(substances_to_include=[*substances_with_data]),
            )

        data_frame = data_frame.dropna(axis=1, how="all")
        return data_frame