def filter_data(data_directory, properties_of_interest, chemical_environments,
                output_directory):
    """Filters out any measurements which where made for components which
    do not contain the chemical environments of interest.

    Parameters
    ----------
    data_directory: str
        The directory containing the unfiltered data.
    properties_of_interest: list of tuple of PropertyType and SubstanceType
        The types of properties to extract data for.
    chemical_environments: list of list of str
        A list of those chemical environments to filter by. Each list in the
        full list corresponds to the chemical environments which should be
        matched by one of the components in the system.
    output_directory: str
        The directory to store the extracted data in.
    """

    for property_tuple in properties_of_interest:

        property_type, substance_type = property_tuple

        data_set = processing.load_processed_data_set(data_directory,
                                                      property_type,
                                                      substance_type)

        # Start by filtering out any substances not composed of O, C, H, N, F, Cl, Br, S
        data_set = filter_by_elements(data_set, "C", "H", "O", "N", "F", "Cl",
                                      "Br", "S")

        # Next filter out any substances which aren't alcohols, esters or acids.
        data_set = filter_by_checkmol(data_set, *chemical_environments)

        # Save the filtered data set.
        processing.save_processed_data_set(output_directory, data_set,
                                           property_type, substance_type)

        # Save out a pdf of all smiles patterns (/ tuples of smiles patterns).
        property_type = property_to_snake_case(property_type)

        file_name = f"{property_type}_{str(substance_type.value)}.pdf"
        file_path = os.path.join(output_directory, file_name)

        data_frame_to_pdf(data_set, file_path)
Пример #2
0
def main():

    logging.basicConfig(level=logging.INFO)

    data_frames = [
        pandas.read_csv(os.path.join("test_sets", "density_binary.csv")),
        pandas.read_csv(os.path.join("test_sets", "enthalpy_of_mixing_binary.csv")),
        pandas.read_csv(os.path.join("test_sets", "excess_molar_volume_binary.csv")),
        pandas.read_csv(os.path.join("test_sets", "pure_set.csv")),
    ]

    full_data_frame = pandas.concat(data_frames, ignore_index=True, sort=False)
    full_data_set = data_set_from_data_frame(full_data_frame)

    full_data_frame.to_csv(os.path.join("test_sets", "full_set.csv"), index=False)
    full_data_set.json(os.path.join("test_sets", "full_set.json"))

    data_frame_to_pdf(full_data_frame, os.path.join("test_sets", "full_set.pdf"))
def filter_data(data_directory, property_type, substance_type,
                output_directory):

    # Load in the data set
    data_frame = load_processed_data_set(data_directory, property_type,
                                         substance_type)

    # Filter to be close to ambient.
    data_frame = filter_by_temperature(data_frame, 290.0 * unit.kelvin,
                                       305 * unit.kelvin)

    # Filter out aromatics, long chain molecules (>= hept), alkenes,
    # ethers, 3 + 4 membered rings
    data_frame = filter_by_smirks(
        data_frame,
        None,
        [
            "[#6a]",
            "[#6r3]",
            "[#6r4]",
            "[#6]=[#6]",
            "[#6]~[#6]~[#6]~[#6]~[#6]~[#6]~[#6]",
            "[#6H2]-[#8X2]-[#6H2]",
        ],
    )

    # Filter out any molecules with undefined stereochemistry
    data_frame = filter_undefined_stereochemistry(data_frame)

    # Save the filtered set.
    save_processed_data_set(
        output_directory,
        data_frame,
        property_type,
        substance_type,
    )

    property_type = property_to_snake_case(property_type)
    file_name = f"{property_type}_{str(substance_type.value)}.pdf"

    data_frame_to_pdf(data_frame, os.path.join(output_directory, file_name))
def main():

    logging.basicConfig(level=logging.INFO)

    root_output_directory = "test_sets"
    os.makedirs(root_output_directory, exist_ok=True)

    # Define the types of property which are of interest.
    properties_of_interest = [
        (Density, SubstanceType.Pure),
        (EnthalpyOfVaporization, SubstanceType.Pure),
    ]

    # Define the state we would ideally chose data points at.
    target_states = [
        StatePoint(298.15 * unit.kelvin, 1.0 * unit.atmosphere, (1.0, )),
    ]
    target_states = {x: target_states for x in properties_of_interest}

    # Define the environments of interest.
    environments_of_interest = [
        "alcohol", "ester", "alkane", "ether", "ketone"
    ]

    # Load in the training substances so we can avoid selecting
    # them for the test set.
    training_smiles = load_training_components()

    with TemporaryDirectory() as data_directory:

        # Apply the filters to the available data.
        for property_of_interest in properties_of_interest:

            data_frames = []

            for environment in environments_of_interest:

                data_frame = load_processed_data_set(
                    os.path.join(
                        "..",
                        "..",
                        "..",
                        "data_availability",
                        "data_by_environments",
                        f"{environment}_{environment}",
                        "all_data",
                    ),
                    *property_of_interest,
                )

                data_frames.append(data_frame)

            data_frame = pandas.concat(data_frames,
                                       ignore_index=True,
                                       sort=False)

            data_frame = filter_data(data_frame)
            data_frame = filter_by_smiles(data_frame, training_smiles, None)

            save_processed_data_set(data_directory, data_frame,
                                    *property_of_interest)

        # Determine which components have enthalpy of vaporization
        # measurements. These will be the compounds which will be
        # included in the pure test set.
        h_vap_data_frame = load_processed_data_set(data_directory,
                                                   EnthalpyOfVaporization,
                                                   SubstanceType.Pure)

        test_set_components = {*h_vap_data_frame["Component 1"]}
        test_set_components = [(x, ) for x in test_set_components]

        # Select the data points.
        selected_data_set = select_data_points(
            data_directory=data_directory,
            chosen_substances=test_set_components,
            target_state_points=target_states,
        )

    selected_data_set.json(os.path.join(root_output_directory,
                                        "pure_set.json"))

    selected_data_frame = selected_data_set.to_pandas()
    selected_data_frame.to_csv(os.path.join(root_output_directory,
                                            "pure_set.csv"),
                               index=False)

    data_frame_to_pdf(selected_data_frame,
                      os.path.join(root_output_directory, "pure_set.pdf"))
Пример #5
0
def main():

    training_set_smiles = [
        # Ethers
        "C1COCCO1",
        "C1CCOCC1",
        "COC(C)(C)C",
        "CC(C)OC(C)C",
        "CCCCOCCCC",
        # Ketones
        "O=C1CCCC1",
        "CCCC(C)=O",
        "O=C1CCCCC1",
        "O=C1CCCCCC1",
        # Alcohols
        "CO",
        "CCO",
        "CCCO",
        "CCCCO",
        "CC(C)(C)O",
        "CC(C)O",
        "CC(C)CO",
        # Esters
        "CC(=O)O",
        "COC=O",
        "CCOC(C)=O",
        "CCOC(=O)CC(=O)OCC",
        "CCCCOC(C)=O",
        "CCCOC(C)=O",
        # Alkanes
        "C1CCCCC1",
        "CCCCCC",
        "CC1CCCCC1",
        "CCCCCCC",
        "CC(C)CC(C)(C)C",
        "CCCCCCCCCC",
    ]

    # Ensure the smiles patterns are standardized.
    smiles = [Component(x).smiles for x in training_set_smiles]

    # Load in the Hvap data
    h_vap_data_frame = pandas.read_csv(
        os.path.join(
            "..",
            "..",
            "..",
            "data_availability",
            "sourced_h_vap_data",
            "enthalpy_of_vaporization_pure.csv",
        ))
    h_vap_data_frame = filter_by_smiles(h_vap_data_frame,
                                        smiles_to_include=smiles,
                                        smiles_to_exclude=None)

    h_vap_data_set = data_set_from_data_frame(h_vap_data_frame)

    # Load in the density data
    density_data_frame = pandas.read_csv(
        os.path.join(
            "..",
            "..",
            "..",
            "..",
            "shared",
            "filtered_data",
            "density_pure.csv",
        ))
    density_data_frame = filter_by_smiles(density_data_frame,
                                          smiles_to_include=smiles,
                                          smiles_to_exclude=None)

    density_data_set = data_set_from_data_frame(density_data_frame)

    # Retain the density measurements which were made closest to 298.15K and 1 atm.
    target_state_point = StatePoint(
        temperature=298.15 * unit.kelvin,
        pressure=1.0 * unit.atmosphere,
        mole_fractions=(1.0, ),
    )

    final_data_set = PhysicalPropertyDataSet()

    for substance in density_data_set.substances:

        properties_per_state = defaultdict(list)

        # Refactor the properties into more convenient data structures.
        for physical_property in density_data_set.properties_by_substance(
                substance):

            state_point = StatePoint.from_physical_property(physical_property)
            properties_per_state[state_point].append(physical_property)

        # Sort the state points based on their distance to the target state.
        sorted_states_points = list(
            sorted(
                properties_per_state.keys(),
                key=functools.partial(StatePoint.individual_distances,
                                      target_state_point),
            ))

        final_data_set.add_properties(
            properties_per_state[sorted_states_points[0]][0])

    final_data_set.merge(h_vap_data_set)

    final_data_set.json("training_set.json", format=True)

    final_data_frame = final_data_set.to_pandas()
    final_data_frame.to_csv("training_set.csv", index=False)

    data_frame_to_pdf(final_data_frame, "training_set.pdf")

    generate_report("training_set.json")
def main():

    # Set up logging
    logging.basicConfig(level=logging.INFO)

    smiles_map = {
        # Ethers
        "1,4-dioxane": "C1COCCO1",
        "oxane": "C1CCOCC1",
        "methyl tert butyl ether": "COC(C)(C)C",
        "diisopropyl ether": "CC(C)OC(C)C",
        "dibuytl ether": "CCCCOCCCC",
        # Ketones
        "cyclopentanone": "O=C1CCCC1",
        "2-pentanone": "CCCC(C)=O",
        "cyclohexanone": "O=C1CCCCC1",
        "cycloheptanone": "O=C1CCCCCC1",
        # Alcohols
        "methanol": "CO",
        "ethanol": "CCO",
        "propanol": "CCCO",
        "butanol": "CCCCO",
        "propan-2-ol": "CC(C)O",
        "2-Methylpropan-1-ol": "CC(C)CO",
        "2-Methylpropan-2-ol": "CC(C)(C)O",
        # Esters / acids
        "acetic acid": "CC(=O)O",
        "methyl formate": "COC=O",
        "ethyl acetate": "CCOC(C)=O",
        "propyl acetate": "CCCOC(C)=O",
        "butyl acetate": "CCCCOC(C)=O",
        "diethyl propanedioate": "CCOC(=O)CC(=O)OCC",
        # Alkanes
        "cyclohexane": "C1CCCCC1",
        "hexane": "CCCCCC",
        "methylcyclohexane": "CC1CCCCC1",
        "heptane": "CCCCCCC",
        "iso-octane": "CC(C)CC(C)(C)C",
        "decane": "CCCCCCCCCC",
    }

    substances = [
        # Ether - Alkane
        (smiles_map["dibuytl ether"], smiles_map["iso-octane"]),
        (smiles_map["oxane"], smiles_map["heptane"]),
        (smiles_map["methyl tert butyl ether"], smiles_map["decane"]),
        (smiles_map["diisopropyl ether"], smiles_map["iso-octane"]),
        (smiles_map["diisopropyl ether"], smiles_map["heptane"]),
        (smiles_map["oxane"], smiles_map["hexane"]),
        (smiles_map["oxane"], smiles_map["cyclohexane"]),
        # Alcohol - Alkane
        (smiles_map["propanol"], smiles_map["cyclohexane"]),
        (smiles_map["propanol"], smiles_map["iso-octane"]),
        (smiles_map["propanol"], smiles_map["methylcyclohexane"]),
        (smiles_map["butanol"], smiles_map["iso-octane"]),
        (smiles_map["butanol"], smiles_map["hexane"]),
        (smiles_map["butanol"], smiles_map["methylcyclohexane"]),
        (smiles_map["butanol"], smiles_map["heptane"]),
        (smiles_map["ethanol"], smiles_map["iso-octane"]),
        (smiles_map["ethanol"], smiles_map["heptane"]),
        # Ether - Ketone
        (smiles_map["oxane"], smiles_map["cyclopentanone"]),
        (smiles_map["oxane"], smiles_map["cyclohexanone"]),
        (smiles_map["oxane"], smiles_map["2-pentanone"]),
        (smiles_map["1,4-dioxane"], smiles_map["cyclopentanone"]),
        (smiles_map["1,4-dioxane"], smiles_map["cyclohexanone"]),
        (smiles_map["1,4-dioxane"], smiles_map["2-pentanone"]),
        (smiles_map["1,4-dioxane"], smiles_map["cycloheptanone"]),
        # Alcohol - Ester ( / acid)
        (smiles_map["methanol"], smiles_map["methyl formate"]),
        (smiles_map["methanol"], smiles_map["diethyl propanedioate"]),
        (smiles_map["ethanol"], smiles_map["acetic acid"]),
        (smiles_map["ethanol"], smiles_map["ethyl acetate"]),
        (smiles_map["ethanol"], smiles_map["diethyl propanedioate"]),
        (smiles_map["butanol"], smiles_map["diethyl propanedioate"]),
        (smiles_map["propan-2-ol"], smiles_map["diethyl propanedioate"]),
        (smiles_map["2-Methylpropan-1-ol"],
         smiles_map["diethyl propanedioate"]),
        (smiles_map["2-Methylpropan-2-ol"], smiles_map["methyl formate"]),
        (smiles_map["2-Methylpropan-2-ol"], smiles_map["butyl acetate"]),
    ]

    substances = [tuple(sorted(x)) for x in substances]

    smiles_to_pdf(substances, "all_substances.pdf")

    target_states = [
        StatePoint(298.15 * unit.kelvin, 1.0 * unit.atmosphere, (0.25, 0.75)),
        StatePoint(298.15 * unit.kelvin, 1.0 * unit.atmosphere, (0.50, 0.50)),
        StatePoint(298.15 * unit.kelvin, 1.0 * unit.atmosphere, (0.75, 0.25)),
    ]

    filtered_directory = "filtered_common_data"
    os.makedirs(filtered_directory, exist_ok=True)

    filter_common_data(filtered_directory, substances)

    output_directory = "training_sets"
    os.makedirs(output_directory, exist_ok=True)

    h_mix_rho_x = select_data_points(
        data_directory=os.path.join(filtered_directory, "h_mix_and_rho_x"),
        chosen_substances=None,
        target_state_points={
            (EnthalpyOfMixing, SubstanceType.Binary): target_states,
            (Density, SubstanceType.Binary): target_states,
        },
    )

    h_mix_rho_x.json(
        os.path.join(output_directory, "h_mix_rho_x_training_set.json"))
    h_mix_rho_x = h_mix_rho_x.to_pandas()

    h_mix_rho_x.to_csv(os.path.join(output_directory,
                                    "h_mix_rho_x_training_set.csv"),
                       index=False)
    data_frame_to_pdf(
        h_mix_rho_x,
        os.path.join(output_directory, "h_mix_rho_x_training_set.pdf"),
    )

    generate_report(
        os.path.join(output_directory, "h_mix_rho_x_training_set.json"))
def filter_common_data(output_directory, substances):
    """Filter the common data to a smaller temperature range - this
    seems to help the state selection method get closer to the target
    states.
    """
    os.makedirs(os.path.join(output_directory, "h_mix_and_rho_x"),
                exist_ok=True)

    for property_type, substance_type in [
        (EnthalpyOfMixing, SubstanceType.Binary),
        (Density, SubstanceType.Binary),
    ]:

        data_frames = []

        for environment_mix in [
                "alcohol_ester",
                "alcohol_alkane",
                "ether_alkane",
                "ether_ketone",
        ]:

            data_frame = load_processed_data_set(
                os.path.join(
                    "..",
                    "..",
                    "..",
                    "data_availability",
                    "data_by_environments",
                    environment_mix,
                    "common_data",
                    "h_mix_rho_x",
                ),
                property_type,
                substance_type,
            )

            data_frame = filter_by_substance_composition(
                data_frame, substances, None)

            data_frame = data_frame[(data_frame["Mole Fraction 1"] > 0.10)
                                    & (data_frame["Mole Fraction 1"] < 0.90)]

            data_frames.append(data_frame)

        full_data_frame = pandas.concat(data_frames)

        save_processed_data_set(
            os.path.join(output_directory, "h_mix_and_rho_x"),
            full_data_frame,
            property_type,
            substance_type,
        )
        data_frame_to_pdf(
            full_data_frame,
            os.path.join(
                output_directory,
                "h_mix_and_rho_x",
                property_to_file_name(property_type, substance_type) + ".pdf",
            ),
        )
def main():

    root_output_directory = "partitioned_data"

    # Define the types of property which are of interest.
    properties_of_interest = [
        (Density, SubstanceType.Pure),
        (EnthalpyOfVaporization, SubstanceType.Pure),
        (EnthalpyOfMixing, SubstanceType.Binary),
        (ExcessMolarVolume, SubstanceType.Binary),
        (Density, SubstanceType.Binary),
    ]

    # Define the types of mixture which are of interest
    environment_types = ["alcohol_alcohol", "alcohol_ester", "ester_ester"]

    # Find all of the substances which appeared in the training set
    training_smiles = find_training_smiles()

    for environment_type in environment_types:

        output_directory = os.path.join(root_output_directory, environment_type)
        os.makedirs(output_directory, exist_ok=True)

        for property_type, substance_type in properties_of_interest:

            full_data_frame = load_processed_data_set(
                os.path.join("filtered_data", environment_type),
                property_type,
                substance_type,
            )

            property_type = property_to_snake_case(property_type)
            file_name = f"{property_type}_{str(substance_type.value)}"

            # Extract properties where neither component appears in
            # in the training set.
            if substance_type == SubstanceType.Binary:

                data_frame = full_data_frame[
                    ~full_data_frame["Component 1"].isin(training_smiles)
                    & ~full_data_frame["Component 2"].isin(training_smiles)
                ]

            elif substance_type == SubstanceType.Pure:

                data_frame = full_data_frame[
                    ~full_data_frame["Component 1"].isin(training_smiles)
                ]

            else:

                raise NotImplementedError()

            base_directory = os.path.join(output_directory, "not_in_training")
            os.makedirs(base_directory, exist_ok=True)

            data_frame.to_csv(
                os.path.join(base_directory, file_name + ".csv"), index=False
            )
            data_frame_to_pdf(
                data_frame, os.path.join(base_directory, file_name + ".pdf")
            )

            if substance_type == SubstanceType.Pure:
                continue

            # Extract properties where both components appear in
            # in the training set.
            data_frame = full_data_frame[
                full_data_frame["Component 1"].isin(training_smiles)
                & full_data_frame["Component 2"].isin(training_smiles)
            ]

            base_directory = os.path.join(output_directory, "both_in_training")
            os.makedirs(base_directory, exist_ok=True)

            data_frame.to_csv(
                os.path.join(base_directory, file_name + ".csv"), index=False
            )
            data_frame_to_pdf(
                data_frame, os.path.join(base_directory, file_name + ".pdf")
            )

            # Extract properties where only one component appears in
            # in the training set.
            data_frame = full_data_frame[
                (
                    full_data_frame["Component 1"].isin(training_smiles)
                    & ~full_data_frame["Component 2"].isin(training_smiles)
                )
                | (
                    ~full_data_frame["Component 1"].isin(training_smiles)
                    & full_data_frame["Component 2"].isin(training_smiles)
                )
            ]

            base_directory = os.path.join(output_directory, "one_in_training")
            os.makedirs(base_directory, exist_ok=True)

            data_frame.to_csv(
                os.path.join(base_directory, file_name + ".csv"), index=False
            )
            data_frame_to_pdf(
                data_frame, os.path.join(base_directory, file_name + ".pdf")
            )
def main():

    # Set up logging
    logging.basicConfig(level=logging.INFO)

    substances = [
        # Methanol
        ("CO", "COC=O"),  # Methyl formate
        ("CO", "CCOC(=O)CC(=O)OCC"),  # Diethyl Malonate
        # Ethanol
        ("CCO", "CC(=O)O"),  # Acetic acid
        ("CCO", "CCOC(C)=O"),  # Ethyl acetate
        ("CCO", "CCOC(=O)CC(=O)OCC"),  # Diethyl Malonate
        # Butanol
        ("CCCCO", "CCOC(=O)CC(=O)OCC"),  # Diethyl Malonate
        # Isopropanol
        ("CC(C)O", "CCOC(=O)CC(=O)OCC"),  # Diethyl Malonate
        # Isobutanol
        ("CC(C)CO", "CCOC(=O)CC(=O)OCC"),  # Diethyl Malonate
        # Tert-butanol
        ("CC(C)(C)O", "COC=O"),  # Methyl formate
        ("CC(C)(C)O", "CCCCOC(C)=O"),  # Butyl acetate
    ]

    substances = [tuple(sorted(x)) for x in substances]

    target_states = [
        StatePoint(298.15 * unit.kelvin, 1.0 * unit.atmosphere, (0.25, 0.75)),
        StatePoint(298.15 * unit.kelvin, 1.0 * unit.atmosphere, (0.50, 0.50)),
        StatePoint(298.15 * unit.kelvin, 1.0 * unit.atmosphere, (0.75, 0.25)),
    ]

    filtered_directory = "filtered_common_data"
    os.makedirs(filtered_directory, exist_ok=True)

    filter_common_data(filtered_directory)

    output_directory = "training_sets"
    os.makedirs(output_directory, exist_ok=True)

    h_mix_v_excess = select_data_points(
        data_directory=os.path.join(filtered_directory, "h_mix_and_v_excess"),
        chosen_substances=substances,
        target_state_points={
            (EnthalpyOfMixing, SubstanceType.Binary): target_states,
            (ExcessMolarVolume, SubstanceType.Binary): target_states,
        },
    )

    h_mix_v_excess.json(
        os.path.join(output_directory, "h_mix_v_excess_training_set.json"))
    h_mix_v_excess = h_mix_v_excess.to_pandas()

    h_mix_v_excess.to_csv(os.path.join(output_directory,
                                       "h_mix_v_excess_training_set.csv"),
                          index=False)
    data_frame_to_pdf(
        h_mix_v_excess,
        os.path.join(output_directory, "h_mix_v_excess_training_set.pdf"),
    )

    h_mix_density = select_data_points(
        data_directory=os.path.join(filtered_directory,
                                    "h_mix_and_binary_density"),
        chosen_substances=substances,
        target_state_points={
            (EnthalpyOfMixing, SubstanceType.Binary): target_states,
            (Density, SubstanceType.Binary): target_states,
        },
    )

    h_mix_density.json(
        os.path.join(output_directory, "h_mix_density_training_set.json"))
    h_mix_density = h_mix_density.to_pandas()

    h_mix_density.to_csv(os.path.join(output_directory,
                                      "h_mix_density_training_set.csv"),
                         index=False)
    data_frame_to_pdf(
        h_mix_density,
        os.path.join(output_directory, "h_mix_density_training_set.pdf"))