def filter_data(data_directory, properties_of_interest, chemical_environments, output_directory): """Filters out any measurements which where made for components which do not contain the chemical environments of interest. Parameters ---------- data_directory: str The directory containing the unfiltered data. properties_of_interest: list of tuple of PropertyType and SubstanceType The types of properties to extract data for. chemical_environments: list of list of str A list of those chemical environments to filter by. Each list in the full list corresponds to the chemical environments which should be matched by one of the components in the system. output_directory: str The directory to store the extracted data in. """ for property_tuple in properties_of_interest: property_type, substance_type = property_tuple data_set = processing.load_processed_data_set(data_directory, property_type, substance_type) # Start by filtering out any substances not composed of O, C, H, N, F, Cl, Br, S data_set = filter_by_elements(data_set, "C", "H", "O", "N", "F", "Cl", "Br", "S") # Next filter out any substances which aren't alcohols, esters or acids. data_set = filter_by_checkmol(data_set, *chemical_environments) # Save the filtered data set. processing.save_processed_data_set(output_directory, data_set, property_type, substance_type) # Save out a pdf of all smiles patterns (/ tuples of smiles patterns). property_type = property_to_snake_case(property_type) file_name = f"{property_type}_{str(substance_type.value)}.pdf" file_path = os.path.join(output_directory, file_name) data_frame_to_pdf(data_set, file_path)
def main(): logging.basicConfig(level=logging.INFO) data_frames = [ pandas.read_csv(os.path.join("test_sets", "density_binary.csv")), pandas.read_csv(os.path.join("test_sets", "enthalpy_of_mixing_binary.csv")), pandas.read_csv(os.path.join("test_sets", "excess_molar_volume_binary.csv")), pandas.read_csv(os.path.join("test_sets", "pure_set.csv")), ] full_data_frame = pandas.concat(data_frames, ignore_index=True, sort=False) full_data_set = data_set_from_data_frame(full_data_frame) full_data_frame.to_csv(os.path.join("test_sets", "full_set.csv"), index=False) full_data_set.json(os.path.join("test_sets", "full_set.json")) data_frame_to_pdf(full_data_frame, os.path.join("test_sets", "full_set.pdf"))
def filter_data(data_directory, property_type, substance_type, output_directory): # Load in the data set data_frame = load_processed_data_set(data_directory, property_type, substance_type) # Filter to be close to ambient. data_frame = filter_by_temperature(data_frame, 290.0 * unit.kelvin, 305 * unit.kelvin) # Filter out aromatics, long chain molecules (>= hept), alkenes, # ethers, 3 + 4 membered rings data_frame = filter_by_smirks( data_frame, None, [ "[#6a]", "[#6r3]", "[#6r4]", "[#6]=[#6]", "[#6]~[#6]~[#6]~[#6]~[#6]~[#6]~[#6]", "[#6H2]-[#8X2]-[#6H2]", ], ) # Filter out any molecules with undefined stereochemistry data_frame = filter_undefined_stereochemistry(data_frame) # Save the filtered set. save_processed_data_set( output_directory, data_frame, property_type, substance_type, ) property_type = property_to_snake_case(property_type) file_name = f"{property_type}_{str(substance_type.value)}.pdf" data_frame_to_pdf(data_frame, os.path.join(output_directory, file_name))
def main(): logging.basicConfig(level=logging.INFO) root_output_directory = "test_sets" os.makedirs(root_output_directory, exist_ok=True) # Define the types of property which are of interest. properties_of_interest = [ (Density, SubstanceType.Pure), (EnthalpyOfVaporization, SubstanceType.Pure), ] # Define the state we would ideally chose data points at. target_states = [ StatePoint(298.15 * unit.kelvin, 1.0 * unit.atmosphere, (1.0, )), ] target_states = {x: target_states for x in properties_of_interest} # Define the environments of interest. environments_of_interest = [ "alcohol", "ester", "alkane", "ether", "ketone" ] # Load in the training substances so we can avoid selecting # them for the test set. training_smiles = load_training_components() with TemporaryDirectory() as data_directory: # Apply the filters to the available data. for property_of_interest in properties_of_interest: data_frames = [] for environment in environments_of_interest: data_frame = load_processed_data_set( os.path.join( "..", "..", "..", "data_availability", "data_by_environments", f"{environment}_{environment}", "all_data", ), *property_of_interest, ) data_frames.append(data_frame) data_frame = pandas.concat(data_frames, ignore_index=True, sort=False) data_frame = filter_data(data_frame) data_frame = filter_by_smiles(data_frame, training_smiles, None) save_processed_data_set(data_directory, data_frame, *property_of_interest) # Determine which components have enthalpy of vaporization # measurements. These will be the compounds which will be # included in the pure test set. h_vap_data_frame = load_processed_data_set(data_directory, EnthalpyOfVaporization, SubstanceType.Pure) test_set_components = {*h_vap_data_frame["Component 1"]} test_set_components = [(x, ) for x in test_set_components] # Select the data points. selected_data_set = select_data_points( data_directory=data_directory, chosen_substances=test_set_components, target_state_points=target_states, ) selected_data_set.json(os.path.join(root_output_directory, "pure_set.json")) selected_data_frame = selected_data_set.to_pandas() selected_data_frame.to_csv(os.path.join(root_output_directory, "pure_set.csv"), index=False) data_frame_to_pdf(selected_data_frame, os.path.join(root_output_directory, "pure_set.pdf"))
def main(): training_set_smiles = [ # Ethers "C1COCCO1", "C1CCOCC1", "COC(C)(C)C", "CC(C)OC(C)C", "CCCCOCCCC", # Ketones "O=C1CCCC1", "CCCC(C)=O", "O=C1CCCCC1", "O=C1CCCCCC1", # Alcohols "CO", "CCO", "CCCO", "CCCCO", "CC(C)(C)O", "CC(C)O", "CC(C)CO", # Esters "CC(=O)O", "COC=O", "CCOC(C)=O", "CCOC(=O)CC(=O)OCC", "CCCCOC(C)=O", "CCCOC(C)=O", # Alkanes "C1CCCCC1", "CCCCCC", "CC1CCCCC1", "CCCCCCC", "CC(C)CC(C)(C)C", "CCCCCCCCCC", ] # Ensure the smiles patterns are standardized. smiles = [Component(x).smiles for x in training_set_smiles] # Load in the Hvap data h_vap_data_frame = pandas.read_csv( os.path.join( "..", "..", "..", "data_availability", "sourced_h_vap_data", "enthalpy_of_vaporization_pure.csv", )) h_vap_data_frame = filter_by_smiles(h_vap_data_frame, smiles_to_include=smiles, smiles_to_exclude=None) h_vap_data_set = data_set_from_data_frame(h_vap_data_frame) # Load in the density data density_data_frame = pandas.read_csv( os.path.join( "..", "..", "..", "..", "shared", "filtered_data", "density_pure.csv", )) density_data_frame = filter_by_smiles(density_data_frame, smiles_to_include=smiles, smiles_to_exclude=None) density_data_set = data_set_from_data_frame(density_data_frame) # Retain the density measurements which were made closest to 298.15K and 1 atm. target_state_point = StatePoint( temperature=298.15 * unit.kelvin, pressure=1.0 * unit.atmosphere, mole_fractions=(1.0, ), ) final_data_set = PhysicalPropertyDataSet() for substance in density_data_set.substances: properties_per_state = defaultdict(list) # Refactor the properties into more convenient data structures. for physical_property in density_data_set.properties_by_substance( substance): state_point = StatePoint.from_physical_property(physical_property) properties_per_state[state_point].append(physical_property) # Sort the state points based on their distance to the target state. sorted_states_points = list( sorted( properties_per_state.keys(), key=functools.partial(StatePoint.individual_distances, target_state_point), )) final_data_set.add_properties( properties_per_state[sorted_states_points[0]][0]) final_data_set.merge(h_vap_data_set) final_data_set.json("training_set.json", format=True) final_data_frame = final_data_set.to_pandas() final_data_frame.to_csv("training_set.csv", index=False) data_frame_to_pdf(final_data_frame, "training_set.pdf") generate_report("training_set.json")
def main(): # Set up logging logging.basicConfig(level=logging.INFO) smiles_map = { # Ethers "1,4-dioxane": "C1COCCO1", "oxane": "C1CCOCC1", "methyl tert butyl ether": "COC(C)(C)C", "diisopropyl ether": "CC(C)OC(C)C", "dibuytl ether": "CCCCOCCCC", # Ketones "cyclopentanone": "O=C1CCCC1", "2-pentanone": "CCCC(C)=O", "cyclohexanone": "O=C1CCCCC1", "cycloheptanone": "O=C1CCCCCC1", # Alcohols "methanol": "CO", "ethanol": "CCO", "propanol": "CCCO", "butanol": "CCCCO", "propan-2-ol": "CC(C)O", "2-Methylpropan-1-ol": "CC(C)CO", "2-Methylpropan-2-ol": "CC(C)(C)O", # Esters / acids "acetic acid": "CC(=O)O", "methyl formate": "COC=O", "ethyl acetate": "CCOC(C)=O", "propyl acetate": "CCCOC(C)=O", "butyl acetate": "CCCCOC(C)=O", "diethyl propanedioate": "CCOC(=O)CC(=O)OCC", # Alkanes "cyclohexane": "C1CCCCC1", "hexane": "CCCCCC", "methylcyclohexane": "CC1CCCCC1", "heptane": "CCCCCCC", "iso-octane": "CC(C)CC(C)(C)C", "decane": "CCCCCCCCCC", } substances = [ # Ether - Alkane (smiles_map["dibuytl ether"], smiles_map["iso-octane"]), (smiles_map["oxane"], smiles_map["heptane"]), (smiles_map["methyl tert butyl ether"], smiles_map["decane"]), (smiles_map["diisopropyl ether"], smiles_map["iso-octane"]), (smiles_map["diisopropyl ether"], smiles_map["heptane"]), (smiles_map["oxane"], smiles_map["hexane"]), (smiles_map["oxane"], smiles_map["cyclohexane"]), # Alcohol - Alkane (smiles_map["propanol"], smiles_map["cyclohexane"]), (smiles_map["propanol"], smiles_map["iso-octane"]), (smiles_map["propanol"], smiles_map["methylcyclohexane"]), (smiles_map["butanol"], smiles_map["iso-octane"]), (smiles_map["butanol"], smiles_map["hexane"]), (smiles_map["butanol"], smiles_map["methylcyclohexane"]), (smiles_map["butanol"], smiles_map["heptane"]), (smiles_map["ethanol"], smiles_map["iso-octane"]), (smiles_map["ethanol"], smiles_map["heptane"]), # Ether - Ketone (smiles_map["oxane"], smiles_map["cyclopentanone"]), (smiles_map["oxane"], smiles_map["cyclohexanone"]), (smiles_map["oxane"], smiles_map["2-pentanone"]), (smiles_map["1,4-dioxane"], smiles_map["cyclopentanone"]), (smiles_map["1,4-dioxane"], smiles_map["cyclohexanone"]), (smiles_map["1,4-dioxane"], smiles_map["2-pentanone"]), (smiles_map["1,4-dioxane"], smiles_map["cycloheptanone"]), # Alcohol - Ester ( / acid) (smiles_map["methanol"], smiles_map["methyl formate"]), (smiles_map["methanol"], smiles_map["diethyl propanedioate"]), (smiles_map["ethanol"], smiles_map["acetic acid"]), (smiles_map["ethanol"], smiles_map["ethyl acetate"]), (smiles_map["ethanol"], smiles_map["diethyl propanedioate"]), (smiles_map["butanol"], smiles_map["diethyl propanedioate"]), (smiles_map["propan-2-ol"], smiles_map["diethyl propanedioate"]), (smiles_map["2-Methylpropan-1-ol"], smiles_map["diethyl propanedioate"]), (smiles_map["2-Methylpropan-2-ol"], smiles_map["methyl formate"]), (smiles_map["2-Methylpropan-2-ol"], smiles_map["butyl acetate"]), ] substances = [tuple(sorted(x)) for x in substances] smiles_to_pdf(substances, "all_substances.pdf") target_states = [ StatePoint(298.15 * unit.kelvin, 1.0 * unit.atmosphere, (0.25, 0.75)), StatePoint(298.15 * unit.kelvin, 1.0 * unit.atmosphere, (0.50, 0.50)), StatePoint(298.15 * unit.kelvin, 1.0 * unit.atmosphere, (0.75, 0.25)), ] filtered_directory = "filtered_common_data" os.makedirs(filtered_directory, exist_ok=True) filter_common_data(filtered_directory, substances) output_directory = "training_sets" os.makedirs(output_directory, exist_ok=True) h_mix_rho_x = select_data_points( data_directory=os.path.join(filtered_directory, "h_mix_and_rho_x"), chosen_substances=None, target_state_points={ (EnthalpyOfMixing, SubstanceType.Binary): target_states, (Density, SubstanceType.Binary): target_states, }, ) h_mix_rho_x.json( os.path.join(output_directory, "h_mix_rho_x_training_set.json")) h_mix_rho_x = h_mix_rho_x.to_pandas() h_mix_rho_x.to_csv(os.path.join(output_directory, "h_mix_rho_x_training_set.csv"), index=False) data_frame_to_pdf( h_mix_rho_x, os.path.join(output_directory, "h_mix_rho_x_training_set.pdf"), ) generate_report( os.path.join(output_directory, "h_mix_rho_x_training_set.json"))
def filter_common_data(output_directory, substances): """Filter the common data to a smaller temperature range - this seems to help the state selection method get closer to the target states. """ os.makedirs(os.path.join(output_directory, "h_mix_and_rho_x"), exist_ok=True) for property_type, substance_type in [ (EnthalpyOfMixing, SubstanceType.Binary), (Density, SubstanceType.Binary), ]: data_frames = [] for environment_mix in [ "alcohol_ester", "alcohol_alkane", "ether_alkane", "ether_ketone", ]: data_frame = load_processed_data_set( os.path.join( "..", "..", "..", "data_availability", "data_by_environments", environment_mix, "common_data", "h_mix_rho_x", ), property_type, substance_type, ) data_frame = filter_by_substance_composition( data_frame, substances, None) data_frame = data_frame[(data_frame["Mole Fraction 1"] > 0.10) & (data_frame["Mole Fraction 1"] < 0.90)] data_frames.append(data_frame) full_data_frame = pandas.concat(data_frames) save_processed_data_set( os.path.join(output_directory, "h_mix_and_rho_x"), full_data_frame, property_type, substance_type, ) data_frame_to_pdf( full_data_frame, os.path.join( output_directory, "h_mix_and_rho_x", property_to_file_name(property_type, substance_type) + ".pdf", ), )
def main(): root_output_directory = "partitioned_data" # Define the types of property which are of interest. properties_of_interest = [ (Density, SubstanceType.Pure), (EnthalpyOfVaporization, SubstanceType.Pure), (EnthalpyOfMixing, SubstanceType.Binary), (ExcessMolarVolume, SubstanceType.Binary), (Density, SubstanceType.Binary), ] # Define the types of mixture which are of interest environment_types = ["alcohol_alcohol", "alcohol_ester", "ester_ester"] # Find all of the substances which appeared in the training set training_smiles = find_training_smiles() for environment_type in environment_types: output_directory = os.path.join(root_output_directory, environment_type) os.makedirs(output_directory, exist_ok=True) for property_type, substance_type in properties_of_interest: full_data_frame = load_processed_data_set( os.path.join("filtered_data", environment_type), property_type, substance_type, ) property_type = property_to_snake_case(property_type) file_name = f"{property_type}_{str(substance_type.value)}" # Extract properties where neither component appears in # in the training set. if substance_type == SubstanceType.Binary: data_frame = full_data_frame[ ~full_data_frame["Component 1"].isin(training_smiles) & ~full_data_frame["Component 2"].isin(training_smiles) ] elif substance_type == SubstanceType.Pure: data_frame = full_data_frame[ ~full_data_frame["Component 1"].isin(training_smiles) ] else: raise NotImplementedError() base_directory = os.path.join(output_directory, "not_in_training") os.makedirs(base_directory, exist_ok=True) data_frame.to_csv( os.path.join(base_directory, file_name + ".csv"), index=False ) data_frame_to_pdf( data_frame, os.path.join(base_directory, file_name + ".pdf") ) if substance_type == SubstanceType.Pure: continue # Extract properties where both components appear in # in the training set. data_frame = full_data_frame[ full_data_frame["Component 1"].isin(training_smiles) & full_data_frame["Component 2"].isin(training_smiles) ] base_directory = os.path.join(output_directory, "both_in_training") os.makedirs(base_directory, exist_ok=True) data_frame.to_csv( os.path.join(base_directory, file_name + ".csv"), index=False ) data_frame_to_pdf( data_frame, os.path.join(base_directory, file_name + ".pdf") ) # Extract properties where only one component appears in # in the training set. data_frame = full_data_frame[ ( full_data_frame["Component 1"].isin(training_smiles) & ~full_data_frame["Component 2"].isin(training_smiles) ) | ( ~full_data_frame["Component 1"].isin(training_smiles) & full_data_frame["Component 2"].isin(training_smiles) ) ] base_directory = os.path.join(output_directory, "one_in_training") os.makedirs(base_directory, exist_ok=True) data_frame.to_csv( os.path.join(base_directory, file_name + ".csv"), index=False ) data_frame_to_pdf( data_frame, os.path.join(base_directory, file_name + ".pdf") )
def main(): # Set up logging logging.basicConfig(level=logging.INFO) substances = [ # Methanol ("CO", "COC=O"), # Methyl formate ("CO", "CCOC(=O)CC(=O)OCC"), # Diethyl Malonate # Ethanol ("CCO", "CC(=O)O"), # Acetic acid ("CCO", "CCOC(C)=O"), # Ethyl acetate ("CCO", "CCOC(=O)CC(=O)OCC"), # Diethyl Malonate # Butanol ("CCCCO", "CCOC(=O)CC(=O)OCC"), # Diethyl Malonate # Isopropanol ("CC(C)O", "CCOC(=O)CC(=O)OCC"), # Diethyl Malonate # Isobutanol ("CC(C)CO", "CCOC(=O)CC(=O)OCC"), # Diethyl Malonate # Tert-butanol ("CC(C)(C)O", "COC=O"), # Methyl formate ("CC(C)(C)O", "CCCCOC(C)=O"), # Butyl acetate ] substances = [tuple(sorted(x)) for x in substances] target_states = [ StatePoint(298.15 * unit.kelvin, 1.0 * unit.atmosphere, (0.25, 0.75)), StatePoint(298.15 * unit.kelvin, 1.0 * unit.atmosphere, (0.50, 0.50)), StatePoint(298.15 * unit.kelvin, 1.0 * unit.atmosphere, (0.75, 0.25)), ] filtered_directory = "filtered_common_data" os.makedirs(filtered_directory, exist_ok=True) filter_common_data(filtered_directory) output_directory = "training_sets" os.makedirs(output_directory, exist_ok=True) h_mix_v_excess = select_data_points( data_directory=os.path.join(filtered_directory, "h_mix_and_v_excess"), chosen_substances=substances, target_state_points={ (EnthalpyOfMixing, SubstanceType.Binary): target_states, (ExcessMolarVolume, SubstanceType.Binary): target_states, }, ) h_mix_v_excess.json( os.path.join(output_directory, "h_mix_v_excess_training_set.json")) h_mix_v_excess = h_mix_v_excess.to_pandas() h_mix_v_excess.to_csv(os.path.join(output_directory, "h_mix_v_excess_training_set.csv"), index=False) data_frame_to_pdf( h_mix_v_excess, os.path.join(output_directory, "h_mix_v_excess_training_set.pdf"), ) h_mix_density = select_data_points( data_directory=os.path.join(filtered_directory, "h_mix_and_binary_density"), chosen_substances=substances, target_state_points={ (EnthalpyOfMixing, SubstanceType.Binary): target_states, (Density, SubstanceType.Binary): target_states, }, ) h_mix_density.json( os.path.join(output_directory, "h_mix_density_training_set.json")) h_mix_density = h_mix_density.to_pandas() h_mix_density.to_csv(os.path.join(output_directory, "h_mix_density_training_set.csv"), index=False) data_frame_to_pdf( h_mix_density, os.path.join(output_directory, "h_mix_density_training_set.pdf"))