def find_training_smiles(): """Returns the smiles of all of the substances which appeared in the training set. Returns ------- list of tuple of str The smiles patterns of the training substances. """ # Find those alcohols which were included in the training set training_set = PhysicalPropertyDataSet.from_json( os.path.join( "..", "..", "..", "pure_mixture_optimisation", "force_balance", "alcohol_ester", "h_mix_rho_x_rho_pure_h_vap", "targets", "mixture_data", "training_set.json", ) ).to_pandas() training_smiles = data_frame_to_smiles_tuples(training_set) training_smiles = set(x for y in training_smiles for x in y) return training_smiles
def data_frame_to_pdf(data_frame, file_path, rows=10, columns=6): """Creates a PDF file containing images of a the of substances contained in a data frame. Parameters ---------- data_frame: pandas.DataFrame The data frame containing the different substances. file_path: str The file path to save the pdf to. rows: int The maximum number of rows of molecules to include per page. columns: int The maximum number of molecules to include per row. """ if len(data_frame) == 0: return smiles_tuples = data_frame_to_smiles_tuples(data_frame) smiles_to_pdf(smiles_tuples, file_path, rows, columns)
def main(): root_data_directory = "data_by_environments" # Set up logging logging.basicConfig(level=logging.INFO) # Define the properties and environments we are interested in. environments_of_interest = [ os.path.basename(x) for x in glob("data_by_environments/*") ] properties_of_interest = [ (EnthalpyOfMixing, SubstanceType.Binary), (Density, SubstanceType.Binary), # (ExcessMolarVolume, SubstanceType.Binary), ] friendly_names = { (EnthalpyOfMixing, SubstanceType.Binary): "Hmix(x)", (Density, SubstanceType.Binary): "rho(x)", # (ExcessMolarVolume, SubstanceType.Binary): "Vexcess(x)", } property_combinations = [(x, ) for x in properties_of_interest] property_combinations.extend( itertools.combinations(properties_of_interest, 2)) data_rows = [] for environment_of_interest in environments_of_interest: environment_1, environment_2 = environment_of_interest.split("_") data_row = { "Environment 1": environment_1, "Environment 2": environment_2 } data_directory = os.path.join(root_data_directory, "_".join([environment_1, environment_2]), "all_data") for property_combination in property_combinations: # Find the set of substances which are common to all of the # specified property types. all_substance_smiles = [] property_names = [] for property_tuple in property_combination: property_names.append(friendly_names[property_tuple]) data_frame = load_processed_data_set(data_directory, *property_tuple) if len(data_frame) == 0: all_substance_smiles = [] break substance_smiles = set(data_frame_to_smiles_tuples(data_frame)) all_substance_smiles.append(substance_smiles) common_substance_smiles = {} if len(all_substance_smiles) > 0: common_substance_smiles = set.intersection( *all_substance_smiles) property_string = " + ".join(property_names) data_row[property_string] = len(common_substance_smiles) data_rows.append(data_row) columns = [ "Environment 1", "Environment 2", *[ " + ".join([friendly_names[x] for x in y]) for y in property_combinations ], ] summary_frame = pandas.DataFrame(data=data_rows, columns=columns) summary_frame.fillna(0, inplace=True) summary_frame.sort_values(["Hmix(x) + rho(x)"], ascending=False, inplace=True) summary_frame.to_csv("summary.csv", index=False) with open("summary.md", "w") as file: summary_frame.to_markdown(file, showindex=False)
def _build_substance_data(data_directory, target_substances_per_property, smirks_to_exercise): """Loads all of the different data sets for each property type of interest and converts them into a single list of `SubstanceData` objects. Any substances which don't exercise at least one of the chemical environments of interest are ignored. Parameters ---------- data_directory: str The directory which contains the processed pandas data sets target_substances_per_property: dict of tuple of type and SubstanceType and int The target number of unique substances to choose for each type of property of interest. smirks_to_exercise: list of str A list of those smirks patterns which represent those chemical environments which we to aim to exercise. Returns ------- list of SubstanceData The loaded substance data. """ all_substance_tuples = defaultdict(set) all_smiles_patterns = set() for property_type, substance_type in target_substances_per_property: # Load the full data sets from the processed data file data_frame = load_processed_data_set(data_directory, property_type, substance_type) substance_tuples = data_frame_to_smiles_tuples(data_frame) for substance_tuple in substance_tuples: all_substance_tuples[substance_tuple].add( (property_type, substance_type)) substance_smiles = set(x for y in substance_tuples for x in y) all_smiles_patterns.update(substance_smiles) # Build the list of substances which we can choose from all_substance_data = [] for substance_tuple in all_substance_tuples: # Make sure that this smiles tuple does actually exercise at least one # of the chemical environments of interest. smiles_per_smirks = find_smirks_matches(tuple(smirks_to_exercise), *substance_tuple) all_exercised_smirks = set([ smirks for smirks, smiles in smiles_per_smirks.items() if len(smiles) > 0 ]) smirks_per_smiles = invert_dict_of_iterable(smiles_per_smirks) exercised_smirks_of_interest = set() for smiles_pattern in substance_tuple: if (smiles_pattern not in smirks_per_smiles or len(smirks_per_smiles[smiles_pattern]) == 0): continue exercised_smirks_of_interest.update( smirks_per_smiles[smiles_pattern]) if len(exercised_smirks_of_interest) == 0: continue substance_data = SubstanceData( substance_tuple=substance_tuple, smirks_exercised=all_exercised_smirks, property_types=all_substance_tuples[substance_tuple], ) all_substance_data.append(substance_data) return all_substance_data
def main(): root_output_directory = "data_by_environments" # Set up logging logging.basicConfig(level=logging.INFO) # Define the types of data to find. properties_of_interest = [ [(EnthalpyOfMixing, SubstanceType.Binary), (Density, SubstanceType.Binary)], [ (EnthalpyOfMixing, SubstanceType.Binary), (ExcessMolarVolume, SubstanceType.Binary), ], [ (EnthalpyOfMixing, SubstanceType.Binary), (Density, SubstanceType.Binary), (ExcessMolarVolume, SubstanceType.Binary), ], ] # Define some shorter file names to use: type_to_file_name = { (Density, SubstanceType.Binary): "rho_x", (EnthalpyOfMixing, SubstanceType.Binary): "h_mix", (ExcessMolarVolume, SubstanceType.Binary): "v_excess", } # Define which types of mixtures we are interested in, e.g. # alcohol-alcohol, alcohol-ester etc. environments_of_interest = [ os.path.basename(x) for x in glob("data_by_environments/*") ] for environment_of_interest in environments_of_interest: data_directory = os.path.join("data_by_environments", environment_of_interest, "all_data") os.makedirs( os.path.join(root_output_directory, environment_of_interest, "common_data"), exist_ok=True, ) for property_type_set in properties_of_interest: # Find the set of substances which are common to all of the # specified property types. all_substance_smiles = [] for property_type, substance_type in property_type_set: data_frame = load_processed_data_set(data_directory, property_type, substance_type) if len(data_frame) == 0: all_substance_smiles = [] break substance_smiles = set(data_frame_to_smiles_tuples(data_frame)) all_substance_smiles.append(substance_smiles) if len(all_substance_smiles) == 0: continue common_substance_smiles = set.intersection(*all_substance_smiles) # Save the common substances to a pdf file. file_name = "_".join(type_to_file_name[x] for x in property_type_set) file_path = os.path.join( root_output_directory, environment_of_interest, "common_data", f"{file_name}.pdf", ) if len(common_substance_smiles) > 0: smiles_to_pdf(list(common_substance_smiles), file_path) # Output the common data to the `common_data` directory. output_directory = os.path.join(root_output_directory, environment_of_interest, "common_data", file_name) for property_type, substance_type in property_type_set: data_frame = load_processed_data_set(data_directory, property_type, substance_type) data_frame = filter_by_substance_composition( data_frame, common_substance_smiles, None) save_processed_data_set(output_directory, data_frame, property_type, substance_type)