class data_manipulation_tests(unittest.TestCase): data = ['cpt'] data2 = ['cpt', 'density', 'viscosity'] data_ranges = [[200, 1000], [900, 1300], [0, 2]] T = [298.1, 298.16] P = [101, 102] devmodel1 = salty.aggregate_data(data2, T=T, P=P, impute=True, data_ranges=data_ranges, scale_center=False) devmodel = salty.aggregate_data(data2, T=T, P=P, impute=True, data_ranges=data_ranges) def test_1_aggregate_data(self): devmodel = salty.aggregate_data(self.data, T=self.T, P=self.P) return devmodel def test_2_devmodel_to_array(self): X_train, Y_train, X_test, Y_test = salty.devmodel_to_array( self.devmodel, train_fraction=0.8) return X_train, Y_train, X_test, Y_test def test_3_merge_duplicates(self): data = salty.merge_duplicates(self.devmodel, keep_descriptors=True) return data def test_4_assign_category(self): data = salty.assign_category(self.devmodel1.Data) return data def test_benchmark(self): salty.Benchmark.run(self.test_1_aggregate_data) salty.Benchmark.run(self.test_2_devmodel_to_array) salty.Benchmark.run(self.test_3_merge_duplicates) salty.Benchmark.run(self.test_4_assign_category)
def test_gaussian_pdf(self): # grab experimental data T = [297, 316] # select narrow state variable ranges P = [99, 102] # we will set MD simulation to 101 kPa and 298 K cpt = [[207, 3000]] exp_data = ["cpt"] cpt_data = salty.aggregate_data(exp_data, T=T, P=P, data_ranges=cpt) exp_data = ["density"] dens_data = salty.aggregate_data(exp_data, T=T, P=P) # calc KDEs from experimental data adl.gaussian_pdf( cpt_data.Data["Heat capacity at constant pressure, J/K/mol"]) adl.gaussian_pdf(dens_data.Data["Specific density, kg/m<SUP>3</SUP>"])
def test_calculate_minimum_distances(self): T = [298.1, 298.16] # select narrow state variable ranges P = [101, 102] # we will set MD simulation to 101 kPa and 298 K exp_data = ["cpt", "density"] data = salty.aggregate_data(exp_data, T=T, P=P) merged = salty.merge_duplicates(data) hull = merged.iloc[:, 2:4] adl.calculate_minimum_distances(hull, 1200, 600)
class visualization_library_tests(unittest.TestCase): data = ['cpt', 'density'] T = [298.1, 298.16] P = [101, 102] devmodel = salty.aggregate_data(data, T=T, P=P) X_train, Y_train, X_test, Y_test = \ salty.devmodel_to_array(devmodel, train_fraction=0.8) model = Sequential() model.add(Dense(75, activation='relu', input_dim=X_train.shape[1])) model.add(Dropout(0.25)) model.add(Dense(Y_train.shape[1], activation='linear')) model.compile(optimizer="adam", loss="mean_squared_error", metrics=['mse']) model.fit(X_train, Y_train, epochs=1, verbose=False) def test_1_parity_plot(self): plot = vis.parity_plot(self.X_test, self.Y_test, self.model, self.devmodel) return plot def test_benchmark(self): salty.Benchmark.run(self.test_1_parity_plot)
from gains.salt_generator import generate_solvent import salty from random import randint model_ID = ["cpt", "density"] T = [298.1, 298.16] P = [101, 102] exp_data = ["cpt", "density"] data = salty.aggregate_data(exp_data, T=T, P=P) merged = salty.merge_duplicates(data) to_hull = merged.iloc[:, 2:4] target = [0, 0] simplex_id = 1 token_id = randint(1000, 9999) generate_solvent(target, model_ID, heavy_atom_limit=20, sim_bounds=[0.55, 1], hits=10, write_file=False, hull=to_hull, simplex=simplex_id, exp_data=data, verbose=0, gen_token=token_id, hull_bounds=[0, .1], inner_search=False, parent_cap=1, mutation_cap=1000)
def test_1_aggregate_data(self): devmodel = salty.aggregate_data(self.data, T=self.T, P=self.P) return devmodel
def build_model_from_md(df, property_to_model, temperature=[298.1, 299], pressure=[101, 102], output_ranges=[[200, 3000]], md_temperature=298.15, md_pressure=101.325): """ creates new qspr models using md data Parameters ---------- df : pandas DataFrame salt_log data from the genetic algorithm. Contains the headers 'Salt Smiles' and 'MD Calculation'. Current support is only for cpt and density property_to_model : str current support is for 'cpt' or 'density' temperature : array, optional temperature bounds on experimental data to add. Default 297, 316 K pressure : array, optional pressure bounds on experimental data to add. Default 99, 102 kpa output_ranges : array, optional property bounds on experimental data to add. Default 200, 3000 (kg/m3 or kj/molK) md_temperature : float, optional temperature used to generate the md data. Default 298.15 K md_pressure : float, optional pressure used to generate the md data. Dfault 101.325 kPa Returns ------- newmodel : salt dev_model object new_MD_data_index : int start index of the newly incorporated MD data Summary ------- Create 4 lists from df: cation/anion smiles, cpt, density Nans will be used for cation/anion name in the newmodel output """ cpt = [] density = [] cation_smi = [] anion_smi = [] for i in range(df.shape[0]): calculation = df["MD Calculation"][i] cpt.append(re.findall("\d+\.\d+", calculation)[0]) density.append(re.findall("\d+\.\d+", calculation)[1]) cation_smi.append(df['Salt Smiles'][i].split(".")[0]) anion_smi.append(df['Salt Smiles'][i].split(".")[1]) module_path = dirname(__file__) data = df n = data.shape[0] f = open(join(module_path, 'data', 'Deslist'), 'r') Deslist = [] for line in f: Deslist.append(line.strip('\n\t')) calc = Calculator(Deslist) D = len(Deslist) d = len(Deslist) * 2 + 8 X = np.zeros((n, d)) X[:, -8] = md_temperature X[:, -7] = md_pressure for i in range(n): cation = Chem.MolFromSmiles(cation_smi[i]) anion = Chem.MolFromSmiles(anion_smi[i]) X[i][:D] = calc.CalcDescriptors(cation) X[i][D:2 * D] = calc.CalcDescriptors(anion) X[:, -5] = density X[:, -6] = cpt cols_cat = [s + "-cation" for s in Deslist] cols_ani = [s + "-anion" for s in Deslist] cols = cols_cat + cols_ani + [ "Temperature, K", "Pressure, kPa", "Heat capacity at constant pressure," "J/K/mol", "Specific density, kg/m<SUP>3</SUP>", "name-anion", "smiles-anion", "name-cation", "smiles-cation" ] X = pd.DataFrame(X, columns=cols) X.iloc[:, -4] = np.nan X.iloc[:, -2] = np.nan X.iloc[:, -3] = anion_smi X.iloc[:, -1] = cation_smi # X is the df with the new simulation data new_MD_data_index = X.shape[0] # plot new predictions after re-training devmodel = salty.aggregate_data(property_to_model, T=temperature, P=pressure, data_ranges=output_ranges, scale_center=False) cols = devmodel.Data.columns new_data = pd.concat([devmodel.Data, X]) # have to sort in future version if property_to_model == ['density']: prop = "Specific density, kg/m<SUP>3</SUP>" to_drop = "Heat capacity at constant pressure, J/K/mol" elif property_to_model == ['cpt']: to_drop = "Specific density, kg/m<SUP>3</SUP>" prop = "Heat capacity at constant pressure, J/K/mol" elif property_to_model == ["cpt", "density"]: prop = [ "Heat capacity at constant pressure, J/K/mol", "Specific density, kg/m<SUP>3</SUP>" ] if property_to_model != ["cpt", "density"]: new_data.drop(columns=[to_drop], inplace=True) new_data = new_data[cols] new_data.reset_index(inplace=True, drop=True) if property_to_model == ["cpt", "density"]: exp_data = [prop[0], prop[1], "Temperature, K", "Pressure, kPa"] else: exp_data = [prop, "Temperature, K", "Pressure, kPa"] merged = new_data unique_salts = merged["smiles-cation"] + merged["smiles-anion"] unique_cations = repr(merged["smiles-cation"].unique()) unique_anions = repr(merged["smiles-anion"].unique()) actual_data_ranges = [] for i in range(len(exp_data)): actual_data_ranges.append("{} - {}".format( str(merged[exp_data[i]].min()), str(merged[exp_data[i]].max()))) a = np.array([ len(unique_salts.unique()), unique_cations, unique_anions, len(unique_salts) ]) a = np.concatenate((a, actual_data_ranges)) cols1 = ["Unique salts", "Cations", "Anions", "Total datapoints"] cols = cols1 + exp_data data_summary = pd.DataFrame(a, cols) merged = new_data metaDf = merged.select_dtypes(include=["object"]) dataDf = merged.select_dtypes(include=[np.number]) cols = dataDf.columns.tolist() instance = StandardScaler() for i in range(1, len(property_to_model) + 1): dataDf.iloc[:, -i] = dataDf.iloc[:, -i].apply(lambda x: log(float(x))) scaled_data = pd.DataFrame(instance.fit_transform( dataDf.iloc[:, :-len(property_to_model)]), columns=cols[:-len(property_to_model)]) df = pd.concat( [scaled_data, dataDf.iloc[:, -len(property_to_model):], metaDf], axis=1) # may have to sort in future version mean_std_of_coeffs = pd.DataFrame([instance.mean_, instance.scale_], columns=cols[:-len(property_to_model)]) new_model = salty.dev_model(mean_std_of_coeffs, data_summary, df) print(new_model.Data_summary) return new_model, new_MD_data_index
import gains as genetic from gains.salt_generator import generate_solvent import salty model_ID = ["cpt", "density"] T = [298.1, 298.16] #select narrow state variable ranges P = [101, 102] #we will set MD simulation to 101 kPa and 298 K data = salty.aggregate_data(model_ID, T=T, P=P) merged = salty.merge_duplicates(data) to_hull = merged.iloc[:, 2:4] target = [800, 1200] generate_solvent(target, model_ID, heavy_atom_limit=21, sim_bounds=[0, 1], hits=62, write_file=True, hull=to_hull)