def get_data(normalize_y, sample_size): radius = 3 max_neighbor_num = 10 training = 0.9 test = 0.1 df = pd.read_csv("./Data/metal-alloy-db.v2/00Total_DB.csv") # df = df.sample(n=len(df)) df = df.sample(n=sample_size) cifs = "./Data/metal-alloy-db.v2/" + df['DBname'] + ".cif" structures = [IStructure.from_file(cif) for cif in cifs] encoded_structures = [ structure_encoder(structure, radius, max_neighbor_num) for structure in structures ] x_data = np.array(encoded_structures) formation_energy = df['FormationEnergy'] if normalize_y: mean = formation_energy.mean() std = formation_energy.std() norm_form_energy = (df['FormationEnergy'] - mean) / std def norm_back(val, mean, std): return val * std + mean y_data = [[val] for val in norm_form_energy] else: y_data = [[val] for val in formation_energy] y_data = np.array(y_data) total = len(df) train = int(float(total) * training) test = int(float(total) * test) x_train = x_data[:train] y_train = y_data[:train] x_test = x_data[train:train + test] y_test = y_data[train:train + test] return x_train, y_train, x_test, y_test
if __name__ == "__main__": radius = 3 max_neighbor_num = 10 training = 0.9 test = 0.1 df = pd.read_csv("./Data/metal-alloy-db.v1/00Total_DB.csv") # df = df.sample(n=len(df)) df = df.sample(n=500) cifs = "./Data/metal-alloy-db.v1/" + df['DBname'] + ".cif" structures = [IStructure.from_file(cif) for cif in cifs] encoded_structures = [structure_encoder(structure, radius, max_neighbor_num) for structure in structures] x_data = np.array(encoded_structures) x_data = np.expand_dims(x_data, axis=4) formation_energy = df['FormationEnergy'] y_normalization = False if y_normalization: mean = formation_energy.mean() std = formation_energy.std() norm_form_energy = (df['FormationEnergy'] - mean) / std def norm_back(val, mean, std): return val * std + mean y_data = [[val] for val in norm_form_energy] else: y_data = [[val] for val in formation_energy]
''' formation_energy = df['FormationEnergy'] mean = formation_energy.mean() std = formation_energy.std() norm_form_energy = (df['FormationEnergy'] - mean) / std def norm_back(val, mean, std): return val * std + mean y_data = [[val] for val in norm_form_energy] y_data = np.array(y_data) print("Done") encoded_structures = [ structure_encoder(structure, radius, max_neighbor_num) for structure in structures ] x_atom_fea = [] x_nbr_fea_idx = [] x_nbr_fea = [] for i, each_structure in enumerate(encoded_structures): x_atom_fea.append(each_structure[0]) x_nbr_fea_idx.append(each_structure[1]) x_nbr_fea.append(each_structure[2]) x_atom_fea = np.array(x_atom_fea, dtype='float32') x_nbr_fea_idx = np.array(x_nbr_fea_idx, dtype='float32') x_nbr_fea = np.array(x_nbr_fea, dtype='float32') atom_fea = tf.placeholder(tf.float32, [None, None, 92])