def test_crystalgraph(self): cg = CrystalGraph(cutoff=4) graph = cg.convert(self.structures[0]) self.assertEqual(cg.cutoff, 4) keys = set(graph.keys()) self.assertSetEqual({"bond", "atom", "index1", "index2", "state"}, keys) cg2 = CrystalGraph(cutoff=6) self.assertEqual(cg2.cutoff, 6) graph2 = cg2.convert(self.structures[0]) self.assertListEqual(to_list(graph2["state"][0]), [0, 0]) graph3 = cg(self.structures[0]) np.testing.assert_almost_equal(graph["atom"], graph3["atom"])
def test_crystalgraph(self): cg = CrystalGraph(cutoff=4) graph = cg.convert(self.structures[0]) self.assertEqual(cg.cutoff, 4) keys = set(graph.keys()) self.assertSetEqual({"bond", "atom", "index1", "index2", "state"}, keys) cg2 = CrystalGraph(cutoff=6) self.assertEqual(cg2.cutoff, 6) graph2 = cg2.convert(self.structures[0]) self.assertListEqual(graph2['state'][0], [0, 0]) graph3 = cg(self.structures[0]) self.assertListEqual(graph['atom'], graph3['atom'])
def test_crystalgraph(self): cg = CrystalGraph() graph = cg.convert(self.structures[0]) self.assertEqual(cg.r, 4) keys = set(graph.keys()) self.assertSetEqual({"distance", "node", "index1", "index2", "state"}, keys) cg2 = CrystalGraph(r=6) self.assertEqual(cg2.r, 6) graph2 = cg2.convert(self.structures[0]) self.assertListEqual(graph2['state'][0], [0, 0]) graph3 = cg(self.structures[0]) self.assertListEqual(graph['node'], graph3['node'])
def test_check_dimension(self): gc = CrystalGraph(bond_converter=GaussianDistance(np.linspace(0, 5, 20), 0.5)) s = Structure(Lattice.cubic(3), ['Si'], [[0, 0, 0]]) graph = gc.convert(s) model = MEGNetModel(10, 2, nblocks=1, lr=1e-2, n1=4, n2=4, n3=4, npass=1, ntarget=1, graph_converter=CrystalGraph(bond_converter=gc), ) with self.assertRaises(Exception) as context: model.check_dimension(graph) self.assertTrue('The data dimension for bond' in str(context.exception))
with open("mp.2019.04.01.json") as f: structure_data = {i["material_id"]: i["structure"] for i in json.load(f)} print("All structures in mp.2019.04.01.json contain %d structures" % len(structure_data)) ## Band gap data with gzip.open("data_no_structs.json.gz", "rb") as f: bandgap_data = json.loads(f.read()) useful_ids = set.union(*[set(bandgap_data[i].keys()) for i in ALL_FIDELITIES ]) # mp ids that are used in training print("Only %d structures are used" % len(useful_ids)) print("Calculating the graphs for all structures... this may take minutes.") structure_data = {i: structure_data[i] for i in useful_ids} structure_data = { i: crystal_graph.convert(Structure.from_str(j, fmt="cif")) for i, j in structure_data.items() } ## Generate graphs with fidelity information graphs = [] targets = [] material_ids = [] for fidelity_id, fidelity in enumerate(ALL_FIDELITIES): for mp_id in bandgap_data[fidelity]: graph = deepcopy(structure_data[mp_id]) # The fidelity information is included here by changing the state attributes # PBE: 0, GLLB-SC: 1, HSE: 2, SCAN: 3 graph["state"] = [fidelity_id]
def test_get_flat_data(self): cg = CrystalGraph(cutoff=4) graphs = [cg.convert(i) for i in self.structures] targets = [0.1, 0.2] inp = cg.get_flat_data(graphs, targets) self.assertListEqual([len(i) for i in inp], [2] * 6)
def test_convert(self): cg = CrystalGraph(cutoff=4) graph = cg.convert(self.structures[0]) self.assertListEqual(graph['atom'], [i.specie.Z for i in self.structures[0]])
def train(): # Parse args args = parse_args() radius = args.radius n_works = args.n_works warm_start = args.warm_start output_path = args.output_path graph_file = args.graph_file prop_col = args.property learning_rate = args.learning_rate embedding_file = args.embedding_file k_folds = list(map(int, args.k_folds.split(","))) print("args is : {}".format(args)) print("Local devices are : {}, \n\n Available gpus are : {}".format( device_lib.list_local_devices(), K.tensorflow_backend._get_available_gpus())) # prepare output path if not os.path.exists(output_path): os.makedirs(output_path, exist_ok=True) # Get a crystal graph with cutoff radius A cg = CrystalGraph( bond_convertor=GaussianDistance(np.linspace(0, radius + 1, 100), 0.5), cutoff=radius, ) if graph_file is not None: # load graph data with gzip.open(graph_file, "rb") as f: valid_graph_dict = pickle.load(f) idx_list = list(range(len(valid_graph_dict))) valid_idx_list = [ idx for idx, graph in valid_graph_dict.items() if graph is not None ] else: # load structure data with gzip.open(args.input_file, "rb") as f: df = pd.DataFrame(pickle.load(f))[["structure", prop_col]] idx_list = list(range(len(df))) # load embedding data for transfer learning if embedding_file is not None: with open(embedding_file) as json_file: embedding_data = json.load(json_file) # Calculate and save valid graphs valid_idx_list = list() valid_graph_dict = dict() for idx in idx_list: try: graph = cg.convert(df["structure"].iloc[idx]) if embedding_file is not None: graph["atom"] = [embedding_data[i] for i in graph["atom"]] valid_graph_dict[idx] = { "graph": graph, "target": df[prop_col].iloc[idx], } valid_idx_list.append(idx) except RuntimeError: valid_graph_dict[idx] = None # Save graphs with gzip.open(os.path.join(output_path, "graphs.pkl.gzip"), "wb") as f: pickle.dump(valid_graph_dict, f) # Split data kf = KFold(n_splits=args.cv, random_state=18012019, shuffle=True) for fold, (train_val_idx, test_idx) in enumerate(kf.split(idx_list)): print(fold) if fold not in k_folds: continue fold_output_path = os.path.join(output_path, "kfold_{}".format(fold)) fold_model_path = os.path.join(fold_output_path, "model") if not os.path.exists(fold_model_path): os.makedirs(fold_model_path, exist_ok=True) train_idx, val_idx = train_test_split(train_val_idx, test_size=0.25, random_state=18012019, shuffle=True) # Calculate valid train validation test ids and save it valid_train_idx = sorted(list(set(train_idx) & (set(valid_idx_list)))) valid_val_idx = sorted(list(set(val_idx) & (set(valid_idx_list)))) valid_test_idx = sorted(list(set(test_idx) & (set(valid_idx_list)))) np.save(os.path.join(fold_output_path, "train_idx.npy"), valid_train_idx) np.save(os.path.join(fold_output_path, "val_idx.npy"), valid_val_idx) np.save(os.path.join(fold_output_path, "test_idx.npy"), valid_test_idx) # Prepare training graphs train_graphs = [valid_graph_dict[i]["graph"] for i in valid_train_idx] train_targets = [ valid_graph_dict[i]["target"] for i in valid_train_idx ] # Prepare validation graphs val_graphs = [valid_graph_dict[i]["graph"] for i in valid_val_idx] val_targets = [valid_graph_dict[i]["target"] for i in valid_val_idx] # Normalize targets or not if args.normalize: y_scaler = StandardScaler() train_targets = y_scaler.fit_transform( np.array(train_targets).reshape(-1, 1)).ravel() val_targets = y_scaler.transform( np.array(val_targets).reshape((-1, 1))).ravel() else: y_scaler = None # Initialize model if warm_start is None: # Set up model if learning_rate is None: learning_rate = 1e-3 model = MEGNetModel( 100, 2, nblocks=args.n_blocks, nvocal=95, npass=args.n_pass, lr=learning_rate, loss=args.loss, graph_convertor=cg, is_classification=True if args.type == "classification" else False, nfeat_node=None if embedding_file is None else 16, ) initial_epoch = 0 else: # Model file model_list = [ m_file for m_file in os.listdir( os.path.join(warm_start, "kfold_{}".format(fold), "model")) if m_file.endswith(".hdf5") ] if args.type == "classification": model_list.sort( key=lambda m_file: float( m_file.split("_")[3].replace(".hdf5", "")), reverse=False, ) else: model_list.sort( key=lambda m_file: float( m_file.split("_")[3].replace(".hdf5", "")), reverse=True, ) model_file = os.path.join(warm_start, "kfold_{}".format(fold), "model", model_list[-1]) # Load model from file if learning_rate is None: full_model = load_model( model_file, custom_objects={ "softplus2": softplus2, "Set2Set": Set2Set, "mean_squared_error_with_scale": mean_squared_error_with_scale, "MEGNetLayer": MEGNetLayer, }, ) learning_rate = K.get_value(full_model.optimizer.lr) # Set up model model = MEGNetModel( 100, 2, nblocks=args.n_blocks, nvocal=95, npass=args.n_pass, lr=learning_rate, loss=args.loss, graph_convertor=cg, is_classification=True if args.type == "classification" else False, nfeat_node=None if embedding_file is None else 16, ) model.load_weights(model_file) initial_epoch = int(model_list[-1].split("_")[2]) print("warm start from : {}, \nlearning_rate is {}.".format( model_file, learning_rate)) # Train model.train_from_graphs( train_graphs, train_targets, val_graphs, val_targets, batch_size=args.batch_size, epochs=args.max_epochs, verbose=2, initial_epoch=initial_epoch, use_multiprocessing=False if n_works <= 1 else True, workers=n_works, dirname=fold_model_path, y_scaler=y_scaler, save_best_only=args.save_best_only, )