Exemplo n.º 1
0
    def setUpClass(cls):
        cls.n_feature = 3
        cls.n_bond_features = 10
        cls.n_global_features = 2

        class Generator(Sequence):
            def __init__(self, x, y):
                self.x = x
                self.y = y
            def __len__(self):
                return 10
            def __getitem__(self, index):
                return  self.x, self.y

        x_crystal = [np.array([1, 2, 3, 4]).reshape((1, -1)),
                     np.random.normal(size=(1, 6, cls.n_bond_features)),
                     np.random.normal(size=(1, 2, cls.n_global_features)),
                     np.array([[0, 0, 1, 1, 2, 3]]),
                     np.array([[1, 1, 0, 0, 3, 2]]),
                     np.array([[0, 0, 1, 1]]),
                     np.array([[0, 0, 0, 0, 1, 1]]),
                     ]

        y = np.random.normal(size=(1, 2, 1))
        cls.train_gen_crystal = Generator(x_crystal, y)
        x_mol = [np.random.normal(size=(1, 4, cls.n_feature)),
                 np.random.normal(size=(1, 6, cls.n_bond_features)),
                 np.random.normal(size=(1, 2, cls.n_global_features)),
                 np.array([[0, 0, 1, 1, 2, 3]]),
                 np.array([[1, 1, 0, 0, 3, 2]]),
                 np.array([[0, 0, 1, 1]]),
                 np.array([[0, 0, 0, 0, 1, 1]]),
                 ]
        y = np.random.normal(size=(1, 2, 1))
        cls.train_gen_mol = Generator(x_mol, y)

        cls.model = MEGNetModel(10, 2, nblocks=1, lr=1e-2,
                                n1=4, n2=4, n3=4, npass=1, ntarget=1,
                                graph_converter=CrystalGraph(bond_converter=GaussianDistance(np.linspace(0, 5, 10), 0.5)),
                                )
        cls.model2 = MEGNetModel(10, 2, nblocks=1, lr=1e-2,
                                 n1=4, n2=4, n3=4, npass=1, ntarget=2,
                                 graph_converter=CrystalGraph(bond_converter=GaussianDistance(np.linspace(0, 5, 10), 0.5)),
                                 )
Exemplo n.º 2
0
 def setUpClass(cls):
     cls.s = Structure.from_spacegroup('Fm-3m', Lattice.cubic(5.69169),
                                       ['Na', 'Cl'],
                                       [[0, 0, 0], [0, 0, 0.5]])
     cls.dummy_model = MEGNetModel(100,
                                   2,
                                   nblocks=1,
                                   n1=4,
                                   n2=2,
                                   n3=2,
                                   npass=1)
Exemplo n.º 3
0
 def test_check_dimension(self):
     gc = CrystalGraph(bond_converter=GaussianDistance(np.linspace(0, 5, 20), 0.5))
     s = Structure(Lattice.cubic(3), ['Si'], [[0, 0, 0]])
     graph = gc.convert(s)
     model = MEGNetModel(10, 2, nblocks=1, lr=1e-2,
                         n1=4, n2=4, n3=4, npass=1, ntarget=1,
                         graph_converter=CrystalGraph(bond_converter=gc),
                         )
     with self.assertRaises(Exception) as context:
         model.check_dimension(graph)
         self.assertTrue('The data dimension for bond' in str(context.exception))
Exemplo n.º 4
0
    def setUpClass(cls):
        cls.n_feature = 3
        cls.n_bond_features = 10
        cls.n_global_features = 2

        def generator(x, y):
            while True:
                yield x, y

        x_crystal = [
            np.array([1, 2, 3, 4]).reshape((1, -1)),
            np.random.normal(size=(1, 6, cls.n_bond_features)),
            np.random.normal(size=(1, 2, cls.n_global_features)),
            np.array([[0, 0, 1, 1, 2, 3]]),
            np.array([[1, 1, 0, 0, 3, 2]]),
            np.array([[0, 0, 1, 1]]),
            np.array([[0, 0, 0, 0, 1, 1]]),
        ]

        y = np.random.normal(size=(1, 2, 1))
        cls.train_gen_crystal = generator(x_crystal, y)
        x_mol = [
            np.random.normal(size=(1, 4, cls.n_feature)),
            np.random.normal(size=(1, 6, cls.n_bond_features)),
            np.random.normal(size=(1, 2, cls.n_global_features)),
            np.array([[0, 0, 1, 1, 2, 3]]),
            np.array([[1, 1, 0, 0, 3, 2]]),
            np.array([[0, 0, 1, 1]]),
            np.array([[0, 0, 0, 0, 1, 1]]),
        ]
        y = np.random.normal(size=(1, 2, 1))
        cls.train_gen_mol = generator(x_mol, y)

        cls.model = MEGNetModel(
            10,
            2,
            nblocks=1,
            lr=1e-2,
            n1=4,
            n2=4,
            n3=4,
            npass=1,
            ntarget=1,
            graph_convertor=CrystalGraph(
                bond_convertor=GaussianDistance(np.linspace(0, 5, 10), 0.5)),
        )
Exemplo n.º 5
0
 def test_crystal_model_v2(self):
     cg = CrystalGraph()
     s = Structure(Lattice.cubic(3), ['Si'], [[0, 0, 0]])
     with ScratchDir('.'):
         model = MEGNetModel(nfeat_edge=None,
                             nfeat_global=2,
                             nblocks=1,
                             lr=1e-2,
                             n1=4,
                             n2=4,
                             n3=4,
                             npass=1,
                             ntarget=1,
                             graph_converter=cg,
                             centers=np.linspace(0, 4, 10),
                             width=0.5)
         model = model.train([s, s], [0.1, 0.1], epochs=2)
         t = model.predict_structure(s)
         self.assertTrue(t.shape == (1, ))
Exemplo n.º 6
0
def prepare_model_megnet(individuals, epochs, outfile, excl=[]):
    # prepares model file
    # prepares Megnet model based on list of individuals
    # uses total energy per atom
    # excl - excluding particular stoichiometry - important for network learning
    structures = []
    energies = []
    adapt = AseAtomsAdaptor()
    empty = 0
    if not excl:
        empty = 1

    i = 0
    for ind in individuals:
        struct_ase = ind.get_init_structure()
        chem_sym = struct_ase.get_chemical_symbols()
        e_tot = ind.e_tot
        struct_pymatgen = adapt.get_structure(struct_ase)
        flag = 1
        if empty == 0 and chem_sym == excl:
            flag = 0

        if flag == 1:
            structures.append(struct_pymatgen)
            energies.append(e_tot)
            i = i + 1

    print("read data of " + str(i) + " structures total")

    # standard vales as taken from Megnet manual
    nfeat_bond = 100
    nfeat_global = 2
    r_cutoff = 5
    gaussian_centers = np.linspace(0, r_cutoff + 1, nfeat_bond)
    gaussian_width = 0.5
    distance_converter = GaussianDistance(gaussian_centers, gaussian_width)
    graph_converter = CrystalGraph(bond_converter=distance_converter, cutoff=r_cutoff)
    model = MEGNetModel(nfeat_bond, nfeat_global, graph_converter=graph_converter)

    # model training
    model.train(structures, energies, epochs=epochs)

    model.save_model(outfile)
Exemplo n.º 7
0
## Set GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

#  2. Model construction
##  Graph converter
crystal_graph = CrystalGraph(bond_converter=GaussianDistance(
    centers=np.linspace(0, 6, 100), width=0.5),
                             cutoff=5.0)
## model setup
model = MEGNetModel(
    nfeat_edge=100,
    nfeat_global=None,
    ngvocal=len(TRAIN_FIDELITIES),
    global_embedding_dim=16,
    nblocks=3,
    nvocal=95,
    npass=2,
    graph_converter=crystal_graph,
    lr=1e-3,
)

#  3. Data loading and processing
##  load data

##  Structure data for all materials project materials

if not os.path.isfile("mp.2019.04.01.json"):
    raise RuntimeError(
        "Please download the data first! Use runall.sh in this directory if needed."
    )
###### megnet example hyper-parameters
from megnet.models import MEGNetModel
from megnet.data.graph import GaussianDistance
from megnet.data.crystal import CrystalGraph
import numpy as np

nfeat_bond = 100
nfeat_global = 2
r_cutoff = 5
gaussian_centers = np.linspace(0, r_cutoff + 1, nfeat_bond)
gaussian_width = 0.5
distance_converter = GaussianDistance(gaussian_centers, gaussian_width)
graph_converter = CrystalGraph(bond_converter=distance_converter,
                               cutoff=r_cutoff)
model = MEGNetModel(nfeat_bond, nfeat_global, graph_converter=graph_converter)

#########################################


def cvt_fmt_graph(rows):
    structures = []
    props = []
    for row in rows:
        structures.append(
            pymatgen_io_ase.AseAtomsAdaptor.get_structure(row.toatoms()))
        props.append(row.data[predict_item] / 100)
        # props.append(abs(row.data[predict_item]/10))
    graphs_valid = []
    targets_valid = []
    structures_invalid = []
Exemplo n.º 9
0
            if abs(e) > cut_value:
                targets[it][i] = prdc
            # targets[i] = (model.predict_structure(structures[i]).ravel() + targets[i])/2
        logging.info('Data count: {dc}, std orig dft value: {std_orig}, std of model output: {std_model}'.format(
            dc=len(targets_lst), std_orig=np.std(targets_lst), std_model=np.std(prediction_lst)))
        logging.info('Data count: {dc}, Mean orig: {mean_orig}, Mean_model: {mean_model}'.format(
            dc=len(targets_lst), mean_orig=np.mean(targets_lst), mean_model=np.mean(prediction_lst)))
        f = open(dump_model_name + '_'+ it + '.txt', 'wb') # to store and analyze the error
        pickle.dump(error_lst, f)
        f.close()

# model = MEGNetModel(10, 2, nblocks=3, lr=1e-3,
#         n1=4, n2=4, n3=4, npass=1, ntarget=1,
#         graph_converter=CrystalGraph(bond_converter=GaussianDistance(np.linspace(0, 5, 10), 0.5)))

model = MEGNetModel(nfeat_edge=10, nfeat_global=2, graph_converter=CrystalGraph(bond_converter=GaussianDistance(np.linspace(0, 5, 10), 0.5)))
model.save_model(dump_model_name+'_1by1_init_randomly' + '.hdf5')
init_model_tag = 'EGPHS'

ep = 5000
callback = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

for s in test_structures:
    test_input.append(model.graph_converter.graph_to_input(model.graph_converter.convert(s)))

db_short_full_dict = {'G': 'gllb-sc', 'H': 'hse', 'S': 'scan', 'P': 'pbe', 'E': 'E1'}

def construct_dataset_from_str(db_short_str):
    s = []
    t = []
    for i in range(len(db_short_str)):
Exemplo n.º 10
0
            e = (model.predict_structure(structures[i]).ravel() - targets[i])
            ME += e
            error_lst.append(e)
            if abs(e) > 0.5:
                targets[i] = model.predict_structure(structures[i]).ravel()
            # targets[i] = (model.predict_structure(structures[i]).ravel() + targets[i])/2
        ME /= sz
        f = open(str(sz) + 'txt', 'wb')
        pickle.dump(error_lst, f)
        f.close()
        # for i in range(idx, idx + sz):
        #     targets[i] += ME
        idx += sz

model = MEGNetModel(10, 2, nblocks=1, lr=1e-4,
        n1=4, n2=4, n3=4, npass=1, ntarget=1,
        graph_converter=CrystalGraph(bond_converter=GaussianDistance(np.linspace(0, 5, 10), 0.5)))


ep = 5000
callback = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=50, restore_best_weights=True)

for s in test_structures:
    test_input.append(model.graph_converter.graph_to_input(model.graph_converter.convert(s)))

if training_mode == 0: # PBE -> HSE ... -> part EXP, one by one
    idx = 0
    for i in range(len(data_size)):
        model.train(structures[idx:idx+data_size[i]], targets[idx:idx+data_size[i]], epochs=ep)
        idx += data_size[i]
        prediction(model)
Exemplo n.º 11
0
     return result

# === megnet start === #

from megnet.models import MEGNetModel
from megnet.data.graph import GaussianDistance
from megnet.data.crystal import CrystalGraph
from megnet.utils.preprocessing import StandardScaler

from megnet.callbacks import ReduceLRUponNan, ManualStop, XiaotongCB

import numpy as np

gc = CrystalGraph(bond_converter=GaussianDistance(
        np.linspace(0, 5, 100), 0.5), cutoff=4)
model = MEGNetModel(100, 2, graph_converter=gc, lr=1e-4, loss=examine_loss) # , metrics=[examine_loss])
INTENSIVE = False # U0 is an extensive quantity
scaler = StandardScaler.from_training_data(structures, targets, is_intensive=INTENSIVE)
model.target_scaler = scaler

# callbacks = [ReduceLRUponNan(patience=500), ManualStop(), XiaotongCB()]

# change structures to megnet predictable structures
mp_strs = []

train_graphs, train_targets = model.get_all_graphs_targets(structures, targets)
train_nb_atoms = [len(i['atom']) for i in train_graphs]
train_targets = [model.target_scaler.transform(i, j) for i, j in zip(train_targets, train_nb_atoms)]


for s in structures:
    test_targets.append(t_exp[i])

# r = list(range(len(list(d['disordered_exp'].keys()))))
# for i in r:
#     s_exp_disordered[i].remove_oxidation_states()
#     test_structures.append(s_exp_disordered[i])
#     test_targets.append(t_exp_disordered[i])

model = MEGNetModel.from_file(old_model_name)
model.summary()
embed = model.get_weights()[0]
print(model.get_weights()[0].shape)

model_new = MEGNetModel(
    nfeat_edge=10,
    nfeat_global=2,
    graph_converter=CrystalGraph(
        bond_converter=GaussianDistance(np.linspace(0, 5, 10), 0.5)))
model_new.summary()
# model_new.set_weights(model.get_weights()[0:])
# prediction(model_new)

model = MEGNetModel(
    nfeat_edge=100,
    nfeat_node=16,
    ngvocal=4,
    global_embedding_dim=16,
    graph_converter=CrystalGraphDisordered(
        bond_converter=GaussianDistance(np.linspace(0, 5, 100), 0.5)))
model.summary()
Exemplo n.º 13
0
def train():
    # Parse args
    args = parse_args()
    radius = args.radius
    n_works = args.n_works
    warm_start = args.warm_start
    output_path = args.output_path
    graph_file = args.graph_file
    prop_col = args.property
    learning_rate = args.learning_rate
    embedding_file = args.embedding_file
    k_folds = list(map(int, args.k_folds.split(",")))
    print("args is : {}".format(args))

    print("Local devices are : {}, \n\n Available gpus are : {}".format(
        device_lib.list_local_devices(),
        K.tensorflow_backend._get_available_gpus()))

    # prepare output path
    if not os.path.exists(output_path):
        os.makedirs(output_path, exist_ok=True)

    # Get a crystal graph with cutoff radius A
    cg = CrystalGraph(
        bond_convertor=GaussianDistance(np.linspace(0, radius + 1, 100), 0.5),
        cutoff=radius,
    )

    if graph_file is not None:
        # load graph data
        with gzip.open(graph_file, "rb") as f:
            valid_graph_dict = pickle.load(f)
        idx_list = list(range(len(valid_graph_dict)))
        valid_idx_list = [
            idx for idx, graph in valid_graph_dict.items() if graph is not None
        ]
    else:
        # load structure data
        with gzip.open(args.input_file, "rb") as f:
            df = pd.DataFrame(pickle.load(f))[["structure", prop_col]]
        idx_list = list(range(len(df)))

        # load embedding data for transfer learning
        if embedding_file is not None:
            with open(embedding_file) as json_file:
                embedding_data = json.load(json_file)

        # Calculate and save valid graphs
        valid_idx_list = list()
        valid_graph_dict = dict()
        for idx in idx_list:
            try:
                graph = cg.convert(df["structure"].iloc[idx])
                if embedding_file is not None:
                    graph["atom"] = [embedding_data[i] for i in graph["atom"]]
                valid_graph_dict[idx] = {
                    "graph": graph,
                    "target": df[prop_col].iloc[idx],
                }
                valid_idx_list.append(idx)
            except RuntimeError:
                valid_graph_dict[idx] = None

        # Save graphs
        with gzip.open(os.path.join(output_path, "graphs.pkl.gzip"),
                       "wb") as f:
            pickle.dump(valid_graph_dict, f)

    # Split data
    kf = KFold(n_splits=args.cv, random_state=18012019, shuffle=True)
    for fold, (train_val_idx, test_idx) in enumerate(kf.split(idx_list)):
        print(fold)
        if fold not in k_folds:
            continue
        fold_output_path = os.path.join(output_path, "kfold_{}".format(fold))
        fold_model_path = os.path.join(fold_output_path, "model")
        if not os.path.exists(fold_model_path):
            os.makedirs(fold_model_path, exist_ok=True)

        train_idx, val_idx = train_test_split(train_val_idx,
                                              test_size=0.25,
                                              random_state=18012019,
                                              shuffle=True)

        # Calculate valid train validation test ids and save it
        valid_train_idx = sorted(list(set(train_idx) & (set(valid_idx_list))))
        valid_val_idx = sorted(list(set(val_idx) & (set(valid_idx_list))))
        valid_test_idx = sorted(list(set(test_idx) & (set(valid_idx_list))))
        np.save(os.path.join(fold_output_path, "train_idx.npy"),
                valid_train_idx)
        np.save(os.path.join(fold_output_path, "val_idx.npy"), valid_val_idx)
        np.save(os.path.join(fold_output_path, "test_idx.npy"), valid_test_idx)

        # Prepare training graphs
        train_graphs = [valid_graph_dict[i]["graph"] for i in valid_train_idx]
        train_targets = [
            valid_graph_dict[i]["target"] for i in valid_train_idx
        ]

        # Prepare validation graphs
        val_graphs = [valid_graph_dict[i]["graph"] for i in valid_val_idx]
        val_targets = [valid_graph_dict[i]["target"] for i in valid_val_idx]

        # Normalize targets or not
        if args.normalize:
            y_scaler = StandardScaler()
            train_targets = y_scaler.fit_transform(
                np.array(train_targets).reshape(-1, 1)).ravel()
            val_targets = y_scaler.transform(
                np.array(val_targets).reshape((-1, 1))).ravel()
        else:
            y_scaler = None

        # Initialize model
        if warm_start is None:
            #  Set up model
            if learning_rate is None:
                learning_rate = 1e-3
            model = MEGNetModel(
                100,
                2,
                nblocks=args.n_blocks,
                nvocal=95,
                npass=args.n_pass,
                lr=learning_rate,
                loss=args.loss,
                graph_convertor=cg,
                is_classification=True
                if args.type == "classification" else False,
                nfeat_node=None if embedding_file is None else 16,
            )

            initial_epoch = 0
        else:
            # Model file
            model_list = [
                m_file for m_file in os.listdir(
                    os.path.join(warm_start, "kfold_{}".format(fold), "model"))
                if m_file.endswith(".hdf5")
            ]
            if args.type == "classification":
                model_list.sort(
                    key=lambda m_file: float(
                        m_file.split("_")[3].replace(".hdf5", "")),
                    reverse=False,
                )
            else:
                model_list.sort(
                    key=lambda m_file: float(
                        m_file.split("_")[3].replace(".hdf5", "")),
                    reverse=True,
                )

            model_file = os.path.join(warm_start, "kfold_{}".format(fold),
                                      "model", model_list[-1])

            #  Load model from file
            if learning_rate is None:
                full_model = load_model(
                    model_file,
                    custom_objects={
                        "softplus2": softplus2,
                        "Set2Set": Set2Set,
                        "mean_squared_error_with_scale":
                        mean_squared_error_with_scale,
                        "MEGNetLayer": MEGNetLayer,
                    },
                )

                learning_rate = K.get_value(full_model.optimizer.lr)
            # Set up model
            model = MEGNetModel(
                100,
                2,
                nblocks=args.n_blocks,
                nvocal=95,
                npass=args.n_pass,
                lr=learning_rate,
                loss=args.loss,
                graph_convertor=cg,
                is_classification=True
                if args.type == "classification" else False,
                nfeat_node=None if embedding_file is None else 16,
            )
            model.load_weights(model_file)
            initial_epoch = int(model_list[-1].split("_")[2])
            print("warm start from : {}, \nlearning_rate is {}.".format(
                model_file, learning_rate))

        # Train
        model.train_from_graphs(
            train_graphs,
            train_targets,
            val_graphs,
            val_targets,
            batch_size=args.batch_size,
            epochs=args.max_epochs,
            verbose=2,
            initial_epoch=initial_epoch,
            use_multiprocessing=False if n_works <= 1 else True,
            workers=n_works,
            dirname=fold_model_path,
            y_scaler=y_scaler,
            save_best_only=args.save_best_only,
        )
Exemplo n.º 14
0
def main() -> None:
    """Execute main script."""
    parser = ArgumentParser()
    parser.add_argument(
        "--train",
        action="store_true",
        help="Whether to train the model.",
        dest="do_train",
    )
    parser.add_argument(
        "--eval",
        action="store_true",
        help="Whether to evaluate the model.",
        dest="do_eval",
    )
    parser.add_argument(
        "--which",
        choices=["MEGNet", "VGP", "ProbNN"],
        required=("--train" in sys.argv),
        help=(
            "Which components to train: "
            "MEGNet -- Just the MEGNetModel; "
            "VGP -- Just the VGP part of the ProbNN; "
            "ProbNN -- The whole ProbNN."
        ),
        dest="which",
    )
    parser.add_argument(
        "--epochs",
        "-n",
        type=int,
        required=("--train" in sys.argv),
        help="Number of training epochs.",
        dest="epochs",
    )
    parser.add_argument(
        "--inducing",
        "-i",
        type=int,
        help="Number of inducing index points.",
        default=500,
        dest="num_inducing",
    )
    args = parser.parse_args()

    do_train: bool = args.do_train
    do_eval: bool = args.do_eval
    which_model: str = args.which
    epochs: int = args.epochs
    num_inducing: int = args.num_inducing

    # Load the MEGNetModel into memory
    try:
        meg_model: MEGNetModel = MEGNetModel.from_file(str(MEGNET_MODEL_DIR))
    except FileNotFoundError:
        meg_model = MEGNetModel(**default_megnet_config())

    # Load the data into memory
    df = download_data(PHONONS_URL, PHONONS_SAVE_DIR)
    structures = df["structure"]
    targets = df["last phdos peak"]
    num_data = len(structures)
    print(f"{num_data} datapoints loaded.")

    num_training = floor(num_data * TRAINING_RATIO)
    print(f"{num_training} training data, {num_data-num_training} test data.")
    train_structs = structures[:num_training]
    train_targets = targets[:num_training]
    test_structs = structures[num_training:]
    test_targets = targets[num_training:]

    if which_model == "MEGNet":
        if do_train:
            tf_callback = TensorBoard(MEGNET_LOGS / NOW, write_graph=False)
            meg_model.train(
                train_structs,
                train_targets,
                test_structs,
                test_targets,
                automatic_correction=False,
                dirname="meg_checkpoints",
                epochs=epochs,
                callbacks=[tf_callback],
                verbose=VERBOSITY,
            )
            meg_model.save_model(str(MEGNET_MODEL_DIR))
        if do_eval:
            train_predicted = meg_model.predict_structures(train_structs).flatten()
            train_mae = MAE(train_predicted, None, train_targets)
            metric_logger.info("MEGNet train MAE = %f", train_mae)

            test_predicted = meg_model.predict_structures(test_structs).flatten()
            test_mae = MAE(test_predicted, None, test_targets)
            metric_logger.info("MEGNet test MAE = %f", test_mae)
    else:
        # Load the ProbNN into memory
        try:
            prob_model: MEGNetProbModel = MEGNetProbModel.load(PROB_MODEL_DIR)
        except FileNotFoundError:
            prob_model = MEGNetProbModel(meg_model, num_inducing, metrics=["MAE"])

        if do_train:
            if which_model == "VGP":
                prob_model.set_frozen("NN", recompile=False)
                prob_model.set_frozen(["VGP", "Norm"], freeze=False)
                tf_callback = TensorBoard(VGP_LOGS / NOW, write_graph=False)
            else:
                prob_model.set_frozen(["VGP", "NN", "Norm"], freeze=False)
                tf_callback = TensorBoard(FULL_MODEL_LOGS / NOW, write_graph=False)
            prob_model.train(
                train_structs,
                train_targets,
                epochs,
                test_structs,
                test_targets,
                callbacks=[tf_callback],
                verbose=VERBOSITY,
            )
            prob_model.save(PROB_MODEL_DIR)
        if do_eval:
            train_metrics = evaluate_uq_metrics(
                prob_model, train_structs, train_targets
            )
            log_metrics(train_metrics, "training")
            test_metrics = evaluate_uq_metrics(prob_model, test_structs, test_targets)
            log_metrics(test_metrics, "test")
Exemplo n.º 15
0
def megnet_input(prop, ZeroVals, bond, nfeat_global, cutoff, width, *fraction):
    """
    megnet_input(prop, ZeroVals, bond, nfeat_global, cutoff, width, *fraction)

    Extracts valid structures and targets and splits them into user specified
    datsets. 

    Inputs:
    prop-                   Optical property of interest. 
    ZeroVals-               Exclude/Include zero optical property values.
    bond-                   MEGNet feature bond.
    nfeat_global-           MEGNet feature global.
    cutoff-                 MEGNet MEGNet radial cutoff. 
    width-                  MEGNet gaussian width.
    *fraction-              Fraction of data to split into training and 
                            validation sets. Passing an extra argument to 
                            split data based on quantity is permissible.

    Outputs:
    1-                      Featurised structures for training with 
                            MEGNet. 
    2-                      Valid structures and targets.
    3-                      Inputs for extraction of activations. 
    4-                      Pool, test, training and validation sets. 
    """
    logging.info("Get graph inputs to MEGNet ...")
    print("Bond features = ", bond)
    print("Global features = ", nfeat_global)
    print("Radial cutoff = ", cutoff)
    print("Gaussian width = ", width)
    gaussian_centers = np.linspace(0, cutoff, bond)
    distance_converter = GaussianDistance(gaussian_centers, width)
    graph_converter = CrystalGraph(bond_converter=distance_converter)
    model = MEGNetModel(bond, nfeat_global, graph_converter=graph_converter)

    datafile = "%s_data.pkl" % prop
    inputs = pd.read_pickle(datafile)
    print("\nNumber of input entries found for %s data = %s" %
          (prop, len(inputs)))
    if ZeroVals == False:
        logging.info(
            "Excluding zero optical property values from the dataset ...")
        mask = np.array(
            [i for i, val in enumerate(inputs[prop]) if abs(val) == 0.])
        structures = np.delete(inputs["structure"].to_numpy(), mask)
        targets = np.delete(inputs[prop].to_numpy(), mask)
        print("Remaining number of entries = %s" % len(targets))
    else:
        logging.info("Zero optical property values will be included ...")
        structures = inputs["structure"].to_numpy()
        targets = inputs[prop].to_numpy()

    # Get the valid structures and targets i.e exclude isolated atoms
    logging.info("Obtaining valid structures and targets ...")
    valid_structures = []
    valid_targets = []
    activations_input_full = []
    for s, t in zip(structures, targets):
        try:
            activations_input_full.append(
                StructureGraph.get_input(graph_converter, s))
        except:
            print("Skipping structure with isolated atom ...")
            continue
        valid_structures.append(s)
        valid_targets.append(t)
    print("Number of invalid structures = %s" %
          (len(targets) - len(valid_targets)))
    print("\nTotal number of entries available for analysis = %s" %
          len(valid_targets))

    pool_frac = fraction[0][0]
    if len(fraction) == 1:
        if (fraction[0][0] + fraction[0][1]) == 1.:
            # For train-test split and k-fold cross-validation
            test_frac = fraction[0][1]

            logging.info("The pool is the same as the training set ...")
            print("Requested pool: %s%%" % (pool_frac * 100))
            print("Requested test set: %s%%" % (test_frac * 100))

            # Data split is based on percentages
            pool_boundary = int(len(valid_targets) * pool_frac)
            Xpool = np.array(valid_structures[0:pool_boundary])
            ypool = np.array(valid_targets[0:pool_boundary])
            Xtest = np.array(valid_structures[pool_boundary:])
            ytest = np.array(valid_targets[pool_boundary:])

            logging.info("The pool is the same as the training set ...")
            print("Pool:", ypool.shape)
            print("Test set:", ytest.shape)
            return (model, activations_input_full, valid_structures,
                    valid_targets, Xpool, ypool, Xtest, ytest)

        elif (fraction[0][0] + fraction[0][1]) < 1.:
            #  For repeat active learning
            val_frac = fraction[0][1]
            test_frac = np.round(1 - pool_frac, decimals=2)

            pool_boundary = int(len(valid_targets) * pool_frac)
            Xpool = np.array(valid_structures[0:pool_boundary])
            ypool = np.array(valid_targets[0:pool_boundary])
            Xtest = np.array(valid_structures[pool_boundary:])
            ytest = np.array(valid_targets[pool_boundary:])

            val_boundary = int(pool_boundary * val_frac)
            Xtrain = Xpool[:-val_boundary]
            ytrain = ypool[:-val_boundary]

            Xval = Xpool[-val_boundary:]
            yval = ypool[-val_boundary:]
            print("Requested validation set: %s%% of pool" % (val_frac * 100))
            print("Training set:", ytrain.shape)
            print("Validation set:", yval.shape)
            print("Test set:", ytest.shape)
            return (model, activations_input_full, valid_structures,
                    valid_targets, Xpool, ypool, Xtest, ytest, Xtrain, ytrain,
                    Xval, yval)

    else:
        return (model, activations_input_full, np.array(valid_structures),
                np.array(valid_targets))
           for i in qm9_ids]  # We are training U0 herea

train_structures = structures[:80]
test_structures = structures[80:]
train_targets = targets[:80]
test_targets = targets[80:]

from megnet.models import MEGNetModel
from megnet.data.graph import GaussianDistance
from megnet.data.crystal import CrystalGraph
from megnet.utils.preprocessing import StandardScaler
import numpy as np

gc = CrystalGraph(bond_converter=GaussianDistance(np.linspace(0, 5, 100), 0.5),
                  cutoff=4)
model = MEGNetModel(100, 2, graph_converter=gc, lr=1e-3)

INTENSIVE = False  # U0 is an extensive quantity
scaler = StandardScaler.from_training_data(train_structures,
                                           train_targets,
                                           is_intensive=INTENSIVE)
model.target_scaler = scaler

model.train(train_structures, train_targets, epochs=500, verbose=2)

predicted_tests = []
for i in test_structures:
    predicted_tests.append(model.predict_structure(i).ravel()[0])

print(type(test_targets), type(predicted_tests))