예제 #1
0
def get_dataset(_log, dbpath, dataset, dataset_properties=None):
    """
    Get a dataset from the configuration.

    Args:
        dbpath (str): path to the local database
        dataset (str): name of the dataset
        dataset_properties (list): properties of the dataset

    Returns:
        AtomsData object

    """
    dataset = dataset.upper()
    _log.info('Load {} dataset'.format(dataset))
    if dataset == 'QM9':
        return QM9(dbpath, properties=dataset_properties)
    elif dataset == 'ISO17':
        return get_iso17(dataset_properties=dataset_properties)
    elif dataset == 'ANI1':
        return get_ani1(dataset_properties=dataset_properties)
    elif dataset == 'MD17':
        return get_md17(dataset_properties=dataset_properties)
    elif dataset == 'MATPROJ':
        return get_matproj(dataset_properties=dataset_properties)
    elif dataset == 'CUSTOM':
        file, extension = os.path.splitext(dbpath)
        if extension == '.db':
            return AtomsData(dbpath, required_properties=dataset_properties)
        else:
            generate_db(db_path=file + '.db', file_path=dbpath)
            return AtomsData(file + '.db',
                             required_properties=dataset_properties)
예제 #2
0
    def __init__(self,
                 dbpath,
                 properties,
                 n_atom_basis=128,
                 n_layers=2,
                 n_filters=128,
                 n_interactions=3,
                 cutoff=5.0,
                 n_gaussians=25,
                 environment_provider=AseEnvironmentProvider,
                 frac=0.05,
                 E_lim=0.025,
                 F_lim=0.2,
                 S_lim=0.005,
                 shm=True):

        ### SchNet settings ###

        self.n_atom_basis = n_atom_basis
        self.n_layers = n_layers
        self.n_filters = n_filters
        self.n_interactions = n_interactions
        self.cutoff = cutoff
        self.n_gaussians = n_gaussians

        #######################

        self.i = 0
        self.frac = frac
        self.E_lim = E_lim
        self.F_lim = F_lim
        self.S_lim = S_lim

        if shm:
            dbcopy = '/dev/shm/' + uuid.uuid4().hex + '.db'
            shutil.copyfile(dbpath, dbcopy)

            self.dataset = AtomsData(dbcopy,
                                     load_only=properties,
                                     environment_provider=environment_provider(
                                         self.cutoff),
                                     centering_function=None)
        else:
            self.dataset = AtomsData(dbpath,
                                     load_only=properties,
                                     environment_provider=environment_provider(
                                         self.cutoff),
                                     centering_function=None)

        self.idx_rem = np.arange(len(self.dataset))
        np.random.shuffle(self.idx_rem)

        I = np.arange(round(self.frac * len(self.idx_rem)))

        self.idx_red = self.idx_rem[I]
        self.idx_rem = np.delete(self.idx_rem, I)
예제 #3
0
def evaluate_schnet(models: List[Union[TorchMessage, torch.nn.Module, Path]],
                    molecules: List[str],
                    property_name: str,
                    batch_size: int = 64,
                    device: str = 'cpu') -> np.ndarray:
    """Run inference for a machine learning model

    Args:
        models: List of models to evaluate. Either a SchNet model or
           the bytes corresponding to a serialized model
        molecules: XYZ-format structures of molecules to be evaluate
        property_name: Name of the property being predicted
        batch_size: Number of molecules to evaluate per batch
        device: Device on which to run the computation
    """

    # Make sure the models are converted to Torch models
    if isinstance(models[0], TorchMessage):
        models = [m.get_model(device) for m in models]
    elif isinstance(models[0], (Path, str)):
        models = [torch.load(m, map_location='cpu')
                  for m in models]  # Load to main memory first

    # Make the dataset
    with TemporaryDirectory() as td:
        # Convert the molecules to ase.Atoms objects
        atoms = [next(read_xyz(StringIO(x), slice(None))) for x in molecules]

        # Save the data to an ASE Atoms database
        run_file = os.path.join(td, 'run_data.db')
        db = AtomsData(run_file, available_properties=[])
        db.add_systems(atoms, [{} for _ in atoms])

        # Build the data loader
        loader = AtomsLoader(db, batch_size=batch_size)

        # Run the models
        y_preds = []
        for model in models:
            y_pred = []
            model.to(device)  # Move the model to the device
            for batch in loader:
                # Push the batch to the device
                batch = {k: v.to(device) for k, v in batch.items()}

                # Run it and save results
                pred = model(batch)
                y_pred.append(pred[property_name].detach().cpu().numpy())
            y_preds.append(np.squeeze(np.concatenate(y_pred)))

        return np.vstack(y_preds).T
예제 #4
0
def get_dataset(dbpath, dataset, dataset_properties=None):
    """
    Get a dataset from the configuration.

    Args:
        dbpath (str): path to the local database
        dataset (str): name of the dataset
        dataset_properties (list): properties of the dataset

    Returns:
        AtomsData object

    """
    dataset = dataset.upper()
    if dataset == 'QM9':
        return QM9(dbpath, properties=dataset_properties)
    elif dataset == 'ISO17':
        return get_iso17(dataset_properties=dataset_properties)
    elif dataset == 'ANI1':
        return get_ani1(dataset_properties=dataset_properties)
    elif dataset == 'MD17':
        return get_md17(dataset_properties=dataset_properties)
    elif dataset == 'MATPROJ':
        return get_matproj(dataset_properties=dataset_properties)
    elif dataset == 'CUSTOM':
        return AtomsData(dbpath, required_properties=dataset_properties)
    else:
        raise NotImplementedError
예제 #5
0
def make_schnetpack_data(dataset,
                         dbpath,
                         properties,
                         xyz_col='xyz',
                         conformers=None,
                         overwrite=True):
    """Convert a Pandas dictionary to a SchNet database

    Args:
        dataset (pd.DataFrame): Dataset to convert
        dbpath (string): Path to database to be saved
        properties ([string]): List of properties to include in the dataset
        conformers (str): Name of column with conformers as xyz
        xyz_col (string): Name of the column with the XYZ data
        overwrite (True): Whether to overwrite the database
    """

    # If needed, delete the previous database
    if os.path.exists(dbpath) and overwrite:
        os.unlink(dbpath)

    # Convert all entries to ase.Atoms objects
    atoms = dataset[xyz_col].apply(lambda x: read_xyz(StringIO(x)).__next__())

    # Every column besides the training set will be a property
    prop_cols = set(properties).difference([xyz_col])
    property_list = [
        dict(zip(prop_cols, [np.atleast_1d(row[p]) for p in prop_cols]))
        for i, row in dataset.iterrows()
    ]

    # Add conformers to the property list, but it isn't a required property when loading entries
    if conformers is not None:
        for d, c in zip(property_list, dataset[conformers]):
            d['conformers'] = np.atleast_1d(c)

    # Initialize the object
    db = AtomsData(dbpath,
                   required_properties=properties,
                   conformers=conformers is not None)

    # Add every system to the db object
    db.add_systems(atoms, property_list)
    return db
예제 #6
0
def test_orca_parser(testdir, orca_log_path, target_orca_db_path):
    db_path = os.path.join(testdir, "test_orca_parser.db")

    all_properties = OrcaMainFileParser.properties + OrcaHessianFileParser.properties

    orca_parser = OrcaParser(db_path, properties=all_properties)
    orca_parser.file_extensions[Properties.hessian] = ".hess"
    orca_parser.parse_data([orca_log_path])

    db_target = AtomsData(target_orca_db_path)
    db_test = AtomsData(db_path)

    target_atoms, target_properties = db_target.get_properties(0)
    test_atoms, test_properties = db_test.get_properties(0)

    assert np.allclose(target_atoms.get_atomic_numbers(),
                       test_atoms.get_atomic_numbers())
    assert np.allclose(target_atoms.positions, test_atoms.positions)

    for p in target_properties:
        assert p in test_properties
        assert np.allclose(test_properties[p], target_properties[p])
예제 #7
0
    R2Score('stress', 'stress')
]

fid = open('results.txt', 'w')

header = '             Dataset                Model  MAE energy  MAE forces  MAE stress RMSE energy RMSE forces RMSE stress   R2 energy   R2 forces   R2 stress\n'
header += '==================== ==================== =========== =========== =========== =========== =========== =========== =========== =========== ===========\n'

fid.write(header)
fid.flush()

for dataset_file in natsorted(os.listdir(datasets_path)):

    dataset = AtomsData(datasets_path + dataset_file,
                        load_only=properties,
                        environment_provider=OpenCLEnvironmentProvider(
                            cutoff, 0),
                        centering_function=None)

    loader = AtomsLoader(dataset,
                         batch_size=20,
                         num_workers=1,
                         pin_memory=True)

    for model_file in natsorted(os.listdir(models_path)):

        model = load_model(models_path + model_file)

        # Disable the creation of graph, which is not needed since we are only evaluating.
        model.output_modules[0].create_graph = False
예제 #8
0
                            type=int)

    # Parse the arguments
    args = arg_parser.parse_args()
    run_params = args.__dict__

    # Determine the output directory
    test_dir = os.path.join(
        'networks',
        f'b{args.batch_size}_n{args.num_epochs}_S{args.random_seed}')
    os.makedirs(test_dir, exist_ok=True)
    with open(os.path.join(test_dir, 'config.json'), 'w') as fp:
        json.dump(run_params, fp)

    # Load in the training database and downsample it
    train_data = AtomsData('../datasets/train.db')
    sampled_idx = np.random.RandomState(args.random_seed).randint(
        len(train_data), size=(len(train_data), ))
    sampled_idx = [int(i) for i in sampled_idx]
    train_data = create_subset(train_data, sampled_idx)

    # Making the data loaders for use during training
    train_loader = AtomsLoader(train_data, args.batch_size, shuffle=True)
    test_data = AtomsData('../datasets/test.db')
    test_loader = AtomsLoader(test_data, args.batch_size)
    valid_data = AtomsData('../datasets/valid.db')
    valid_loader = AtomsLoader(valid_data, args.batch_size)

    # Make the model
    model = torch.load('../best_model', map_location=args.device)
    for module in model.modules():
예제 #9
0
def train_schnet(
    model: Union[TorchMessage, torch.nn.Module, Path],
    database: Dict[str, float],
    num_epochs: int,
    reset_weights: bool = True,
    property_name: str = 'output',
    test_set: Optional[List[str]] = None,
    device: str = 'cpu',
    batch_size: int = 32,
    validation_split: float = 0.1,
    bootstrap: bool = False,
    random_state: int = 1,
    learning_rate: float = 1e-3,
    patience: int = None,
    timeout: float = None
) -> Union[Tuple[TorchMessage, pd.DataFrame], Tuple[TorchMessage, pd.DataFrame,
                                                    List[float]]]:
    """Train a SchNet model

    Args:
        model: Model to be retrained
        database: Mapping of XYZ format structure to property
        num_epochs: Number of training epochs
        property_name: Name of the property being predicted
        reset_weights: Whether to re-initialize weights before training, or start training from previous
        test_set: Hold-out set. If provided, function will return the performance of the model on those weights
        device: Device (e.g., 'cuda', 'cpu') used for training
        batch_size: Batch size during training
        validation_split: Fraction to training set to use for the validation loss
        bootstrap: Whether to take a bootstrap sample of the training set before training
        random_state: Random seed used for generating validation set and bootstrap sampling
        learning_rate: Initial learning rate for optimizer
        patience: Patience until learning rate is lowered. Default: epochs / 8
        timeout: Maximum training time in seconds
    Returns:
        - model: Retrained model
        - history: Training history
        - test_pred: Predictions on ``test_set``, if provided
    """

    # Make sure the models are converted to Torch models
    if isinstance(model, TorchMessage):
        model = model.get_model(device)
    elif isinstance(model, (Path, str)):
        model = torch.load(model,
                           map_location='cpu')  # Load to main memory first

    # If desired, re-initialize weights
    if reset_weights:
        for module in model.modules():
            if hasattr(module, 'reset_parameters'):
                module.reset_parameters()

    # Separate the database into molecules and properties
    xyz, y = zip(*database.items())
    xyz = np.array(xyz)
    y = np.array(y)

    # Convert the xyz files to ase Atoms
    atoms = np.array([next(read_xyz(StringIO(x), slice(None))) for x in xyz])

    # Make the training and validation splits
    rng = np.random.RandomState(random_state)
    train_split = rng.rand(len(xyz)) > validation_split
    train_X = atoms[train_split]
    train_y = y[train_split]
    valid_X = atoms[~train_split]
    valid_y = y[~train_split]

    # Perform a bootstrap sample of the training data
    if bootstrap:
        sample = rng.choice(len(train_X), size=(len(train_X), ), replace=True)
        train_X = train_X[sample]
        train_y = train_y[sample]

    # Start the training process
    with TemporaryDirectory() as td:
        # Save the data to an ASE Atoms database
        train_file = os.path.join(td, 'train_data.db')
        db = AtomsData(train_file, available_properties=[property_name])
        db.add_systems(train_X, [{property_name: i} for i in train_y])
        train_loader = AtomsLoader(db, batch_size=batch_size, shuffle=True)

        valid_file = os.path.join(td, 'valid_data.db')
        db = AtomsData(valid_file, available_properties=[property_name])
        db.add_systems(valid_X, [{property_name: i} for i in valid_y])
        valid_loader = AtomsLoader(db, batch_size=batch_size)

        # Make the trainer
        opt = optim.Adam(model.parameters(), lr=learning_rate)

        loss = trn.build_mse_loss(['delta'])
        metrics = [spk.metrics.MeanSquaredError('delta')]
        if patience is None:
            patience = num_epochs // 8
        hooks = [
            trn.CSVHook(log_path=td, metrics=metrics),
            trn.ReduceLROnPlateauHook(opt,
                                      patience=patience,
                                      factor=0.8,
                                      min_lr=1e-6,
                                      stop_after_min=True)
        ]

        if timeout is not None:
            hooks.append(TimeoutHook(timeout))

        trainer = trn.Trainer(
            model_path=td,
            model=model,
            hooks=hooks,
            loss_fn=loss,
            optimizer=opt,
            train_loader=train_loader,
            validation_loader=valid_loader,
            checkpoint_interval=num_epochs + 1  # Turns off checkpointing
        )

        trainer.train(device, n_epochs=num_epochs)

        # Load in the best model
        model = torch.load(os.path.join(td, 'best_model'))

        # If desired, report the performance on a test set
        test_pred = None
        if test_set is not None:
            test_pred = evaluate_schnet([model],
                                        test_set,
                                        property_name=property_name,
                                        batch_size=batch_size,
                                        device=device)

        # Move the model off of the GPU to save memory
        if 'cuda' in device:
            model.to('cpu')

        # Load in the training results
        train_results = pd.read_csv(os.path.join(td, 'log.csv'))

        # Return the results
        if test_pred is None:
            return TorchMessage(model), train_results
        else:
            return TorchMessage(model), train_results, test_pred[:, 0].tolist()
예제 #10
0
import numpy as np
import os

### CREATES A "EMPTY" PYTORCH DATASET TO PARSE THE DATA TO

## TO BE DEFINED BY USER - defines file location for trajectory files and the database name
dirpath = 'INSERT_DIRECTORY_PATH_HERE'
dbname = "test.db"

## If a database of the same name already exists, it is removed as otherwise it will cause this script to fail
if os.path.isfile(os.path.join(dirpath, dbname)):
    os.remove(os.path.join(dirpath, dbname))

## Creates the schnetpack database (spk_db) in the given directory and defines the properties we are interested in
spk_db = AtomsData(
    os.path.join(dirpath, dbname), available_properties=[
        'energy', 'forces'
    ])  # note that {name}.db must not previously exist in said directory

### APPENDING PROPERTIES TO THE DATABASE

## Parses the energy and forces for every image of every trajectory file in the given directory to the previously defined database
for root, dirs, files in os.walk(dirpath):
    for name in files:
        ## Defines the trajectory and requests all images
        trajectory = read(os.path.join(dirpath, name + "@:"))

        ## Extracts the energies and forces for said trajectory file and puts them in a list of dictionaries
        property_list = [{
            "energy":
            np.array([atoms.get_potential_energy()], dtype=np.float32),
            "forces":
예제 #11
0
class IterativeDatasetReduction():
    """docstring for IterativeDatasetReduction"""
    def __init__(self,
                 dbpath,
                 properties,
                 n_atom_basis=128,
                 n_layers=2,
                 n_filters=128,
                 n_interactions=3,
                 cutoff=5.0,
                 n_gaussians=25,
                 environment_provider=AseEnvironmentProvider,
                 frac=0.05,
                 E_lim=0.025,
                 F_lim=0.2,
                 S_lim=0.005,
                 shm=True):

        ### SchNet settings ###

        self.n_atom_basis = n_atom_basis
        self.n_layers = n_layers
        self.n_filters = n_filters
        self.n_interactions = n_interactions
        self.cutoff = cutoff
        self.n_gaussians = n_gaussians

        #######################

        self.i = 0
        self.frac = frac
        self.E_lim = E_lim
        self.F_lim = F_lim
        self.S_lim = S_lim

        if shm:
            dbcopy = '/dev/shm/' + uuid.uuid4().hex + '.db'
            shutil.copyfile(dbpath, dbcopy)

            self.dataset = AtomsData(dbcopy,
                                     load_only=properties,
                                     environment_provider=environment_provider(
                                         self.cutoff),
                                     centering_function=None)
        else:
            self.dataset = AtomsData(dbpath,
                                     load_only=properties,
                                     environment_provider=environment_provider(
                                         self.cutoff),
                                     centering_function=None)

        self.idx_rem = np.arange(len(self.dataset))
        np.random.shuffle(self.idx_rem)

        I = np.arange(round(self.frac * len(self.idx_rem)))

        self.idx_red = self.idx_rem[I]
        self.idx_rem = np.delete(self.idx_rem, I)

    def evaluate_fn(self, batch, result, fid=None):
        with torch.no_grad():

            N = torch.sum(batch['_atom_mask'], 1)

            E_err = torch.abs(batch['energy'] - result['energy']).view(-1) / N
            F_err = torch.sum(torch.abs(batch['forces'] - result['forces']),
                              (2, 1)) / N
            S_err = torch.mean(torch.abs(batch['stress'] - result['stress']),
                               (2, 1))

            if fid is not None:
                for e, f, s in zip(E_err, F_err, S_err):
                    fid.write('%f,%f,%f\n' % (e, f, s))

        return ((E_err > self.E_lim).byte() + (F_err > self.F_lim).byte() +
                (S_err > self.S_lim).byte() > 0)

    def train(self,
              n_epochs,
              lr,
              loss_fn,
              batch_size,
              num_workers,
              device,
              patience=100,
              threshold_ratio=0.0001):

        self.i += 1

        reduced = self.dataset.create_subset(self.idx_red)

        num_val = round(0.10 * len(reduced))
        train, val, test = train_test_split(data=reduced,
                                            num_train=len(reduced) - num_val,
                                            num_val=num_val)

        train_loader = AtomsLoader(train,
                                   batch_size=round(batch_size),
                                   num_workers=num_workers,
                                   shuffle=True,
                                   pin_memory=True)

        val_loader = AtomsLoader(val,
                                 batch_size=round(batch_size / 2),
                                 num_workers=num_workers,
                                 pin_memory=True)

        representation = SchNet(n_atom_basis=self.n_atom_basis,
                                n_filters=self.n_filters,
                                n_interactions=self.n_interactions,
                                cutoff=self.cutoff,
                                n_gaussians=self.n_gaussians)

        output_modules = Atomwise(representation.n_atom_basis,
                                  n_layers=self.n_layers,
                                  property='energy',
                                  derivative='forces',
                                  stress='stress',
                                  negative_dr=True,
                                  create_graph=True)

        model = AtomisticModel(representation, output_modules)

        optimizer = Adam(model.parameters(), lr=lr)

        hooks = [
            CSVHook('log_%i' % self.i, [
                MeanAbsoluteError('energy', 'energy'),
                MeanAbsoluteError('forces', 'forces', element_wise=True),
                MeanAbsoluteError('stress', 'stress'),
                R2Score('energy', 'energy'),
                R2Score('forces', 'forces', element_wise=True),
                R2Score('stress', 'stress')
            ],
                    every_n_epochs=1)
        ]

        hooks.append(EarlyStoppingHook(patience, threshold_ratio))

        trainer = Trainer('output_%i/' % self.i,
                          model,
                          loss_fn,
                          optimizer,
                          train_loader,
                          val_loader,
                          hooks=hooks,
                          keep_n_checkpoints=1,
                          checkpoint_interval=n_epochs)

        print('Running training!')
        print('    Reduced images: %i' % len(reduced))
        print('    Traning images: %i' % len(train))
        print(' Validation images: %i' % len(val))
        print('')

        trainer.train(device, n_epochs)

    def evaluate(self, batch_size, num_workers, device, log_remaining=True):

        model = load_model('output_%i/best_model' % self.i,
                           map_location=device)
        model.output_modules[0].create_graph = False

        remaining = self.dataset.create_subset(self.idx_rem)

        loader = AtomsLoader(remaining,
                             batch_size=round(batch_size / 2),
                             num_workers=num_workers,
                             pin_memory=True)

        print('Running evaluation!')

        if log_remaining:
            fid = open('log_%i/remaining.csv' % self.i, 'w')
            fid.write('Energy (eV),Force (eV/Å),Stress (eV/ų)\n')
        else:
            fid = None

        passfail = []
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            result = model(batch)
            passfail += self.evaluate_fn(batch, result, fid).tolist()

        fid.close()

        I = np.where(passfail)[0]
        percentage = 100 * len(I) / len(self.idx_rem)

        if percentage > 5.0:
            np.random.shuffle(I)
            J = I[0:round(self.frac * len(I))]

            self.idx_red = np.append(self.idx_red, self.idx_rem[J])
            self.idx_rem = np.delete(self.idx_rem, J)
        else:
            1 + 1

        print('            Failed images: %i' % len(I))
        print('             Added images: %i' % len(J))
        print('  Percentage of remaining: %5.2f' % percentage)
        print(' Reduced/Remaining images: %i/%i' %
              (len(self.idx_red), len(self.idx_rem)))
        print('')

    def reduce(self,
               n_epochs,
               lr,
               loss_fn,
               batch_size,
               num_workers,
               device,
               patience=100,
               threshold_ratio=0.0001,
               log_remaining=True):

        while True:
            self.train(n_epochs, lr, loss_fn, batch_size, num_workers, device)
            self.evaluate(batch_size, num_workers, device)
예제 #12
0
def read_dataset(path,numberofgeoms,filename):

    atom_buffer = []
    property_buffer = []
    charge_buffer = []
    metadata = {}
    for geom in range(1,1+numberofgeoms):

        #Geometry and Atomtypes
        xyz_file = open(path+"/xyz-files/%07d.xyz"%geom,"r").readlines()
        charge = int(xyz_file[1].split()[2])
        natom = int(xyz_file[0].split()[0])
        E=[]
        R=np.zeros((natom,3))
        for iatom in range(natom):
            E.append(xyz_file[iatom+2].split()[0])
            for xyz in range(3):
               R[iatom][xyz] = float(xyz_file[iatom+2].split()[1+xyz])/Bohr
        atoms = Atoms(E,R)

        #Properties
        prop_file = open(path+"/properties/%07d"%geom,"r").readlines()
        singlets = 0
        doublets = 0
        triplets = 0
        quartets = 0
        _energy = False
        energy = np.zeros((1))
        _soc = False
        soc = np.zeros((1))
        _force = False
        force = np.zeros((1))
        _dipole = False
        dipole = np.zeros((1))
        _nac = False
        nac = np.zeros((1))
        _dyson = False
        property_matrix=False
        dyson = np.zeros((1))
        property_list=[]
        for line in prop_file:
            if line.startswith("Singlets"):
                singlets = int(line.split()[1])
            elif line.startswith("Doublets"):
                doublets = int(line.split()[1])
            elif line.startswith("Triplets"):
                triplets = int(line.split()[1])
            elif line.startswith("Quartets"):
                quartets = int(line.split()[1])
            elif line.startswith("Energy"):
                if int(line.split()[-1])==int(1):
                    _energy = True
                    property_list.append('energy')
            elif line.startswith("Dipole"):
                if int(line.split()[-1])==int(1):
                    _dipole = True
                    property_list.append('dipoles')
            elif line.startswith("SOC"):
                if int(line.split()[-1])==int(1):
                    _soc = True
                    property_list.append('socs')
            elif line.startswith("Grad"):
                if int(line.split()[-1])==int(1):
                    _force = True
                    property_list.append('forces')
                    property_list.append('has_forces')
            elif line.startswith("Given_grad"):
                has_force=[]
                if int(line.split()[-1])==int(1):
                    _has_forces = True
                    has_force.append(1)
                    property_list.append('has_forces')
                else:
                    has_force.append(0)
                has_force=np.array(has_force)
            elif line.startswith("NAC"):
                if int(line.split()[-1])==int(1):
                    _nac = True
                    property_list.append('nacs')
            elif line.startswith('DYSON'):
                if int(line.split()[-1])==int(1):
                    _dyson = True
                    property_list.append('dyson')
            else:
                continue
        nmstates = singlets + 2*doublets + 3*triplets + 4*quartets
        iline = -1
        for line in prop_file:
            iline+=1
            if line.startswith("! Energy"):
                n_energy = singlets + doublets + triplets + quartets
                #int(line.split()[2])
                energy = [] #np.zeros((n_energy))
                eline  = prop_file[iline+1].split()
                for i in range(singlets):
                    energy.append(float(eline[i]))
                for i in range(singlets,singlets+doublets):
                    energy.append(float(eline[i]))
                for i in range(singlets+2*doublets,singlets+2*doublets+triplets):
                    energy.append(float(eline[i]))
                for i in range(singlets+2*doublets+3*triplets,singlets+2*doublets+3*triplets+quartets):
                    energy.append(float(eline[i]))
                energy=np.array(energy)
            #dipole is read in as mu(1,1), mu(1,2), mu(1,3),...
            elif line.startswith("! Dipole"):
                n_dipole = int((singlets*(singlets+1))/2+(doublets*(doublets+1))/2+(triplets*(triplets+1))/2+(quartets*(quartets+1))/2)
                dipole = np.zeros((n_dipole,3))
                dline = prop_file[iline+1].split()
                for i in range(n_dipole):
                    for xyz in range(3):
                        dipole[i][xyz] = float(dline[i+n_dipole*xyz])
            elif line.startswith("! SpinOrbitCoupling"):
                n_soc = int(line.split()[2])
                soc = [] #np.zeros((n_soc))
                sline = prop_file[iline+1].split()
                for i in range(n_soc):
                     soc.append(float(sline[i]))
                soc=np.array(soc)
            elif line.startswith("! Gradient"):
                n_grad = int(line.split()[2])
                force = np.zeros((singlets+triplets+doublets+quartets,natom,3))
                index = -1
                gline = prop_file[iline+1].split()
                for istate in range(singlets+doublets):
                    for iatom in range(natom):
                        for xyz in range(3):
                            index+=1
                            force[istate][iatom][xyz] = -float(gline[index])
                index+=(natom*3*doublets)
                for istate in range(singlets+doublets,singlets+doublets+triplets):
                    for iatom in range(natom):
                        for xyz in range(3):
                            index+=1
                            force[istate][iatom][xyz] = -float(gline[index])
                index+=(2*natom*3*triplets)
                for istate in range(singlets+doublets+triplets,singlets+doublets+triplets+quartets):
                    for iatom in range(natom):
                        for xyz in range(3):
                            index+=1
                            force[istate][iatom][xyz] = -float(gline[index])
            #nonadiabatic couplings are also defined as vectors
            elif line.startswith("! Nonadiabatic coupling"):
                n_nac = int(int(line.split()[3])/3/natom)
                #dimension: nstates(coupled), natoms,xyz(3)
                nac = np.zeros((n_nac,natom,3))
                nacline = prop_file[iline+1].split()
                index=-1
                for i in range(n_nac):
                    for iatom in range(natom):
                        for xyz in range(3):
                            index+=1
                            nac[i][iatom][xyz] = float(nacline[index])
            elif line.startswith('! Dyson'):
                n_dyson = int(line.split()[-1])
                property_matrix = []
                sline = prop_file[iline+1].split()
                for i in range(n_dyson):
                    property_matrix.append(float(sline[i]))
                property_matrix=np.array(property_matrix)
            else:
                continue

        available_properties = { 'energy' : energy,
                        'socs'    : soc,
                        'forces'  : force,
                        'has_forces': has_force,
                        'nacs'    : nac,
                        'dipoles' : dipole,
                        'dyson'   : property_matrix }
        #Append list 
        charge_buffer.append(charge)
        atom_buffer.append(atoms)
        property_buffer.append(available_properties)
    #get schnet format
    metadata['n_singlets'] = int(singlets)
    metadata['n_doublets'] = int(doublets)
    metadata['n_triplets'] = int(triplets)
    metadata['n_quartets'] = int(quartets)
    states = ''
    for singlet in range(singlets):
      states += 'S '
    for dublet in range(2*doublets):
      states += 'D '
    for triplet in range(3*triplets):
      states += 'T '
    for quartet in range(4*quartets):
      states += 'Q '
    metadata['states'] = states
    reference = 'QC' # TODO put your method here
    phasecorrected = False
    metadata['phasecorrected'] = phasecorrected
    metadata['ReferenceMethod'] = reference
    spk_data = AtomsData(filename,available_properties=property_list)
    spk_data.add_systems(atom_buffer,property_buffer)
    #get metadata
    spk_data.set_metadata(metadata)
예제 #13
0
#mse_loss = MeanSquaredError()
mse_loss = spk.train.loss.build_mse_loss

logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))

# basic settings
model_dir = "psi4_model"  # directory that will be created for storing model
#os.makedirs(model_dir)
properties = ["energy", "forces"]  # properties used for training

# data preparation
logging.info("get dataset")
dataset = AtomsData(
    "psi4.db",
    available_properties=properties,
    #required_properties=properties,
    collect_triples=True)

train, val, test = spk.train_test_split(
    data=dataset,
    num_train=200,
    num_val=20,
    split_file=os.path.join(model_dir, "split.npz"),
)
train_loader = spk.AtomsLoader(train, batch_size=50)
val_loader = spk.AtomsLoader(val, batch_size=5)

# get statistics
#atomrefs = dataset.get_atomrefs(properties)
per_atom = dict(energy=True, forces=False)