def get_dataset(_log, dbpath, dataset, dataset_properties=None): """ Get a dataset from the configuration. Args: dbpath (str): path to the local database dataset (str): name of the dataset dataset_properties (list): properties of the dataset Returns: AtomsData object """ dataset = dataset.upper() _log.info('Load {} dataset'.format(dataset)) if dataset == 'QM9': return QM9(dbpath, properties=dataset_properties) elif dataset == 'ISO17': return get_iso17(dataset_properties=dataset_properties) elif dataset == 'ANI1': return get_ani1(dataset_properties=dataset_properties) elif dataset == 'MD17': return get_md17(dataset_properties=dataset_properties) elif dataset == 'MATPROJ': return get_matproj(dataset_properties=dataset_properties) elif dataset == 'CUSTOM': file, extension = os.path.splitext(dbpath) if extension == '.db': return AtomsData(dbpath, required_properties=dataset_properties) else: generate_db(db_path=file + '.db', file_path=dbpath) return AtomsData(file + '.db', required_properties=dataset_properties)
def __init__(self, dbpath, properties, n_atom_basis=128, n_layers=2, n_filters=128, n_interactions=3, cutoff=5.0, n_gaussians=25, environment_provider=AseEnvironmentProvider, frac=0.05, E_lim=0.025, F_lim=0.2, S_lim=0.005, shm=True): ### SchNet settings ### self.n_atom_basis = n_atom_basis self.n_layers = n_layers self.n_filters = n_filters self.n_interactions = n_interactions self.cutoff = cutoff self.n_gaussians = n_gaussians ####################### self.i = 0 self.frac = frac self.E_lim = E_lim self.F_lim = F_lim self.S_lim = S_lim if shm: dbcopy = '/dev/shm/' + uuid.uuid4().hex + '.db' shutil.copyfile(dbpath, dbcopy) self.dataset = AtomsData(dbcopy, load_only=properties, environment_provider=environment_provider( self.cutoff), centering_function=None) else: self.dataset = AtomsData(dbpath, load_only=properties, environment_provider=environment_provider( self.cutoff), centering_function=None) self.idx_rem = np.arange(len(self.dataset)) np.random.shuffle(self.idx_rem) I = np.arange(round(self.frac * len(self.idx_rem))) self.idx_red = self.idx_rem[I] self.idx_rem = np.delete(self.idx_rem, I)
def evaluate_schnet(models: List[Union[TorchMessage, torch.nn.Module, Path]], molecules: List[str], property_name: str, batch_size: int = 64, device: str = 'cpu') -> np.ndarray: """Run inference for a machine learning model Args: models: List of models to evaluate. Either a SchNet model or the bytes corresponding to a serialized model molecules: XYZ-format structures of molecules to be evaluate property_name: Name of the property being predicted batch_size: Number of molecules to evaluate per batch device: Device on which to run the computation """ # Make sure the models are converted to Torch models if isinstance(models[0], TorchMessage): models = [m.get_model(device) for m in models] elif isinstance(models[0], (Path, str)): models = [torch.load(m, map_location='cpu') for m in models] # Load to main memory first # Make the dataset with TemporaryDirectory() as td: # Convert the molecules to ase.Atoms objects atoms = [next(read_xyz(StringIO(x), slice(None))) for x in molecules] # Save the data to an ASE Atoms database run_file = os.path.join(td, 'run_data.db') db = AtomsData(run_file, available_properties=[]) db.add_systems(atoms, [{} for _ in atoms]) # Build the data loader loader = AtomsLoader(db, batch_size=batch_size) # Run the models y_preds = [] for model in models: y_pred = [] model.to(device) # Move the model to the device for batch in loader: # Push the batch to the device batch = {k: v.to(device) for k, v in batch.items()} # Run it and save results pred = model(batch) y_pred.append(pred[property_name].detach().cpu().numpy()) y_preds.append(np.squeeze(np.concatenate(y_pred))) return np.vstack(y_preds).T
def get_dataset(dbpath, dataset, dataset_properties=None): """ Get a dataset from the configuration. Args: dbpath (str): path to the local database dataset (str): name of the dataset dataset_properties (list): properties of the dataset Returns: AtomsData object """ dataset = dataset.upper() if dataset == 'QM9': return QM9(dbpath, properties=dataset_properties) elif dataset == 'ISO17': return get_iso17(dataset_properties=dataset_properties) elif dataset == 'ANI1': return get_ani1(dataset_properties=dataset_properties) elif dataset == 'MD17': return get_md17(dataset_properties=dataset_properties) elif dataset == 'MATPROJ': return get_matproj(dataset_properties=dataset_properties) elif dataset == 'CUSTOM': return AtomsData(dbpath, required_properties=dataset_properties) else: raise NotImplementedError
def make_schnetpack_data(dataset, dbpath, properties, xyz_col='xyz', conformers=None, overwrite=True): """Convert a Pandas dictionary to a SchNet database Args: dataset (pd.DataFrame): Dataset to convert dbpath (string): Path to database to be saved properties ([string]): List of properties to include in the dataset conformers (str): Name of column with conformers as xyz xyz_col (string): Name of the column with the XYZ data overwrite (True): Whether to overwrite the database """ # If needed, delete the previous database if os.path.exists(dbpath) and overwrite: os.unlink(dbpath) # Convert all entries to ase.Atoms objects atoms = dataset[xyz_col].apply(lambda x: read_xyz(StringIO(x)).__next__()) # Every column besides the training set will be a property prop_cols = set(properties).difference([xyz_col]) property_list = [ dict(zip(prop_cols, [np.atleast_1d(row[p]) for p in prop_cols])) for i, row in dataset.iterrows() ] # Add conformers to the property list, but it isn't a required property when loading entries if conformers is not None: for d, c in zip(property_list, dataset[conformers]): d['conformers'] = np.atleast_1d(c) # Initialize the object db = AtomsData(dbpath, required_properties=properties, conformers=conformers is not None) # Add every system to the db object db.add_systems(atoms, property_list) return db
def test_orca_parser(testdir, orca_log_path, target_orca_db_path): db_path = os.path.join(testdir, "test_orca_parser.db") all_properties = OrcaMainFileParser.properties + OrcaHessianFileParser.properties orca_parser = OrcaParser(db_path, properties=all_properties) orca_parser.file_extensions[Properties.hessian] = ".hess" orca_parser.parse_data([orca_log_path]) db_target = AtomsData(target_orca_db_path) db_test = AtomsData(db_path) target_atoms, target_properties = db_target.get_properties(0) test_atoms, test_properties = db_test.get_properties(0) assert np.allclose(target_atoms.get_atomic_numbers(), test_atoms.get_atomic_numbers()) assert np.allclose(target_atoms.positions, test_atoms.positions) for p in target_properties: assert p in test_properties assert np.allclose(test_properties[p], target_properties[p])
R2Score('stress', 'stress') ] fid = open('results.txt', 'w') header = ' Dataset Model MAE energy MAE forces MAE stress RMSE energy RMSE forces RMSE stress R2 energy R2 forces R2 stress\n' header += '==================== ==================== =========== =========== =========== =========== =========== =========== =========== =========== ===========\n' fid.write(header) fid.flush() for dataset_file in natsorted(os.listdir(datasets_path)): dataset = AtomsData(datasets_path + dataset_file, load_only=properties, environment_provider=OpenCLEnvironmentProvider( cutoff, 0), centering_function=None) loader = AtomsLoader(dataset, batch_size=20, num_workers=1, pin_memory=True) for model_file in natsorted(os.listdir(models_path)): model = load_model(models_path + model_file) # Disable the creation of graph, which is not needed since we are only evaluating. model.output_modules[0].create_graph = False
type=int) # Parse the arguments args = arg_parser.parse_args() run_params = args.__dict__ # Determine the output directory test_dir = os.path.join( 'networks', f'b{args.batch_size}_n{args.num_epochs}_S{args.random_seed}') os.makedirs(test_dir, exist_ok=True) with open(os.path.join(test_dir, 'config.json'), 'w') as fp: json.dump(run_params, fp) # Load in the training database and downsample it train_data = AtomsData('../datasets/train.db') sampled_idx = np.random.RandomState(args.random_seed).randint( len(train_data), size=(len(train_data), )) sampled_idx = [int(i) for i in sampled_idx] train_data = create_subset(train_data, sampled_idx) # Making the data loaders for use during training train_loader = AtomsLoader(train_data, args.batch_size, shuffle=True) test_data = AtomsData('../datasets/test.db') test_loader = AtomsLoader(test_data, args.batch_size) valid_data = AtomsData('../datasets/valid.db') valid_loader = AtomsLoader(valid_data, args.batch_size) # Make the model model = torch.load('../best_model', map_location=args.device) for module in model.modules():
def train_schnet( model: Union[TorchMessage, torch.nn.Module, Path], database: Dict[str, float], num_epochs: int, reset_weights: bool = True, property_name: str = 'output', test_set: Optional[List[str]] = None, device: str = 'cpu', batch_size: int = 32, validation_split: float = 0.1, bootstrap: bool = False, random_state: int = 1, learning_rate: float = 1e-3, patience: int = None, timeout: float = None ) -> Union[Tuple[TorchMessage, pd.DataFrame], Tuple[TorchMessage, pd.DataFrame, List[float]]]: """Train a SchNet model Args: model: Model to be retrained database: Mapping of XYZ format structure to property num_epochs: Number of training epochs property_name: Name of the property being predicted reset_weights: Whether to re-initialize weights before training, or start training from previous test_set: Hold-out set. If provided, function will return the performance of the model on those weights device: Device (e.g., 'cuda', 'cpu') used for training batch_size: Batch size during training validation_split: Fraction to training set to use for the validation loss bootstrap: Whether to take a bootstrap sample of the training set before training random_state: Random seed used for generating validation set and bootstrap sampling learning_rate: Initial learning rate for optimizer patience: Patience until learning rate is lowered. Default: epochs / 8 timeout: Maximum training time in seconds Returns: - model: Retrained model - history: Training history - test_pred: Predictions on ``test_set``, if provided """ # Make sure the models are converted to Torch models if isinstance(model, TorchMessage): model = model.get_model(device) elif isinstance(model, (Path, str)): model = torch.load(model, map_location='cpu') # Load to main memory first # If desired, re-initialize weights if reset_weights: for module in model.modules(): if hasattr(module, 'reset_parameters'): module.reset_parameters() # Separate the database into molecules and properties xyz, y = zip(*database.items()) xyz = np.array(xyz) y = np.array(y) # Convert the xyz files to ase Atoms atoms = np.array([next(read_xyz(StringIO(x), slice(None))) for x in xyz]) # Make the training and validation splits rng = np.random.RandomState(random_state) train_split = rng.rand(len(xyz)) > validation_split train_X = atoms[train_split] train_y = y[train_split] valid_X = atoms[~train_split] valid_y = y[~train_split] # Perform a bootstrap sample of the training data if bootstrap: sample = rng.choice(len(train_X), size=(len(train_X), ), replace=True) train_X = train_X[sample] train_y = train_y[sample] # Start the training process with TemporaryDirectory() as td: # Save the data to an ASE Atoms database train_file = os.path.join(td, 'train_data.db') db = AtomsData(train_file, available_properties=[property_name]) db.add_systems(train_X, [{property_name: i} for i in train_y]) train_loader = AtomsLoader(db, batch_size=batch_size, shuffle=True) valid_file = os.path.join(td, 'valid_data.db') db = AtomsData(valid_file, available_properties=[property_name]) db.add_systems(valid_X, [{property_name: i} for i in valid_y]) valid_loader = AtomsLoader(db, batch_size=batch_size) # Make the trainer opt = optim.Adam(model.parameters(), lr=learning_rate) loss = trn.build_mse_loss(['delta']) metrics = [spk.metrics.MeanSquaredError('delta')] if patience is None: patience = num_epochs // 8 hooks = [ trn.CSVHook(log_path=td, metrics=metrics), trn.ReduceLROnPlateauHook(opt, patience=patience, factor=0.8, min_lr=1e-6, stop_after_min=True) ] if timeout is not None: hooks.append(TimeoutHook(timeout)) trainer = trn.Trainer( model_path=td, model=model, hooks=hooks, loss_fn=loss, optimizer=opt, train_loader=train_loader, validation_loader=valid_loader, checkpoint_interval=num_epochs + 1 # Turns off checkpointing ) trainer.train(device, n_epochs=num_epochs) # Load in the best model model = torch.load(os.path.join(td, 'best_model')) # If desired, report the performance on a test set test_pred = None if test_set is not None: test_pred = evaluate_schnet([model], test_set, property_name=property_name, batch_size=batch_size, device=device) # Move the model off of the GPU to save memory if 'cuda' in device: model.to('cpu') # Load in the training results train_results = pd.read_csv(os.path.join(td, 'log.csv')) # Return the results if test_pred is None: return TorchMessage(model), train_results else: return TorchMessage(model), train_results, test_pred[:, 0].tolist()
import numpy as np import os ### CREATES A "EMPTY" PYTORCH DATASET TO PARSE THE DATA TO ## TO BE DEFINED BY USER - defines file location for trajectory files and the database name dirpath = 'INSERT_DIRECTORY_PATH_HERE' dbname = "test.db" ## If a database of the same name already exists, it is removed as otherwise it will cause this script to fail if os.path.isfile(os.path.join(dirpath, dbname)): os.remove(os.path.join(dirpath, dbname)) ## Creates the schnetpack database (spk_db) in the given directory and defines the properties we are interested in spk_db = AtomsData( os.path.join(dirpath, dbname), available_properties=[ 'energy', 'forces' ]) # note that {name}.db must not previously exist in said directory ### APPENDING PROPERTIES TO THE DATABASE ## Parses the energy and forces for every image of every trajectory file in the given directory to the previously defined database for root, dirs, files in os.walk(dirpath): for name in files: ## Defines the trajectory and requests all images trajectory = read(os.path.join(dirpath, name + "@:")) ## Extracts the energies and forces for said trajectory file and puts them in a list of dictionaries property_list = [{ "energy": np.array([atoms.get_potential_energy()], dtype=np.float32), "forces":
class IterativeDatasetReduction(): """docstring for IterativeDatasetReduction""" def __init__(self, dbpath, properties, n_atom_basis=128, n_layers=2, n_filters=128, n_interactions=3, cutoff=5.0, n_gaussians=25, environment_provider=AseEnvironmentProvider, frac=0.05, E_lim=0.025, F_lim=0.2, S_lim=0.005, shm=True): ### SchNet settings ### self.n_atom_basis = n_atom_basis self.n_layers = n_layers self.n_filters = n_filters self.n_interactions = n_interactions self.cutoff = cutoff self.n_gaussians = n_gaussians ####################### self.i = 0 self.frac = frac self.E_lim = E_lim self.F_lim = F_lim self.S_lim = S_lim if shm: dbcopy = '/dev/shm/' + uuid.uuid4().hex + '.db' shutil.copyfile(dbpath, dbcopy) self.dataset = AtomsData(dbcopy, load_only=properties, environment_provider=environment_provider( self.cutoff), centering_function=None) else: self.dataset = AtomsData(dbpath, load_only=properties, environment_provider=environment_provider( self.cutoff), centering_function=None) self.idx_rem = np.arange(len(self.dataset)) np.random.shuffle(self.idx_rem) I = np.arange(round(self.frac * len(self.idx_rem))) self.idx_red = self.idx_rem[I] self.idx_rem = np.delete(self.idx_rem, I) def evaluate_fn(self, batch, result, fid=None): with torch.no_grad(): N = torch.sum(batch['_atom_mask'], 1) E_err = torch.abs(batch['energy'] - result['energy']).view(-1) / N F_err = torch.sum(torch.abs(batch['forces'] - result['forces']), (2, 1)) / N S_err = torch.mean(torch.abs(batch['stress'] - result['stress']), (2, 1)) if fid is not None: for e, f, s in zip(E_err, F_err, S_err): fid.write('%f,%f,%f\n' % (e, f, s)) return ((E_err > self.E_lim).byte() + (F_err > self.F_lim).byte() + (S_err > self.S_lim).byte() > 0) def train(self, n_epochs, lr, loss_fn, batch_size, num_workers, device, patience=100, threshold_ratio=0.0001): self.i += 1 reduced = self.dataset.create_subset(self.idx_red) num_val = round(0.10 * len(reduced)) train, val, test = train_test_split(data=reduced, num_train=len(reduced) - num_val, num_val=num_val) train_loader = AtomsLoader(train, batch_size=round(batch_size), num_workers=num_workers, shuffle=True, pin_memory=True) val_loader = AtomsLoader(val, batch_size=round(batch_size / 2), num_workers=num_workers, pin_memory=True) representation = SchNet(n_atom_basis=self.n_atom_basis, n_filters=self.n_filters, n_interactions=self.n_interactions, cutoff=self.cutoff, n_gaussians=self.n_gaussians) output_modules = Atomwise(representation.n_atom_basis, n_layers=self.n_layers, property='energy', derivative='forces', stress='stress', negative_dr=True, create_graph=True) model = AtomisticModel(representation, output_modules) optimizer = Adam(model.parameters(), lr=lr) hooks = [ CSVHook('log_%i' % self.i, [ MeanAbsoluteError('energy', 'energy'), MeanAbsoluteError('forces', 'forces', element_wise=True), MeanAbsoluteError('stress', 'stress'), R2Score('energy', 'energy'), R2Score('forces', 'forces', element_wise=True), R2Score('stress', 'stress') ], every_n_epochs=1) ] hooks.append(EarlyStoppingHook(patience, threshold_ratio)) trainer = Trainer('output_%i/' % self.i, model, loss_fn, optimizer, train_loader, val_loader, hooks=hooks, keep_n_checkpoints=1, checkpoint_interval=n_epochs) print('Running training!') print(' Reduced images: %i' % len(reduced)) print(' Traning images: %i' % len(train)) print(' Validation images: %i' % len(val)) print('') trainer.train(device, n_epochs) def evaluate(self, batch_size, num_workers, device, log_remaining=True): model = load_model('output_%i/best_model' % self.i, map_location=device) model.output_modules[0].create_graph = False remaining = self.dataset.create_subset(self.idx_rem) loader = AtomsLoader(remaining, batch_size=round(batch_size / 2), num_workers=num_workers, pin_memory=True) print('Running evaluation!') if log_remaining: fid = open('log_%i/remaining.csv' % self.i, 'w') fid.write('Energy (eV),Force (eV/Å),Stress (eV/ų)\n') else: fid = None passfail = [] for batch in loader: batch = {k: v.to(device) for k, v in batch.items()} result = model(batch) passfail += self.evaluate_fn(batch, result, fid).tolist() fid.close() I = np.where(passfail)[0] percentage = 100 * len(I) / len(self.idx_rem) if percentage > 5.0: np.random.shuffle(I) J = I[0:round(self.frac * len(I))] self.idx_red = np.append(self.idx_red, self.idx_rem[J]) self.idx_rem = np.delete(self.idx_rem, J) else: 1 + 1 print(' Failed images: %i' % len(I)) print(' Added images: %i' % len(J)) print(' Percentage of remaining: %5.2f' % percentage) print(' Reduced/Remaining images: %i/%i' % (len(self.idx_red), len(self.idx_rem))) print('') def reduce(self, n_epochs, lr, loss_fn, batch_size, num_workers, device, patience=100, threshold_ratio=0.0001, log_remaining=True): while True: self.train(n_epochs, lr, loss_fn, batch_size, num_workers, device) self.evaluate(batch_size, num_workers, device)
def read_dataset(path,numberofgeoms,filename): atom_buffer = [] property_buffer = [] charge_buffer = [] metadata = {} for geom in range(1,1+numberofgeoms): #Geometry and Atomtypes xyz_file = open(path+"/xyz-files/%07d.xyz"%geom,"r").readlines() charge = int(xyz_file[1].split()[2]) natom = int(xyz_file[0].split()[0]) E=[] R=np.zeros((natom,3)) for iatom in range(natom): E.append(xyz_file[iatom+2].split()[0]) for xyz in range(3): R[iatom][xyz] = float(xyz_file[iatom+2].split()[1+xyz])/Bohr atoms = Atoms(E,R) #Properties prop_file = open(path+"/properties/%07d"%geom,"r").readlines() singlets = 0 doublets = 0 triplets = 0 quartets = 0 _energy = False energy = np.zeros((1)) _soc = False soc = np.zeros((1)) _force = False force = np.zeros((1)) _dipole = False dipole = np.zeros((1)) _nac = False nac = np.zeros((1)) _dyson = False property_matrix=False dyson = np.zeros((1)) property_list=[] for line in prop_file: if line.startswith("Singlets"): singlets = int(line.split()[1]) elif line.startswith("Doublets"): doublets = int(line.split()[1]) elif line.startswith("Triplets"): triplets = int(line.split()[1]) elif line.startswith("Quartets"): quartets = int(line.split()[1]) elif line.startswith("Energy"): if int(line.split()[-1])==int(1): _energy = True property_list.append('energy') elif line.startswith("Dipole"): if int(line.split()[-1])==int(1): _dipole = True property_list.append('dipoles') elif line.startswith("SOC"): if int(line.split()[-1])==int(1): _soc = True property_list.append('socs') elif line.startswith("Grad"): if int(line.split()[-1])==int(1): _force = True property_list.append('forces') property_list.append('has_forces') elif line.startswith("Given_grad"): has_force=[] if int(line.split()[-1])==int(1): _has_forces = True has_force.append(1) property_list.append('has_forces') else: has_force.append(0) has_force=np.array(has_force) elif line.startswith("NAC"): if int(line.split()[-1])==int(1): _nac = True property_list.append('nacs') elif line.startswith('DYSON'): if int(line.split()[-1])==int(1): _dyson = True property_list.append('dyson') else: continue nmstates = singlets + 2*doublets + 3*triplets + 4*quartets iline = -1 for line in prop_file: iline+=1 if line.startswith("! Energy"): n_energy = singlets + doublets + triplets + quartets #int(line.split()[2]) energy = [] #np.zeros((n_energy)) eline = prop_file[iline+1].split() for i in range(singlets): energy.append(float(eline[i])) for i in range(singlets,singlets+doublets): energy.append(float(eline[i])) for i in range(singlets+2*doublets,singlets+2*doublets+triplets): energy.append(float(eline[i])) for i in range(singlets+2*doublets+3*triplets,singlets+2*doublets+3*triplets+quartets): energy.append(float(eline[i])) energy=np.array(energy) #dipole is read in as mu(1,1), mu(1,2), mu(1,3),... elif line.startswith("! Dipole"): n_dipole = int((singlets*(singlets+1))/2+(doublets*(doublets+1))/2+(triplets*(triplets+1))/2+(quartets*(quartets+1))/2) dipole = np.zeros((n_dipole,3)) dline = prop_file[iline+1].split() for i in range(n_dipole): for xyz in range(3): dipole[i][xyz] = float(dline[i+n_dipole*xyz]) elif line.startswith("! SpinOrbitCoupling"): n_soc = int(line.split()[2]) soc = [] #np.zeros((n_soc)) sline = prop_file[iline+1].split() for i in range(n_soc): soc.append(float(sline[i])) soc=np.array(soc) elif line.startswith("! Gradient"): n_grad = int(line.split()[2]) force = np.zeros((singlets+triplets+doublets+quartets,natom,3)) index = -1 gline = prop_file[iline+1].split() for istate in range(singlets+doublets): for iatom in range(natom): for xyz in range(3): index+=1 force[istate][iatom][xyz] = -float(gline[index]) index+=(natom*3*doublets) for istate in range(singlets+doublets,singlets+doublets+triplets): for iatom in range(natom): for xyz in range(3): index+=1 force[istate][iatom][xyz] = -float(gline[index]) index+=(2*natom*3*triplets) for istate in range(singlets+doublets+triplets,singlets+doublets+triplets+quartets): for iatom in range(natom): for xyz in range(3): index+=1 force[istate][iatom][xyz] = -float(gline[index]) #nonadiabatic couplings are also defined as vectors elif line.startswith("! Nonadiabatic coupling"): n_nac = int(int(line.split()[3])/3/natom) #dimension: nstates(coupled), natoms,xyz(3) nac = np.zeros((n_nac,natom,3)) nacline = prop_file[iline+1].split() index=-1 for i in range(n_nac): for iatom in range(natom): for xyz in range(3): index+=1 nac[i][iatom][xyz] = float(nacline[index]) elif line.startswith('! Dyson'): n_dyson = int(line.split()[-1]) property_matrix = [] sline = prop_file[iline+1].split() for i in range(n_dyson): property_matrix.append(float(sline[i])) property_matrix=np.array(property_matrix) else: continue available_properties = { 'energy' : energy, 'socs' : soc, 'forces' : force, 'has_forces': has_force, 'nacs' : nac, 'dipoles' : dipole, 'dyson' : property_matrix } #Append list charge_buffer.append(charge) atom_buffer.append(atoms) property_buffer.append(available_properties) #get schnet format metadata['n_singlets'] = int(singlets) metadata['n_doublets'] = int(doublets) metadata['n_triplets'] = int(triplets) metadata['n_quartets'] = int(quartets) states = '' for singlet in range(singlets): states += 'S ' for dublet in range(2*doublets): states += 'D ' for triplet in range(3*triplets): states += 'T ' for quartet in range(4*quartets): states += 'Q ' metadata['states'] = states reference = 'QC' # TODO put your method here phasecorrected = False metadata['phasecorrected'] = phasecorrected metadata['ReferenceMethod'] = reference spk_data = AtomsData(filename,available_properties=property_list) spk_data.add_systems(atom_buffer,property_buffer) #get metadata spk_data.set_metadata(metadata)
#mse_loss = MeanSquaredError() mse_loss = spk.train.loss.build_mse_loss logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) # basic settings model_dir = "psi4_model" # directory that will be created for storing model #os.makedirs(model_dir) properties = ["energy", "forces"] # properties used for training # data preparation logging.info("get dataset") dataset = AtomsData( "psi4.db", available_properties=properties, #required_properties=properties, collect_triples=True) train, val, test = spk.train_test_split( data=dataset, num_train=200, num_val=20, split_file=os.path.join(model_dir, "split.npz"), ) train_loader = spk.AtomsLoader(train, batch_size=50) val_loader = spk.AtomsLoader(val, batch_size=5) # get statistics #atomrefs = dataset.get_atomrefs(properties) per_atom = dict(energy=True, forces=False)