def test_atomic_element(openff_methane: Molecule): feature = AtomicElement(["H", "C"]) assert len(feature) == 2 encoding = feature(openff_methane).numpy() assert encoding.shape == (5, 2) assert numpy.allclose(encoding[1:, 0], 1.0) assert numpy.allclose(encoding[1:, 1], 0.0) assert numpy.isclose(encoding[0, 0], 0.0) assert numpy.isclose(encoding[0, 1], 1.0)
def test_setup(self, tmpdir, mock_data_store): data_module = DGLMoleculeDataModule( atom_features=[AtomicElement(["Cl", "H"])], bond_features=[BondOrder()], partial_charge_method="am1bcc", bond_order_method="am1", train_set_path=mock_data_store, train_batch_size=None, output_path=os.path.join(tmpdir, "tmp.pkl"), use_cached_data=False, ) data_module.prepare_data() data_module.setup() assert isinstance(data_module._train_data.datasets[0], DGLMoleculeDataset) assert data_module._val_data is None assert data_module._test_data is None
def test_prepare_cache(self, use_cached_data, expected_raises, tmpdir, mock_data_store): data_module = DGLMoleculeDataModule( atom_features=[AtomicElement(["Cl", "H"])], bond_features=[BondOrder()], partial_charge_method="am1bcc", bond_order_method="am1", train_set_path=mock_data_store, train_batch_size=None, output_path=os.path.join(tmpdir, "tmp.pkl"), use_cached_data=use_cached_data, ) with open(data_module._output_path, "wb") as file: pickle.dump((None, None, None), file) with expected_raises: data_module.prepare_data()
def mock_data_module(self) -> DGLMoleculeDataModule: return DGLMoleculeDataModule( atom_features=[ AtomicElement(["C", "H", "Cl"]), AtomFormalCharge([0, 1]) ], bond_features=[BondOrder()], partial_charge_method="am1bcc", bond_order_method="am1", train_set_path="train.sqlite", train_batch_size=1, val_set_path="val.sqlite", val_batch_size=2, test_set_path="test.sqlite", test_batch_size=3, output_path="tmp.pkl", use_cached_data=True, molecule_to_dgl=TestDGLMoleculeDataModule.mock_molecule_to_dgl, )
def test_prepare(self, tmpdir, mock_data_store): data_module = DGLMoleculeDataModule( atom_features=[AtomicElement(["Cl", "H"])], bond_features=[BondOrder()], partial_charge_method="am1bcc", bond_order_method="am1", train_set_path=mock_data_store, train_batch_size=None, val_set_path=mock_data_store, test_set_path=mock_data_store, output_path=os.path.join(tmpdir, "tmp.pkl"), ) data_module.prepare_data() assert os.path.isfile(data_module._output_path) with open(data_module._output_path, "rb") as file: datasets = pickle.load(file) assert all(isinstance(dataset, ConcatDataset) for dataset in datasets) assert all(dataset.datasets[0].n_features == 2 for dataset in datasets)
def main(): print(torch.seed()) # Define the atom / bond features of interest. atom_features = [ AtomicElement(["C", "O", "H"]), AtomConnectivity(), ] bond_features = [ BondOrder(), ] # Compute the total length of the input atomic feature vector n_atom_features = sum(len(feature) for feature in atom_features) # Load in the training and test data training_smiles = ["CO", "CCO", "CCCO", "CCCCO"] training_data = DGLMoleculeDataset.from_smiles( training_smiles, atom_features, bond_features, label_function, ) training_loader = DGLMoleculeDataLoader(training_data, batch_size=len(training_smiles), shuffle=False) test_smiles = [ "CCCCCCCCCO", ] test_loader = DGLMoleculeDataLoader( DGLMoleculeDataset.from_smiles( test_smiles, atom_features, bond_features, label_function, ), batch_size=len(test_smiles), shuffle=False, ) # Define the model. n_gcn_layers = 5 n_gcn_hidden_features = 128 n_am1_layers = 2 n_am1_hidden_features = 64 learning_rate = 0.001 model = DGLMoleculeLightningModel( convolution_module=ConvolutionModule( architecture="SAGEConv", in_feats=n_atom_features, hidden_feats=[n_gcn_hidden_features] * n_gcn_layers, ), readout_modules={ # The keys of the readout modules should correspond to keys in the # label dictionary. "am1-charges": ReadoutModule( pooling_layer=PoolAtomFeatures(), readout_layers=SequentialLayers( in_feats=n_gcn_hidden_features, hidden_feats=[n_am1_hidden_features] * n_am1_layers + [2], activation=["ReLU"] * n_am1_layers + ["Identity"], ), postprocess_layer=ComputePartialCharges(), ) }, learning_rate=learning_rate, ) print(model) # Train the model n_epochs = 100 n_gpus = 0 if not torch.cuda.is_available() else 1 print(f"Using {n_gpus} GPUs") trainer = pl.Trainer(gpus=n_gpus, min_epochs=n_epochs, max_epochs=n_epochs) trainer.fit(model, train_dataloaders=training_loader) trainer.test(model, test_dataloaders=test_loader)
def main( train_set_path, train_batch_size, val_set_path, test_set_path, n_gcn_layers, n_gcn_hidden_features, n_am1_layers, n_am1_hidden_features, learning_rate, n_epochs, ): pprint(locals()) # pl.seed_everything(3992210414) # h-parameter sweep v1 # Define the features of interest. atom_features = [ AtomicElement(["C", "O", "H", "N", "S", "F", "Br", "Cl", "I", "P"]), AtomConnectivity(), AtomAverageFormalCharge(), ] bond_features = [ # BondIsInRing(), # BondOrder() ] # Load in the pre-processed training and test molecules and store them in # featurized graphs. data_module = DGLMoleculeDataModule( atom_features, bond_features, partial_charge_method="am1", bond_order_method=None, train_set_path=train_set_path, train_batch_size=train_batch_size, val_set_path=val_set_path, val_batch_size=None, test_set_path=test_set_path, test_batch_size=None, use_cached_data=True, ) n_atom_features = data_module.n_atom_features # Define the model. model = DGLMoleculeLightningModel( convolution_module=ConvolutionModule( architecture="SAGEConv", in_feats=n_atom_features, hidden_feats=[n_gcn_hidden_features] * n_gcn_layers, ), readout_modules={ "am1-charges": ReadoutModule( pooling_layer=PoolAtomFeatures(), readout_layers=SequentialLayers( in_feats=n_gcn_hidden_features, hidden_feats=[n_am1_hidden_features] * n_am1_layers + [2], activation=["ReLU"] * n_am1_layers + ["Identity"], ), postprocess_layer=ComputePartialCharges(), ) }, learning_rate=learning_rate, ) print(model) # Train the model n_gpus = 0 if not torch.cuda.is_available() else 1 print(f"Using {n_gpus} GPUs") logger = TensorBoardLogger( "lightning-logs", version=( f"{train_batch_size}-" f"{n_gcn_layers}-" f"{n_gcn_hidden_features}-" f"{n_am1_layers}-" f"{n_am1_hidden_features}-" f"{learning_rate}" ), ) trainer = pl.Trainer( gpus=n_gpus, min_epochs=n_epochs, max_epochs=n_epochs, logger=logger ) trainer.fit(model, datamodule=data_module) trainer.test(model, data_module)