def test_default_featurizer(self): smiles = ["C1=CC=CN=C1", "O=C(NCc1cc(OC)c(O)cc1)CCCC/C=C/C(C)C"] featurizer = MolGraphConvFeaturizer() graph_feat = featurizer.featurize(smiles) assert len(graph_feat) == 2 # assert "C1=CC=CN=C1" assert graph_feat[0].num_nodes == 6 assert graph_feat[0].num_node_features == 30 assert graph_feat[0].num_edges == 12 # assert "O=C(NCc1cc(OC)c(O)cc1)CCCC/C=C/C(C)C" assert graph_feat[1].num_nodes == 22 assert graph_feat[1].num_node_features == 30 assert graph_feat[1].num_edges == 44
def test_featurizer_with_use_partial_charge(self): smiles = ["C1=CC=CN=C1", "O=C(NCc1cc(OC)c(O)cc1)CCCC/C=C/C(C)C"] featurizer = MolGraphConvFeaturizer(use_partial_charge=True) graph_feat = featurizer.featurize(smiles) assert len(graph_feat) == 2 # assert "C1=CC=CN=C1" assert graph_feat[0].num_nodes == 6 assert graph_feat[0].num_node_features == 31 assert graph_feat[0].num_edges == 12 # assert "O=C(NCc1cc(OC)c(O)cc1)CCCC/C=C/C(C)C" assert graph_feat[1].num_nodes == 22 assert graph_feat[1].num_node_features == 31 assert graph_feat[1].num_edges == 44
def test_mpnn_classification(): # load datasets featurizer = MolGraphConvFeaturizer(use_edges=True) tasks, dataset, transformers, metric = get_dataset('classification', featurizer=featurizer) # initialize models n_tasks = len(tasks) model = MPNNModel(mode='classification', n_tasks=n_tasks, learning_rate=0.0005) # overfit test model.fit(dataset, nb_epoch=200) scores = model.evaluate(dataset, [metric], transformers) assert scores['mean-roc_auc_score'] >= 0.85 # test on a small MoleculeNet dataset from deepchem.molnet import load_bace_classification tasks, all_dataset, transformers = load_bace_classification( featurizer=featurizer) train_set, _, _ = all_dataset model = MPNNModel(mode='classification', n_tasks=len(tasks), node_out_feats=2, edge_hidden_feats=2, num_step_message_passing=1, num_step_set2set=1, num_layer_set2set=1) model.fit(train_set, nb_epoch=1)
def test_attentivefp_classification(): # load datasets featurizer = MolGraphConvFeaturizer(use_edges=True) tasks, dataset, transformers, metric = get_dataset( 'classification', featurizer=featurizer) # initialize models n_tasks = len(tasks) model = AttentiveFPModel( mode='classification', n_tasks=n_tasks, batch_size=10, learning_rate=0.001) # overfit test model.fit(dataset, nb_epoch=100) scores = model.evaluate(dataset, [metric], transformers) assert scores['mean-roc_auc_score'] >= 0.85 # test on a small MoleculeNet dataset from deepchem.molnet import load_bace_classification tasks, all_dataset, transformers = load_bace_classification( featurizer=featurizer) train_set, _, _ = all_dataset model = AttentiveFPModel( mode='classification', n_tasks=len(tasks), num_layers=1, num_timesteps=1, graph_feat_size=2) model.fit(train_set, nb_epoch=1)
def test_mpnn_regression(): # load datasets featurizer = MolGraphConvFeaturizer(use_edges=True) tasks, dataset, transformers, metric = get_dataset('regression', featurizer=featurizer) # initialize models n_tasks = len(tasks) model = MPNNModel(mode='regression', n_tasks=n_tasks, batch_size=10) # overfit test model.fit(dataset, nb_epoch=400) scores = model.evaluate(dataset, [metric], transformers) assert scores['mean_absolute_error'] < 0.5 # test on a small MoleculeNet dataset from deepchem.molnet import load_delaney tasks, all_dataset, transformers = load_delaney(featurizer=featurizer) train_set, _, _ = all_dataset model = MPNNModel(mode='regression', n_tasks=len(tasks), node_out_feats=2, edge_hidden_feats=2, num_step_message_passing=1, num_step_set2set=1, num_layer_set2set=1) model.fit(train_set, nb_epoch=1)
def test_gcn_regression(): # load datasets featurizer = MolGraphConvFeaturizer() tasks, dataset, transformers, metric = get_dataset('regression', featurizer=featurizer) # initialize models n_tasks = len(tasks) model = GCNModel(mode='regression', n_tasks=n_tasks, number_atom_features=30, batch_size=10, learning_rate=0.003) # overfit test model.fit(dataset, nb_epoch=300) scores = model.evaluate(dataset, [metric], transformers) assert scores['mean_absolute_error'] < 0.5 # test on a small MoleculeNet dataset from deepchem.molnet import load_delaney tasks, all_dataset, transformers = load_delaney(featurizer=featurizer) train_set, _, _ = all_dataset model = dc.models.GCNModel(n_tasks=len(tasks), graph_conv_layers=[2], residual=False, predictor_hidden_feats=2) model.fit(train_set, nb_epoch=1)
def test_gat_classification(): # load datasets featurizer = MolGraphConvFeaturizer() tasks, dataset, transformers, metric = get_dataset('classification', featurizer=featurizer) # initialize models n_tasks = len(tasks) model = GATModel(mode='classification', n_tasks=n_tasks, number_atom_features=30, batch_size=10, learning_rate=0.001) # overfit test model.fit(dataset, nb_epoch=100) scores = model.evaluate(dataset, [metric], transformers) assert scores['mean-roc_auc_score'] >= 0.85 # test on a small MoleculeNet dataset from deepchem.molnet import load_bace_classification tasks, all_dataset, transformers = load_bace_classification( featurizer=featurizer) train_set, _, _ = all_dataset model = dc.models.GATModel(mode='classification', n_tasks=len(tasks), graph_attention_layers=[2], n_attention_heads=1, residual=False, predictor_hidden_feats=2) model.fit(train_set, nb_epoch=1)
def test_gcn_reload(): # load datasets featurizer = MolGraphConvFeaturizer() tasks, dataset, transformers, metric = get_dataset('classification', featurizer=featurizer) # initialize models n_tasks = len(tasks) model_dir = tempfile.mkdtemp() model = GCNModel(mode='classification', n_tasks=n_tasks, number_atom_features=30, model_dir=model_dir, batch_size=10, learning_rate=0.0003) model.fit(dataset, nb_epoch=70) scores = model.evaluate(dataset, [metric], transformers) assert scores['mean-roc_auc_score'] >= 0.85 reloaded_model = GCNModel(mode='classification', n_tasks=n_tasks, number_atom_features=30, model_dir=model_dir, batch_size=10, learning_rate=0.0003) reloaded_model.restore() pred_mols = ["CCCC", "CCCCCO", "CCCCC"] X_pred = featurizer(pred_mols) random_dataset = dc.data.NumpyDataset(X_pred) original_pred = model.predict(random_dataset) reload_pred = reloaded_model.predict(random_dataset) assert np.all(original_pred == reload_pred)
def test_attentivefp_regression(): # load datasets featurizer = MolGraphConvFeaturizer(use_edges=True) tasks, dataset, transformers, metric = get_dataset( 'regression', featurizer=featurizer) # initialize models n_tasks = len(tasks) model = AttentiveFPModel(mode='regression', n_tasks=n_tasks, batch_size=10) # overfit test model.fit(dataset, nb_epoch=100) scores = model.evaluate(dataset, [metric], transformers) assert scores['mean_absolute_error'] < 0.5 # test on a small MoleculeNet dataset from deepchem.molnet import load_delaney tasks, all_dataset, transformers = load_delaney(featurizer=featurizer) train_set, _, _ = all_dataset model = AttentiveFPModel( mode='regression', n_tasks=len(tasks), num_layers=1, num_timesteps=1, graph_feat_size=2) model.fit(train_set, nb_epoch=1)
def test_featurizer_with_self_loop(self): smiles = ["C1=CC=CN=C1", "O=C(NCc1cc(OC)c(O)cc1)CCCC/C=C/C(C)C"] featurizer = MolGraphConvFeaturizer(add_self_edges=True) graph_feat = featurizer.featurize(smiles) assert len(graph_feat) == 2 # assert "C1=CC=CN=C1" assert graph_feat[0].num_nodes == 6 assert graph_feat[0].num_node_features == 39 assert graph_feat[0].num_edges == 12 + 6 assert graph_feat[0].num_edge_features == 11 # assert "O=C(NCc1cc(OC)c(O)cc1)CCCC/C=C/C(C)C" assert graph_feat[1].num_nodes == 22 assert graph_feat[1].num_node_features == 39 assert graph_feat[1].num_edges == 44 + 22 assert graph_feat[1].num_edge_features == 11
def test_attentivefp_regression(): # load datasets featurizer = MolGraphConvFeaturizer(use_edges=True) tasks, dataset, transformers, metric = get_dataset('regression', featurizer=featurizer) # initialize models n_tasks = len(tasks) model = AttentiveFPModel(mode='regression', n_tasks=n_tasks, batch_size=10) # overfit test model.fit(dataset, nb_epoch=100) scores = model.evaluate(dataset, [metric], transformers) assert scores['mean_absolute_error'] < 0.5
def test_gat_regression(): # load datasets featurizer = MolGraphConvFeaturizer() tasks, dataset, transformers, metric = get_dataset('regression', featurizer=featurizer) # initialize models n_tasks = len(tasks) model = GATModel(mode='regression', n_tasks=n_tasks, batch_size=10) # overfit test # GAT's convergence is a little slow model.fit(dataset, nb_epoch=300) scores = model.evaluate(dataset, [metric], transformers) assert scores['mean_absolute_error'] < 0.5
def test_attentivefp_classification(): # load datasets featurizer = MolGraphConvFeaturizer(use_edges=True) tasks, dataset, transformers, metric = get_dataset('classification', featurizer=featurizer) # initialize models n_tasks = len(tasks) model = AttentiveFPModel(mode='classification', n_tasks=n_tasks, batch_size=10, learning_rate=0.001) # overfit test model.fit(dataset, nb_epoch=100) scores = model.evaluate(dataset, [metric], transformers) assert scores['mean-roc_auc_score'] >= 0.85
def test_gcn_regression(): # load datasets featurizer = MolGraphConvFeaturizer() tasks, dataset, transformers, metric = get_dataset('regression', featurizer=featurizer) # initialize models n_tasks = len(tasks) model = GCNModel(mode='regression', n_tasks=n_tasks, number_atom_features=30, batch_size=10) # overfit test model.fit(dataset, nb_epoch=100) scores = model.evaluate(dataset, [metric], transformers) assert scores['mean_absolute_error'] < 0.5
def load_dataset(args): splitter = 'scaffold' if args['featurizer'] == 'ECFP': featurizer = 'ECFP' elif args['featurizer'] == 'GC': from deepchem.feat import MolGraphConvFeaturizer featurizer = MolGraphConvFeaturizer() if args['dataset'] == 'BACE_classification': from deepchem.molnet import load_bace_classification tasks, all_dataset, transformers = load_bace_classification( featurizer=featurizer, splitter=splitter, reload=False) elif args['dataset'] == 'BBBP': from deepchem.molnet import load_bbbp tasks, all_dataset, transformers = load_bbbp( featurizer=featurizer, splitter=splitter, reload=False) elif args['dataset'] == 'BACE_regression': from deepchem.molnet import load_bace_regression tasks, all_dataset, transformers = load_bace_regression( featurizer=featurizer, splitter=splitter, reload=False) elif args['dataset'] == 'ClinTox': from deepchem.molnet import load_clintox tasks, all_dataset, transformers = load_clintox( featurizer=featurizer, splitter=splitter, reload=False) elif args['dataset'] == 'Delaney': from deepchem.molnet import load_delaney tasks, all_dataset, transformers = load_delaney( featurizer=featurizer, splitter=splitter, reload=False) elif args['dataset'] == 'HOPV': from deepchem.molnet import load_hopv tasks, all_dataset, transformers = load_hopv( featurizer=featurizer, splitter=splitter, reload=False) elif args['dataset'] == 'SIDER': from deepchem.molnet import load_sider tasks, all_dataset, transformers = load_sider( featurizer=featurizer, splitter=splitter, reload=False) elif args['dataset'] == 'Lipo': from deepchem.molnet import load_lipo tasks, all_dataset, transformers = load_lipo( featurizer=featurizer, splitter=splitter, reload=False) else: raise ValueError('Unexpected dataset: {}'.format(args['dataset'])) return args, tasks, all_dataset, transformers
def test_gcn_classification(): # load datasets featurizer = MolGraphConvFeaturizer() tasks, dataset, transformers, metric = get_dataset('classification', featurizer=featurizer) # initialize models n_tasks = len(tasks) model = GCNModel(mode='classification', n_tasks=n_tasks, number_atom_features=30, batch_size=10, learning_rate=0.0003) # overfit test model.fit(dataset, nb_epoch=70) scores = model.evaluate(dataset, [metric], transformers) assert scores['mean-roc_auc_score'] >= 0.85
def test_gat_classification(): # load datasets featurizer = MolGraphConvFeaturizer() tasks, dataset, transformers, metric = get_dataset('classification', featurizer=featurizer) # initialize models n_tasks = len(tasks) model = GATModel(mode='classification', n_tasks=n_tasks, batch_size=10, learning_rate=0.001) # overfit test # GAT's convergence is a little slow model.fit(dataset, nb_epoch=150) scores = model.evaluate(dataset, [metric], transformers) assert scores['mean-roc_auc_score'] >= 0.85
def test_gat_classification(): # load datasets featurizer = MolGraphConvFeaturizer() tasks, dataset, transformers, metric = get_dataset('regression', featurizer=featurizer) # initialize models n_tasks = len(tasks) model = GATModel(n_tasks=n_tasks, loss=losses.L2Loss(), batch_size=4, learning_rate=0.001) # overfit test model.fit(dataset, nb_epoch=100) scores = model.evaluate(dataset, [metric], transformers) # TODO: check this asseration is correct or not assert scores['mean_absolute_error'] < 1.0
def load_dataset(args): splitter = 'scaffold' if args['featurizer'] == 'ECFP': featurizer = 'ECFP' elif args['featurizer'] == 'GC': from deepchem.feat import MolGraphConvFeaturizer featurizer = MolGraphConvFeaturizer() elif args['featurizer'] == 'AC': from deepchem.feat import AtomicConvFeaturizer featurizer = AtomicConvFeaturizer(frag1_num_atoms=100, frag2_num_atoms=1000, complex_num_atoms=1100, max_num_neighbors=12, neighbor_cutoff=4) if args['dataset'] == 'BACE_classification': from deepchem.molnet import load_bace_classification tasks, all_dataset, transformers = load_bace_classification( featurizer=featurizer, splitter=splitter, reload=False) elif args['dataset'] == 'BBBP': from deepchem.molnet import load_bbbp tasks, all_dataset, transformers = load_bbbp(featurizer=featurizer, splitter=splitter, reload=False) elif args['dataset'] == 'BACE_regression': from deepchem.molnet import load_bace_regression tasks, all_dataset, transformers = load_bace_regression( featurizer=featurizer, splitter=splitter, reload=False) elif args['dataset'] == 'ClinTox': from deepchem.molnet import load_clintox tasks, all_dataset, transformers = load_clintox(featurizer=featurizer, splitter=splitter, reload=False) elif args['dataset'] == 'Delaney': from deepchem.molnet import load_delaney tasks, all_dataset, transformers = load_delaney(featurizer=featurizer, splitter=splitter, reload=False) elif args['dataset'] == 'HOPV': from deepchem.molnet import load_hopv tasks, all_dataset, transformers = load_hopv(featurizer=featurizer, splitter=splitter, reload=False) elif args['dataset'] == 'SIDER': from deepchem.molnet import load_sider tasks, all_dataset, transformers = load_sider(featurizer=featurizer, splitter=splitter, reload=False) elif args['dataset'] == 'Lipo': from deepchem.molnet import load_lipo tasks, all_dataset, transformers = load_lipo(featurizer=featurizer, splitter=splitter, reload=False) elif args['dataset'] == 'PDBbind': from deepchem.molnet import load_pdbbind tasks, all_dataset, transformers = load_pdbbind( featurizer=featurizer, save_dir='.', data_dir='.', splitter='random', pocket=True, set_name='core', # refined reload=False) else: raise ValueError('Unexpected dataset: {}'.format(args['dataset'])) return args, tasks, all_dataset, transformers