def test_graph_conv_atom_features(self): tasks, dataset, transformers, metric = self.get_dataset('regression', 'Raw', num_tasks=1) atom_feature_name = 'feature' y = [] for mol in dataset.X: atom_features = [] for atom in mol.GetAtoms(): val = np.random.normal() mol.SetProp( "atom %08d %s" % (atom.GetIdx(), atom_feature_name), str(val)) atom_features.append(np.random.normal()) y.append(np.sum(atom_features)) featurizer = ConvMolFeaturizer(atom_properties=[atom_feature_name]) X = featurizer.featurize(dataset.X) dataset = deepchem.data.NumpyDataset(X, np.array(y)) batch_size = 50 model = GraphConvTensorGraph( len(tasks), number_atom_features=featurizer.feature_length(), batch_size=batch_size, mode='regression') model.fit(dataset, nb_epoch=1) y_pred1 = model.predict(dataset) model.save() model2 = TensorGraph.load_from_dir(model.model_dir) y_pred2 = model2.predict(dataset) self.assertTrue(np.all(y_pred1 == y_pred2))
def test_graph_conv_atom_features(): tasks, dataset, transformers, metric = get_dataset('regression', 'Raw', num_tasks=1) atom_feature_name = 'feature' y = [] for mol in dataset.X: atom_features = [] for atom in mol.GetAtoms(): val = np.random.normal() mol.SetProp("atom %08d %s" % (atom.GetIdx(), atom_feature_name), str(val)) atom_features.append(np.random.normal()) y.append([np.sum(atom_features)]) featurizer = ConvMolFeaturizer(atom_properties=[atom_feature_name]) X = featurizer.featurize(dataset.X) dataset = dc.data.NumpyDataset(X, np.array(y)) batch_size = 50 model = GraphConvModel(len(tasks), number_atom_features=featurizer.feature_length(), batch_size=batch_size, mode='regression') model.fit(dataset, nb_epoch=1) y_pred1 = model.predict(dataset)
def test_mol_ordering(): mols = get_molecules() featurizer = ConvMolFeaturizer() featurized_mols = featurizer.featurize(mols) for i in range(len(featurized_mols)): atom_features = featurized_mols[i].atom_features degree_list = np.expand_dims(featurized_mols[i].degree_list, axis=1) atom_features = np.concatenate([degree_list, atom_features], axis=1) featurized_mols[i].atom_features = atom_features conv_mol = ConvMol.agglomerate_mols(featurized_mols) for start, end in conv_mol.deg_slice.tolist(): members = conv_mol.membership[start:end] sorted_members = np.array(sorted(members)) members = np.array(members) assert np.all(sorted_members == members) conv_mol_atom_features = conv_mol.get_atom_features() adj_number = 0 for start, end in conv_mol.deg_slice.tolist(): deg_features = conv_mol_atom_features[start:end] adj_number_array = deg_features[:, 0] assert np.all(adj_number_array == adj_number) adj_number += 1
def test_graph_conv_atom_features(self): tasks, dataset, transformers, metric = self.get_dataset( 'regression', 'Raw', num_tasks=1) atom_feature_name = 'feature' y = [] for mol in dataset.X: atom_features = [] for atom in mol.GetAtoms(): val = np.random.normal() mol.SetProp("atom %08d %s" % (atom.GetIdx(), atom_feature_name), str(val)) atom_features.append(np.random.normal()) y.append([np.sum(atom_features)]) featurizer = ConvMolFeaturizer(atom_properties=[atom_feature_name]) X = featurizer.featurize(dataset.X) dataset = dc.data.NumpyDataset(X, np.array(y)) batch_size = 50 model = GraphConvModel( len(tasks), number_atom_features=featurizer.feature_length(), batch_size=batch_size, mode='regression') model.fit(dataset, nb_epoch=1) y_pred1 = model.predict(dataset) model.save() model2 = TensorGraph.load_from_dir(model.model_dir) y_pred2 = model2.predict(dataset) self.assertTrue(np.allclose(y_pred1, y_pred2))
def test_convmol_hashable(self): featurizer1 = ConvMolFeaturizer(atom_properties=['feature']) featurizer2 = ConvMolFeaturizer(atom_properties=['feature']) featurizer3 = ConvMolFeaturizer() d = set() d.add(featurizer1) d.add(featurizer2) d.add(featurizer3) self.assertEqual(2, len(d)) featurizers = [featurizer1, featurizer2, featurizer3] for featurizer in featurizers: self.assertTrue(featurizer in featurizers)
def preprocess(raw_data, feats="convmol"): """ Preprocess molecule data """ labels = raw_data['labels'] smiles = raw_data['smiles'] adjs = raw_data['adjs'] num_classes = np.unique(labels).shape[0] #One hot labels labels_one_hot = np.eye(num_classes)[labels.reshape(-1)] if feats == "weave": featurizer = WeaveFeaturizer() elif feats == "convmol": featurizer = ConvMolFeaturizer() mol_objs = featurizer.featurize([MolFromSmiles(smile) for smile in smiles]) #Sort feature matrices by node degree node_features = [] for i, feat in enumerate(mol_objs): sortind = np.argsort(adjs[i].sum(axis=1) - 1) N = len(sortind) sortMatrix = np.eye(N)[sortind, :] node_features.append(np.matmul(sortMatrix.T, feat.get_atom_features())) #Normalize Adjacency Mats norm_adjs = [preprocess_adj(A) for A in adjs] return { 'labels_one_hot': labels_one_hot, 'node_features': node_features, 'norm_adjs': norm_adjs }
def load_training_data(dataset_files, split_field='Fold', smiles_field='Smiles', y_field='Value', id_field='Compound_No', tempdir=op.join(MODEL_DIR, 'datatemp'), cv=True): """ Given a list of datasets in csv format, read them and prepare them for DeepChem (split if needed, etc.) :param dataset_files: path to the csv files containing the training data for each task of interest :param split_field: column name in the csv giving the fold assignment for CV. Not used if cv=False :param smiles_field: column name in the csv giving the SMILES of the compounds :param y_field: column name in the csv giving the experimental value to learn :param cv: whether we are also splitting the data by split_field :return: list of tasks and the list of ConvMol datasets (one dataset per group in split_field) """ ensure_dir(tempdir) df_trains = [] for dataset_file in dataset_files: try: df_trains.append(pd.read_csv(dataset_file, sep=',')) except IOError: # no test split for example df = pd.DataFrame( {id_field: [], y_field: [], smiles_field: []}) # create an empty df for missing task df_trains.append(df) n_tasks = len(dataset_files) # Rename the y_field column df_trains = [df_train.rename(index=str, columns={y_field: y_field + '_%i' % i}) for i, df_train in enumerate(df_trains)] # Merge the individual tasks based on Smiles if cv: df_train = reduce(lambda x, y: pd.merge(x, y, on=[id_field, smiles_field, split_field], how='outer'), df_trains) else: df_train = reduce(lambda x, y: pd.merge(x, y, on=[id_field, smiles_field], how='outer'), df_trains) # Save the merged train csv in a temporary place dataset_file = op.join(tempdir, 'data.csv') df_train.to_csv(dataset_file, na_rep=np.nan, index=False) # Featurization featurizer = ConvMolFeaturizer() loader = CSVLoader(tasks=[y_field + '_%i' % i for i in range(n_tasks)], smiles_field=smiles_field, featurizer=featurizer, id_field=id_field) dataset = loader.featurize(dataset_file, shard_size=8192, data_dir=tempdir) if cv: folds = np.unique(df_trains[0][split_field].tolist()) # Separate in folds folds_datasets = [] fold_dirs = [] for f in folds: fold_dir = tempfile.mkdtemp(prefix=tempdir + '/') indices = np.flatnonzero(df_train[split_field] == f) folds_datasets.append(dataset.select(indices, select_dir=fold_dir)) fold_dirs.append(fold_dir) return ['Value_%i' % i for i in range(n_tasks)], folds_datasets, fold_dirs return ['Value_%i' % i for i in range(n_tasks)], dataset
def amino_acid_embedding(self, name=None): if name == None: name = 'AAEmbedding_'+str(self.module_count)+'_' self.module_count += 1 feat = ConvMolFeaturizer() featurized_AA = [feat._featurize(Chem.MolFromSmiles(smile)) for smile in AminoAcid_SMILES] multiConvMol = ConvMol.agglomerate_mols(featurized_AA, max_deg=3) atom_features = TensorWrapper(tf.constant(multiConvMol.get_atom_features(), dtype=tf.float32), name=name+'atom_features') degree_slice = TensorWrapper(tf.constant(multiConvMol.deg_slice, dtype=tf.int32), name=name+'degree') membership = TensorWrapper(tf.constant(multiConvMol.membership, dtype=tf.int32), name=name+'membership') deg_adjs = [] for i in range(0, 3): deg_adjs.append(TensorWrapper(tf.constant(multiConvMol.get_deg_adjacency_lists()[i+1], dtype=tf.int32), name=name+'deg_'+str(i))) gc1 = GraphConv( 64, max_deg=3, activation_fn=tf.nn.relu, in_layers=[atom_features, degree_slice, membership] + deg_adjs, name=name+'gc1') batch_norm1 = BatchNorm(in_layers=[gc1, self.training_placeholder], name=name+'bn1') gp1 = GraphPool(max_degree=3, in_layers=[batch_norm1, degree_slice, membership] + deg_adjs, name=name+'gp1') gc2 = GraphConv( 64, max_deg=3, activation_fn=tf.nn.relu, in_layers=[gp1, degree_slice, membership] + deg_adjs, name=name+'gc2') batch_norm2 = BatchNorm(in_layers=[gc2, self.training_placeholder], name=name+'bn2') gp2 = GraphPool(max_degree=3, in_layers=[batch_norm2, degree_slice, membership] + deg_adjs, name=name+'gp2') dense = Dense(out_channels=self.embedding_length/2, activation_fn=tf.nn.relu, in_layers=[gp2], name=name+'dense1') batch_norm3 = BatchNorm(in_layers=[dense, self.training_placeholder], name=name+'bn3') readout = GraphGather( batch_size=21, activation_fn=tf.nn.tanh, in_layers=[batch_norm3, degree_slice, membership] + deg_adjs, name=name+'gg') padding = AminoAcidPad( embedding_length=self.embedding_length, in_layers=[readout], name=name+'pad') return padding
def load_inference_data(dataset_file, n_tasks, tempdir, smiles_field='Smiles', id_field='Compound_No'): """ :param dataset_file: path to the csv files containing the data we want to predict :param smiles_field: column name in the csv giving the SMILES of the compounds :param id_field: column name in the csv giving the identifier for the molecules :param tempdir: directory where ConvMol datasets will be temporarily stored :return: list of tasks and the ConvMol datasets """ # Featurize the dataset for Graph Convolutional architecture featurizer = ConvMolFeaturizer() loader = CSVLoader(tasks=['Value_%i' % i for i in range(n_tasks)], smiles_field=smiles_field, featurizer=featurizer, id_field=id_field) dataset = loader.featurize(dataset_file, shard_size=8192, data_dir=tempdir) return ['Value_%i' % i for i in range(n_tasks)], dataset
def load_data(dataset_file, split_field=None, smiles_field='Smiles', y_field=None, id_field='Compound_No', tempdir=op.join(RESULTS_DIR, 'DCGraphConv', 'to_check'), cv=True): """ :param dataset_file: path to the csv file containing the data to read :param split_field: column name in the csv giving the fold assignment for CV. Optional :param smiles_field: column name in the csv giving the SMILES of the compounds :param y_field: column name in the csv giving the experimental value to learn. Optional (in case of prospective predictions) :param id_field: column name in the csv giving the molecule id :param tempdir: where the shards and featurized data will be temporarily stored :param cv: bool. whether we are splitting by folds or not (in which case split_field is used as fold assignment indicator) :return: list of tasks and the list of ConvMol datasets (one dataset per group in split_field) """ if y_field is None: tasks = [] else: tasks = [y_field] featurizer = ConvMolFeaturizer() loader = CSVLoader(tasks=tasks, smiles_field=smiles_field, featurizer=featurizer, id_field=id_field) dataset = loader.featurize(dataset_file, shard_size=8192, data_dir=tempdir) if split_field is not None and cv: df_train = pd.read_csv(dataset_file, sep=',') folds = np.unique(df_train[split_field].tolist()) folds_datasets = [] fold_dirs = [] for f in folds: fold_dir = tempfile.mkdtemp(prefix=tempdir + '/') indices = np.flatnonzero(df_train[split_field] == f) folds_datasets.append(dataset.select(indices, select_dir=fold_dir)) fold_dirs.append(fold_dir) return tasks, folds_datasets, fold_dirs return tasks, dataset
def generate_conv_mol(self): self.log.info( "Generating Molecular Convolutions, may take a few moments ...") mol_list = [] for index, row in self.data.iterrows(): mol = Chem.MolFromSmiles(row["SMILE"]) Chem.AddHs(mol) # AllChem.UFFOptimizeMolecule(mol, confId=id) mol_list.append(mol) conv_array = ConvMolFeaturizer().featurize(mol_list) print( "len(conv_array)", len(conv_array), "type(conv_array)", type(conv_array), "type(conv_array[0]", type(conv_array[0]), "get_atom_features", conv_array[0].get_atom_features().shape, )
from deepchem.feat import ConvMolFeaturizer #from tensorflow.keras.models import Sequential import os os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/' seed = 1 #reload(p) suffix="\\" #file="FinalCleaned.csv" file="WeiReactionsAndAllFps.pkl" filename="FinalAllcsv.csv" featurizer=ConvMolFeaturizer() #pCSV = p.processCSV(filename) #readOriginalDataSetAndCreatePickledSmilesOnly #pCSV.readPickleAndTranslateToMols() #pCSV = p.processCSV(filename) #pCSV = p.processCSV(file) #pCSV.readOriginalDataSetAndCreatePickledSmilesOnly(file) #df=pickle.load(open("WeiReactionsAndAllFps.pkl",'rb')) df=pd.read_pickle("./WeiReactionsNeuralFp.pkl")