Exemplo n.º 1
0
    def test_graph_conv_atom_features(self):
        tasks, dataset, transformers, metric = self.get_dataset('regression',
                                                                'Raw',
                                                                num_tasks=1)

        atom_feature_name = 'feature'
        y = []
        for mol in dataset.X:
            atom_features = []
            for atom in mol.GetAtoms():
                val = np.random.normal()
                mol.SetProp(
                    "atom %08d %s" % (atom.GetIdx(), atom_feature_name),
                    str(val))
                atom_features.append(np.random.normal())
            y.append(np.sum(atom_features))

        featurizer = ConvMolFeaturizer(atom_properties=[atom_feature_name])
        X = featurizer.featurize(dataset.X)
        dataset = deepchem.data.NumpyDataset(X, np.array(y))
        batch_size = 50
        model = GraphConvTensorGraph(
            len(tasks),
            number_atom_features=featurizer.feature_length(),
            batch_size=batch_size,
            mode='regression')

        model.fit(dataset, nb_epoch=1)
        y_pred1 = model.predict(dataset)
        model.save()

        model2 = TensorGraph.load_from_dir(model.model_dir)
        y_pred2 = model2.predict(dataset)
        self.assertTrue(np.all(y_pred1 == y_pred2))
Exemplo n.º 2
0
def test_graph_conv_atom_features():
    tasks, dataset, transformers, metric = get_dataset('regression',
                                                       'Raw',
                                                       num_tasks=1)

    atom_feature_name = 'feature'
    y = []
    for mol in dataset.X:
        atom_features = []
        for atom in mol.GetAtoms():
            val = np.random.normal()
            mol.SetProp("atom %08d %s" % (atom.GetIdx(), atom_feature_name),
                        str(val))
            atom_features.append(np.random.normal())
        y.append([np.sum(atom_features)])

    featurizer = ConvMolFeaturizer(atom_properties=[atom_feature_name])
    X = featurizer.featurize(dataset.X)
    dataset = dc.data.NumpyDataset(X, np.array(y))
    batch_size = 50
    model = GraphConvModel(len(tasks),
                           number_atom_features=featurizer.feature_length(),
                           batch_size=batch_size,
                           mode='regression')

    model.fit(dataset, nb_epoch=1)
    y_pred1 = model.predict(dataset)
Exemplo n.º 3
0
def test_mol_ordering():
    mols = get_molecules()
    featurizer = ConvMolFeaturizer()
    featurized_mols = featurizer.featurize(mols)
    for i in range(len(featurized_mols)):
        atom_features = featurized_mols[i].atom_features
        degree_list = np.expand_dims(featurized_mols[i].degree_list, axis=1)
        atom_features = np.concatenate([degree_list, atom_features], axis=1)
        featurized_mols[i].atom_features = atom_features

    conv_mol = ConvMol.agglomerate_mols(featurized_mols)

    for start, end in conv_mol.deg_slice.tolist():
        members = conv_mol.membership[start:end]
        sorted_members = np.array(sorted(members))
        members = np.array(members)
        assert np.all(sorted_members == members)

    conv_mol_atom_features = conv_mol.get_atom_features()

    adj_number = 0
    for start, end in conv_mol.deg_slice.tolist():
        deg_features = conv_mol_atom_features[start:end]
        adj_number_array = deg_features[:, 0]
        assert np.all(adj_number_array == adj_number)
        adj_number += 1
Exemplo n.º 4
0
  def test_graph_conv_atom_features(self):
    tasks, dataset, transformers, metric = self.get_dataset(
        'regression', 'Raw', num_tasks=1)

    atom_feature_name = 'feature'
    y = []
    for mol in dataset.X:
      atom_features = []
      for atom in mol.GetAtoms():
        val = np.random.normal()
        mol.SetProp("atom %08d %s" % (atom.GetIdx(), atom_feature_name),
                    str(val))
        atom_features.append(np.random.normal())
      y.append([np.sum(atom_features)])

    featurizer = ConvMolFeaturizer(atom_properties=[atom_feature_name])
    X = featurizer.featurize(dataset.X)
    dataset = dc.data.NumpyDataset(X, np.array(y))
    batch_size = 50
    model = GraphConvModel(
        len(tasks),
        number_atom_features=featurizer.feature_length(),
        batch_size=batch_size,
        mode='regression')

    model.fit(dataset, nb_epoch=1)
    y_pred1 = model.predict(dataset)
    model.save()

    model2 = TensorGraph.load_from_dir(model.model_dir)
    y_pred2 = model2.predict(dataset)
    self.assertTrue(np.allclose(y_pred1, y_pred2))
Exemplo n.º 5
0
    def test_convmol_hashable(self):
        featurizer1 = ConvMolFeaturizer(atom_properties=['feature'])
        featurizer2 = ConvMolFeaturizer(atom_properties=['feature'])
        featurizer3 = ConvMolFeaturizer()

        d = set()
        d.add(featurizer1)
        d.add(featurizer2)
        d.add(featurizer3)

        self.assertEqual(2, len(d))
        featurizers = [featurizer1, featurizer2, featurizer3]

        for featurizer in featurizers:
            self.assertTrue(featurizer in featurizers)
Exemplo n.º 6
0
def preprocess(raw_data, feats="convmol"):
    """
    Preprocess molecule data
    """
    labels = raw_data['labels']
    smiles = raw_data['smiles']
    adjs = raw_data['adjs']
    num_classes = np.unique(labels).shape[0]

    #One hot labels
    labels_one_hot = np.eye(num_classes)[labels.reshape(-1)]

    if feats == "weave":
        featurizer = WeaveFeaturizer()
    elif feats == "convmol":
        featurizer = ConvMolFeaturizer()

    mol_objs = featurizer.featurize([MolFromSmiles(smile) for smile in smiles])

    #Sort feature matrices by node degree
    node_features = []
    for i, feat in enumerate(mol_objs):
        sortind = np.argsort(adjs[i].sum(axis=1) - 1)
        N = len(sortind)
        sortMatrix = np.eye(N)[sortind, :]
        node_features.append(np.matmul(sortMatrix.T, feat.get_atom_features()))

    #Normalize Adjacency Mats
    norm_adjs = [preprocess_adj(A) for A in adjs]

    return {
        'labels_one_hot': labels_one_hot,
        'node_features': node_features,
        'norm_adjs': norm_adjs
    }
Exemplo n.º 7
0
def load_training_data(dataset_files, split_field='Fold', smiles_field='Smiles', y_field='Value',
                       id_field='Compound_No', tempdir=op.join(MODEL_DIR, 'datatemp'), cv=True):
    """
    Given a list of datasets in csv format, read them and prepare them for DeepChem (split if needed, etc.)
    :param dataset_files: path to the csv files containing the training data for each task of interest
    :param split_field: column name in the csv giving the fold assignment for CV. Not used if cv=False
    :param smiles_field: column name in the csv giving the SMILES of the compounds
    :param y_field: column name in the csv giving the experimental value to learn
    :param cv: whether we are also splitting the data by split_field
    :return: list of tasks and the list of ConvMol datasets (one dataset per group in split_field)
    """
    ensure_dir(tempdir)

    df_trains = []
    for dataset_file in dataset_files:
        try:
            df_trains.append(pd.read_csv(dataset_file, sep=','))
        except IOError:  # no test split for example
            df = pd.DataFrame(
                {id_field: [], y_field: [], smiles_field: []})  # create an empty df for missing task
            df_trains.append(df)

    n_tasks = len(dataset_files)
    # Rename the y_field column
    df_trains = [df_train.rename(index=str, columns={y_field: y_field + '_%i' % i}) for i, df_train in
                 enumerate(df_trains)]

    # Merge the individual tasks based on Smiles
    if cv:
        df_train = reduce(lambda x, y: pd.merge(x, y, on=[id_field, smiles_field, split_field], how='outer'),
                          df_trains)
    else:
        df_train = reduce(lambda x, y: pd.merge(x, y, on=[id_field, smiles_field], how='outer'), df_trains)
    # Save the merged train csv in a temporary place
    dataset_file = op.join(tempdir, 'data.csv')
    df_train.to_csv(dataset_file, na_rep=np.nan, index=False)

    # Featurization
    featurizer = ConvMolFeaturizer()
    loader = CSVLoader(tasks=[y_field + '_%i' % i for i in range(n_tasks)], smiles_field=smiles_field,
                       featurizer=featurizer, id_field=id_field)
    dataset = loader.featurize(dataset_file, shard_size=8192, data_dir=tempdir)

    if cv:
        folds = np.unique(df_trains[0][split_field].tolist())

        # Separate in folds
        folds_datasets = []
        fold_dirs = []
        for f in folds:
            fold_dir = tempfile.mkdtemp(prefix=tempdir + '/')
            indices = np.flatnonzero(df_train[split_field] == f)
            folds_datasets.append(dataset.select(indices, select_dir=fold_dir))
            fold_dirs.append(fold_dir)

        return ['Value_%i' % i for i in range(n_tasks)], folds_datasets, fold_dirs

    return ['Value_%i' % i for i in range(n_tasks)], dataset
Exemplo n.º 8
0
  def amino_acid_embedding(self, name=None):
    if name == None:
      name = 'AAEmbedding_'+str(self.module_count)+'_'
      self.module_count += 1
    feat = ConvMolFeaturizer()
    featurized_AA = [feat._featurize(Chem.MolFromSmiles(smile)) for smile in AminoAcid_SMILES]
    multiConvMol = ConvMol.agglomerate_mols(featurized_AA, max_deg=3)
    atom_features = TensorWrapper(tf.constant(multiConvMol.get_atom_features(), dtype=tf.float32), name=name+'atom_features')
    degree_slice = TensorWrapper(tf.constant(multiConvMol.deg_slice, dtype=tf.int32), name=name+'degree')
    membership = TensorWrapper(tf.constant(multiConvMol.membership, dtype=tf.int32), name=name+'membership')

    deg_adjs = []
    for i in range(0, 3):
      deg_adjs.append(TensorWrapper(tf.constant(multiConvMol.get_deg_adjacency_lists()[i+1], dtype=tf.int32), name=name+'deg_'+str(i)))
      
    gc1 = GraphConv(
        64,
        max_deg=3,
        activation_fn=tf.nn.relu,
        in_layers=[atom_features, degree_slice, membership] + deg_adjs, name=name+'gc1')
    batch_norm1 = BatchNorm(in_layers=[gc1, self.training_placeholder], name=name+'bn1')
    gp1 = GraphPool(max_degree=3, in_layers=[batch_norm1, degree_slice, membership] + deg_adjs, name=name+'gp1')
    gc2 = GraphConv(
        64,
        max_deg=3,
        activation_fn=tf.nn.relu,
        in_layers=[gp1, degree_slice, membership] + deg_adjs, name=name+'gc2')
    batch_norm2 = BatchNorm(in_layers=[gc2, self.training_placeholder], name=name+'bn2')
    gp2 = GraphPool(max_degree=3, in_layers=[batch_norm2, degree_slice, membership] + deg_adjs, name=name+'gp2')
    dense = Dense(out_channels=self.embedding_length/2, activation_fn=tf.nn.relu, in_layers=[gp2], name=name+'dense1')
    batch_norm3 = BatchNorm(in_layers=[dense, self.training_placeholder], name=name+'bn3')
    readout = GraphGather(
        batch_size=21,
        activation_fn=tf.nn.tanh,
        in_layers=[batch_norm3, degree_slice, membership] + deg_adjs, name=name+'gg')
    padding = AminoAcidPad(
        embedding_length=self.embedding_length,
        in_layers=[readout], name=name+'pad')
    return padding
Exemplo n.º 9
0
def load_inference_data(dataset_file, n_tasks, tempdir, smiles_field='Smiles', id_field='Compound_No'):
    """
    :param dataset_file: path to the csv files containing the data we want to predict
    :param smiles_field: column name in the csv giving the SMILES of the compounds
    :param id_field: column name in the csv giving the identifier for the molecules
    :param tempdir: directory where ConvMol datasets will be temporarily stored
    :return: list of tasks and the ConvMol datasets
    """
    # Featurize the dataset for Graph Convolutional architecture
    featurizer = ConvMolFeaturizer()
    loader = CSVLoader(tasks=['Value_%i' % i for i in range(n_tasks)], smiles_field=smiles_field, featurizer=featurizer,
                       id_field=id_field)
    dataset = loader.featurize(dataset_file, shard_size=8192, data_dir=tempdir)

    return ['Value_%i' % i for i in range(n_tasks)], dataset
Exemplo n.º 10
0
def load_data(dataset_file,
              split_field=None,
              smiles_field='Smiles',
              y_field=None,
              id_field='Compound_No',
              tempdir=op.join(RESULTS_DIR, 'DCGraphConv', 'to_check'),
              cv=True):
    """
    :param dataset_file: path to the csv file containing the data to read
    :param split_field: column name in the csv giving the fold assignment for CV. Optional
    :param smiles_field: column name in the csv giving the SMILES of the compounds
    :param y_field: column name in the csv giving the experimental value to learn. Optional (in case of prospective
    predictions)
    :param id_field: column name in the csv giving the molecule id
    :param tempdir: where the shards and featurized data will be temporarily stored
    :param cv: bool. whether we are splitting by folds or not (in which case split_field is used as fold assignment
    indicator)
    :return: list of tasks and the list of ConvMol datasets (one dataset per group in split_field)
    """
    if y_field is None:
        tasks = []
    else:
        tasks = [y_field]
    featurizer = ConvMolFeaturizer()
    loader = CSVLoader(tasks=tasks,
                       smiles_field=smiles_field,
                       featurizer=featurizer,
                       id_field=id_field)
    dataset = loader.featurize(dataset_file, shard_size=8192, data_dir=tempdir)

    if split_field is not None and cv:
        df_train = pd.read_csv(dataset_file, sep=',')
        folds = np.unique(df_train[split_field].tolist())

        folds_datasets = []
        fold_dirs = []
        for f in folds:
            fold_dir = tempfile.mkdtemp(prefix=tempdir + '/')
            indices = np.flatnonzero(df_train[split_field] == f)
            folds_datasets.append(dataset.select(indices, select_dir=fold_dir))
            fold_dirs.append(fold_dir)

        return tasks, folds_datasets, fold_dirs

    return tasks, dataset
Exemplo n.º 11
0
    def generate_conv_mol(self):

        self.log.info(
            "Generating Molecular Convolutions, may take a few moments ...")

        mol_list = []

        for index, row in self.data.iterrows():
            mol = Chem.MolFromSmiles(row["SMILE"])
            Chem.AddHs(mol)
            #            AllChem.UFFOptimizeMolecule(mol, confId=id)
            mol_list.append(mol)

        conv_array = ConvMolFeaturizer().featurize(mol_list)
        print(
            "len(conv_array)",
            len(conv_array),
            "type(conv_array)",
            type(conv_array),
            "type(conv_array[0]",
            type(conv_array[0]),
            "get_atom_features",
            conv_array[0].get_atom_features().shape,
        )
from deepchem.feat  import ConvMolFeaturizer


#from tensorflow.keras.models import Sequential
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

seed = 1

#reload(p)
suffix="\\"
#file="FinalCleaned.csv"
file="WeiReactionsAndAllFps.pkl"
filename="FinalAllcsv.csv"

featurizer=ConvMolFeaturizer()

#pCSV = p.processCSV(filename)



#readOriginalDataSetAndCreatePickledSmilesOnly
#pCSV.readPickleAndTranslateToMols()


#pCSV = p.processCSV(filename)
#pCSV = p.processCSV(file)
#pCSV.readOriginalDataSetAndCreatePickledSmilesOnly(file)
#df=pickle.load(open("WeiReactionsAndAllFps.pkl",'rb'))
df=pd.read_pickle("./WeiReactionsNeuralFp.pkl")