예제 #1
0
 def predict_mols(self, mols):
   featurizer = CircularFingerprint(
       size=self.n_features, radius=2, chiral=True)
   features = np.expand_dims(featurizer.featurize(mols), axis=1)
   features = np.concatenate([features, features], axis=1)
   ds = NumpyDataset(features, None, None, None)
   return self.predict(ds)[0][:, 0]
예제 #2
0
    def test_circularfingerprint_hashable(self):
        featurizer1 = CircularFingerprint()
        featurizer2 = CircularFingerprint()
        featurizer3 = CircularFingerprint(size=5)

        d = set()
        d.add(featurizer1)
        d.add(featurizer2)
        d.add(featurizer3)

        self.assertEqual(2, len(d))
        featurizers = [featurizer1, featurizer2, featurizer3]

        for featurizer in featurizers:
            self.assertTrue(featurizer in featurizers)
 def test_circular_fingerprints_with_1024(self):
     """
     Test CircularFingerprint with 1024 size.
     """
     featurizer = CircularFingerprint(size=1024)
     rval = featurizer([self.mol])
     assert rval.shape == (1, 1024)
 def test_circular_fingerprints(self):
     """
     Test CircularFingerprint.
     """
     featurizer = CircularFingerprint()
     rval = featurizer([self.mol])
     assert rval.shape == (1, 2048)
 def test_sparse_circular_fingerprints(self):
     """
     Test CircularFingerprint with sparse encoding.
     """
     featurizer = CircularFingerprint(sparse=True)
     rval = featurizer([self.mol])
     assert rval.shape == (1, )
     assert isinstance(rval[0], dict)
     assert len(rval[0])
    def test_sparse_circular_fingerprints_with_smiles(self):
        """
        Test CircularFingerprint with sparse encoding and SMILES for each
        fragment.
        """
        featurizer = CircularFingerprint(sparse=True, smiles=True)
        rval = featurizer([self.mol])
        assert rval.shape == (1, )
        assert isinstance(rval[0], dict)
        assert len(rval[0])

        # check for separate count and SMILES entries for each fragment
        for fragment_id, value in rval[0].items():
            assert 'count' in value
            assert 'smiles' in value
예제 #7
0
def data_split(self, test=0.2, val=0, split="random", add_molecule_to_testset=None,  scaler=None, random=None):
    """
        Take in a data frame, the target column name (exp).
    Returns a numpy array with the target variable,
    a numpy array (matrix) of feature variables,
    and a list of strings of the feature headers.

    :param self:
    :param test:
    :param val:
    :param split:
            Keywords: 'random', 'index', 'scaffold'. Default is 'random'
    :param add_molecule_to_testset:
    :param scaler:
            Keywords:  None, 'standard', 'minmax'. Default is None
    :param random:
    :return:
    """
    self.test_percent = test  # Test percent instance
    self.val_percent = val  # Val percent instance
    self.target_array = np.array(self.data[self.target_name])  # Target array instance
    scaler_method = scaler
    if scaler == "standard":  # Determine data scaling method
        scaler_method = StandardScaler()
    elif scaler == "minmax":
        scaler_method = MinMaxScaler()

    self.scaler_method = scaler  # Scaler instance

    if self.dataset in ['sider.csv', 'clintox.csv']:  # Drop specific columns for specific datasets
        features_df = __dropCol__(df=self.data, target_name=self.target_name, keepSmiles=True)
    else:
        features_df = __dropCol__(df=self.data, target_name=self.target_name)

    # if dropSmiles is not None:
    #     dropSmiles = [Chem.MolToSmiles(Chem.MolFromSmiles(i)) for i in dropSmiles]
    #     features_df = features_df[~features_df.isin(dropSmiles)]

    self.feature_list = list(features_df.columns)  # save list of strings of features
    self.feature_length = len(self.feature_list)  # Save feature size
    self.feature_array = np.array(features_df)  # Save feature array instance
    self.n_tot = self.feature_array.shape[0]  # Save total amount of features

    molecules_array = np.array(self.data['smiles'])  # Grab molecules array

    self.train_percent = 1 - self.test_percent - self.val_percent  # Train percent

    temp_data = self.data

    # if val != 0:
    canonMolToAdd = []
    if add_molecule_to_testset is not None:  # For specific molecules to add to testset
        for i in add_molecule_to_testset:
            to_mol = Chem.MolFromSmiles(i)
            if to_mol is not None:
                canonMolToAdd.append(Chem.MolToSmiles(to_mol))
        # add_molecule_to_testset = [Chem.MolToSmiles(Chem.MolFromSmiles(i)) for i in add_molecule_to_testset]

        add_data = self.data[self.data['smiles'].isin(canonMolToAdd)]  # Add in features of specific SMILES

        add_data_molecules = np.array(add_data['smiles'])  # Specific molecule array to be added

        add_features_array = np.array(__dropCol__(add_data, self.target_name))  # Specific feature array to be added

        add_target_array = np.array(add_data[self.target_name])  # Specific target array to be added

        temp_data = self.data[~self.data['smiles'].isin(canonMolToAdd)]  # Final feature df

    # We need to generate fingerprints to use deepchem's scaffold and butina splitting techniques.
    featurizer = CircularFingerprint(size=2048)

    # Loading in csv into deepchem
    loader = dc.data.CSVLoader(tasks=[self.target_name], smiles_field="smiles", featurizer=featurizer)

    dataset = loader.featurize(self.dataset)  # Feature

    split_dict = {"random": dc.splits.RandomSplitter(), "scaffold": dc.splits.ScaffoldSplitter(),
                  "index": dc.splits.IndexSplitter()}  # Dictionary of different data splitting methods
    split_name_dict = {"random": "RandomSplit", "scaffold": "ScaffoldSplit", "index": "IndexSplit"}
    try:
        splitter = split_dict[split]
        self.split_method = split_name_dict[split]
    except KeyError:
        raise Exception("""Invalid splitting methods. Please enter either "random", "scaffold" or "index".""")
    if val == 0:

        train_dataset, test_dataset = splitter.train_test_split(dataset, frac_train=round(1-test, 1),
                                                                seed=self.random_seed)
    else:
        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset, frac_train=1-test-val,
                                                                                     frac_test=test, frac_valid=val,
                                                                                     seed=self.random_seed)
    # All training related data
    train_molecules = []
    for train_smiles in train_dataset.ids:
        train_to_mol = Chem.MolFromSmiles(train_smiles)
        if train_to_mol is not None:
            train_molecules.append(Chem.MolToSmiles(train_to_mol))
        else:
            pass

    train_molecules = list(OrderedDict.fromkeys(train_molecules))
    train_df = temp_data[temp_data['smiles'].isin(train_molecules)]
    train_df = train_df.drop_duplicates()  # Drop duplicates in Dataframe
    self.train_molecules = np.array(train_df['smiles'])
    self.train_features = np.array(__dropCol__(df=train_df, target_name=self.target_name))
    self.n_train = self.train_features.shape[0]
    self.train_target = np.array(train_df[self.target_name])

    # All testing related data
    test_molecules = []
    for test_smiles in test_dataset.ids:
        test_to_mol = Chem.MolFromSmiles(test_smiles)
        if test_to_mol is not None:
            test_molecules.append(Chem.MolToSmiles(test_to_mol))
        else:
            pass
    test_molecules = list(OrderedDict.fromkeys(test_molecules))  # Drop duplicates in list

    test_df = temp_data[temp_data['smiles'].isin(test_molecules)]
    test_df = test_df.drop_duplicates()  # Drop duplicates in Dataframe
    self.test_molecules = np.array(test_df['smiles'])
    self.test_features = np.array(__dropCol__(df=test_df, target_name=self.target_name))
    self.test_target = np.array(test_df[self.target_name])
    if add_molecule_to_testset is not None:  # If there are specific SMILES to add
        self.test_features = np.concatenate([self.test_features, add_features_array])
        self.test_molecules = np.concatenate([self.test_molecules, add_data_molecules])
        self.test_target = np.concatenate([self.test_target, add_target_array])
    self.n_test = self.test_features.shape[0]

    # All validating related data
    if val != 0:
        val_molecules = []
        for smiles in valid_dataset.ids:
            val_to_mol = Chem.MolFromSmiles(smiles)
            if val_to_mol is not None:
                val_molecules.append(Chem.MolToSmiles(val_to_mol))

        val_molecules = list(OrderedDict.fromkeys(val_molecules))
        val_df = temp_data[temp_data['smiles'].isin(val_molecules)]
        val_df = val_df.drop_duplicates()
        self.val_molecules = np.array(val_df['smiles'])
        self.val_features = np.array(__dropCol__(df=val_df, target_name=self.target_name))
        self.val_target = np.array(val_df[self.target_name])
        self.n_val = self.val_features.shape[0]

    if self.algorithm != "cnn" and scaler is not None:
        self.train_features = scaler_method.fit_transform(self.train_features)
        self.test_features = scaler_method.transform(self.test_features)
        if val > 0:
            self.val_features = scaler_method.transform(self.val_features)
    elif self.algorithm == "cnn" and scaler is not None:
        # Can scale data 1d, 2d and 3d data
        self.train_features = scaler_method.fit_transform(self.train_features.reshape(-1,
                                                                                    self.train_features.shape[
                                                                                        -1])).reshape(
            self.train_features.shape)

        self.test_features = scaler_method.transform(self.test_features.reshape(-1,
                                                                              self.test_features.shape[-1])).reshape(
            self.test_features.shape)
        if val > 0:
            self.val_features = scaler_method.transform(self.val_features.reshape(-1,
                                                                                self.val_features.shape[-1])).reshape(
                self.val_features.shape)
    ptrain = self.n_train / self.n_tot * 100
    #
    ptest = self.n_test / self.n_tot * 100
    #
    print()
    # print(
    #     'Dataset of {} points is split into training ({:.1f}%), validation ({:.1f}%), and testing ({:.1f}%).'.format(
    #         self.n_tot, ptrain, pval, ptest))

    self.in_shape = self.feature_array.shape[1]
    # Logic to seperate data in test/train/val
    def __fetch_set__(smiles):
        if smiles in self.test_molecules:
            return 'test'
        elif smiles in self.train_molecules:
            return 'train'
        else:
            return 'val'

    self.data['in_set'] = self.data['smiles'].apply(__fetch_set__)

    cols = list(self.data.columns)
    cols.remove('in_set')
    self.data = self.data[['in_set', *cols]]
    # return train_features, test_features, val_features, train_target, test_target, val_target, feature_list

    # Uncomment this section to have data shape distribution printed.

    print('Total Feature Shape:', self.feature_array.shape)
    print('Total Target Shape', self.target_array.shape)
    print()
    print('Training Features Shape:', self.train_features.shape)
    print('Training Target Shape:', self.train_target.shape)
    print()
    print('Test Features Shape:', self.test_features.shape)
    print('Test Target Shape:', self.test_target.shape)
    print()
    # if val > 0.0:
    #     print('Val Features Shape:', self.val_features.shape)
    #     print('Val Target Shape:', self.val_target.shape)
    #     print("Train:Test:Val -->", np.round(self.train_features.shape[0] / self.feature_array.shape[0] * 100, 1), ':',
    #       np.round(self.test_features.shape[0] / self.feature_array.shape[0] * 100, 1), ":",
    #       np.round(self.val_features.shape[0] / self.feature_array.shape[0] * 100, 1))
    # else:
    #     print('Train:Test -->', np.round(self.train_features.shape[0] / self.feature_array.shape[0] * 100, 1), ':',
    #       np.round(self.test_features.shape[0] / self.feature_array.shape[0] * 100, 1))