Пример #1
0
def test_ensemble_model():
    X = np.vstack(
        (np.arange(30, 10,
                   -2, dtype='float64'), np.arange(100,
                                                   90,
                                                   -1,
                                                   dtype='float64'))).T

    Y = np.arange(10, dtype='float64')

    rf = regressors.randomforest(random_state=42)
    nn = regressors.neuralnetwork(solver='lbfgs', random_state=42)
    ensemble = ensemble_model((rf, nn))

    # we do not need to fit underlying models, they change when we fit enseble
    ensemble.fit(X, Y)

    pred = ensemble.predict(X)
    mean_pred = np.vstack((rf.predict(X), nn.predict(X))).mean(axis=0)
    assert_array_almost_equal(pred, mean_pred)
    assert_almost_equal(ensemble.score(X, Y), r2_score(Y, pred))

    # ensemble of a single model should behave exactly like this model
    nn = neuralnetwork(solver='lbfgs', random_state=42)
    ensemble = ensemble_model((nn, ))
    ensemble.fit(X, Y)
    assert_array_almost_equal(ensemble.predict(X), nn.predict(X))
    assert_almost_equal(ensemble.score(X, Y), nn.score(X, Y))
Пример #2
0
 def __init__(self, protein=None, n_jobs=-1, version=1, spr=0, **kwargs):
     self.protein = protein
     self.n_jobs = n_jobs
     self.version = version
     self.spr = spr
     model = randomforest(n_estimators=500,
                          oob_score=True,
                          n_jobs=n_jobs,
                          **kwargs)
     if version == 1:
         cutoff = 12
         descriptors = close_contacts(protein,
                                      cutoff=cutoff,
                                      protein_types=protein_atomic_nums,
                                      ligand_types=ligand_atomic_nums)
     elif version == 2:
         cutoff = np.array([0, 2, 4, 6, 8, 10, 12])
         descriptors = close_contacts(protein,
                                      cutoff=cutoff,
                                      protein_types=protein_atomic_nums,
                                      ligand_types=ligand_atomic_nums)
     elif version == 3:
         cutoff = 12
         cc = close_contacts(protein,
                             cutoff=cutoff,
                             protein_types=protein_atomic_nums,
                             ligand_types=ligand_atomic_nums)
         vina = autodock_vina_descriptor(protein)
         descriptors = ensemble_descriptor((vina, cc))
     super(rfscore, self).__init__(model,
                                   descriptors,
                                   score_title='rfscore_v%i' % self.version)
Пример #3
0
def test_regressors():
    X = np.vstack((np.arange(30, 10, -2, dtype='float64'),
                   np.arange(100, 90, -1, dtype='float64'))).T

    Y = np.arange(10, dtype='float64')

    np.random.seed(42)

    for regressor in (regressors.svm(C=10),
                      regressors.randomforest(random_state=42),
                      regressors.neuralnetwork(solver='lbfgs',
                                               random_state=42,
                                               hidden_layer_sizes=(20, 20)),
                      regressors.mlr()):

        regressor.fit(X, Y)

        pred = regressor.predict(X)
        assert_true((np.abs(pred.flatten() - Y) < 1).all())
        assert_greater(regressor.score(X, Y), 0.9)

        pickled = pickle.dumps(regressor)
        reloaded = pickle.loads(pickled)
        pred_reloaded = reloaded.predict(X)
        assert_array_almost_equal(pred, pred_reloaded)
Пример #4
0
 def __init__(self, protein = None, n_jobs = -1, version = 1, spr = 0, **kwargs):
     self.protein = protein
     self.n_jobs = n_jobs
     self.version = version
     self.spr = spr
     model = randomforest(n_estimators = 500, oob_score = True, n_jobs = n_jobs, **kwargs)
     if version == 1:
         cutoff = 12
         descriptors = close_contacts(protein, cutoff = cutoff, protein_types = protein_atomic_nums, ligand_types = ligand_atomic_nums)
     elif version == 2:
         cutoff = np.array([ 0,  2,  4,  6,  8, 10, 12])
         descriptors = close_contacts(protein, cutoff = cutoff, protein_types = protein_atomic_nums, ligand_types = ligand_atomic_nums)
     elif version == 3:
         cutoff = 12
         cc = close_contacts(protein, cutoff = cutoff, protein_types = protein_atomic_nums, ligand_types = ligand_atomic_nums)
         vina = autodock_vina_descriptor(protein)
         descriptors = ensemble_descriptor((vina, cc))
     super(rfscore,self).__init__(model, descriptors, score_title = 'rfscore')
Пример #5
0
 def __init__(self, protein=None, n_jobs=-1, version=1, spr=0, **kwargs):
     self.protein = protein
     self.n_jobs = n_jobs
     self.version = version
     self.spr = spr
     if version == 1:
         cutoff = 12
         mtry = 6
         descriptors = close_contacts_descriptor(
             protein,
             cutoff=cutoff,
             protein_types=protein_atomic_nums,
             ligand_types=ligand_atomic_nums)
     elif version == 2:
         cutoff = np.array([0, 2, 4, 6, 8, 10, 12])
         mtry = 14
         descriptors = close_contacts_descriptor(
             protein,
             cutoff=cutoff,
             protein_types=protein_atomic_nums,
             ligand_types=ligand_atomic_nums)
     elif version == 3:
         cutoff = 12
         mtry = 6
         cc = close_contacts_descriptor(protein,
                                        cutoff=cutoff,
                                        protein_types=protein_atomic_nums,
                                        ligand_types=ligand_atomic_nums)
         vina_scores = [
             'vina_gauss1', 'vina_gauss2', 'vina_repulsion',
             'vina_hydrophobic', 'vina_hydrogen', 'vina_num_rotors'
         ]
         vina = oddt_vina_descriptor(protein, vina_scores=vina_scores)
         descriptors = ensemble_descriptor((vina, cc))
     model = randomforest(n_estimators=500,
                          oob_score=True,
                          n_jobs=n_jobs,
                          max_features=mtry,
                          bootstrap=True,
                          min_samples_split=6,
                          **kwargs)
     super(rfscore, self).__init__(model,
                                   descriptors,
                                   score_title='rfscore_v%i' % self.version)
Пример #6
0
    def __init__(self, protein=None, n_jobs=-1, version=1, spr=0, **kwargs):
        """Scoring function implementing RF-Score variants. It predicts the
        binding affinity  (pKi/d) of ligand in a complex utilizng simple
        descriptors (close contacts of atoms <12A) with sophisticated
        machine-learning model (random forest). The third variand supplements
        those contacts with Vina partial scores. For futher details see RF-Score
        publications v1[1]_, v2[2]_, v3[3]_.


        Parameters
        ----------
        protein : oddt.toolkit.Molecule object
            Receptor for the scored ligands

        n_jobs: int (default=-1)
            Number of cores to use for scoring and training. By default (-1)
            all cores are allocated.

        version: int (default=1)
            Scoring function variant. The deault is the simplest one (v1).

        spr: int (default=0)
            The minimum number of contacts in each pair of atom types in
            the training set for the column to be included in training.
            This is a way of removal of not frequent and empty contacts.

        References
        ----------
        .. [1] Ballester PJ, Mitchell JBO. A machine learning approach to
            predicting protein-ligand binding affinity with applications to
            molecular docking. Bioinformatics. 2010;26: 1169-1175.
            doi:10.1093/bioinformatics/btq112

        .. [2] Ballester PJ, Schreyer A, Blundell TL. Does a more precise
            chemical description of protein-ligand complexes lead to more
            accurate prediction of binding affinity? J Chem Inf Model. 2014;54:
            944-955. doi:10.1021/ci500091r

        .. [3] Li H, Leung K-S, Wong M-H, Ballester PJ. Improving AutoDock Vina
            Using Random Forest: The Growing Accuracy of Binding Affinity
            Prediction by the Effective Exploitation of Larger Data Sets.
            Mol Inform. WILEY-VCH Verlag; 2015;34: 115-126.
            doi:10.1002/minf.201400132

        """
        self.protein = protein
        self.n_jobs = n_jobs
        self.version = version
        self.spr = spr
        if version == 1:
            cutoff = 12
            mtry = 6
            descriptors = close_contacts_descriptor(
                protein,
                cutoff=cutoff,
                protein_types=protein_atomic_nums,
                ligand_types=ligand_atomic_nums)
        elif version == 2:
            cutoff = np.array([0, 2, 4, 6, 8, 10, 12])
            mtry = 14
            descriptors = close_contacts_descriptor(
                protein,
                cutoff=cutoff,
                protein_types=protein_atomic_nums,
                ligand_types=ligand_atomic_nums)
        elif version == 3:
            cutoff = 12
            mtry = 6
            cc = close_contacts_descriptor(
                protein,
                cutoff=cutoff,
                protein_types=protein_atomic_nums,
                ligand_types=ligand_atomic_nums)
            vina_scores = ['vina_gauss1',
                           'vina_gauss2',
                           'vina_repulsion',
                           'vina_hydrophobic',
                           'vina_hydrogen',
                           'vina_num_rotors']
            vina = oddt_vina_descriptor(protein, vina_scores=vina_scores)
            descriptors = ensemble_descriptor((vina, cc))
        model = randomforest(n_estimators=500,
                             oob_score=True,
                             n_jobs=n_jobs,
                             max_features=mtry,
                             bootstrap=True,
                             min_samples_split=6,
                             **kwargs)
        super(rfscore, self).__init__(model, descriptors,
                                      score_title='rfscore_v%i' % self.version)
Пример #7
0
 def __init__(self, protein = None, n_jobs = -1, **kwargs):
     self.protein = protein
     self.n_jobs = n_jobs
     model = randomforest(n_estimators = 500, oob_score = True, n_jobs = n_jobs, **kwargs)
     descriptors = close_contacts(protein, cutoff = cutoff, protein_types = protein_atomic_nums, ligand_types = ligand_atomic_nums)
     super(rfscore,self).__init__(model, descriptors, score_title = 'rfscore')
Пример #8
0
    assert cls.score(X, Y) == 1.0

    prob = cls.predict_proba(X)
    assert_array_almost_equal(prob, [[0, 1]] * 5 + [[1, 0]] * 5, decimal=1)
    log_prob = cls.predict_log_proba(X)
    assert_array_almost_equal(np.log(prob), log_prob)

    pickled = pickle.dumps(cls)
    reloaded = pickle.loads(pickled)
    prob_reloaded = reloaded.predict_proba(X)
    assert_array_almost_equal(prob, prob_reloaded)


@pytest.mark.parametrize('reg', [
    regressors.svm(C=10),
    regressors.randomforest(random_state=42),
    regressors.neuralnetwork(
        solver='lbfgs', random_state=42, hidden_layer_sizes=(20, 20)),
    regressors.mlr()
])
def test_regressors(reg):
    X = np.vstack(
        (np.arange(30, 10,
                   -2, dtype='float64'), np.arange(100,
                                                   90,
                                                   -1,
                                                   dtype='float64'))).T

    Y = np.arange(10, dtype='float64')

    np.random.seed(42)
Пример #9
0
    def __init__(self, protein=None, n_jobs=-1, version=1, spr=0, **kwargs):
        """Scoring function implementing RF-Score variants. It predicts the
        binding affinity  (pKi/d) of ligand in a complex utilizng simple
        descriptors (close contacts of atoms <12A) with sophisticated
        machine-learning model (random forest). The third variand supplements
        those contacts with Vina partial scores. For futher details see RF-Score
        publications v1[1]_, v2[2]_, v3[3]_.


        Parameters
        ----------
        protein : oddt.toolkit.Molecule object
            Receptor for the scored ligands

        n_jobs: int (default=-1)
            Number of cores to use for scoring and training. By default (-1)
            all cores are allocated.

        version: int (default=1)
            Scoring function variant. The deault is the simplest one (v1).

        spr: int (default=0)
            The minimum number of contacts in each pair of atom types in
            the training set for the column to be included in training.
            This is a way of removal of not frequent and empty contacts.

        References
        ----------
        .. [1] Ballester PJ, Mitchell JBO. A machine learning approach to
            predicting protein-ligand binding affinity with applications to
            molecular docking. Bioinformatics. 2010;26: 1169-1175.
            doi:10.1093/bioinformatics/btq112

        .. [2] Ballester PJ, Schreyer A, Blundell TL. Does a more precise
            chemical description of protein-ligand complexes lead to more
            accurate prediction of binding affinity? J Chem Inf Model. 2014;54:
            944-955. doi:10.1021/ci500091r

        .. [3] Li H, Leung K-S, Wong M-H, Ballester PJ. Improving AutoDock Vina
            Using Random Forest: The Growing Accuracy of Binding Affinity
            Prediction by the Effective Exploitation of Larger Data Sets.
            Mol Inform. WILEY-VCH Verlag; 2015;34: 115-126.
            doi:10.1002/minf.201400132

        """
        self.protein = protein
        self.n_jobs = n_jobs
        self.version = version
        self.spr = spr
        if version == 1:
            cutoff = 12
            mtry = 6
            descriptors = close_contacts_descriptor(
                protein,
                cutoff=cutoff,
                protein_types=protein_atomic_nums,
                ligand_types=ligand_atomic_nums)
        elif version == 2:
            cutoff = np.array([0, 2, 4, 6, 8, 10, 12])
            mtry = 14
            descriptors = close_contacts_descriptor(
                protein,
                cutoff=cutoff,
                protein_types=protein_atomic_nums,
                ligand_types=ligand_atomic_nums)
        elif version == 3:
            cutoff = 12
            mtry = 6
            cc = close_contacts_descriptor(protein,
                                           cutoff=cutoff,
                                           protein_types=protein_atomic_nums,
                                           ligand_types=ligand_atomic_nums)
            vina_scores = [
                'vina_gauss1', 'vina_gauss2', 'vina_repulsion',
                'vina_hydrophobic', 'vina_hydrogen', 'vina_num_rotors'
            ]
            vina = oddt_vina_descriptor(protein, vina_scores=vina_scores)
            descriptors = ensemble_descriptor((vina, cc))
        # elif version == 5:
        #     cutoff = np.array([0, 2, 4, 6, 8, 10, 12])
        #     mtry = 14
        #     descriptors = close_contacts_descriptor(
        #         protein,
        #         cutoff=cutoff,
        #         protein_types=protein_atomic_nums,
        #         ligand_types=ligand_atomic_nums)
        model = randomforest(n_estimators=500,
                             oob_score=True,
                             n_jobs=n_jobs,
                             max_features=mtry,
                             bootstrap=True,
                             min_samples_split=6,
                             **kwargs)
        super(rfscore, self).__init__(model,
                                      descriptors,
                                      score_title='rfscore_v%i' % self.version)