def run(self):
        logger.info('MakeModel {}nM: Init'.format(self.value))
        data = pd.read_csv(self.input().path)
        # get unique molecules and its smiles
        mols = data[['molregno', 'canonical_smiles']]
        mols = mols.drop_duplicates('molregno')
        mols = mols.set_index('molregno')
        mols = mols.sort_index()

        # group targets by molregno
        targets = data[['molregno', 'target_chembl_id']]
        #targets = targets.sort_index(by='molregno')
        targets = targets.sort_values(by='molregno')
        targets = targets.groupby('molregno').apply(
            lambda x: ','.join(x.target_chembl_id))
        targets = targets.apply(lambda x: x.split(','))
        targets = pd.DataFrame(targets, columns=['targets'])

        # merge it
        mols['ROMol'] = mols.apply(
            lambda x: Chem.MolFromSmiles(x['canonical_smiles']), axis=1)
        dataset = pd.merge(mols, targets, left_index=True, right_index=True)
        dataset = dataset.ix[dataset['ROMol'].notnull()]

        # generate fingerprints
        dataset['FP'] = dataset.apply(lambda row: computeFP(row['ROMol']),
                                      axis=1)
        dataset = dataset.ix[dataset['FP'].notnull()]
        logger.info('MakeModel {}nM: Data ready'.format(self.value))

        # generate models training data
        X = [f.fp for f in dataset['FP']]
        mlb = MultiLabelBinarizer()
        y = mlb.fit_transform(dataset['targets'])

        # train the model
        morgan_bnb = OneVsRestClassifier(MultinomialNB())
        morgan_bnb.fit(X, y)
        morgan_bnb.targets = mlb.classes_

        # save the model
        joblib.dump(morgan_bnb, self.output().path)
        logger.info('MakeModel {}nM: Done'.format(self.value))
예제 #2
0
print 'fps done'

from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.externals import joblib

from sklearn.preprocessing import MultiLabelBinarizer

X = [f.fp for f in dataset['FP']]
yy = [c for c in dataset['targets']]

##print dataset['targets'].head()

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(yy)  ## this is for newer versions of sklearn

morgan_bnb = OneVsRestClassifier(MultinomialNB())

print 'model building'
morgan_bnb.fit(X, y)

morgan_bnb.targets = mlb.classes_

print morgan_bnb.multilabel_
print morgan_bnb.targets

joblib.dump(morgan_bnb, '../chembl_22/models/1uM/mNB_1uM_all.pkl')

print 'done!'