def run(self): logger.info('MakeModel {}nM: Init'.format(self.value)) data = pd.read_csv(self.input().path) # get unique molecules and its smiles mols = data[['molregno', 'canonical_smiles']] mols = mols.drop_duplicates('molregno') mols = mols.set_index('molregno') mols = mols.sort_index() # group targets by molregno targets = data[['molregno', 'target_chembl_id']] #targets = targets.sort_index(by='molregno') targets = targets.sort_values(by='molregno') targets = targets.groupby('molregno').apply( lambda x: ','.join(x.target_chembl_id)) targets = targets.apply(lambda x: x.split(',')) targets = pd.DataFrame(targets, columns=['targets']) # merge it mols['ROMol'] = mols.apply( lambda x: Chem.MolFromSmiles(x['canonical_smiles']), axis=1) dataset = pd.merge(mols, targets, left_index=True, right_index=True) dataset = dataset.ix[dataset['ROMol'].notnull()] # generate fingerprints dataset['FP'] = dataset.apply(lambda row: computeFP(row['ROMol']), axis=1) dataset = dataset.ix[dataset['FP'].notnull()] logger.info('MakeModel {}nM: Data ready'.format(self.value)) # generate models training data X = [f.fp for f in dataset['FP']] mlb = MultiLabelBinarizer() y = mlb.fit_transform(dataset['targets']) # train the model morgan_bnb = OneVsRestClassifier(MultinomialNB()) morgan_bnb.fit(X, y) morgan_bnb.targets = mlb.classes_ # save the model joblib.dump(morgan_bnb, self.output().path) logger.info('MakeModel {}nM: Done'.format(self.value))
print 'fps done' from sklearn.naive_bayes import BernoulliNB, MultinomialNB from sklearn.multiclass import OneVsRestClassifier from sklearn.externals import joblib from sklearn.preprocessing import MultiLabelBinarizer X = [f.fp for f in dataset['FP']] yy = [c for c in dataset['targets']] ##print dataset['targets'].head() mlb = MultiLabelBinarizer() y = mlb.fit_transform(yy) ## this is for newer versions of sklearn morgan_bnb = OneVsRestClassifier(MultinomialNB()) print 'model building' morgan_bnb.fit(X, y) morgan_bnb.targets = mlb.classes_ print morgan_bnb.multilabel_ print morgan_bnb.targets joblib.dump(morgan_bnb, '../chembl_22/models/1uM/mNB_1uM_all.pkl') print 'done!'