def mols_having_best_feat(penalty='l1', c=1, num_folds=10): df_t3 = task3_res() coefs = [] for cv_seed, group in df_t3.groupby(['cv_seed']): print(cv_seed, len(group)) # Iterate over the 5 cv seeds for the same num_cv_folds: for _, gr in df_t3.result.items(): # average over the different folds # print df_t3.C # print df_t3.num_cv_folds # print df_t3.cv_seed coefs.append(np.mean(np.array([gr.logreg_coefs(i).ravel() for i in range(num_folds)]), axis=0)) av_coefs = np.mean(np.array(coefs), axis=0) index_of_best = np.argmax(av_coefs) mfm = MalariaFingerprintsManager(dset='lab') feat = mfm.i2s(index_of_best) print(feat) # feat = 'n1c(S(C)(=O)=O)sc(N)c1S(c)(=O)=O' molids = mfm.mols_with_feature(feat) mc = MalariaCatalog() mols = mc.molids2mols(molids) labels = mc.molids2labels(molids, as01=True) print(len(mols)) draw_in_a_grid_aligned_according_to_pattern(mols, feat, op.join(MALARIA_EXPS_ROOT, 'logregs', 'Mols_having_best_fpt.png'), legends=molids, classes=labels)
def from_feat_back_to_mols_faster(dset, smi): """ Retrieves the list of molecules that contain the given feature in the given dataset. """ # The non-folded version is easy mfm = MalariaFingerprintsManager(dset=dset) X = mfm.X() col = mfm.s2i(smi) # the column where we have to look for in the X matrix cooX = X.tocoo() indices_mols = cooX.row[cooX.col == col] molids = [mfm.i2m(i) for i in indices_mols] mc = MalariaCatalog() activities = [mc.label(molid, as01=True) for molid in molids] mols = mc.molids2mols(molids) return zip(mols, molids, indices_mols, activities)