Exemplo n.º 1
0
def mols_having_best_feat(penalty='l1', c=1, num_folds=10):

    df_t3 = task3_res()

    coefs = []
    for cv_seed, group in df_t3.groupby(['cv_seed']):
        print(cv_seed, len(group))
    # Iterate over the 5 cv seeds for the same num_cv_folds:
    for _, gr in df_t3.result.items():
        # average over the different folds
        # print df_t3.C
        # print df_t3.num_cv_folds
        # print df_t3.cv_seed
        coefs.append(np.mean(np.array([gr.logreg_coefs(i).ravel() for i in range(num_folds)]), axis=0))

    av_coefs = np.mean(np.array(coefs), axis=0)
    index_of_best = np.argmax(av_coefs)
    mfm = MalariaFingerprintsManager(dset='lab')
    feat = mfm.i2s(index_of_best)
    print(feat)
    # feat = 'n1c(S(C)(=O)=O)sc(N)c1S(c)(=O)=O'
    molids = mfm.mols_with_feature(feat)
    mc = MalariaCatalog()
    mols = mc.molids2mols(molids)
    labels = mc.molids2labels(molids, as01=True)

    print(len(mols))
    draw_in_a_grid_aligned_according_to_pattern(mols, feat,
                                                op.join(MALARIA_EXPS_ROOT, 'logregs', 'Mols_having_best_fpt.png'),
                                                legends=molids, classes=labels)
def from_feat_back_to_mols_faster(dset, smi):
    """
    Retrieves the list of molecules that contain the given feature in the given dataset.
    """
    # The non-folded version is easy
    mfm = MalariaFingerprintsManager(dset=dset)
    X = mfm.X()
    col = mfm.s2i(smi)  # the column where we have to look for in the X matrix
    cooX = X.tocoo()
    indices_mols = cooX.row[cooX.col == col]
    molids = [mfm.i2m(i) for i in indices_mols]
    mc = MalariaCatalog()
    activities = [mc.label(molid, as01=True) for molid in molids]
    mols = mc.molids2mols(molids)
    return zip(mols, molids, indices_mols, activities)