예제 #1
0
    def save_submission(sub, outfile, select_top=500):
        # Get the smiles
        smiles = mc.molids2smiless(sub.index)

        # Rankings
        ranks, (sscores, smolids, ssmiles) = \
            rank_sort(sub.values, (sub.values,
                                   sub.index.values,
                                   smiles), reverse=True, select_top=select_top)
        # Save for submission
        with open(outfile, 'w') as writer:
            for molid, smiles, score in izip(smolids, ssmiles, sscores):
                writer.write('%s,%s,%.6f\n' % (molid, smiles, score))
예제 #2
0
def compute_confirmatory(deployers,
                         molids_provider,
                         outfile,
                         y_provider=None,
                         select_top=500,
                         mc=None):
    """Scores and rankings on plain-average for the labelled / ambiguous dataset."""

    # Labelled
    Xlab, f_names = deployers(dset='lab')
    info('AUC after plain averaging (bagging like): %.3f' % roc_auc_score(y_provider(),
                                                                          np.nanmean(Xlab, axis=1)))
    # Ambiguous
    Xamb, _ = deployers(dset='amb')
    # All together
    X = np.vstack((Xlab, Xamb))

    # Scores are just plain averages
    scores = np.nanmean(X, axis=1)

    # Get the molids, smiles, labels, pec50
    lab_molids = molids_provider(dset='lab')
    amb_molids = molids_provider(dset='amb')
    molids = np.hstack((lab_molids, amb_molids))

    if mc is None:
        mc = MalariaCatalog()
    labels = mc.molids2labels(molids)
    pec50s = mc.molids2pec50s(molids)
    smiles = mc.molids2smiless(molids)

    # Rankings
    ranks, (sscores, smolids, slabels, spec50s, ssmiles) = \
        rank_sort(scores, (scores, molids, labels, pec50s, smiles),
                  reverse=True,
                  select_top=select_top)

    # N.B.
    # if analyzing ranking variability, use instead
    # scores2rankings()

    # Save for submission
    with open(outfile, 'w') as writer:
        for molid, smiles, score in zip(smolids, ssmiles, sscores):
            writer.write('%s,%s,%.6f\n' % (molid, smiles, score))

    # Create and save a pandas series to allow further stacking
    s = Series(data=scores, index=molids)
    s.to_pickle(op.join(op.splitext(outfile)[0] + '.pkl'))

    return molids, scores
예제 #3
0
def compute_heldout(dset,
                    deployers,
                    molids_provider,
                    outfile,
                    y_provider=None,
                    stacker=None,
                    select_top=None,
                    mc=None):
    """Predictions for the held-out sets."""
    X, _ = deployers(dset=dset)

    # Stacking or averaging?
    if stacker is not None:
        Xlab, _ = deployers(dset='lab')
        y = y_provider()
        stacker.fit(Xlab, y)  # Careful: Xlab columns can be extremelly collinear...
        if True:
            scores = stacker.predict(X)
        else:
            scores = stacker.predict_proba(X)[:, 1]
    else:
        scores = np.nanmean(X, axis=1)

    # Get the molids, smiles
    if mc is None:
        mc = MalariaCatalog()
    molids = molids_provider(dset=dset)
    smiles = mc.molids2smiless(molids)

    # Rankings
    ranks, (sscores, smolids, ssmiles) = \
        rank_sort(scores, (scores, molids, smiles), reverse=True, select_top=select_top)

    # Save for submission
    with open(outfile, 'w') as writer:
        for molid, smiles, score in izip(smolids, ssmiles, sscores):
            writer.write('%s,%s,%.6f\n' % (molid, smiles, score))

    # Create and save a pandas series to allow further stacking
    s = Series(data=scores, index=molids)
    s.to_pickle(op.join(op.splitext(outfile)[0] + '.pkl'))

    return molids, scores
예제 #4
0
         (df.folder_seed < 1) &
         (df.folder_size == 0))

results_for_fs = df[conds].result

importances = []
for res in results_for_fs:
    importances += [res.logreg_coefs(fold).ravel() for fold in res.present_folds()]

mean_importance = np.mean(importances, axis=0)
std_importance = np.std(importances, axis=0)
features = np.arange(len(mean_importance))

# Importance is in the absolute value, sign indicates positive/negative feature
ranks, (sfeatures, smean_importance, sstd_importance) =\
    rank_sort(mean_importance, (features, mean_importance, std_importance))

# Negative
print('Super-negative features')
for f, mi, si in izip(sfeatures[:10], smean_importance[:10], sstd_importance[:10]):
    print('Feature: %d (%.2f +/- %.2f)' % (f, mi, si))

# Positives
print('Super-positive features')
for f, mi, si in izip(sfeatures[-10:], smean_importance[-10:], sstd_importance[-10:]):
    print('Feature: %d (%.2f +/- %.2f)' % (f, mi, si))

# Some preparations...
rng = np.random.RandomState(52)
mfm = MalariaFingerprintsManager(dset='lab')
mc = MalariaCatalog()