示例#1
0
 def all_subs(dset):
     info(dset)
     subs = set()
     with open(op.join(_MALARIA_ECFPS_DIR, dset + '.merged')) as reader:
         for line in reader:
             subs.update(sub.split()[0] for sub in line.split('\t')[1:])  # TODO sort by frequency
     return subs
示例#2
0
 def process(molid, smiles):
     if molid is _END_MOLID:
         writer.close()
         return
     try:
         mol = to_rdkit_mol(smiles)
         fpsinfo = {}
         # N.B. We won't actually use rdkit hash, so we won't ask for nonzero values...
         # Is there a way of asking rdkit to give us this directly?
         AllChem.GetMorganFingerprint(mol, max_radius, bitInfo=fpsinfo, useFeatures=fcfp)
         counts = defaultdict(int)
         centers = defaultdict(list)
         for bit_descs in fpsinfo.values():
             for center, radius in bit_descs:
                 cansmiles = explain_circular_substructure(mol, center, radius)
                 counts[cansmiles] += 1
                 centers[cansmiles].append((center, radius))
         if write_centers:
             features_strings = ['%s %d %s' % (cansmiles,
                                               count,
                                               ' '.join(['%d %d' % (c, r) for c, r in centers[cansmiles]]))
                                 for cansmiles, count in counts.iteritems()]
         else:
             features_strings = ['%s %d' % (cansmiles, count) for cansmiles, count in counts.iteritems()]
         writer.write('%s\t%s\n' % (molid, '\t'.join(features_strings)))
     except:
         info('Failed molecule %s: %s' % (molid, smiles))
         writer.write('%s\t*FAILED*\n' % molid)
示例#3
0
def _molidsmiles_it_ecfp(output_file, start=0, step=46, fcfp=True, logeach=5000):
    """Q&D variant to allow Parallel work (cannot pickle closures or reuse iterators...)."""
    processor = _ecfp_writer(output_file=output_file, fcfp=fcfp)
    mols = read_smiles_ultraiterator()
    for molindex, (molid, smiles) in enumerate(islice(mols, start, None, step)):
        if logeach > 0 and molindex > 0 and not molindex % logeach:
            info('Molecule %d' % molindex)
        processor(molid, smiles)
    processor(_END_MOLID, None)
示例#4
0
def save_molids(data_dir, name, molids, overwrite=False):
    """Save molids in plain text in the data directory."""
    molids_file = op.join(data_dir, '%s.ids' % name)
    if not op.isfile(molids_file) or overwrite:
        info('Saving molids...')
        with open(molids_file, 'w') as writer:
            for molid in molids:
                writer.write(molid)
                writer.write('\n')
示例#5
0
 def malaria_ecfp_parallel_results_iterator(prefix='', log=True):
     """Iterates over the files resulting from the computation of ecfps using the function ecfp."""
     weirdfps = glob(op.join(_MALARIA_ECFPS_PARALLEL_RESULTS_DIR, '%s*.weirdfps' % prefix))
     weirdfps = _sort_by_start(weirdfps)
     for fn in weirdfps:
         if log:
             info(fn)
         with gzip.open(fn) as reader:
             for line in reader:
                 yield line
示例#6
0
def do_trees_submissions(do_confirmatory=True,
                         do_heldout=True,
                         do_screening=True):
    compute_submissions(prefix='trees',
                        dest_dir=MALARIA_TREES_EXPERIMENT_ROOT,
                        deployers=trees_deployers,
                        molids_provider=trees_molids,
                        y_provider=trees_y,
                        do_confirmatory=do_confirmatory,
                        do_heldout=do_heldout,
                        do_screening=do_screening)
    info('Submissions computed!')
示例#7
0
def catalog_malaria_mols(overwrite=False, checks=False):
    """Bootstrap the malaria catalogs."""

    to_catalog = (
        (op.join(MALARIA_DATA_ROOT, 'rdkit', 'mols', 'unl'), read_unlabelled_smiles),
        (op.join(MALARIA_DATA_ROOT, 'rdkit', 'mols', 'lab'), read_labelled_only_smiles),
        (op.join(MALARIA_DATA_ROOT, 'rdkit', 'mols', 'scr'), read_screening_smiles),
    )

    for path, molit in to_catalog:
        build_benchmark_check_rdkmols_catalog(path, molit=molit, checks=checks, overwrite=overwrite)

    info('ALL DONE')
示例#8
0
def compute_submissions(prefix,
                        dest_dir,
                        deployers,
                        molids_provider,
                        y_provider,
                        do_confirmatory=True,
                        do_heldout=True,
                        do_screening=True,
                        confirmatory_top=500,
                        scr_top=1000):

    info('Computing submissions for %s' % prefix)

    mc = MalariaCatalog()  # For performance, maybe this should be singleton...

    if do_confirmatory:
        compute_confirmatory(deployers,
                             molids_provider,
                             outfile=op.join(dest_dir, '%s_hitSelection.txt' % prefix),
                             y_provider=y_provider,
                             select_top=confirmatory_top)

    def do_predict(dset, select_top=None):

        info('Computing predictions for %s: %s' % (prefix, dset))

        _, scores_averaged = compute_heldout(dset,
                                             deployers,
                                             molids_provider,
                                             op.join(dest_dir, '%s_%s-averaged.txt' % (prefix, dset)),
                                             y_provider=y_provider,
                                             mc=mc,
                                             select_top=select_top)

        _, scores_linr = compute_heldout(dset,
                                         deployers,
                                         molids_provider,
                                         op.join(dest_dir, '%s_%s-stacker=linr.txt' % (prefix, dset)),
                                         y_provider=y_provider,
                                         stacker=LinearRegression(),
                                         mc=mc,
                                         select_top=select_top)

        info('Computing kendall-tau (go take a nap if there are a lot of examples...)')
        info('%s:%s - Kendall-tau avg vs linr: %.2f' % (prefix, dset, kendalltau(scores_linr, scores_averaged)))

    if do_heldout:
        do_predict('unl')

    if do_screening:
        do_predict('scr', select_top=scr_top)
示例#9
0
def compute_confirmatory(deployers,
                         molids_provider,
                         outfile,
                         y_provider=None,
                         select_top=500,
                         mc=None):
    """Scores and rankings on plain-average for the labelled / ambiguous dataset."""

    # Labelled
    Xlab, f_names = deployers(dset='lab')
    info('AUC after plain averaging (bagging like): %.3f' % roc_auc_score(y_provider(),
                                                                          np.nanmean(Xlab, axis=1)))
    # Ambiguous
    Xamb, _ = deployers(dset='amb')
    # All together
    X = np.vstack((Xlab, Xamb))

    # Scores are just plain averages
    scores = np.nanmean(X, axis=1)

    # Get the molids, smiles, labels, pec50
    lab_molids = molids_provider(dset='lab')
    amb_molids = molids_provider(dset='amb')
    molids = np.hstack((lab_molids, amb_molids))

    if mc is None:
        mc = MalariaCatalog()
    labels = mc.molids2labels(molids)
    pec50s = mc.molids2pec50s(molids)
    smiles = mc.molids2smiless(molids)

    # Rankings
    ranks, (sscores, smolids, slabels, spec50s, ssmiles) = \
        rank_sort(scores, (scores, molids, labels, pec50s, smiles),
                  reverse=True,
                  select_top=select_top)

    # N.B.
    # if analyzing ranking variability, use instead
    # scores2rankings()

    # Save for submission
    with open(outfile, 'w') as writer:
        for molid, smiles, score in zip(smolids, ssmiles, sscores):
            writer.write('%s,%s,%.6f\n' % (molid, smiles, score))

    # Create and save a pandas series to allow further stacking
    s = Series(data=scores, index=molids)
    s.to_pickle(op.join(op.splitext(outfile)[0] + '.pkl'))

    return molids, scores
示例#10
0
 def process(molid, smiles):
     if molid is _END_MOLID:
         h5.close()
         return
     ne = len(molids)
     try:
         molids.resize((ne + 1,))
         molids[ne] = molid
         mol = to_rdkit_mol(smiles)
         descs.resize((ne + 1, nf))
         descs[ne, :] = computer.compute(mol)[0]
     except:
         info('Failed molecule %s: %s' % (molid, smiles))
         descs[ne, :] = [np.nan] * nf
示例#11
0
def _molidsmiles_it(start=0, step=46, mols=None, processor=None, logeach=500):
    """Iterates (molindex, molid, smiles) triplets skipping step molecules in each iteration.
    This is useful for evenly splitting workloads between processors / machines.
    Parameters:
      - start: the index of the first pair to consider
      - step: how many molecules are skipped on each iteration
      - mols: an iterator (molid, smiles)
      - processor: a function that gets called for each pair;
                   when the iterator is exhausted, (_END_MOLID, None) is sent.
    """
    if mols is None:
        mols = read_smiles_ultraiterator()
    for molindex, (molid, smiles) in enumerate(islice(mols, start, None, step)):
        if logeach > 0 and molindex > 0 and not molindex % logeach:
            info('Molecule %d' % molindex)
        processor(molid, smiles)
    processor(_END_MOLID, None)
示例#12
0
def detect_duplicate_features(transductive=False, verbose=False):
    """Detect exact duplicated features in the malaria dataset, returning a list of duplicated groups (column indices).
    Here duplicated is very practically defined as "appearing in the same molecules accross the malaria dataset".
    """

    # TODO: this is really memory intensive, make streaming (over the columns...)
    # TODO: manage ambiguous...

    # Are there many singleton features collapsed?

    if transductive:
        Xlab = MalariaFingerprintsManager(dset='lab', keep_ambiguous=False).X()
        Xunl = MalariaFingerprintsManager(dset='unl', keep_ambiguous=True).X()
        Xscr = MalariaFingerprintsManager(dset='scr', keep_ambiguous=True).X()
        X = vstack((Xlab, Xunl, Xscr))
    else:
        X = MalariaFingerprintsManager(dset='lab', keep_ambiguous=False).X()

    info('MatrixMol Feature Duplicate detection')
    info('We are dealing with a matrix as big as %d molecules and %d features' % X.shape)

    ne, nf = X.shape
    X = X.tocsc()
    X.indices.flags.writeable = False  # Make the views from this array hashable
    groups = defaultdict(lambda: array('I'))
    for i in xrange(nf):
        xi = X.indices[X.indptr[i]:X.indptr[i+1]:]
        groups[xi.data].append(i)
        if verbose and i > 0 and not i % 1000000:
            info('%d of %d substructures hashed according to the molecules they pertain' % (i, nf))

    return groups.values()
示例#13
0
def trees_deploy(dest_file=MALARIA_TREES_DEPLOYMENT_H5):
    """Generates predictions for unlabelled datasets."""

    df = trees_results_to_pandas()

    h5 = h5py.File(dest_file, 'w')

    # Choose a few good results (maybe apply diversity filters or ensemble selection or...)
    deployers = df[(df.model_num_trees == 6000)]

    info('Deploying %d tree ensembles' % len(deployers))

    for i, res in enumerate(deployers.result):
        f_name = '%s__%s' % (res.model_setup_id(), res.eval_setup_id())  # What about the data setup?
                                                                         # Here it works but in general not
                                                                         # Save it all...
                                                                         # (a new dataset with all the coords
                                                                         # and the result path)
        info(f_name)

        # Lab
        if '%s/lab' % f_name not in h5:
            h5['%s/lab' % f_name] = res.scores('lab')[:, 1].astype(np.float32)

        # Amb
        if '%s/amb' % f_name not in h5:
            h5['%s/amb' % f_name] = res.scores('amb')[:, 1].astype(np.float32)

        # Unl
        if '%s/unl' % f_name not in h5:
            h5['%s/unl' % f_name] = res.scores('unl')[:, 1].astype(np.float32)

        # Scr
        if '%s/scr' % f_name not in h5:
            h5['%s/scr' % f_name] = fix_streaming_scoring_bug_results(res.scores('scr')[:, 1].astype(np.float32))
            assert h5['%s/scr' % f_name].hape[0] == 5488144, 'Streaming rdkf bug striking back...'

    h5.close()
示例#14
0
def predict_malaria_unlabelled(model, h5, rf_amb=None, rf_scr=None, rf_unl=None, chunksize=0):
    """Use the model to cast predictions for the datasets, storing them where appropriate in the h5 file and
    allowing predicition on streams of the screening dataset.
    """

    # Ambiguous examples
    if rf_amb is not None:
        info('Scoring amb...')
        h5['amb'] = model.predict_proba(rf_amb.X()).astype(np.float32)
    # Unlabelled (competition) examples
    if rf_unl is not None:
        info('Scoring unl...')
        h5['unl'] = model.predict_proba(rf_unl.X()).astype(np.float32)
    # Screening examples
    if rf_scr is not None:
        info('Scoring scr...')
        if chunksize <= 0:
            h5['scr'] = model.predict_proba(rf_scr.X()).astype(np.int32)
        else:
            scr = h5.create_dataset('scr', shape=(rf_scr.ne_stream(), 2), dtype=np.float32)
            for i, x in enumerate(rf_scr.X_stream(chunksize=chunksize)):
                base = i * chunksize
                info('\t num_scr_examples: %d' % base)
                scr[base:base + chunksize] = model.predict_proba(x)
示例#15
0
    def do_predict(dset, select_top=None):

        info('Computing predictions for %s: %s' % (prefix, dset))

        _, scores_averaged = compute_heldout(dset,
                                             deployers,
                                             molids_provider,
                                             op.join(dest_dir, '%s_%s-averaged.txt' % (prefix, dset)),
                                             y_provider=y_provider,
                                             mc=mc,
                                             select_top=select_top)

        _, scores_linr = compute_heldout(dset,
                                         deployers,
                                         molids_provider,
                                         op.join(dest_dir, '%s_%s-stacker=linr.txt' % (prefix, dset)),
                                         y_provider=y_provider,
                                         stacker=LinearRegression(),
                                         mc=mc,
                                         select_top=select_top)

        info('Computing kendall-tau (go take a nap if there are a lot of examples...)')
        info('%s:%s - Kendall-tau avg vs linr: %.2f' % (prefix, dset, kendalltau(scores_linr, scores_averaged)))
示例#16
0
def logreg_deploy(dest_file=MALARIA_LOGREGS_DEPLOYMENT_H5):
    """Generates predictions for unlabelled datasets."""

    df = logreg_results_to_pandas()

    h5 = h5py.File(dest_file, 'w')

    # Choose a few good results (maybe apply diversity filters or ensemble selection or...)
    deployment_cond_1 = (df.cv_seed < 5) & \
                        (df.num_present_folds == df.num_cv_folds) & \
                        (df.penalty == 'l1') & \
                        (df.C == 1) & \
                        (df.class_weight == 'auto') & \
                        (df.tol == 1E-4) & \
                        (df.folder_size < 1) & \
                        (df.folder_seed == -1) & \
                        (df.auc_mean > 0.92)

    deployment_cond_2 = (df.num_present_folds == df.num_cv_folds) & \
                        (df.penalty == 'l2') & \
                        (df.C == 5) & \
                        (df.class_weight == 'auto') & \
                        (df.tol == 1E-4) & \
                        (df.folder_size < 1) & \
                        (df.folder_seed == -1) & \
                        (df.auc_mean > 0.93)

    deployers = df[deployment_cond_1 | deployment_cond_2]

    info('Deploying %d logistic regressors' % len(deployers))

    # We will have 40 "features", one for each deployer
    # For lab it will just be the test scores
    # For amb, unl and scr it will be the average of the scores for each cv fold

    rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(None)

    for i, res in enumerate(deployers.result):
        f_name = '%s__%s' % (res.model_setup_id(), res.eval_setup_id())  # What about the data setup?
                                                                         # Here it works but in general not
                                                                         # Save it all...
                                                                         # (a new dataset with all the coords
                                                                         # and the result path)
        print f_name

        # Lab
        if '%s/lab' % f_name not in h5:
            h5['%s/lab' % f_name] = res.scores()[:, 1].astype(np.float32)

        # Amb
        models = [res.fold_model(fold) for fold in res.present_folds()]
        if '%s/amb' % f_name not in h5:
            h5['%s/amb' % f_name] = np.nanmean([model.predict_proba(rf_amb.X())[:, 1]
                                                for model in models], axis=0).astype(np.float32)
        # Unl
        if '%s/unl' % f_name not in h5:
            h5['%s/unl' % f_name] = np.nanmean([model.predict_proba(rf_unl.X())[:, 1]
                                                for model in models], axis=0).astype(np.float32)
        # Scr
        if '%s/scr' % f_name not in h5:
            h5['%s/scr' % f_name] = np.nanmean([model.predict_proba(rf_scr.X())[:, 1]
                                                for model in models], axis=0).astype(np.float32)

    h5.close()
示例#17
0
def build_benchmark_check_rdkmols_catalog(mmapdir, molit=read_labelled_only_smiles, checks=False, overwrite=False):
    """Builds a memmapped catalog {molid->rdkbytes} from a (molid, smiles) iterator.
    tests it and compares to sequential recreation of the molecules from smiles.
    """

    # Build the catalog
    info('Building %s catalog...' % mmapdir)
    start = time()
    mmm = MemMappedMols(mmapdir)
    if not overwrite and mmm.has_catalog():
        info('Already computed, skipping.')
    else:
        mmm.save_from_smiles_iterator(molit())
    info('Time taken to build the memmapped file: %.2f seconds' % (time() - start))

    if not checks:
        return

    # Load the catalog
    mmms = MemMappedMols(mmapdir)

    # Lame benchmark - memmapped contiguous
    info('Benchmarking contiguous memmap reading')
    start = time()
    molcount = 0
    # noinspection PyTypeChecker
    for molid in mmms.molids():
        mmms.mol(molid)
        molcount += 1
    info('Time taken to read the memmapped %d mols (contiguous): %.2f seconds' % (molcount, time() - start))

    info('Benchmarking random memmap reading')
    start = time()
    molcount = 0
    for molid in set(mmms.molids()):
        mmms.mol(molid)
        molcount += 1
    info('Time taken to read the memmapped %d mols (random): %.2f seconds' % (molcount, time() - start))

    # Lame benchmark - from smiles
    info('Benchmarking reading from the original file')
    start = time()
    molcount = 0
    for _, smiles in molit():
        Chem.MolFromSmiles(smiles)
        molcount += 1
    info('Time taken to read the smiled %d mols: %.2f seconds' % (molcount, time() - start))

    # Exhaustive linear test that all mols are correctly stored
    info('Making sure that all is OKish')
    for molid, smiles in molit():
        emol = Chem.MolFromSmiles(smiles)
        if emol is None:
            if not mmms.mol(molid) is None:
                warning('Molecule %s with original smiles %s should not be parsed from the binary store' %
                        (molid, smiles))
        else:
            if mmms.mol(molid) is not None:
                if not Chem.MolToSmiles(emol) == Chem.MolToSmiles(mmms.mol(molid)):
                    warning('Molecule %s with original smiles %s do not reconstruct properly: \n\t(%s != %s)' %
                            (molid, smiles, Chem.MolToSmiles(emol), Chem.MolToSmiles(mmms.mol(molid))))
    info('All is OKish')
示例#18
0
def clean_results_pre_infojson_bug_fix():
    results = ResultInDisk.collect_results_under_dir(MALARIA_LOGREGS_EXPERIMENT_ROOT,
                                                     factory=malaria_result_factory)
    bad_results = [res for res in results if not op.isfile(op.join(res.eval_dir, 'info.json'))]
    for res in bad_results:
        info('Bye %s' % res.eval_dir)
示例#19
0
def logreg_deploy(dest_file=None, with_bug=False):
    """
    Generates predictions for the competition unlabelled datasets, saving them in HDF5 files.

    Generates one prediction per molecule and cross-validation experiment:

      - For the labelled set, the prediction is given by the model of the
        run where the molecule was in the testing set.

      - For the other sets, the predictions are averages of all the models
        built during cross-validation. Note that at the time of submitting
        there was a bug that made these predictions be just the one of the
        last fold (see `with_bug` parameter).


    Parameters
    ----------
    dest_file : string or None, default None
      Path to the HDF5 to store the prediction values.
      There will be as many groups in there as deployed models.
      Each group will contain 4 datasets:
        - lab: predicitions on the labelled dataset
        - amb: predictions on the ambiguously labelled compounds
        - unl: predictions in the held-out competition set
        - scr: predictions in the screening dataset

    with_bug : bool, default False
      If True, predictions will be generated as for the competion
      (taking only the last fold of each experiment into account).
      If False, predictions will be generated as initially intended
      (averaging all the folds for each experiment).
      This bug does not affect the labelled scores.

    Returns
    -------
    The path to the HDF5 file where the scores have been saved.

    Side effects
    ------------
    The HDF5 file is created
    """

    if dest_file is None:
        dest_file = malaria_logreg_deployers_file(with_bug=with_bug)

    results = logreg_experiments_to_deploy().result

    info('Deploying %d logistic regression experiments (%d classifiers)' % (
        len(results),
        sum(len(result.present_folds()) for result in results)))

    # We will have a few "features" for each deployer
    # For lab it will just be the test scores
    # For amb, unl and scr it will be the average of the scores for each cv fold

    rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(None)

    with h5py.File(dest_file, 'w') as h5:

        for i, res in enumerate(results):

            # Deployer id
            f_name = '%s__%s' % (res.model_setup_id(), res.eval_setup_id())

            # Lab
            if '%s/lab' % f_name not in h5:
                h5['%s/lab' % f_name] = res.scores()[:, 1].astype(np.float32)

            # Get result models
            models = [res.fold_model(fold, with_bug=with_bug) for fold in res.present_folds()]

            # Amb
            if '%s/amb' % f_name not in h5:
                h5['%s/amb' % f_name] = np.nanmean([model.predict_proba(rf_amb.X())[:, 1]
                                                    for model in models], axis=0).astype(np.float32)
            # Unl
            if '%s/unl' % f_name not in h5:
                h5['%s/unl' % f_name] = np.nanmean([model.predict_proba(rf_unl.X())[:, 1]
                                                    for model in models], axis=0).astype(np.float32)
            # Scr
            if '%s/scr' % f_name not in h5:
                h5['%s/scr' % f_name] = np.nanmean([model.predict_proba(rf_scr.X())[:, 1]
                                                    for model in models], axis=0).astype(np.float32)

    return dest_file
示例#20
0
def fit_logregs(dest_dir=MALARIA_LOGREGS_EXPERIMENT_ROOT,
                # Logreg params
                logreg_penalty='l1',
                logreg_C=1.0,
                logreg_class_weight_auto=False,
                logreg_dual=False,
                logreg_tol=1e-4,
                logreg_fit_intercept=True,
                logreg_intercept_scaling=1,
                # CV params
                num_cv_folds=10,
                cv_seeds=(0,),
                save_unlabelled_predictions=False,
                save_fold_model=False,
                min_fold_auc=0.88,
                # Fingerprint folding params
                fingerprint_folder_seed=0,
                fingerprint_fold_size=1023,
                # Computational requirements params
                force=False,
                chunksize=1000000):
    """Logistic regression experiment using the liblinear wrapper in sklearn.
    Generates cross-val results
    """

    ### TODO Remove
    if logreg_tol < 1E-5:
        info('Ignoring long intolerant experiments')
        return

    info('Malaria logregs experiment')

    # Command line type inference is rotten...
    logreg_C = float(logreg_C)
    logreg_tol = float(logreg_tol)
    logreg_intercept_scaling = float(logreg_intercept_scaling)
    num_cv_folds = int(num_cv_folds)
    min_fold_auc = float(min_fold_auc)
    fingerprint_folder_seed = int(fingerprint_folder_seed)
    fingerprint_fold_size = int(fingerprint_fold_size)
    chunksize = int(chunksize)

    # Example providers
    folder = None if fingerprint_fold_size < 1 else MurmurFolder(seed=fingerprint_folder_seed,
                                                                 fold_size=fingerprint_fold_size)
    rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(folder)
    info('Data description: %s' % rf_lab.configuration().id(full=True))

    # Experiment context: data
    data_id = rf_lab.configuration().id(full=True)
    data_dir = op.join(dest_dir, data_id)
    ensure_dir(data_dir)

    for cv_seed in cv_seeds:

        # Command line type inference is rotten...
        cv_seed = int(cv_seed)

        # Deterministic randomness
        my_rng = np.random.RandomState(seed=cv_seed)

        # Experiment context: model
        logreg_params = OrderedDict((
            ('penalty', logreg_penalty),
            ('C', logreg_C),
            ('class_weight', 'auto' if logreg_class_weight_auto else None),
            ('dual', logreg_dual),
            ('tol', logreg_tol),
            ('fit_intercept', logreg_fit_intercept),
            ('intercept_scaling', logreg_intercept_scaling),
            ('random_state', my_rng.randint(low=0, high=1000 ** 4)),
        ))
        model_setup = LogisticRegression(**logreg_params)
        model_id = 'skllogreg__%s' % '__'.join(['%s=%s' % (k, str(v)) for k, v in logreg_params.iteritems()])
        model_dir = op.join(data_dir, model_id)
        ensure_dir(model_dir)
        info('Model: %s' % model_id)

        # Experiment context: eval
        eval_id = 'cv__cv_seed=%d__num_folds=%d' % (cv_seed, num_cv_folds)
        eval_dir = op.join(model_dir, eval_id)
        ensure_dir(eval_dir)
        info('Eval: %d-fold cross validation (seed=%d)' % (num_cv_folds, cv_seed))

        # Already done?
        info_file = op.join(eval_dir, 'info.json')
        if op.isfile(info_file) and not force:
            info('\tAlready done, skipping...')
            return  # Oh well, a lot have been done up to here... rework somehow

        # Anytime we see this file, we know we need to stop
        stop_computing_file = op.join(eval_dir, 'STOP_BAD_FOLD')

        #---------
        #--------- Time to work!
        #---------

        # Save model config
        joblib.dump(model_setup, op.join(model_dir, 'model_setup.pkl'), compress=3)

        # Read labelled data in
        info('Reading data...')
        X, y = rf_lab.Xy()
        info('ne=%d; nf=%d' % rf_lab.X().shape)

        # Save molids... a bit too ad-hoc...
        save_molids(data_dir, 'lab', rf_lab.ids())
        if save_unlabelled_predictions:
            save_molids(data_dir, 'unl', rf_unl.ids())
            save_molids(data_dir, 'scr', rf_scr.ids())
            save_molids(data_dir, 'amb', rf_amb.ids())

        # Save folding information.
        # By now, all the folds have already been computed:
        #   - because we cached X
        #   - and in this case we are warranted that no new unfolded features will appear at test time
        if folder is not None:
            info('Saving the map folded_features -> unfolded_feature...')
            folded2unfolded_file = op.join(data_dir, 'folded2unfolded.h5')
            if not op.isfile(folded2unfolded_file):
                with h5py.File(folded2unfolded_file) as h5:
                    h5['f2u'] = folder.folded2unfolded()
            folder_light_file = op.join(data_dir, 'folder.pkl')
            if not op.isfile(folder_light_file):
                folder_light = copy(folder)  # Shallow copy
                folder_light.clear_cache()
                joblib.dump(folder_light, folder_light_file, compress=3)

        # Cross-val splitter
        cver = cv_splits(num_points=len(y),
                         Y=y,
                         num_folds=num_cv_folds,
                         rng=my_rng,
                         stratify=True)

        # Fit and classify
        for cv_fold_num in xrange(num_cv_folds):

            fold_info_file = op.join(eval_dir, 'fold=%d__info.json' % cv_fold_num)
            if op.isfile(fold_info_file):
                info('Fold %d already done, skipping' % cv_fold_num)
                continue

            if op.isfile(stop_computing_file):
                info('Bad fold detected, no more computations required')
                break

            # Split into train/test
            train_i, test_i = cver(cv_fold_num)
            Xtrain, ytrain = X[train_i, :], y[train_i]
            Xtest, ytest = X[test_i, :], y[test_i]

            # Copy the model...
            model = clone(model_setup)

            start = time()
            info('Training...')
            model.fit(Xtrain, ytrain)
            train_time = time() - start
            info('Model fitting has taken %.2f seconds' % train_time)

            if save_fold_model:
                info('Saving trained model')
                joblib.dump(model, op.join(eval_dir, 'fold=%d__fitmodel.pkl' % cv_fold_num), compress=3)

            info('Predicting and saving results...')
            with h5py.File(op.join(eval_dir, 'fold=%d__scores.h5' % cv_fold_num), 'w') as h5:

                start = time()

                # Test indices
                h5['test_indices'] = test_i

                # Model
                h5['logreg_coef'] = model.coef_
                h5['logreg_intercept'] = model.intercept_

                # Test examples
                info('Scoring test...')
                scores_test = model.predict_proba(Xtest)
                fold_auc = roc_auc_score(ytest, scores_test[:, 1])
                fold_enrichment5 = enrichment_at(ytest, scores_test[:, 1], percentage=0.05)
                info('Fold %d ROCAUC: %.3f' % (cv_fold_num, fold_auc))
                info('Fold %d Enrichment at 5%%: %.3f' % (cv_fold_num, fold_enrichment5))
                h5['test'] = scores_test.astype(np.float32)

                if save_unlabelled_predictions:
                    predict_malaria_unlabelled(model,
                                               h5,
                                               rf_amb=rf_amb,
                                               rf_scr=rf_scr,
                                               rf_unl=rf_unl,
                                               chunksize=chunksize)

                test_time = time() - start
                info('Predicting has taken %.2f seconds' % test_time)

                # Finally save meta-information for the fold
                metainfo = mlexp_info_helper(
                    title='malaria-trees-oob',
                    data_setup=data_id,
                    model_setup=model_id,
                    exp_function=giveupthefunc(),
                )
                metainfo.update((
                    ('train_time', train_time),
                    ('test_time', test_time),
                    ('auc', fold_auc),
                    ('enrichment5', fold_enrichment5),
                ))
                with open(fold_info_file, 'w') as writer:
                    json.dump(metainfo, writer, indent=2, sort_keys=False)

                # One last thing, should we stop now?
                if fold_auc < min_fold_auc:
                    stop_message = 'The fold %d was bad (auc %.3f < %.3f), skipping the rest of the folds' % \
                                   (cv_fold_num, fold_auc, min_fold_auc)
                    info(stop_message)
                    with open(stop_computing_file, 'w') as writer:
                        writer.write(stop_message)

        # Summarize cross-val in the info file
        metainfo = mlexp_info_helper(
            title='malaria-trees-oob',
            data_setup=data_id,
            model_setup=model_id,
            exp_function=giveupthefunc(),
        )
        metainfo.update((
            ('num_cv_folds', num_cv_folds),
            ('cv_seed', cv_seed),
        ))
        metainfo.update(logreg_params.items())
        with open(info_file, 'w') as writer:
            json.dump(metainfo, writer, indent=2, sort_keys=False)
示例#21
0
def fit(dest_dir=MALARIA_TREES_EXPERIMENT_ROOT,
        seeds=(0, 1, 2, 3, 4),
        num_treess=(10, 6000, 4000, 2000, 1000, 500, 20, 50, 100),
        save_trained_models=False,
        chunksize=200000,
        num_threads=None,
        force=False):

    # Generates OOB results

    info('Malaria trees experiment')

    # Guess the number of threads
    if num_threads is None:
        num_threads = cpu_count()
    info('Will use %d threads' % num_threads)

    # Example providers
    info('Reading data...')
    rf_lab = MalariaRDKFsExampleSet()
    X, y = rf_lab.Xy()
    rf_unl = MalariaRDKFsExampleSet(dset='unl', remove_ambiguous=False)
    rf_scr = MalariaRDKFsExampleSet(dset='scr', remove_ambiguous=False)
    rf_amb = MalariaRDKFsExampleSet(dset='amb')
    # A bit of logging
    info('Data description: %s' % rf_lab.configuration().id(nonids_too=True))
    info('ne=%d; nf=%d' % rf_lab.X().shape)

    # Experiment context: data
    data_id = rf_lab.configuration().id(nonids_too=True)  # TODO: bring hashing from oscail
    data_dir = op.join(dest_dir, data_id)
    ensure_dir(data_dir)

    # Save molids... a bit too ad-hoc...
    info('Saving molids...')

    save_molids(data_dir, 'lab', rf_lab.ids())
    save_molids(data_dir, 'unl', rf_unl.ids())
    save_molids(data_dir, 'scr', rf_scr.ids())
    save_molids(data_dir, 'amb', rf_amb.ids())

    # Main loop - TODO: robustify with try and continue
    for etc, seed, num_trees in product((True, False), seeds, num_treess):

        # Configure the model
        if etc:
            model = ExtraTreesClassifier(n_estimators=num_trees,
                                         n_jobs=num_threads,
                                         bootstrap=True,
                                         oob_score=True,
                                         random_state=seed)
        else:
            model = RandomForestClassifier(n_estimators=num_trees,
                                           n_jobs=num_threads,
                                           oob_score=True,
                                           random_state=seed)

        # Experiment context: model
        model_id = 'trees__etc=%r__num_trees=%d__seed=%d' % (etc, num_trees, seed)  # TODO: bring self-id from oscail
        model_dir = op.join(data_dir, model_id)
        ensure_dir(model_dir)
        info('Model: %s' % model_id)

        # Experiment context: eval
        eval_id = 'oob'
        eval_dir = op.join(model_dir, eval_id)
        ensure_dir(eval_dir)
        info('Eval: OOB (Out Of Bag)')

        # Already done?
        info_file = op.join(eval_dir, 'info.json')
        if op.isfile(info_file) and not force:
            info('\tAlready done, skipping...')
            continue

        # Save model config
        joblib.dump(model, op.join(model_dir, 'model_setup.pkl'), compress=3)

        # Train-full
        info('Training...')
        start = time()
        model.fit(X, y)
        train_time = time() - start  # This is also test-time, as per OOB=True

        # Save trained model? - yeah, lets do it under oob
        if save_trained_models:
            joblib.dump(model, op.join(eval_dir, 'model_trained.pkl'), compress=3)

        # OOB score, auc and enrichment
        oob_score = model.oob_score_
        oob_scores = model.oob_decision_function_
        oob_scores_not_missing = fill_missing_scores(oob_scores[:, 1])

        auc = roc_auc_score(y,  oob_scores_not_missing)
        enrichment5 = enrichment_at(y, oob_scores_not_missing, percentage=0.05)

        info('OOB AUC: %.2f' % auc)
        info('OOB Enrichment at 5%%: %.2f' % enrichment5)
        info('OOB Accuracy: %.2f' % oob_score)

        # Save scores and importances
        info('Saving results...')
        with h5py.File(op.join(eval_dir, 'oob_auc=%.2f__scores.h5' % auc), 'w') as h5:

            start = time()

            # Feature importances
            h5['f_names'] = rf_lab.fnames()
            h5['f_importances'] = model.feature_importances_

            # Labelled (development) examples
            info('Scoring lab...')
            h5['lab'] = oob_scores.astype(np.float32)

            info('Scoring amb...')
            h5['amb'] = model.predict_proba(rf_amb.X()).astype(np.float32)

            # Unlabelled (competition) examples
            info('Scoring unl...')
            h5['unl'] = model.predict_proba(rf_unl.X()).astype(np.float32)

            # Unlabelled (screening) examples
            info('Scoring scr...')
            if chunksize <= 0:
                h5['scr'] = model.predict_proba(rf_scr.X()).astype(np.int32)
            else:
                scr = h5.create_dataset('scr', shape=(rf_scr.ne_stream(), 2), dtype=np.float32)
                for i, x in enumerate(rf_scr.X_stream(chunksize=chunksize)):
                    base = i * chunksize
                    info('\t num_scr_examples: %d' % base)
                    scr[base:base + chunksize] = model.predict_proba(x)

            test_time = time() - start

        # Finally save meta-information
        metainfo = mlexp_info_helper(
            title='malaria-trees-oob',
            data_setup=data_id,
            model_setup=model_id,
            exp_function=fit,
        )
        metainfo.update((
            ('train_time', train_time),
            ('test_time', test_time),
            ('oob_auc', auc),
            ('oob_enrichment5', enrichment5),
            ('oob_accuracy', oob_score),
        ))
        with open(info_file, 'w') as writer:
            json.dump(metainfo, writer, indent=2, sort_keys=False)