示例#1
0
def logreg_molids(dset='lab'):
    # No need to do this on a per-result basis because
    # atm we are warranted that they are the same accross all evaluations.
    rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(None)
    rf = (rf_lab if dset == 'lab' else
          rf_amb if dset == 'amb' else
          rf_unl if dset == 'unl' else
          rf_scr if dset == 'scr' else
          None)
    if rf is None:
        raise Exception('Unknown dataset %s' % dset)
    return rf.ids()
示例#2
0
def logreg_results_to_pandas(common_molids_cache=False):
    """Collects all the results in disk and place them in record-format in a pandas dataframe.
    Allows convenient reporting, grouping and filtering of results.
    """
    results = ResultInDisk.collect_results_under_dir(MALARIA_LOGREGS_EXPERIMENT_ROOT,
                                                     factory=malaria_result_factory)

    # --- molids cache
    molids_cache = None
    if common_molids_cache:
        rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(None)
        # Labelled molids
        lab_molids = rf_lab.ids()
        amb_molids = rf_amb.ids()  # To prioritize confirmatory tests on labelled data
        # Unlabelled molids
        unl_molids = rf_unl.ids()
        scr_molids = rf_scr.ids()
        # Let's avoid the need to reread them...
        molids_cache = {
            'lab': lab_molids,
            'amb': amb_molids,
            'unl': unl_molids,
            'scr': scr_molids
        }

    results_dict_of_dicts = {}
    for result in results:
        if common_molids_cache:
            result.ids_cache = molids_cache    # dodgy, rework with a copying constructor
        rdict = copy(result.info())
        rdict['result'] = result
        rdict['class_weight'] = 'uniform' if rdict['class_weight'] is None else rdict['class_weight']
        # Some more ad-hoc keys for the model
        rdict['num_present_folds'] = result.num_present_folds()
        rdict['auc_mean'] = result.auc_mean()
        rdict['enrichement5_mean'] = result.enrichement5_mean()
        # Some more ad-hoc keys for the fingerprint folder
        folder = result.fingerprint_folder()
        rdict['folder_seed'] = int(folder.seed) if folder is not None else -1
        rdict['folder_size'] = int(folder.fold_size) if folder is not None else 0
        # Add this result to the data frame
        results_dict_of_dicts[result.root_key()] = rdict

    return DataFrame(results_dict_of_dicts).T
def trie_bench():

    #
    # Needs to be properly done, but we already know:
    #
    #   - In terms of disk space, no option beats vanilla gzipping
    #     on plain text data.
    #
    #   - Marisa trie can be useful to avoid memory problems: it can memmap
    #     making multiprocessing easy, it reduces dramatically the
    #     size. Speed is also fastest for this dataset (informal benchmarks
    #     not in here).
    #
    #   - The smallest memory footprint is given by plain marisa.
    #     But that is only suitable if we could let marisa decide the mapping
    #     to the column index and if we do not mind these indices changing
    #     each time we update the feature collection. We could always keep
    #     an extra int array with the actual index of the column and update it
    #     when we need to rewrite the trie upon new features arrival.
    #
    #   - Most probably, python dict is way faster on remapping - but that is also
    #     probably irrelevant here. We need to measure how much space these consume
    #     and how other alternatives (e.g. sparsepp) fare.
    #

    rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(None)

    def subs(n=None, return_index='simple'):
        substructures = rf_lab.mfm().substructures()
        if n is None:
            n = len(substructures)
        for i in tqdm(range(min(n, len(substructures)))):
            if return_index == 'simple':
                yield str(substructures[i]), i
            elif return_index == 'tuple':
                yield str(substructures[i]), (i,)
            elif return_index == 'no':
                yield str(substructures[i])
            else:
                raise ValueError('return_index must be one of ["simple", "tuple", "no"], it is %r' % return_index)

    # Uncompressed: 11697K. Can mmap.
    # Mapping to index becomes arbitrary, so we either need to define a format
    # with an auxiliary mapping (int by marisa -> int by insertion) and keep it
    # constant with new additions or we just not use this.
    # Need to measure speed.
    trie = marisa_trie.Trie()
    trie.load(op.expanduser('~/substructures.marisa'))
    for k, v in trie.iteritems():
        print(k, v)

    trie = marisa_trie.Trie(subs(return_index='no'))
    trie.save(op.expanduser('~/substructures.marisa'))

    fmt = 'I'
    trie = marisa_trie.RecordTrie(fmt, subs(return_index='tuple'))
    trie.save(op.expanduser('~/substructures.intMarisa'))

    trie = dawg.DAWG(subs(return_index='simple'))
    trie.save(op.expanduser('~/substructures.dawg'))

    trie = dawg.IntCompletionDAWG(subs(return_index='simple'))
    trie.save(op.expanduser('~/substructures.intCompletionDawg'))

    trie = dawg.IntDAWG(subs(return_index='simple'))
    trie.save(op.expanduser('~/substructures.intDawg'))

    trie = datrie.Trie(string.printable)
    for s, i in subs(return_index='simple'):
        trie[s] = i
    trie.save(op.expanduser('~/substructures.datrie'))

    trie = datrie.BaseTrie(string.printable)
    for s, i in subs(return_index='simple'):
        trie[s] = i
    trie.save(op.expanduser('~/substructures.basedatrie'))
示例#4
0
def logreg_deploy(dest_file=None, with_bug=False):
    """
    Generates predictions for the competition unlabelled datasets, saving them in HDF5 files.

    Generates one prediction per molecule and cross-validation experiment:

      - For the labelled set, the prediction is given by the model of the
        run where the molecule was in the testing set.

      - For the other sets, the predictions are averages of all the models
        built during cross-validation. Note that at the time of submitting
        there was a bug that made these predictions be just the one of the
        last fold (see `with_bug` parameter).


    Parameters
    ----------
    dest_file : string or None, default None
      Path to the HDF5 to store the prediction values.
      There will be as many groups in there as deployed models.
      Each group will contain 4 datasets:
        - lab: predicitions on the labelled dataset
        - amb: predictions on the ambiguously labelled compounds
        - unl: predictions in the held-out competition set
        - scr: predictions in the screening dataset

    with_bug : bool, default False
      If True, predictions will be generated as for the competion
      (taking only the last fold of each experiment into account).
      If False, predictions will be generated as initially intended
      (averaging all the folds for each experiment).
      This bug does not affect the labelled scores.

    Returns
    -------
    The path to the HDF5 file where the scores have been saved.

    Side effects
    ------------
    The HDF5 file is created
    """

    if dest_file is None:
        dest_file = malaria_logreg_deployers_file(with_bug=with_bug)

    results = logreg_experiments_to_deploy().result

    info('Deploying %d logistic regression experiments (%d classifiers)' % (
        len(results),
        sum(len(result.present_folds()) for result in results)))

    # We will have a few "features" for each deployer
    # For lab it will just be the test scores
    # For amb, unl and scr it will be the average of the scores for each cv fold

    rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(None)

    with h5py.File(dest_file, 'w') as h5:

        for i, res in enumerate(results):

            # Deployer id
            f_name = '%s__%s' % (res.model_setup_id(), res.eval_setup_id())

            # Lab
            if '%s/lab' % f_name not in h5:
                h5['%s/lab' % f_name] = res.scores()[:, 1].astype(np.float32)

            # Get result models
            models = [res.fold_model(fold, with_bug=with_bug) for fold in res.present_folds()]

            # Amb
            if '%s/amb' % f_name not in h5:
                h5['%s/amb' % f_name] = np.nanmean([model.predict_proba(rf_amb.X())[:, 1]
                                                    for model in models], axis=0).astype(np.float32)
            # Unl
            if '%s/unl' % f_name not in h5:
                h5['%s/unl' % f_name] = np.nanmean([model.predict_proba(rf_unl.X())[:, 1]
                                                    for model in models], axis=0).astype(np.float32)
            # Scr
            if '%s/scr' % f_name not in h5:
                h5['%s/scr' % f_name] = np.nanmean([model.predict_proba(rf_scr.X())[:, 1]
                                                    for model in models], axis=0).astype(np.float32)

    return dest_file
示例#5
0
def logreg_deploy(dest_file=MALARIA_LOGREGS_DEPLOYMENT_H5):
    """Generates predictions for unlabelled datasets."""

    df = logreg_results_to_pandas()

    h5 = h5py.File(dest_file, 'w')

    # Choose a few good results (maybe apply diversity filters or ensemble selection or...)
    deployment_cond_1 = (df.cv_seed < 5) & \
                        (df.num_present_folds == df.num_cv_folds) & \
                        (df.penalty == 'l1') & \
                        (df.C == 1) & \
                        (df.class_weight == 'auto') & \
                        (df.tol == 1E-4) & \
                        (df.folder_size < 1) & \
                        (df.folder_seed == -1) & \
                        (df.auc_mean > 0.92)

    deployment_cond_2 = (df.num_present_folds == df.num_cv_folds) & \
                        (df.penalty == 'l2') & \
                        (df.C == 5) & \
                        (df.class_weight == 'auto') & \
                        (df.tol == 1E-4) & \
                        (df.folder_size < 1) & \
                        (df.folder_seed == -1) & \
                        (df.auc_mean > 0.93)

    deployers = df[deployment_cond_1 | deployment_cond_2]

    info('Deploying %d logistic regressors' % len(deployers))

    # We will have 40 "features", one for each deployer
    # For lab it will just be the test scores
    # For amb, unl and scr it will be the average of the scores for each cv fold

    rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(None)

    for i, res in enumerate(deployers.result):
        f_name = '%s__%s' % (res.model_setup_id(), res.eval_setup_id())  # What about the data setup?
                                                                         # Here it works but in general not
                                                                         # Save it all...
                                                                         # (a new dataset with all the coords
                                                                         # and the result path)
        print f_name

        # Lab
        if '%s/lab' % f_name not in h5:
            h5['%s/lab' % f_name] = res.scores()[:, 1].astype(np.float32)

        # Amb
        models = [res.fold_model(fold) for fold in res.present_folds()]
        if '%s/amb' % f_name not in h5:
            h5['%s/amb' % f_name] = np.nanmean([model.predict_proba(rf_amb.X())[:, 1]
                                                for model in models], axis=0).astype(np.float32)
        # Unl
        if '%s/unl' % f_name not in h5:
            h5['%s/unl' % f_name] = np.nanmean([model.predict_proba(rf_unl.X())[:, 1]
                                                for model in models], axis=0).astype(np.float32)
        # Scr
        if '%s/scr' % f_name not in h5:
            h5['%s/scr' % f_name] = np.nanmean([model.predict_proba(rf_scr.X())[:, 1]
                                                for model in models], axis=0).astype(np.float32)

    h5.close()