Exemplo n.º 1
0
def fit_logregs(dest_dir=MALARIA_LOGREGS_EXPERIMENT_ROOT,
                # Logreg params
                logreg_penalty='l1',
                logreg_C=1.0,
                logreg_class_weight_auto=False,
                logreg_dual=False,
                logreg_tol=1e-4,
                logreg_fit_intercept=True,
                logreg_intercept_scaling=1,
                # CV params
                num_cv_folds=10,
                cv_seeds=(0,),
                save_unlabelled_predictions=False,
                save_fold_model=False,
                min_fold_auc=0.88,
                # Fingerprint folding params
                fingerprint_folder_seed=0,
                fingerprint_fold_size=1023,
                # Computational requirements params
                force=False,
                chunksize=1000000):
    """Logistic regression experiment using the liblinear wrapper in sklearn.
    Generates cross-val results
    """

    ### TODO Remove
    if logreg_tol < 1E-5:
        info('Ignoring long intolerant experiments')
        return

    info('Malaria logregs experiment')

    # Command line type inference is rotten...
    logreg_C = float(logreg_C)
    logreg_tol = float(logreg_tol)
    logreg_intercept_scaling = float(logreg_intercept_scaling)
    num_cv_folds = int(num_cv_folds)
    min_fold_auc = float(min_fold_auc)
    fingerprint_folder_seed = int(fingerprint_folder_seed)
    fingerprint_fold_size = int(fingerprint_fold_size)
    chunksize = int(chunksize)

    # Example providers
    folder = None if fingerprint_fold_size < 1 else MurmurFolder(seed=fingerprint_folder_seed,
                                                                 fold_size=fingerprint_fold_size)
    rf_lab, rf_amb, rf_unl, rf_scr = malaria_logreg_fpt_providers(folder)
    info('Data description: %s' % rf_lab.configuration().id(full=True))

    # Experiment context: data
    data_id = rf_lab.configuration().id(full=True)
    data_dir = op.join(dest_dir, data_id)
    ensure_dir(data_dir)

    for cv_seed in cv_seeds:

        # Command line type inference is rotten...
        cv_seed = int(cv_seed)

        # Deterministic randomness
        my_rng = np.random.RandomState(seed=cv_seed)

        # Experiment context: model
        logreg_params = OrderedDict((
            ('penalty', logreg_penalty),
            ('C', logreg_C),
            ('class_weight', 'auto' if logreg_class_weight_auto else None),
            ('dual', logreg_dual),
            ('tol', logreg_tol),
            ('fit_intercept', logreg_fit_intercept),
            ('intercept_scaling', logreg_intercept_scaling),
            ('random_state', my_rng.randint(low=0, high=1000 ** 4)),
        ))
        model_setup = LogisticRegression(**logreg_params)
        model_id = 'skllogreg__%s' % '__'.join(['%s=%s' % (k, str(v)) for k, v in logreg_params.iteritems()])
        model_dir = op.join(data_dir, model_id)
        ensure_dir(model_dir)
        info('Model: %s' % model_id)

        # Experiment context: eval
        eval_id = 'cv__cv_seed=%d__num_folds=%d' % (cv_seed, num_cv_folds)
        eval_dir = op.join(model_dir, eval_id)
        ensure_dir(eval_dir)
        info('Eval: %d-fold cross validation (seed=%d)' % (num_cv_folds, cv_seed))

        # Already done?
        info_file = op.join(eval_dir, 'info.json')
        if op.isfile(info_file) and not force:
            info('\tAlready done, skipping...')
            return  # Oh well, a lot have been done up to here... rework somehow

        # Anytime we see this file, we know we need to stop
        stop_computing_file = op.join(eval_dir, 'STOP_BAD_FOLD')

        #---------
        #--------- Time to work!
        #---------

        # Save model config
        joblib.dump(model_setup, op.join(model_dir, 'model_setup.pkl'), compress=3)

        # Read labelled data in
        info('Reading data...')
        X, y = rf_lab.Xy()
        info('ne=%d; nf=%d' % rf_lab.X().shape)

        # Save molids... a bit too ad-hoc...
        save_molids(data_dir, 'lab', rf_lab.ids())
        if save_unlabelled_predictions:
            save_molids(data_dir, 'unl', rf_unl.ids())
            save_molids(data_dir, 'scr', rf_scr.ids())
            save_molids(data_dir, 'amb', rf_amb.ids())

        # Save folding information.
        # By now, all the folds have already been computed:
        #   - because we cached X
        #   - and in this case we are warranted that no new unfolded features will appear at test time
        if folder is not None:
            info('Saving the map folded_features -> unfolded_feature...')
            folded2unfolded_file = op.join(data_dir, 'folded2unfolded.h5')
            if not op.isfile(folded2unfolded_file):
                with h5py.File(folded2unfolded_file) as h5:
                    h5['f2u'] = folder.folded2unfolded()
            folder_light_file = op.join(data_dir, 'folder.pkl')
            if not op.isfile(folder_light_file):
                folder_light = copy(folder)  # Shallow copy
                folder_light.clear_cache()
                joblib.dump(folder_light, folder_light_file, compress=3)

        # Cross-val splitter
        cver = cv_splits(num_points=len(y),
                         Y=y,
                         num_folds=num_cv_folds,
                         rng=my_rng,
                         stratify=True)

        # Fit and classify
        for cv_fold_num in xrange(num_cv_folds):

            fold_info_file = op.join(eval_dir, 'fold=%d__info.json' % cv_fold_num)
            if op.isfile(fold_info_file):
                info('Fold %d already done, skipping' % cv_fold_num)
                continue

            if op.isfile(stop_computing_file):
                info('Bad fold detected, no more computations required')
                break

            # Split into train/test
            train_i, test_i = cver(cv_fold_num)
            Xtrain, ytrain = X[train_i, :], y[train_i]
            Xtest, ytest = X[test_i, :], y[test_i]

            # Copy the model...
            model = clone(model_setup)

            start = time()
            info('Training...')
            model.fit(Xtrain, ytrain)
            train_time = time() - start
            info('Model fitting has taken %.2f seconds' % train_time)

            if save_fold_model:
                info('Saving trained model')
                joblib.dump(model, op.join(eval_dir, 'fold=%d__fitmodel.pkl' % cv_fold_num), compress=3)

            info('Predicting and saving results...')
            with h5py.File(op.join(eval_dir, 'fold=%d__scores.h5' % cv_fold_num), 'w') as h5:

                start = time()

                # Test indices
                h5['test_indices'] = test_i

                # Model
                h5['logreg_coef'] = model.coef_
                h5['logreg_intercept'] = model.intercept_

                # Test examples
                info('Scoring test...')
                scores_test = model.predict_proba(Xtest)
                fold_auc = roc_auc_score(ytest, scores_test[:, 1])
                fold_enrichment5 = enrichment_at(ytest, scores_test[:, 1], percentage=0.05)
                info('Fold %d ROCAUC: %.3f' % (cv_fold_num, fold_auc))
                info('Fold %d Enrichment at 5%%: %.3f' % (cv_fold_num, fold_enrichment5))
                h5['test'] = scores_test.astype(np.float32)

                if save_unlabelled_predictions:
                    predict_malaria_unlabelled(model,
                                               h5,
                                               rf_amb=rf_amb,
                                               rf_scr=rf_scr,
                                               rf_unl=rf_unl,
                                               chunksize=chunksize)

                test_time = time() - start
                info('Predicting has taken %.2f seconds' % test_time)

                # Finally save meta-information for the fold
                metainfo = mlexp_info_helper(
                    title='malaria-trees-oob',
                    data_setup=data_id,
                    model_setup=model_id,
                    exp_function=giveupthefunc(),
                )
                metainfo.update((
                    ('train_time', train_time),
                    ('test_time', test_time),
                    ('auc', fold_auc),
                    ('enrichment5', fold_enrichment5),
                ))
                with open(fold_info_file, 'w') as writer:
                    json.dump(metainfo, writer, indent=2, sort_keys=False)

                # One last thing, should we stop now?
                if fold_auc < min_fold_auc:
                    stop_message = 'The fold %d was bad (auc %.3f < %.3f), skipping the rest of the folds' % \
                                   (cv_fold_num, fold_auc, min_fold_auc)
                    info(stop_message)
                    with open(stop_computing_file, 'w') as writer:
                        writer.write(stop_message)

        # Summarize cross-val in the info file
        metainfo = mlexp_info_helper(
            title='malaria-trees-oob',
            data_setup=data_id,
            model_setup=model_id,
            exp_function=giveupthefunc(),
        )
        metainfo.update((
            ('num_cv_folds', num_cv_folds),
            ('cv_seed', cv_seed),
        ))
        metainfo.update(logreg_params.items())
        with open(info_file, 'w') as writer:
            json.dump(metainfo, writer, indent=2, sort_keys=False)
Exemplo n.º 2
0
def fit(dest_dir=MALARIA_TREES_EXPERIMENT_ROOT,
        seeds=(0, 1, 2, 3, 4),
        num_treess=(10, 6000, 4000, 2000, 1000, 500, 20, 50, 100),
        save_trained_models=False,
        chunksize=200000,
        num_threads=None,
        force=False):

    # Generates OOB results

    info('Malaria trees experiment')

    # Guess the number of threads
    if num_threads is None:
        num_threads = cpu_count()
    info('Will use %d threads' % num_threads)

    # Example providers
    info('Reading data...')
    rf_lab = MalariaRDKFsExampleSet()
    X, y = rf_lab.Xy()
    rf_unl = MalariaRDKFsExampleSet(dset='unl', remove_ambiguous=False)
    rf_scr = MalariaRDKFsExampleSet(dset='scr', remove_ambiguous=False)
    rf_amb = MalariaRDKFsExampleSet(dset='amb')
    # A bit of logging
    info('Data description: %s' % rf_lab.configuration().id(nonids_too=True))
    info('ne=%d; nf=%d' % rf_lab.X().shape)

    # Experiment context: data
    data_id = rf_lab.configuration().id(nonids_too=True)  # TODO: bring hashing from oscail
    data_dir = op.join(dest_dir, data_id)
    ensure_dir(data_dir)

    # Save molids... a bit too ad-hoc...
    info('Saving molids...')

    save_molids(data_dir, 'lab', rf_lab.ids())
    save_molids(data_dir, 'unl', rf_unl.ids())
    save_molids(data_dir, 'scr', rf_scr.ids())
    save_molids(data_dir, 'amb', rf_amb.ids())

    # Main loop - TODO: robustify with try and continue
    for etc, seed, num_trees in product((True, False), seeds, num_treess):

        # Configure the model
        if etc:
            model = ExtraTreesClassifier(n_estimators=num_trees,
                                         n_jobs=num_threads,
                                         bootstrap=True,
                                         oob_score=True,
                                         random_state=seed)
        else:
            model = RandomForestClassifier(n_estimators=num_trees,
                                           n_jobs=num_threads,
                                           oob_score=True,
                                           random_state=seed)

        # Experiment context: model
        model_id = 'trees__etc=%r__num_trees=%d__seed=%d' % (etc, num_trees, seed)  # TODO: bring self-id from oscail
        model_dir = op.join(data_dir, model_id)
        ensure_dir(model_dir)
        info('Model: %s' % model_id)

        # Experiment context: eval
        eval_id = 'oob'
        eval_dir = op.join(model_dir, eval_id)
        ensure_dir(eval_dir)
        info('Eval: OOB (Out Of Bag)')

        # Already done?
        info_file = op.join(eval_dir, 'info.json')
        if op.isfile(info_file) and not force:
            info('\tAlready done, skipping...')
            continue

        # Save model config
        joblib.dump(model, op.join(model_dir, 'model_setup.pkl'), compress=3)

        # Train-full
        info('Training...')
        start = time()
        model.fit(X, y)
        train_time = time() - start  # This is also test-time, as per OOB=True

        # Save trained model? - yeah, lets do it under oob
        if save_trained_models:
            joblib.dump(model, op.join(eval_dir, 'model_trained.pkl'), compress=3)

        # OOB score, auc and enrichment
        oob_score = model.oob_score_
        oob_scores = model.oob_decision_function_
        oob_scores_not_missing = fill_missing_scores(oob_scores[:, 1])

        auc = roc_auc_score(y,  oob_scores_not_missing)
        enrichment5 = enrichment_at(y, oob_scores_not_missing, percentage=0.05)

        info('OOB AUC: %.2f' % auc)
        info('OOB Enrichment at 5%%: %.2f' % enrichment5)
        info('OOB Accuracy: %.2f' % oob_score)

        # Save scores and importances
        info('Saving results...')
        with h5py.File(op.join(eval_dir, 'oob_auc=%.2f__scores.h5' % auc), 'w') as h5:

            start = time()

            # Feature importances
            h5['f_names'] = rf_lab.fnames()
            h5['f_importances'] = model.feature_importances_

            # Labelled (development) examples
            info('Scoring lab...')
            h5['lab'] = oob_scores.astype(np.float32)

            info('Scoring amb...')
            h5['amb'] = model.predict_proba(rf_amb.X()).astype(np.float32)

            # Unlabelled (competition) examples
            info('Scoring unl...')
            h5['unl'] = model.predict_proba(rf_unl.X()).astype(np.float32)

            # Unlabelled (screening) examples
            info('Scoring scr...')
            if chunksize <= 0:
                h5['scr'] = model.predict_proba(rf_scr.X()).astype(np.int32)
            else:
                scr = h5.create_dataset('scr', shape=(rf_scr.ne_stream(), 2), dtype=np.float32)
                for i, x in enumerate(rf_scr.X_stream(chunksize=chunksize)):
                    base = i * chunksize
                    info('\t num_scr_examples: %d' % base)
                    scr[base:base + chunksize] = model.predict_proba(x)

            test_time = time() - start

        # Finally save meta-information
        metainfo = mlexp_info_helper(
            title='malaria-trees-oob',
            data_setup=data_id,
            model_setup=model_id,
            exp_function=fit,
        )
        metainfo.update((
            ('train_time', train_time),
            ('test_time', test_time),
            ('oob_auc', auc),
            ('oob_enrichment5', enrichment5),
            ('oob_accuracy', oob_score),
        ))
        with open(info_file, 'w') as writer:
            json.dump(metainfo, writer, indent=2, sort_keys=False)