def from_feat_back_to_mols(dset, smi):
    """
    Retrieves the list of molecules that contain the given feature in the given dataset. THIS IS EXTREMELY SLOW!!!!!!!
    """
    mols = []
    molids = []
    indices = []
    classes = []
    mfm = MalariaFingerprintsManager(dset=dset)
    print('I am here')
    with open(mfm.original_file, 'r') as reader:
        for i, line in enumerate(reader):
            info = line.split('\t')
            molid = info[0]
            ecfps = [feat.split()[0] for feat in info[1:]]
            if smi in ecfps:
                print(molid)
                molids.append(molid)
                indices.append(i)
                mc = MalariaCatalog()
                classes.append(mc.label(molid, as01=True))
                mols.append(None)
    print('Got the first lists')
    molids = np.array(molids)
    indices = np.array(indices)
    classes = np.array(classes)
    mols = np.array(mols)
    # Now we need to retrieve the real rdkit mols for each molid.
    if dset == 'lab':
        for molid, _, _, _, _, smiles in read_labelled_smiles():
            if molid in molids:
                mols[molids == molid] = smiles
    return zip(mols, molids, indices, classes)
Exemplo n.º 2
0
def mols_having_best_feat(penalty='l1', c=1, num_folds=10):

    df_t3 = task3_res()

    coefs = []
    for cv_seed, group in df_t3.groupby(['cv_seed']):
        print(cv_seed, len(group))
    # Iterate over the 5 cv seeds for the same num_cv_folds:
    for _, gr in df_t3.result.items():
        # average over the different folds
        # print df_t3.C
        # print df_t3.num_cv_folds
        # print df_t3.cv_seed
        coefs.append(np.mean(np.array([gr.logreg_coefs(i).ravel() for i in range(num_folds)]), axis=0))

    av_coefs = np.mean(np.array(coefs), axis=0)
    index_of_best = np.argmax(av_coefs)
    mfm = MalariaFingerprintsManager(dset='lab')
    feat = mfm.i2s(index_of_best)
    print(feat)
    # feat = 'n1c(S(C)(=O)=O)sc(N)c1S(c)(=O)=O'
    molids = mfm.mols_with_feature(feat)
    mc = MalariaCatalog()
    mols = mc.molids2mols(molids)
    labels = mc.molids2labels(molids, as01=True)

    print(len(mols))
    draw_in_a_grid_aligned_according_to_pattern(mols, feat,
                                                op.join(MALARIA_EXPS_ROOT, 'logregs', 'Mols_having_best_fpt.png'),
                                                legends=molids, classes=labels)
Exemplo n.º 3
0
def compute_confirmatory(deployers,
                         molids_provider,
                         outfile,
                         y_provider=None,
                         select_top=500,
                         mc=None):
    """Scores and rankings on plain-average for the labelled / ambiguous dataset."""

    # Labelled
    Xlab, f_names = deployers(dset='lab')
    info('AUC after plain averaging (bagging like): %.3f' % roc_auc_score(y_provider(),
                                                                          np.nanmean(Xlab, axis=1)))
    # Ambiguous
    Xamb, _ = deployers(dset='amb')
    # All together
    X = np.vstack((Xlab, Xamb))

    # Scores are just plain averages
    scores = np.nanmean(X, axis=1)

    # Get the molids, smiles, labels, pec50
    lab_molids = molids_provider(dset='lab')
    amb_molids = molids_provider(dset='amb')
    molids = np.hstack((lab_molids, amb_molids))

    if mc is None:
        mc = MalariaCatalog()
    labels = mc.molids2labels(molids)
    pec50s = mc.molids2pec50s(molids)
    smiles = mc.molids2smiless(molids)

    # Rankings
    ranks, (sscores, smolids, slabels, spec50s, ssmiles) = \
        rank_sort(scores, (scores, molids, labels, pec50s, smiles),
                  reverse=True,
                  select_top=select_top)

    # N.B.
    # if analyzing ranking variability, use instead
    # scores2rankings()

    # Save for submission
    with open(outfile, 'w') as writer:
        for molid, smiles, score in zip(smolids, ssmiles, sscores):
            writer.write('%s,%s,%.6f\n' % (molid, smiles, score))

    # Create and save a pandas series to allow further stacking
    s = Series(data=scores, index=molids)
    s.to_pickle(op.join(op.splitext(outfile)[0] + '.pkl'))

    return molids, scores
def from_feat_back_to_mols_faster(dset, smi):
    """
    Retrieves the list of molecules that contain the given feature in the given dataset.
    """
    # The non-folded version is easy
    mfm = MalariaFingerprintsManager(dset=dset)
    X = mfm.X()
    col = mfm.s2i(smi)  # the column where we have to look for in the X matrix
    cooX = X.tocoo()
    indices_mols = cooX.row[cooX.col == col]
    molids = [mfm.i2m(i) for i in indices_mols]
    mc = MalariaCatalog()
    activities = [mc.label(molid, as01=True) for molid in molids]
    mols = mc.molids2mols(molids)
    return zip(mols, molids, indices_mols, activities)
Exemplo n.º 5
0
def compute_heldout(dset,
                    deployers,
                    molids_provider,
                    outfile,
                    y_provider=None,
                    stacker=None,
                    select_top=None,
                    mc=None):
    """Predictions for the held-out sets."""
    X, _ = deployers(dset=dset)

    # Stacking or averaging?
    if stacker is not None:
        Xlab, _ = deployers(dset='lab')
        y = y_provider()
        stacker.fit(Xlab, y)  # Careful: Xlab columns can be extremelly collinear...
        if True:
            scores = stacker.predict(X)
        else:
            scores = stacker.predict_proba(X)[:, 1]
    else:
        scores = np.nanmean(X, axis=1)

    # Get the molids, smiles
    if mc is None:
        mc = MalariaCatalog()
    molids = molids_provider(dset=dset)
    smiles = mc.molids2smiless(molids)

    # Rankings
    ranks, (sscores, smolids, ssmiles) = \
        rank_sort(scores, (scores, molids, smiles), reverse=True, select_top=select_top)

    # Save for submission
    with open(outfile, 'w') as writer:
        for molid, smiles, score in izip(smolids, ssmiles, sscores):
            writer.write('%s,%s,%.6f\n' % (molid, smiles, score))

    # Create and save a pandas series to allow further stacking
    s = Series(data=scores, index=molids)
    s.to_pickle(op.join(op.splitext(outfile)[0] + '.pkl'))

    return molids, scores
def mispredicted_compounds(folding_size=None):
    """
    At each fold, collect the list of mispredicted compounds and assemble it into one list of molids
    """
    FOLDS = range(10)
    mfm = MalariaFingerprintsManager(dset='lab')
    mispredicted = []
    if folding_size is None:
        path = op.join(MALARIA_EXPS_ROOT, 'folding_rdkit', 'no_folding')
    else:
        path = op.join(MALARIA_EXPS_ROOT, 'folding_rdkit', 'fs=%i' % folding_size)
    for fold in FOLDS:
        with open(op.join(path, 'fold=%i' % fold, 'results.pkl'), 'r') as reader:
            _, scores, fold, _, _, _ = pickle.load(reader)
            scores = scores >= 0.5    # dummy threshold
            molids_test = [mfm.i2m(i) for i in fold]
            mc = MalariaCatalog()
            classes_test = [mc.label(molid, as01=True) for molid in molids_test]
            for i, mol in enumerate(molids_test):
                if scores[i] != classes_test[i] and not np.isnan(classes_test[i]):
                    mispredicted.append(mol)
    return mispredicted
Exemplo n.º 7
0
def munge_ecfps():

    #####---Step 1: put all these together in 3 files, lab, unl and scr.
    #####     - ECFPs and FCFPs for the same mol are together
    #####     - The order is the same as in the original file
    #####     - Optionally delete the workers files

    def parse_weirdfpformat_line(line):
        """Returns a tuple (molid, [cansmi, count, [(center, radius)]+]+)."""
        def _parse_weird_feature(feature):
            vals = feature.split()
            cansmi = vals[0]
            count = int(vals[1])
            if len(vals) > 2:
                a = iter(vals[2:])
                centers = [(center, radius) for center, radius in izip(a, a)]
                return cansmi, count, centers
            return cansmi, count, ()
        values = line.strip().split('\t')
        molid = values[0]
        if '*FAILED*' in values[1]:
            return molid, None
        return molid, map(_parse_weird_feature, values[1:])

    def malaria_ecfp_parallel_results_iterator(prefix='', log=True):
        """Iterates over the files resulting from the computation of ecfps using the function ecfp."""
        weirdfps = glob(op.join(_MALARIA_ECFPS_PARALLEL_RESULTS_DIR, '%s*.weirdfps' % prefix))
        weirdfps = _sort_by_start(weirdfps)
        for fn in weirdfps:
            if log:
                info(fn)
            with gzip.open(fn) as reader:
                for line in reader:
                    yield line

    class Chihuahua(object):
        """A data processor that takes weirdfp lines, hunk them in disk and then merge them sorted in a big file.
        It can be setup to be easy on memory usage (at the cost of doubling disk space usage).
        """
        def __init__(self, molid2i, root, prefix, data2molid, chunksize=10000):
            super(Chihuahua, self).__init__()
            self.chunksize = chunksize
            self.molid2i = molid2i
            self.num_mols = len(self.molid2i)
            self.temp_fns = [op.join(root, '%s-%d' % (prefix, base)) for base in xrange(0, self.num_mols, chunksize)]
            self.temp_files = [open(fn, 'w') for fn in self.temp_fns]
            self.data2molid = data2molid
            self.root = root
            self.prefix = prefix
            ensure_dir(self.root)

        def process(self, moldata):
            index = self.molid2i.get(self.data2molid(moldata), None)
            if index is None:
                return
            goes_to = index / self.chunksize
            self.temp_files[goes_to].write(moldata)
            if not moldata.endswith('\n'):
                self.temp_files[goes_to].write('\n')

        def done(self):
            # Sort in memory each chunk
            for tmp in self.temp_files:
                tmp.close()
            with open(op.join(self.root, self.prefix), 'w') as writer:
                for fn in self.temp_fns:
                    with open(fn) as reader:
                        lines = sorted(reader.readlines(), key=lambda line: self.molid2i[self.data2molid(line)])
                        for line in lines:
                            writer.write(line)
            for fn in self.temp_fns:
                os.remove(fn)

    mc = MalariaCatalog()

    labproc = Chihuahua(molid2i={molid: i for i, molid in enumerate(mc.lab())},
                        root=_MALARIA_ECFPS_DIR,
                        prefix='lab',
                        data2molid=lambda line: line[0:line.find('\t')],
                        chunksize=100000)

    unlproc = Chihuahua(molid2i={molid: i for i, molid in enumerate(mc.unl())},
                        root=_MALARIA_ECFPS_DIR,
                        prefix='unl',
                        data2molid=lambda line: line[0:line.find('\t')],
                        chunksize=100000)

    scrproc = Chihuahua(molid2i={molid: i for i, molid in enumerate(mc.scr())},
                        root=_MALARIA_ECFPS_DIR,
                        prefix='scr',
                        data2molid=lambda line: line[0:line.find('\t')],
                        chunksize=100000)

    _process_molecule_data(malaria_ecfp_parallel_results_iterator(), (labproc, unlproc, scrproc))

    #####---Step 2: recode ECFPs and FCFPs from the file at step 1. After this:
    #####  - ECFPs and FCFPs duplicates get merged.
    #####  - A unique assignment for each substructure in the dataset to a int [0, ...] (column number).
    #####  - A unique assignment for each molid in the dataset for wich Morgan DID NOT FAIL (row number).

    def ecfps_recode(dset='lab'):
        """Merges ECFPs and FCFPs into a single line and gets rid of the centers information if present."""
        with open(op.join(_MALARIA_ECFPS_DIR, dset)) as reader, \
                open(op.join(_MALARIA_ECFPS_DIR, dset + '.merged'), 'w') as writer:
            for ecfp in reader:
                fcfp = reader.next()
                molide, subse = parse_weirdfpformat_line(ecfp)
                molidf, subsf = parse_weirdfpformat_line(fcfp)
                assert molide == molidf
                if subse is not None:
                    uniques = set((sub, count) for sub, count, _ in subse + subsf)
                    writer.write('%s\t%s' % (molide, '\t'.join(['%s %d' % (sub, count) for sub, count in uniques])))
                    writer.write('\n')
    ecfps_recode('lab')
    ecfps_recode('unl')
    ecfps_recode('scr')

    def sub2i():
        """Generates a map {labelled_substructure -> column}
        This produces a unique assignment for all the features in the dataset, in three files:
          - lab: the indices for all features that appear in labelled
          - unl: the indices for features that do not appear in labelled but appear in unlabelld
          - scr: the indices for the features that appear in screening but not on labelled or unlabelled
        Of course, keep track of changes to the map as needed while creating models.

        Note that this keeps all the substructures in memory (which shoould be ok for any recent machine).
        """
        def all_subs(dset):
            info(dset)
            subs = set()
            with open(op.join(_MALARIA_ECFPS_DIR, dset + '.merged')) as reader:
                for line in reader:
                    subs.update(sub.split()[0] for sub in line.split('\t')[1:])  # TODO sort by frequency
            return subs
        lab_subs = all_subs('lab')
        unl_subs = all_subs('unl')
        scr_subs = all_subs('scr')
        with open(op.join(_MALARIA_ECFPS_DIR, 'lab.merged.s2i'), 'w') as writer:
            for i, sub in enumerate(sorted(lab_subs)):
                writer.write('%s %d\n' % (sub, i))
        num_written = len(lab_subs)
        with open(op.join(_MALARIA_ECFPS_DIR, 'unl.merged.s2i'), 'w') as writer:
            new_subs = unl_subs - lab_subs
            for i, sub in enumerate(sorted(new_subs)):
                writer.write('%s %d\n' % (sub, i + num_written))
            num_written += len(new_subs)
        with open(op.join(_MALARIA_ECFPS_DIR, 'scr.merged.s2i'), 'w') as writer:
            new_subs = scr_subs - (unl_subs | lab_subs)
            for i, sub in enumerate(sorted(new_subs)):
                writer.write('%s %d\n' % (sub, i + num_written))
        with open(op.join(_MALARIA_ECFPS_DIR, 'trans.merged.s2i'), 'w') as writer:
            for sub in sorted(lab_subs & unl_subs | lab_subs & scr_subs):
                writer.write('%s\n' % sub)
    sub2i()

    def mol2i(dset='lab'):
        """Generates a map {molid -> row}.
        Molecules for which RDKIT could not generate the fingerprints are not in this map,
        nor in hte final sparse matrices.
        In any case we will need to keep track of changes on the map as we do, for example, cross-val.
        """
        with open(op.join(_MALARIA_ECFPS_DIR, dset + '.merged.m2i'), 'w') as writer:
            with open(op.join(_MALARIA_ECFPS_DIR, dset + '.merged')) as reader:
                for line in reader:
                    writer.write('%s\n' % line[0:line.find('\t')])
    mol2i('lab')
    mol2i('unl')
    mol2i('scr')

    #####---Step 3: write sparse matrices with the recoded information of step 2. After this:
    #####  - We get a h5 file for each dataset, with a sparse matrix in CSR format.
    #####  - Note that this is a memory intense procedure, can be done lightweight by using 2 passes.

    def to_sparse_chihuahua(dset='lab', two_pass=False):
        """Generates sparse CSR matrices using as features only these in the labelled dataset,
        with the column index and the row index as computed previously.
        They get stored in a h5 file with the following datasets:
          - data
          - indices
          - indptr
          - shape
        """
        if two_pass:
            # First pass: shape and number of nonzeros
            # Second pass: h5 file with the proper sizes of indices, indptr and data, write on the fly
            raise NotImplementedError
        # mol2row, smiles2col
        m2i = {mol.strip(): i for i, mol in enumerate(open(op.join(_MALARIA_ECFPS_DIR, dset + '.merged.m2i')))}
        s2i = {}
        with open(op.join(_MALARIA_ECFPS_DIR, 'lab.merged.s2i')) as reader:
            for line in reader:
                sub, i = line.strip().split()
                i = int(i)
                s2i[sub] = i
        rows = array('I')
        cols = array('I')
        data = array('I')
        # gather data
        with open(op.join(_MALARIA_ECFPS_DIR, dset + '.merged')) as reader:
            for line in reader:
                values = line.split('\t')
                molid = values[0]
                row = m2i[molid]
                for fc in values[1:]:
                    sub, count = fc.split()
                    count = int(count)
                    col = s2i.get(sub, None)
                    if col is not None:
                        rows.append(row)
                        cols.append(col)
                        data.append(count)
        # save as CSR sparse matrix
        M = coo_matrix((data, (rows, cols)), dtype=np.int32).tocsr()
        with h5py.File(op.join(_MALARIA_ECFPS_DIR, dset + '.sparse.h5'), 'w') as h5:
            h5['indices'] = M.indices
            h5['indptr'] = M.indptr
            h5['data'] = data
            h5['shape'] = np.array([M.shape[0], len(s2i)])
    to_sparse_chihuahua('lab')
    to_sparse_chihuahua('unl')
    to_sparse_chihuahua('scr')

    #####---Step 4: lame feature duplicate detection to tackle partially multicolliniarity
    MalariaFingerprintsManager(zero_dupes='lab').X()
    MalariaFingerprintsManager(zero_dupes='all').X()
Exemplo n.º 8
0
def final_merged_submissions(calibrate=False, dest_dir=MALARIA_EXPS_ROOT):
    """Very ad-hoc merge of submissions obtained with trees and logistic regressors."""

    #####
    #0 Preparations
    #####

    # Avoid circular imports
    from ccl_malaria.logregs_fit import MALARIA_LOGREGS_EXPERIMENT_ROOT
    from ccl_malaria.trees_fit import MALARIA_TREES_EXPERIMENT_ROOT

    mc = MalariaCatalog()

    def save_submission(sub, outfile, select_top=500):
        # Get the smiles
        smiles = mc.molids2smiless(sub.index)

        # Rankings
        ranks, (sscores, smolids, ssmiles) = \
            rank_sort(sub.values, (sub.values,
                                   sub.index.values,
                                   smiles), reverse=True, select_top=select_top)
        # Save for submission
        with open(outfile, 'w') as writer:
            for molid, smiles, score in izip(smolids, ssmiles, sscores):
                writer.write('%s,%s,%.6f\n' % (molid, smiles, score))

    #####
    #1 Robust merge using pandas
    #####
    def read_average_merge(root, prefix):
        hit = pd.read_pickle(op.join(root, '%s_hitSelection.pkl' % prefix))
        labels = mc.molids2labels(hit.index, as01=True)
        lab = hit[~np.isnan(labels)]
        amb = hit[np.isnan(labels)]
        unl = pd.read_pickle(op.join(root, '%s_unl-averaged.pkl' % prefix))
        scr = pd.read_pickle(op.join(root, '%s_scr-averaged.pkl' % prefix))
        return lab, amb, unl, scr
    tlab, tamb, tunl, tscr = read_average_merge(MALARIA_TREES_EXPERIMENT_ROOT, 'trees')
    llab, lamb, lunl, lscr = read_average_merge(MALARIA_LOGREGS_EXPERIMENT_ROOT, 'logreg')

    lab = DataFrame({'trees': tlab, 'logregs': llab})
    lab['labels'] = mc.molids2labels(lab.index, as01=True)
    assert np.sum(np.isnan(lab.labels)) == 0
    amb = DataFrame({'trees': tamb, 'logregs': lamb})
    unl = DataFrame({'trees': tunl, 'logregs': lunl})
    scr = DataFrame({'trees': tscr, 'logregs': lscr})

    # ATM we take it easy and just drop any NA
    lab.dropna(inplace=True)
    amb.dropna(inplace=True)
    unl.dropna(inplace=True)
    scr.dropna(inplace=True)

    #####
    #2 Calibration on labelling - careful with overfitting for hitList, do it in cross-val fashion
    #####
    def calibrate_row(row):
        calibrator = IsotonicRegression(y_min=0, y_max=1)
        x = lab[~np.isnan(lab[row])][row].values
        y = lab[~np.isnan(lab[row])]['labels'].values
        calibrator.fit(x, y)
        lab[row] = calibrator.predict(lab[row].values)
        amb[row] = calibrator.predict(amb[row].values)
        unl[row] = calibrator.predict(unl[row].values)
        scr[row] = calibrator.predict(scr[row].values)
    if calibrate:
        calibrate_row('trees')
        calibrate_row('logregs')

    #####
    #3 Average for the submission in lab-amb
    #####
    submission_lab = (lab.trees + lab.logregs) / 2
    submission_amb = (amb.trees + amb.logregs) / 2
    submission_hts = pd.concat((submission_lab, submission_amb))

    outfile = op.join(dest_dir, 'final-merged-%s-hitSelection.csv' % ('calibrated' if calibrate else 'nonCalibrated'))
    save_submission(submission_hts, outfile)

    #####
    #4 Average predictions for unlabelled
    #####
    submission_unl_avg = (unl.trees + unl.logregs) / 2
    outfile = op.join(dest_dir, 'final-%s-avg-unl.csv' % ('calibrated' if calibrate else 'nonCalibrated'))
    save_submission(submission_unl_avg, outfile, select_top=None)

    submission_scr_avg = (scr.trees + scr.logregs) / 2
    outfile = op.join(dest_dir, 'final-%s-avg-scr.csv' % ('calibrated' if calibrate else 'nonCalibrated'))
    save_submission(submission_scr_avg, outfile, select_top=1000)

    #####
    #5 Stacked (linear regression) for unlabelled
    #####
    stacker = LinearRegression()
    stacker.fit(lab[['trees', 'logregs']], lab.labels)

    submission_unl_st = Series(data=stacker.predict(unl[['trees', 'logregs']]), index=unl.index)
    outfile = op.join(dest_dir, 'final-%s-stacker=linr-unl.csv' % ('calibrated' if calibrate else 'nonCalibrated'))
    save_submission(submission_unl_st, outfile, select_top=None)

    submission_scr_st = Series(data=stacker.predict(scr[['trees', 'logregs']]), index=scr.index)
    outfile = op.join(dest_dir, 'final-%s-stacker=linr-scr.csv' % ('calibrated' if calibrate else 'nonCalibrated'))
    save_submission(submission_scr_st, outfile, select_top=1000)
Exemplo n.º 9
0
def merge_submissions(calibrate=False,
                      select_top_scr=None,
                      with_bug=False,
                      dest_dir=MALARIA_EXPS_ROOT):
    """Very ad-hoc merge of submissions obtained with trees and logistic regressors."""

    #####
    # 0 Preparations
    #####

    # Avoid circular imports
    from ccl_malaria.logregs_fit import MALARIA_LOGREGS_EXPERIMENT_ROOT
    from ccl_malaria.logregs_analysis import malaria_logreg_file_prefix
    from ccl_malaria.trees_fit import MALARIA_TREES_EXPERIMENT_ROOT

    mc = MalariaCatalog()

    ensure_dir(dest_dir)

    def save_submission(sub, outfile, select_top=500):
        # Get the smiles
        smiles = mc.molids2smiless(sub.index)

        # Rankings
        ranks, (sscores, smolids, ssmiles) = \
            rank_sort(sub.values, (sub.values,
                                   sub.index.values,
                                   smiles), reverse=True, select_top=select_top)
        # Save for submission
        with open(outfile, 'w') as writer:
            for molid, smiles, score in zip(smolids, ssmiles, sscores):
                writer.write('%s,%s,%.6f\n' % (molid, smiles, score))

    #####
    # 1 Robust merge using pandas
    #####
    def read_average_merge(root, prefix):
        hit = pd.read_pickle(op.join(root, '%s_hitSelection.pkl' % prefix))
        labels = mc.molids2labels(hit.index, as01=True)
        lab = hit[~np.isnan(labels)]
        amb = hit[np.isnan(labels)]
        unl = pd.read_pickle(op.join(root, '%s_unl-averaged.pkl' % prefix))
        scr = pd.read_pickle(op.join(root, '%s_scr-averaged.pkl' % prefix))
        return lab, amb, unl, scr

    tlab, tamb, tunl, tscr = read_average_merge(MALARIA_TREES_EXPERIMENT_ROOT, 'trees')
    llab, lamb, lunl, lscr = read_average_merge(MALARIA_LOGREGS_EXPERIMENT_ROOT,
                                                malaria_logreg_file_prefix(with_bug=with_bug))

    lab = DataFrame({'trees': tlab, 'logregs': llab})
    lab['labels'] = mc.molids2labels(lab.index, as01=True)
    assert np.sum(np.isnan(lab['labels'])) == 0
    amb = DataFrame({'trees': tamb, 'logregs': lamb})
    unl = DataFrame({'trees': tunl, 'logregs': lunl})
    scr = DataFrame({'trees': tscr, 'logregs': lscr})

    # ATM we take it easy and just drop any NA
    lab.dropna(inplace=True)
    amb.dropna(inplace=True)
    unl.dropna(inplace=True)
    scr.dropna(inplace=True)

    #####
    # 2 Calibration on labelling - careful with overfitting for hitList, do it in cross-val fashion
    #####
    def calibrate_col(col):
        # isotonic not the best here, and faces numerical issues
        calibrator = IsotonicRegression(y_min=0, y_max=1)
        x = lab[~np.isnan(lab[col])][col].values
        y = lab[~np.isnan(lab[col])]['labels'].values
        # This worked with old sklearn
        try:
            # Old sklearn
            calibrator.fit(x.reshape(-1, 1), y)
            lab[col] = calibrator.predict(lab[col].values.reshape(-1, 1))
            amb[col] = calibrator.predict(amb[col].values.reshape(-1, 1))
            unl[col] = calibrator.predict(unl[col].values.reshape(-1, 1))
            scr[col] = calibrator.predict(scr[col].values.reshape(-1, 1))
        except ValueError:
            # Newer sklearn
            calibrator.fit(x.ravel(), y)
            lab[col] = calibrator.predict(lab[col].values.ravel())
            amb[col] = calibrator.predict(amb[col].values.ravel())
            unl[col] = calibrator.predict(unl[col].values.ravel())
            scr[col] = calibrator.predict(scr[col].values.ravel())

    if calibrate:
        calibrate_col('trees')
        calibrate_col('logregs')

    #####
    # 3 Average for the submission in lab-amb
    #####
    submission_lab = (lab.trees + lab.logregs) / 2
    submission_amb = (amb.trees + amb.logregs) / 2
    submission_hts = pd.concat((submission_lab, submission_amb))

    submission_options = '%s-%s' % (
        'calibrated' if calibrate else 'nonCalibrated',
        'lastFold' if with_bug else 'averageFolds')

    outfile = op.join(dest_dir, 'final-merged-%s-hitSelection.csv' % submission_options)
    save_submission(submission_hts, outfile)

    #####
    # 4 Average predictions for unlabelled
    #####
    submission_unl_avg = (unl.trees + unl.logregs) / 2
    outfile = op.join(dest_dir, 'final-%s-avg-unl.csv' % submission_options)
    save_submission(submission_unl_avg, outfile, select_top=None)

    submission_scr_avg = (scr.trees + scr.logregs) / 2
    outfile = op.join(dest_dir, 'final-%s-avg-scr.csv' % submission_options)
    save_submission(submission_scr_avg, outfile, select_top=select_top_scr)

    #####
    # 5 Stacked (linear regression) for unlabelled
    #####
    stacker = LinearRegression()
    stacker.fit(lab[['trees', 'logregs']], lab['labels'])

    def robust_predict(X):
        X = np.asarray(X)
        row_is_finite = np.all(np.isfinite(X), axis=1)
        scores = np.full(len(X), fill_value=np.nan)
        scores[row_is_finite] = stacker.predict(X[row_is_finite])
        return scores

    # noinspection PyArgumentList
    submission_unl_st = Series(data=robust_predict(unl[['trees', 'logregs']]), index=unl.index)
    outfile = op.join(dest_dir, 'final-%s-stacker=linr-unl.csv' % submission_options)
    save_submission(submission_unl_st, outfile, select_top=None)

    # noinspection PyArgumentList
    submission_scr_st = Series(data=robust_predict(scr[['trees', 'logregs']]), index=scr.index)
    outfile = op.join(dest_dir, 'final-%s-stacker=linr-scr.csv' % submission_options)
    save_submission(submission_scr_st, outfile, select_top=select_top_scr)