Пример #1
0
def molecules_coocurrences_df(dset='bcrp',
                              expids=None,
                              feats='ecfps1',
                              model='logreg3',
                              lso=True):
    cache_file = op.join(MANYSOURCES_DATA_ROOT, 'results', 'molecules_coocurrences_df.h5')
    if expids is None:
        expids = range(4096)
    group_id = '/dset=%s/feats=%s/model=%s/lso=%r' % (dset, feats, model, lso)
    molecules_dataset_id = '/dset=%s/molecules' % dset
    with h5py.File(cache_file, 'a') as h5:
        if group_id not in h5:
            coocurrences = []
            valid_expids = []
            fold_ids = []
            dset_name = dset
            dset = None
            for expid in expids:
                print dset_name, expid, model, feats, lso
                res = ManysourcesResult(expid=expid, dset=dset_name, feats=feats, model=model)
                if dset is None:
                    dset = res.ms_dset()
                if molecules_dataset_id not in h5:
                    molecules_as_in_the_matrix = res.molids()
                    h5[molecules_dataset_id] = molecules_as_in_the_matrix
                cv = res.lsocv() if lso else res.crscv()
                if cv is None:
                    continue
                for fold_num, fold in enumerate(cv.folds()):
                    try:
                        c = np.zeros(len(res.molids()), dtype=np.int)
                        c[fold.test_indices()] = 1
                        coocurrences.append(c)
                        valid_expids.append(expid)
                        fold_ids.append(fold_num)
                    except:
                        pass
                res._close_h5()
            group = h5.require_group(group_id)
            group['coocurrences'] = np.array(coocurrences)
            group['expids'] = valid_expids
            group['folds'] = fold_ids
    with h5py.File(cache_file, 'r') as h5:
        molids = h5[molecules_dataset_id][:]
        coocurrences = h5[group_id]['coocurrences'][:].astype(np.bool)
        expids = h5[group_id]['expids'][:]
        folds = h5[group_id]['folds'][:]
        return coocurrences, molids, expids, folds
Пример #2
0
def merge_cache_scores(dset_id='bcrp',
                       expids=None,
                       feats='ecfps1',
                       model='logreg1',
                       lso=True,
                       calib=None):
    """
    Returns a 5-tuple (scores, expids, folds, molids, y) where:

      - scores is a num_mols x num_valid_expids matrix with the scores of each molecule in each (valid) experiment
      - expids is a num_valid_expids array of valid expids
      - folds is a num_mols x num_valid_expids matrix of fold assignment for each molecule in each (valid) experiment
      - molids is a num_mols array with the pertinent mol_ids
      - y is a num_mols array with the labels of the molecules
    """
    cache_file = op.join(MANYSOURCES_DATA_ROOT, 'results', 'scores_df.h5')
    if expids is None:
        expids = range(4096)
    group_id = '/dset=%s/feats=%s/model=%s/lso=%r/calib=%r' % (dset_id, feats, model, lso, calib)
    dset_feats_id = '/dset=%s/feats=%s' % (dset_id, feats)
    with h5py.File(cache_file, 'a') as h5:
        if group_id not in h5:
            group = h5.require_group(group_id)
            dset_feats_group = h5[dset_feats_id]
            scoress = []
            foldss = []
            correct_expids = []
            for expid in expids:
                print dset_id, expid, model, feats
                res = ManysourcesResult(expid=expid, dset=dset_id, feats=feats, model=model)
                cv = res.lsocv() if lso else res.crscv()
                try:
                    scores, ys, folds = cv.merge_scores(calibration=calib)
                    if 'y' not in dset_feats_group:
                        dset_feats_group['y'] = np.array(ys, dtype=np.int32)
                    if 'molids' not in dset_feats_group:
                        dset_feats_group['molids'] = res.molids()
                    scoress.append(scores)
                    foldss.append(folds)
                    correct_expids.append(expid)
                except:
                    pass
                finally:
                    res._close_h5()  # TODO: make result a context manager...
            group['scores'] = np.array(scoress).T
            group['folds'] = np.array(foldss).T
            group['expids'] = np.array(correct_expids, dtype=np.int32)

        with h5py.File(cache_file, 'r') as h5:
            group1 = h5[group_id]
            group2 = h5[dset_feats_id]
            try:
                return \
                    group1['scores'][:], \
                    group1['expids'][:], \
                    group1['folds'][:], \
                    group2['molids'][:], \
                    group2['y'][:]
            except:
                return None
Пример #3
0
 def write():
     with h5py.File(cache_path, 'a') as h5:
         group = h5.require_group(result_coords)
         infile_expids = set(group['expids'][:]) if 'expids' in group else {}
         expidss = []
         oks = 0
         losses = []
         foldss = []
         molids = None
         for expid in expids:
             if verbose:
                 print expid, lso
             if expid in infile_expids:
                 if verbose:
                     print '\tAlready done, skipping...'
                 continue
             try:
                 # look for the results corresponding to the desired expid, lso
                 res = ManysourcesResult(expid=expid, dset=dset, feats=feats, model=model).lsocv() if lso else \
                     ManysourcesResult(expid=expid, dset=dset, feats=feats, model=model).crscv()
                 # Merge the "CV" scores to have one score per compound in the dataset
                 scores, labels, folds = res.merge_scores(calibration=calibration)
                 if verbose:
                     print roc_auc_score(labels, scores, average='samples')
                 losses.append((labels - scores) ** 2)
                 foldss.append(folds)
                 if molids is None:
                     molids = res.molids()
                 expidss.append((expid, len(infile_expids) + oks))
                 oks += 1
             except:
                 # We guess that this happens when the external set only contains one class, but we need to check
                 print 'Warning, had troubles with', expid, lso
                 expidss.append((expid, -1))
         # write molids - N.B. assume same for all of them, which is reasonable
         if 'molids' not in group:
             group['molids'] = molids
         # write expids index
         expids_dset = group.require_dataset('expids',
                                             shape=(len(infile_expids) + len(expidss), 2),
                                             dtype=np.int32,
                                             maxshape=(None, 2))
         expids_dset.resize((len(infile_expids) + len(expidss), 2))
         expids_dset[len(infile_expids):] = expidss
         # write losses
         losses_dset = group.require_dataset('losses',
                                             shape=(len(infile_expids) + len(losses), len(molids)),
                                             dtype=np.float64,
                                             maxshape=(None, len(molids)))
         losses_dset.resize((len(infile_expids) + len(losses), len(molids)))
         losses_dset[len(infile_expids):] = losses
         # write folds (should be optional)
         folds_dset = group.require_dataset('folds',
                                            shape=(len(infile_expids) + len(losses), len(molids)),
                                            dtype=np.int32,
                                            maxshape=(None, len(molids)))
         folds_dset.resize((len(infile_expids) + len(losses), len(molids)))
         folds_dset[len(infile_expids):] = foldss