示例#1
0
 def write():
     with h5py.File(cache_path, 'a') as h5:
         group = h5.require_group(result_coords)
         infile_expids = set(group['expids'][:]) if 'expids' in group else {}
         expidss = []
         oks = 0
         losses = []
         foldss = []
         molids = None
         for expid in expids:
             if verbose:
                 print expid, lso
             if expid in infile_expids:
                 if verbose:
                     print '\tAlready done, skipping...'
                 continue
             try:
                 # look for the results corresponding to the desired expid, lso
                 res = ManysourcesResult(expid=expid, dset=dset, feats=feats, model=model).lsocv() if lso else \
                     ManysourcesResult(expid=expid, dset=dset, feats=feats, model=model).crscv()
                 # Merge the "CV" scores to have one score per compound in the dataset
                 scores, labels, folds = res.merge_scores(calibration=calibration)
                 if verbose:
                     print roc_auc_score(labels, scores, average='samples')
                 losses.append((labels - scores) ** 2)
                 foldss.append(folds)
                 if molids is None:
                     molids = res.molids()
                 expidss.append((expid, len(infile_expids) + oks))
                 oks += 1
             except:
                 # We guess that this happens when the external set only contains one class, but we need to check
                 print 'Warning, had troubles with', expid, lso
                 expidss.append((expid, -1))
         # write molids - N.B. assume same for all of them, which is reasonable
         if 'molids' not in group:
             group['molids'] = molids
         # write expids index
         expids_dset = group.require_dataset('expids',
                                             shape=(len(infile_expids) + len(expidss), 2),
                                             dtype=np.int32,
                                             maxshape=(None, 2))
         expids_dset.resize((len(infile_expids) + len(expidss), 2))
         expids_dset[len(infile_expids):] = expidss
         # write losses
         losses_dset = group.require_dataset('losses',
                                             shape=(len(infile_expids) + len(losses), len(molids)),
                                             dtype=np.float64,
                                             maxshape=(None, len(molids)))
         losses_dset.resize((len(infile_expids) + len(losses), len(molids)))
         losses_dset[len(infile_expids):] = losses
         # write folds (should be optional)
         folds_dset = group.require_dataset('folds',
                                            shape=(len(infile_expids) + len(losses), len(molids)),
                                            dtype=np.int32,
                                            maxshape=(None, len(molids)))
         folds_dset.resize((len(infile_expids) + len(losses), len(molids)))
         folds_dset[len(infile_expids):] = foldss
示例#2
0
def merge_cache_scores(dset_id='bcrp',
                       expids=None,
                       feats='ecfps1',
                       model='logreg1',
                       lso=True,
                       calib=None):
    """
    Returns a 5-tuple (scores, expids, folds, molids, y) where:

      - scores is a num_mols x num_valid_expids matrix with the scores of each molecule in each (valid) experiment
      - expids is a num_valid_expids array of valid expids
      - folds is a num_mols x num_valid_expids matrix of fold assignment for each molecule in each (valid) experiment
      - molids is a num_mols array with the pertinent mol_ids
      - y is a num_mols array with the labels of the molecules
    """
    cache_file = op.join(MANYSOURCES_DATA_ROOT, 'results', 'scores_df.h5')
    if expids is None:
        expids = range(4096)
    group_id = '/dset=%s/feats=%s/model=%s/lso=%r/calib=%r' % (dset_id, feats, model, lso, calib)
    dset_feats_id = '/dset=%s/feats=%s' % (dset_id, feats)
    with h5py.File(cache_file, 'a') as h5:
        if group_id not in h5:
            group = h5.require_group(group_id)
            dset_feats_group = h5[dset_feats_id]
            scoress = []
            foldss = []
            correct_expids = []
            for expid in expids:
                print dset_id, expid, model, feats
                res = ManysourcesResult(expid=expid, dset=dset_id, feats=feats, model=model)
                cv = res.lsocv() if lso else res.crscv()
                try:
                    scores, ys, folds = cv.merge_scores(calibration=calib)
                    if 'y' not in dset_feats_group:
                        dset_feats_group['y'] = np.array(ys, dtype=np.int32)
                    if 'molids' not in dset_feats_group:
                        dset_feats_group['molids'] = res.molids()
                    scoress.append(scores)
                    foldss.append(folds)
                    correct_expids.append(expid)
                except:
                    pass
                finally:
                    res._close_h5()  # TODO: make result a context manager...
            group['scores'] = np.array(scoress).T
            group['folds'] = np.array(foldss).T
            group['expids'] = np.array(correct_expids, dtype=np.int32)

        with h5py.File(cache_file, 'r') as h5:
            group1 = h5[group_id]
            group2 = h5[dset_feats_id]
            try:
                return \
                    group1['scores'][:], \
                    group1['expids'][:], \
                    group1['folds'][:], \
                    group2['molids'][:], \
                    group2['y'][:]
            except:
                return None
示例#3
0
def molecules_coocurrences_df(dset='bcrp',
                              expids=None,
                              feats='ecfps1',
                              model='logreg3',
                              lso=True):
    cache_file = op.join(MANYSOURCES_DATA_ROOT, 'results', 'molecules_coocurrences_df.h5')
    if expids is None:
        expids = range(4096)
    group_id = '/dset=%s/feats=%s/model=%s/lso=%r' % (dset, feats, model, lso)
    molecules_dataset_id = '/dset=%s/molecules' % dset
    with h5py.File(cache_file, 'a') as h5:
        if group_id not in h5:
            coocurrences = []
            valid_expids = []
            fold_ids = []
            dset_name = dset
            dset = None
            for expid in expids:
                print dset_name, expid, model, feats, lso
                res = ManysourcesResult(expid=expid, dset=dset_name, feats=feats, model=model)
                if dset is None:
                    dset = res.ms_dset()
                if molecules_dataset_id not in h5:
                    molecules_as_in_the_matrix = res.molids()
                    h5[molecules_dataset_id] = molecules_as_in_the_matrix
                cv = res.lsocv() if lso else res.crscv()
                if cv is None:
                    continue
                for fold_num, fold in enumerate(cv.folds()):
                    try:
                        c = np.zeros(len(res.molids()), dtype=np.int)
                        c[fold.test_indices()] = 1
                        coocurrences.append(c)
                        valid_expids.append(expid)
                        fold_ids.append(fold_num)
                    except:
                        pass
                res._close_h5()
            group = h5.require_group(group_id)
            group['coocurrences'] = np.array(coocurrences)
            group['expids'] = valid_expids
            group['folds'] = fold_ids
    with h5py.File(cache_file, 'r') as h5:
        molids = h5[molecules_dataset_id][:]
        coocurrences = h5[group_id]['coocurrences'][:].astype(np.bool)
        expids = h5[group_id]['expids'][:]
        folds = h5[group_id]['folds'][:]
        return coocurrences, molids, expids, folds
示例#4
0
def source_only_features(dset='bcrp',
                         model='logreg3',
                         feats='ecfps1',
                         expids=range(20),
                         source='phenylquinazolines_Juvale_2012'):
    """"""
    dset = ManysourcesDataset(dset)
    sparsities = defaultdict(list)
    for expid in expids:
        res = ManysourcesResult(expid=expid, dset=dset.name, feats=feats, model=model)
        # models
        lso_models = [logistic_from_weights(weights, intercept) for weights, intercept, _ in res.lsocv().all_models()]
        crs_models = [logistic_from_weights(weights, intercept) for weights, intercept, _ in res.crscv().all_models()]
        # is sparsity the same?
        for lsom, crsm in zip(lso_models, crs_models):
            sparsities['sparsity_lso'].append(density(lsom.coef_[0, :]))
            sparsities['sparsity_crs'].append(density(crsm.coef_[0, :]))
    return pd.DataFrame(sparsities)
示例#5
0
def sources_coocurrences_df(dset='bcrp',
                            expids=None,
                            feats='ecfps1',
                            model='logreg3',
                            lso=True):
    cache_file = op.join(MANYSOURCES_DATA_ROOT, 'results', 'sources_coocurrences_df.h5')
    if expids is None:
        expids = range(4096)
    group_id = '/dset=%s/feats=%s/model=%s/lso=%r' % (dset, feats, model, lso)
    sources_dataset_id = '/dset=%s/sources' % dset
    with h5py.File(cache_file, 'a') as h5:
        if group_id not in h5:
            coocurrences = []
            valid_expids = []
            fold_ids = []
            dset_name = dset
            dset = None
            for expid in expids:
                print dset_name, expid, model, feats, lso
                res = ManysourcesResult(expid=expid, dset=dset_name, feats=feats, model=model)
                if dset is None:
                    dset = res.ms_dset()
                if sources_dataset_id not in h5:
                    sources_as_in_the_matrix = dset.mols().i2sources_order()
                    h5[sources_dataset_id] = sources_as_in_the_matrix
                if lso is False:
                    print '\tWARNING: source coocurrences do not make much sense without LSO splitting'
                cv = res.lsocv() if lso else res.crscv()
                if cv is None:
                    continue
                for fold_num, fold in enumerate(cv.folds()):
                    try:
                        coocurrences.append(fold.sources_coocurrences(dset=dset))
                        valid_expids.append(expid)
                        fold_ids.append(fold_num)
                    except:
                        pass
                res._close_h5()
            group = h5.require_group(group_id)
            group['coocurrences'] = np.array(coocurrences)
            group['expids'] = valid_expids
            group['folds'] = fold_ids
    with h5py.File(cache_file, 'r') as h5:
        sources = h5[sources_dataset_id][:]
        coocurrences = h5[group_id]['coocurrences'][:].astype(np.bool)
        expids = h5[group_id]['expids'][:]
        folds = h5[group_id]['folds'][:]
        return coocurrences, sources, expids, folds
示例#6
0
def logreg_weights(dset='bcrp',
                   expids=None,
                   feats='ecfps1',
                   model='logreg3',
                   lso=True,
                   eps=1E-6):
    """
    Parameters
    ----------
    dset: string, default 'bcrp'
      The dataset id

    expids: int list, default None
      The experiment ids; if None, use from 0 to 4096

    feats: string, default 'ecfps1'
      The id of the feature set

    model: string, default 'logreg3'
      The id of the model used

    lso: boolean, default True
      Whether the experiment corresponds to a LSO or CRS partitioning scheme

    eps: float, default 0.000001
      Little number to be considered 0

    Returns
    -------
    A four-tuple (matrix, intercepts, expids, folds)
      matrix: csr sparse matrix (num_folds x num_features)
      intercepts: numpy array (num_folds)
      expids: num_folds experiment ids
      folds: num_folds array of fold_ids within each experiment

      Each row of the matrix corresponds to the tuple (expid, fold).

    :rtype: (scipy.sparse.csr_matrix, np.array, np.array, np.array)
    """
    cache_file = op.join(MANYSOURCES_DATA_ROOT, 'results', 'logreg_weights_df.h5')
    if expids is None:
        expids = range(4096)
    group_id = '/dset=%s/feats=%s/model=%s/lso=%r' % (dset, feats, model, lso)
    with h5py.File(cache_file, 'a') as h5:
        if group_id not in h5:
            is_sparse = eps is not None
            if not is_sparse:
                raise NotImplementedError()
            else:
                row = 0
                rows = array('I')
                cols = array('I')
                vals = array('d')
                intercepts = array('d')
                correct_expids = array('I')
                correct_folds = array('I')
                num_feats = None  # real nasty
                for expid in expids:
                    print dset, expid, model, feats, lso
                    res = ManysourcesResult(expid=expid, dset=dset, feats=feats, model=model)
                    cv = res.lsocv() if lso else res.crscv()
                    if cv is None:
                        continue
                    for fold_num, fold in enumerate(cv.folds()):
                        try:
                            coef, intercept = fold.model_data()
                            coef = coef[0]  # Lame
                            if num_feats is None:
                                num_feats = len(coef)
                            else:
                                assert num_feats == len(coef), 'astiness is all around me, and so the feelin is gross'
                            intercept = intercept[()]  # Lame
                            non_zero = np.where(np.abs(coef) > eps)[0]
                            density = float(len(non_zero)) / len(coef)
                            if density > 0.35:
                                print '\tWARNING: sparsity %.2f' % density
                            cols.extend(non_zero)
                            rows.extend([row] * len(non_zero))
                            vals.extend(coef[non_zero])
                            correct_expids.append(expid)
                            correct_folds.append(fold_num)
                            intercepts.append(intercept[0])
                            row += 1
                        except:
                            pass
                    res._close_h5()
                group = h5.require_group(group_id)
                matrix = coo_matrix((vals, (rows, cols))).tocsr()
                group['indices'] = matrix.indices
                group['indptr'] = matrix.indptr
                group['data'] = matrix.data
                group['shape'] = (matrix.shape[0], num_feats)  # nasty nasty
                group['expids'] = correct_expids
                group['folds'] = correct_folds
                group['intercepts'] = intercepts
    with h5py.File(cache_file, 'r') as h5:
        group = h5[group_id]
        matrix = csr_matrix((group['data'][:], group['indices'][:], group['indptr'][:]),
                            shape=group['shape'][:])
        return matrix, group['intercepts'][:], group['expids'][:], group['folds'][:]