示例#1
0
def sources_coocurrences_df(dset='bcrp',
                            expids=None,
                            feats='ecfps1',
                            model='logreg3',
                            lso=True):
    cache_file = op.join(MANYSOURCES_DATA_ROOT, 'results', 'sources_coocurrences_df.h5')
    if expids is None:
        expids = range(4096)
    group_id = '/dset=%s/feats=%s/model=%s/lso=%r' % (dset, feats, model, lso)
    sources_dataset_id = '/dset=%s/sources' % dset
    with h5py.File(cache_file, 'a') as h5:
        if group_id not in h5:
            coocurrences = []
            valid_expids = []
            fold_ids = []
            dset_name = dset
            dset = None
            for expid in expids:
                print dset_name, expid, model, feats, lso
                res = ManysourcesResult(expid=expid, dset=dset_name, feats=feats, model=model)
                if dset is None:
                    dset = res.ms_dset()
                if sources_dataset_id not in h5:
                    sources_as_in_the_matrix = dset.mols().i2sources_order()
                    h5[sources_dataset_id] = sources_as_in_the_matrix
                if lso is False:
                    print '\tWARNING: source coocurrences do not make much sense without LSO splitting'
                cv = res.lsocv() if lso else res.crscv()
                if cv is None:
                    continue
                for fold_num, fold in enumerate(cv.folds()):
                    try:
                        coocurrences.append(fold.sources_coocurrences(dset=dset))
                        valid_expids.append(expid)
                        fold_ids.append(fold_num)
                    except:
                        pass
                res._close_h5()
            group = h5.require_group(group_id)
            group['coocurrences'] = np.array(coocurrences)
            group['expids'] = valid_expids
            group['folds'] = fold_ids
    with h5py.File(cache_file, 'r') as h5:
        sources = h5[sources_dataset_id][:]
        coocurrences = h5[group_id]['coocurrences'][:].astype(np.bool)
        expids = h5[group_id]['expids'][:]
        folds = h5[group_id]['folds'][:]
        return coocurrences, sources, expids, folds
示例#2
0
def molecules_coocurrences_df(dset='bcrp',
                              expids=None,
                              feats='ecfps1',
                              model='logreg3',
                              lso=True):
    cache_file = op.join(MANYSOURCES_DATA_ROOT, 'results', 'molecules_coocurrences_df.h5')
    if expids is None:
        expids = range(4096)
    group_id = '/dset=%s/feats=%s/model=%s/lso=%r' % (dset, feats, model, lso)
    molecules_dataset_id = '/dset=%s/molecules' % dset
    with h5py.File(cache_file, 'a') as h5:
        if group_id not in h5:
            coocurrences = []
            valid_expids = []
            fold_ids = []
            dset_name = dset
            dset = None
            for expid in expids:
                print dset_name, expid, model, feats, lso
                res = ManysourcesResult(expid=expid, dset=dset_name, feats=feats, model=model)
                if dset is None:
                    dset = res.ms_dset()
                if molecules_dataset_id not in h5:
                    molecules_as_in_the_matrix = res.molids()
                    h5[molecules_dataset_id] = molecules_as_in_the_matrix
                cv = res.lsocv() if lso else res.crscv()
                if cv is None:
                    continue
                for fold_num, fold in enumerate(cv.folds()):
                    try:
                        c = np.zeros(len(res.molids()), dtype=np.int)
                        c[fold.test_indices()] = 1
                        coocurrences.append(c)
                        valid_expids.append(expid)
                        fold_ids.append(fold_num)
                    except:
                        pass
                res._close_h5()
            group = h5.require_group(group_id)
            group['coocurrences'] = np.array(coocurrences)
            group['expids'] = valid_expids
            group['folds'] = fold_ids
    with h5py.File(cache_file, 'r') as h5:
        molids = h5[molecules_dataset_id][:]
        coocurrences = h5[group_id]['coocurrences'][:].astype(np.bool)
        expids = h5[group_id]['expids'][:]
        folds = h5[group_id]['folds'][:]
        return coocurrences, molids, expids, folds