def sources_coocurrences_df(dset='bcrp', expids=None, feats='ecfps1', model='logreg3', lso=True): cache_file = op.join(MANYSOURCES_DATA_ROOT, 'results', 'sources_coocurrences_df.h5') if expids is None: expids = range(4096) group_id = '/dset=%s/feats=%s/model=%s/lso=%r' % (dset, feats, model, lso) sources_dataset_id = '/dset=%s/sources' % dset with h5py.File(cache_file, 'a') as h5: if group_id not in h5: coocurrences = [] valid_expids = [] fold_ids = [] dset_name = dset dset = None for expid in expids: print dset_name, expid, model, feats, lso res = ManysourcesResult(expid=expid, dset=dset_name, feats=feats, model=model) if dset is None: dset = res.ms_dset() if sources_dataset_id not in h5: sources_as_in_the_matrix = dset.mols().i2sources_order() h5[sources_dataset_id] = sources_as_in_the_matrix if lso is False: print '\tWARNING: source coocurrences do not make much sense without LSO splitting' cv = res.lsocv() if lso else res.crscv() if cv is None: continue for fold_num, fold in enumerate(cv.folds()): try: coocurrences.append(fold.sources_coocurrences(dset=dset)) valid_expids.append(expid) fold_ids.append(fold_num) except: pass res._close_h5() group = h5.require_group(group_id) group['coocurrences'] = np.array(coocurrences) group['expids'] = valid_expids group['folds'] = fold_ids with h5py.File(cache_file, 'r') as h5: sources = h5[sources_dataset_id][:] coocurrences = h5[group_id]['coocurrences'][:].astype(np.bool) expids = h5[group_id]['expids'][:] folds = h5[group_id]['folds'][:] return coocurrences, sources, expids, folds
def molecules_coocurrences_df(dset='bcrp', expids=None, feats='ecfps1', model='logreg3', lso=True): cache_file = op.join(MANYSOURCES_DATA_ROOT, 'results', 'molecules_coocurrences_df.h5') if expids is None: expids = range(4096) group_id = '/dset=%s/feats=%s/model=%s/lso=%r' % (dset, feats, model, lso) molecules_dataset_id = '/dset=%s/molecules' % dset with h5py.File(cache_file, 'a') as h5: if group_id not in h5: coocurrences = [] valid_expids = [] fold_ids = [] dset_name = dset dset = None for expid in expids: print dset_name, expid, model, feats, lso res = ManysourcesResult(expid=expid, dset=dset_name, feats=feats, model=model) if dset is None: dset = res.ms_dset() if molecules_dataset_id not in h5: molecules_as_in_the_matrix = res.molids() h5[molecules_dataset_id] = molecules_as_in_the_matrix cv = res.lsocv() if lso else res.crscv() if cv is None: continue for fold_num, fold in enumerate(cv.folds()): try: c = np.zeros(len(res.molids()), dtype=np.int) c[fold.test_indices()] = 1 coocurrences.append(c) valid_expids.append(expid) fold_ids.append(fold_num) except: pass res._close_h5() group = h5.require_group(group_id) group['coocurrences'] = np.array(coocurrences) group['expids'] = valid_expids group['folds'] = fold_ids with h5py.File(cache_file, 'r') as h5: molids = h5[molecules_dataset_id][:] coocurrences = h5[group_id]['coocurrences'][:].astype(np.bool) expids = h5[group_id]['expids'][:] folds = h5[group_id]['folds'][:] return coocurrences, molids, expids, folds