def merge_cache_scores(dset_id='bcrp', expids=None, feats='ecfps1', model='logreg1', lso=True, calib=None): """ Returns a 5-tuple (scores, expids, folds, molids, y) where: - scores is a num_mols x num_valid_expids matrix with the scores of each molecule in each (valid) experiment - expids is a num_valid_expids array of valid expids - folds is a num_mols x num_valid_expids matrix of fold assignment for each molecule in each (valid) experiment - molids is a num_mols array with the pertinent mol_ids - y is a num_mols array with the labels of the molecules """ cache_file = op.join(MANYSOURCES_DATA_ROOT, 'results', 'scores_df.h5') if expids is None: expids = range(4096) group_id = '/dset=%s/feats=%s/model=%s/lso=%r/calib=%r' % (dset_id, feats, model, lso, calib) dset_feats_id = '/dset=%s/feats=%s' % (dset_id, feats) with h5py.File(cache_file, 'a') as h5: if group_id not in h5: group = h5.require_group(group_id) dset_feats_group = h5[dset_feats_id] scoress = [] foldss = [] correct_expids = [] for expid in expids: print dset_id, expid, model, feats res = ManysourcesResult(expid=expid, dset=dset_id, feats=feats, model=model) cv = res.lsocv() if lso else res.crscv() try: scores, ys, folds = cv.merge_scores(calibration=calib) if 'y' not in dset_feats_group: dset_feats_group['y'] = np.array(ys, dtype=np.int32) if 'molids' not in dset_feats_group: dset_feats_group['molids'] = res.molids() scoress.append(scores) foldss.append(folds) correct_expids.append(expid) except: pass finally: res._close_h5() # TODO: make result a context manager... group['scores'] = np.array(scoress).T group['folds'] = np.array(foldss).T group['expids'] = np.array(correct_expids, dtype=np.int32) with h5py.File(cache_file, 'r') as h5: group1 = h5[group_id] group2 = h5[dset_feats_id] try: return \ group1['scores'][:], \ group1['expids'][:], \ group1['folds'][:], \ group2['molids'][:], \ group2['y'][:] except: return None
def sources_coocurrences_df(dset='bcrp', expids=None, feats='ecfps1', model='logreg3', lso=True): cache_file = op.join(MANYSOURCES_DATA_ROOT, 'results', 'sources_coocurrences_df.h5') if expids is None: expids = range(4096) group_id = '/dset=%s/feats=%s/model=%s/lso=%r' % (dset, feats, model, lso) sources_dataset_id = '/dset=%s/sources' % dset with h5py.File(cache_file, 'a') as h5: if group_id not in h5: coocurrences = [] valid_expids = [] fold_ids = [] dset_name = dset dset = None for expid in expids: print dset_name, expid, model, feats, lso res = ManysourcesResult(expid=expid, dset=dset_name, feats=feats, model=model) if dset is None: dset = res.ms_dset() if sources_dataset_id not in h5: sources_as_in_the_matrix = dset.mols().i2sources_order() h5[sources_dataset_id] = sources_as_in_the_matrix if lso is False: print '\tWARNING: source coocurrences do not make much sense without LSO splitting' cv = res.lsocv() if lso else res.crscv() if cv is None: continue for fold_num, fold in enumerate(cv.folds()): try: coocurrences.append(fold.sources_coocurrences(dset=dset)) valid_expids.append(expid) fold_ids.append(fold_num) except: pass res._close_h5() group = h5.require_group(group_id) group['coocurrences'] = np.array(coocurrences) group['expids'] = valid_expids group['folds'] = fold_ids with h5py.File(cache_file, 'r') as h5: sources = h5[sources_dataset_id][:] coocurrences = h5[group_id]['coocurrences'][:].astype(np.bool) expids = h5[group_id]['expids'][:] folds = h5[group_id]['folds'][:] return coocurrences, sources, expids, folds
def molecules_coocurrences_df(dset='bcrp', expids=None, feats='ecfps1', model='logreg3', lso=True): cache_file = op.join(MANYSOURCES_DATA_ROOT, 'results', 'molecules_coocurrences_df.h5') if expids is None: expids = range(4096) group_id = '/dset=%s/feats=%s/model=%s/lso=%r' % (dset, feats, model, lso) molecules_dataset_id = '/dset=%s/molecules' % dset with h5py.File(cache_file, 'a') as h5: if group_id not in h5: coocurrences = [] valid_expids = [] fold_ids = [] dset_name = dset dset = None for expid in expids: print dset_name, expid, model, feats, lso res = ManysourcesResult(expid=expid, dset=dset_name, feats=feats, model=model) if dset is None: dset = res.ms_dset() if molecules_dataset_id not in h5: molecules_as_in_the_matrix = res.molids() h5[molecules_dataset_id] = molecules_as_in_the_matrix cv = res.lsocv() if lso else res.crscv() if cv is None: continue for fold_num, fold in enumerate(cv.folds()): try: c = np.zeros(len(res.molids()), dtype=np.int) c[fold.test_indices()] = 1 coocurrences.append(c) valid_expids.append(expid) fold_ids.append(fold_num) except: pass res._close_h5() group = h5.require_group(group_id) group['coocurrences'] = np.array(coocurrences) group['expids'] = valid_expids group['folds'] = fold_ids with h5py.File(cache_file, 'r') as h5: molids = h5[molecules_dataset_id][:] coocurrences = h5[group_id]['coocurrences'][:].astype(np.bool) expids = h5[group_id]['expids'][:] folds = h5[group_id]['folds'][:] return coocurrences, molids, expids, folds
def logreg_weights(dset='bcrp', expids=None, feats='ecfps1', model='logreg3', lso=True, eps=1E-6): """ Parameters ---------- dset: string, default 'bcrp' The dataset id expids: int list, default None The experiment ids; if None, use from 0 to 4096 feats: string, default 'ecfps1' The id of the feature set model: string, default 'logreg3' The id of the model used lso: boolean, default True Whether the experiment corresponds to a LSO or CRS partitioning scheme eps: float, default 0.000001 Little number to be considered 0 Returns ------- A four-tuple (matrix, intercepts, expids, folds) matrix: csr sparse matrix (num_folds x num_features) intercepts: numpy array (num_folds) expids: num_folds experiment ids folds: num_folds array of fold_ids within each experiment Each row of the matrix corresponds to the tuple (expid, fold). :rtype: (scipy.sparse.csr_matrix, np.array, np.array, np.array) """ cache_file = op.join(MANYSOURCES_DATA_ROOT, 'results', 'logreg_weights_df.h5') if expids is None: expids = range(4096) group_id = '/dset=%s/feats=%s/model=%s/lso=%r' % (dset, feats, model, lso) with h5py.File(cache_file, 'a') as h5: if group_id not in h5: is_sparse = eps is not None if not is_sparse: raise NotImplementedError() else: row = 0 rows = array('I') cols = array('I') vals = array('d') intercepts = array('d') correct_expids = array('I') correct_folds = array('I') num_feats = None # real nasty for expid in expids: print dset, expid, model, feats, lso res = ManysourcesResult(expid=expid, dset=dset, feats=feats, model=model) cv = res.lsocv() if lso else res.crscv() if cv is None: continue for fold_num, fold in enumerate(cv.folds()): try: coef, intercept = fold.model_data() coef = coef[0] # Lame if num_feats is None: num_feats = len(coef) else: assert num_feats == len(coef), 'astiness is all around me, and so the feelin is gross' intercept = intercept[()] # Lame non_zero = np.where(np.abs(coef) > eps)[0] density = float(len(non_zero)) / len(coef) if density > 0.35: print '\tWARNING: sparsity %.2f' % density cols.extend(non_zero) rows.extend([row] * len(non_zero)) vals.extend(coef[non_zero]) correct_expids.append(expid) correct_folds.append(fold_num) intercepts.append(intercept[0]) row += 1 except: pass res._close_h5() group = h5.require_group(group_id) matrix = coo_matrix((vals, (rows, cols))).tocsr() group['indices'] = matrix.indices group['indptr'] = matrix.indptr group['data'] = matrix.data group['shape'] = (matrix.shape[0], num_feats) # nasty nasty group['expids'] = correct_expids group['folds'] = correct_folds group['intercepts'] = intercepts with h5py.File(cache_file, 'r') as h5: group = h5[group_id] matrix = csr_matrix((group['data'][:], group['indices'][:], group['indptr'][:]), shape=group['shape'][:]) return matrix, group['intercepts'][:], group['expids'][:], group['folds'][:]