def write(): with h5py.File(cache_path, 'a') as h5: group = h5.require_group(result_coords) infile_expids = set(group['expids'][:]) if 'expids' in group else {} expidss = [] oks = 0 losses = [] foldss = [] molids = None for expid in expids: if verbose: print expid, lso if expid in infile_expids: if verbose: print '\tAlready done, skipping...' continue try: # look for the results corresponding to the desired expid, lso res = ManysourcesResult(expid=expid, dset=dset, feats=feats, model=model).lsocv() if lso else \ ManysourcesResult(expid=expid, dset=dset, feats=feats, model=model).crscv() # Merge the "CV" scores to have one score per compound in the dataset scores, labels, folds = res.merge_scores(calibration=calibration) if verbose: print roc_auc_score(labels, scores, average='samples') losses.append((labels - scores) ** 2) foldss.append(folds) if molids is None: molids = res.molids() expidss.append((expid, len(infile_expids) + oks)) oks += 1 except: # We guess that this happens when the external set only contains one class, but we need to check print 'Warning, had troubles with', expid, lso expidss.append((expid, -1)) # write molids - N.B. assume same for all of them, which is reasonable if 'molids' not in group: group['molids'] = molids # write expids index expids_dset = group.require_dataset('expids', shape=(len(infile_expids) + len(expidss), 2), dtype=np.int32, maxshape=(None, 2)) expids_dset.resize((len(infile_expids) + len(expidss), 2)) expids_dset[len(infile_expids):] = expidss # write losses losses_dset = group.require_dataset('losses', shape=(len(infile_expids) + len(losses), len(molids)), dtype=np.float64, maxshape=(None, len(molids))) losses_dset.resize((len(infile_expids) + len(losses), len(molids))) losses_dset[len(infile_expids):] = losses # write folds (should be optional) folds_dset = group.require_dataset('folds', shape=(len(infile_expids) + len(losses), len(molids)), dtype=np.int32, maxshape=(None, len(molids))) folds_dset.resize((len(infile_expids) + len(losses), len(molids))) folds_dset[len(infile_expids):] = foldss
def merge_cache_scores(dset_id='bcrp', expids=None, feats='ecfps1', model='logreg1', lso=True, calib=None): """ Returns a 5-tuple (scores, expids, folds, molids, y) where: - scores is a num_mols x num_valid_expids matrix with the scores of each molecule in each (valid) experiment - expids is a num_valid_expids array of valid expids - folds is a num_mols x num_valid_expids matrix of fold assignment for each molecule in each (valid) experiment - molids is a num_mols array with the pertinent mol_ids - y is a num_mols array with the labels of the molecules """ cache_file = op.join(MANYSOURCES_DATA_ROOT, 'results', 'scores_df.h5') if expids is None: expids = range(4096) group_id = '/dset=%s/feats=%s/model=%s/lso=%r/calib=%r' % (dset_id, feats, model, lso, calib) dset_feats_id = '/dset=%s/feats=%s' % (dset_id, feats) with h5py.File(cache_file, 'a') as h5: if group_id not in h5: group = h5.require_group(group_id) dset_feats_group = h5[dset_feats_id] scoress = [] foldss = [] correct_expids = [] for expid in expids: print dset_id, expid, model, feats res = ManysourcesResult(expid=expid, dset=dset_id, feats=feats, model=model) cv = res.lsocv() if lso else res.crscv() try: scores, ys, folds = cv.merge_scores(calibration=calib) if 'y' not in dset_feats_group: dset_feats_group['y'] = np.array(ys, dtype=np.int32) if 'molids' not in dset_feats_group: dset_feats_group['molids'] = res.molids() scoress.append(scores) foldss.append(folds) correct_expids.append(expid) except: pass finally: res._close_h5() # TODO: make result a context manager... group['scores'] = np.array(scoress).T group['folds'] = np.array(foldss).T group['expids'] = np.array(correct_expids, dtype=np.int32) with h5py.File(cache_file, 'r') as h5: group1 = h5[group_id] group2 = h5[dset_feats_id] try: return \ group1['scores'][:], \ group1['expids'][:], \ group1['folds'][:], \ group2['molids'][:], \ group2['y'][:] except: return None
def molecules_coocurrences_df(dset='bcrp', expids=None, feats='ecfps1', model='logreg3', lso=True): cache_file = op.join(MANYSOURCES_DATA_ROOT, 'results', 'molecules_coocurrences_df.h5') if expids is None: expids = range(4096) group_id = '/dset=%s/feats=%s/model=%s/lso=%r' % (dset, feats, model, lso) molecules_dataset_id = '/dset=%s/molecules' % dset with h5py.File(cache_file, 'a') as h5: if group_id not in h5: coocurrences = [] valid_expids = [] fold_ids = [] dset_name = dset dset = None for expid in expids: print dset_name, expid, model, feats, lso res = ManysourcesResult(expid=expid, dset=dset_name, feats=feats, model=model) if dset is None: dset = res.ms_dset() if molecules_dataset_id not in h5: molecules_as_in_the_matrix = res.molids() h5[molecules_dataset_id] = molecules_as_in_the_matrix cv = res.lsocv() if lso else res.crscv() if cv is None: continue for fold_num, fold in enumerate(cv.folds()): try: c = np.zeros(len(res.molids()), dtype=np.int) c[fold.test_indices()] = 1 coocurrences.append(c) valid_expids.append(expid) fold_ids.append(fold_num) except: pass res._close_h5() group = h5.require_group(group_id) group['coocurrences'] = np.array(coocurrences) group['expids'] = valid_expids group['folds'] = fold_ids with h5py.File(cache_file, 'r') as h5: molids = h5[molecules_dataset_id][:] coocurrences = h5[group_id]['coocurrences'][:].astype(np.bool) expids = h5[group_id]['expids'][:] folds = h5[group_id]['folds'][:] return coocurrences, molids, expids, folds
def source_only_features(dset='bcrp', model='logreg3', feats='ecfps1', expids=range(20), source='phenylquinazolines_Juvale_2012'): """""" dset = ManysourcesDataset(dset) sparsities = defaultdict(list) for expid in expids: res = ManysourcesResult(expid=expid, dset=dset.name, feats=feats, model=model) # models lso_models = [logistic_from_weights(weights, intercept) for weights, intercept, _ in res.lsocv().all_models()] crs_models = [logistic_from_weights(weights, intercept) for weights, intercept, _ in res.crscv().all_models()] # is sparsity the same? for lsom, crsm in zip(lso_models, crs_models): sparsities['sparsity_lso'].append(density(lsom.coef_[0, :])) sparsities['sparsity_crs'].append(density(crsm.coef_[0, :])) return pd.DataFrame(sparsities)
def sources_coocurrences_df(dset='bcrp', expids=None, feats='ecfps1', model='logreg3', lso=True): cache_file = op.join(MANYSOURCES_DATA_ROOT, 'results', 'sources_coocurrences_df.h5') if expids is None: expids = range(4096) group_id = '/dset=%s/feats=%s/model=%s/lso=%r' % (dset, feats, model, lso) sources_dataset_id = '/dset=%s/sources' % dset with h5py.File(cache_file, 'a') as h5: if group_id not in h5: coocurrences = [] valid_expids = [] fold_ids = [] dset_name = dset dset = None for expid in expids: print dset_name, expid, model, feats, lso res = ManysourcesResult(expid=expid, dset=dset_name, feats=feats, model=model) if dset is None: dset = res.ms_dset() if sources_dataset_id not in h5: sources_as_in_the_matrix = dset.mols().i2sources_order() h5[sources_dataset_id] = sources_as_in_the_matrix if lso is False: print '\tWARNING: source coocurrences do not make much sense without LSO splitting' cv = res.lsocv() if lso else res.crscv() if cv is None: continue for fold_num, fold in enumerate(cv.folds()): try: coocurrences.append(fold.sources_coocurrences(dset=dset)) valid_expids.append(expid) fold_ids.append(fold_num) except: pass res._close_h5() group = h5.require_group(group_id) group['coocurrences'] = np.array(coocurrences) group['expids'] = valid_expids group['folds'] = fold_ids with h5py.File(cache_file, 'r') as h5: sources = h5[sources_dataset_id][:] coocurrences = h5[group_id]['coocurrences'][:].astype(np.bool) expids = h5[group_id]['expids'][:] folds = h5[group_id]['folds'][:] return coocurrences, sources, expids, folds
def logreg_weights(dset='bcrp', expids=None, feats='ecfps1', model='logreg3', lso=True, eps=1E-6): """ Parameters ---------- dset: string, default 'bcrp' The dataset id expids: int list, default None The experiment ids; if None, use from 0 to 4096 feats: string, default 'ecfps1' The id of the feature set model: string, default 'logreg3' The id of the model used lso: boolean, default True Whether the experiment corresponds to a LSO or CRS partitioning scheme eps: float, default 0.000001 Little number to be considered 0 Returns ------- A four-tuple (matrix, intercepts, expids, folds) matrix: csr sparse matrix (num_folds x num_features) intercepts: numpy array (num_folds) expids: num_folds experiment ids folds: num_folds array of fold_ids within each experiment Each row of the matrix corresponds to the tuple (expid, fold). :rtype: (scipy.sparse.csr_matrix, np.array, np.array, np.array) """ cache_file = op.join(MANYSOURCES_DATA_ROOT, 'results', 'logreg_weights_df.h5') if expids is None: expids = range(4096) group_id = '/dset=%s/feats=%s/model=%s/lso=%r' % (dset, feats, model, lso) with h5py.File(cache_file, 'a') as h5: if group_id not in h5: is_sparse = eps is not None if not is_sparse: raise NotImplementedError() else: row = 0 rows = array('I') cols = array('I') vals = array('d') intercepts = array('d') correct_expids = array('I') correct_folds = array('I') num_feats = None # real nasty for expid in expids: print dset, expid, model, feats, lso res = ManysourcesResult(expid=expid, dset=dset, feats=feats, model=model) cv = res.lsocv() if lso else res.crscv() if cv is None: continue for fold_num, fold in enumerate(cv.folds()): try: coef, intercept = fold.model_data() coef = coef[0] # Lame if num_feats is None: num_feats = len(coef) else: assert num_feats == len(coef), 'astiness is all around me, and so the feelin is gross' intercept = intercept[()] # Lame non_zero = np.where(np.abs(coef) > eps)[0] density = float(len(non_zero)) / len(coef) if density > 0.35: print '\tWARNING: sparsity %.2f' % density cols.extend(non_zero) rows.extend([row] * len(non_zero)) vals.extend(coef[non_zero]) correct_expids.append(expid) correct_folds.append(fold_num) intercepts.append(intercept[0]) row += 1 except: pass res._close_h5() group = h5.require_group(group_id) matrix = coo_matrix((vals, (rows, cols))).tocsr() group['indices'] = matrix.indices group['indptr'] = matrix.indptr group['data'] = matrix.data group['shape'] = (matrix.shape[0], num_feats) # nasty nasty group['expids'] = correct_expids group['folds'] = correct_folds group['intercepts'] = intercepts with h5py.File(cache_file, 'r') as h5: group = h5[group_id] matrix = csr_matrix((group['data'][:], group['indices'][:], group['indptr'][:]), shape=group['shape'][:]) return matrix, group['intercepts'][:], group['expids'][:], group['folds'][:]