def molecules_coocurrences_df(dset='bcrp', expids=None, feats='ecfps1', model='logreg3', lso=True): cache_file = op.join(MANYSOURCES_DATA_ROOT, 'results', 'molecules_coocurrences_df.h5') if expids is None: expids = range(4096) group_id = '/dset=%s/feats=%s/model=%s/lso=%r' % (dset, feats, model, lso) molecules_dataset_id = '/dset=%s/molecules' % dset with h5py.File(cache_file, 'a') as h5: if group_id not in h5: coocurrences = [] valid_expids = [] fold_ids = [] dset_name = dset dset = None for expid in expids: print dset_name, expid, model, feats, lso res = ManysourcesResult(expid=expid, dset=dset_name, feats=feats, model=model) if dset is None: dset = res.ms_dset() if molecules_dataset_id not in h5: molecules_as_in_the_matrix = res.molids() h5[molecules_dataset_id] = molecules_as_in_the_matrix cv = res.lsocv() if lso else res.crscv() if cv is None: continue for fold_num, fold in enumerate(cv.folds()): try: c = np.zeros(len(res.molids()), dtype=np.int) c[fold.test_indices()] = 1 coocurrences.append(c) valid_expids.append(expid) fold_ids.append(fold_num) except: pass res._close_h5() group = h5.require_group(group_id) group['coocurrences'] = np.array(coocurrences) group['expids'] = valid_expids group['folds'] = fold_ids with h5py.File(cache_file, 'r') as h5: molids = h5[molecules_dataset_id][:] coocurrences = h5[group_id]['coocurrences'][:].astype(np.bool) expids = h5[group_id]['expids'][:] folds = h5[group_id]['folds'][:] return coocurrences, molids, expids, folds
def merge_cache_scores(dset_id='bcrp', expids=None, feats='ecfps1', model='logreg1', lso=True, calib=None): """ Returns a 5-tuple (scores, expids, folds, molids, y) where: - scores is a num_mols x num_valid_expids matrix with the scores of each molecule in each (valid) experiment - expids is a num_valid_expids array of valid expids - folds is a num_mols x num_valid_expids matrix of fold assignment for each molecule in each (valid) experiment - molids is a num_mols array with the pertinent mol_ids - y is a num_mols array with the labels of the molecules """ cache_file = op.join(MANYSOURCES_DATA_ROOT, 'results', 'scores_df.h5') if expids is None: expids = range(4096) group_id = '/dset=%s/feats=%s/model=%s/lso=%r/calib=%r' % (dset_id, feats, model, lso, calib) dset_feats_id = '/dset=%s/feats=%s' % (dset_id, feats) with h5py.File(cache_file, 'a') as h5: if group_id not in h5: group = h5.require_group(group_id) dset_feats_group = h5[dset_feats_id] scoress = [] foldss = [] correct_expids = [] for expid in expids: print dset_id, expid, model, feats res = ManysourcesResult(expid=expid, dset=dset_id, feats=feats, model=model) cv = res.lsocv() if lso else res.crscv() try: scores, ys, folds = cv.merge_scores(calibration=calib) if 'y' not in dset_feats_group: dset_feats_group['y'] = np.array(ys, dtype=np.int32) if 'molids' not in dset_feats_group: dset_feats_group['molids'] = res.molids() scoress.append(scores) foldss.append(folds) correct_expids.append(expid) except: pass finally: res._close_h5() # TODO: make result a context manager... group['scores'] = np.array(scoress).T group['folds'] = np.array(foldss).T group['expids'] = np.array(correct_expids, dtype=np.int32) with h5py.File(cache_file, 'r') as h5: group1 = h5[group_id] group2 = h5[dset_feats_id] try: return \ group1['scores'][:], \ group1['expids'][:], \ group1['folds'][:], \ group2['molids'][:], \ group2['y'][:] except: return None
def write(): with h5py.File(cache_path, 'a') as h5: group = h5.require_group(result_coords) infile_expids = set(group['expids'][:]) if 'expids' in group else {} expidss = [] oks = 0 losses = [] foldss = [] molids = None for expid in expids: if verbose: print expid, lso if expid in infile_expids: if verbose: print '\tAlready done, skipping...' continue try: # look for the results corresponding to the desired expid, lso res = ManysourcesResult(expid=expid, dset=dset, feats=feats, model=model).lsocv() if lso else \ ManysourcesResult(expid=expid, dset=dset, feats=feats, model=model).crscv() # Merge the "CV" scores to have one score per compound in the dataset scores, labels, folds = res.merge_scores(calibration=calibration) if verbose: print roc_auc_score(labels, scores, average='samples') losses.append((labels - scores) ** 2) foldss.append(folds) if molids is None: molids = res.molids() expidss.append((expid, len(infile_expids) + oks)) oks += 1 except: # We guess that this happens when the external set only contains one class, but we need to check print 'Warning, had troubles with', expid, lso expidss.append((expid, -1)) # write molids - N.B. assume same for all of them, which is reasonable if 'molids' not in group: group['molids'] = molids # write expids index expids_dset = group.require_dataset('expids', shape=(len(infile_expids) + len(expidss), 2), dtype=np.int32, maxshape=(None, 2)) expids_dset.resize((len(infile_expids) + len(expidss), 2)) expids_dset[len(infile_expids):] = expidss # write losses losses_dset = group.require_dataset('losses', shape=(len(infile_expids) + len(losses), len(molids)), dtype=np.float64, maxshape=(None, len(molids))) losses_dset.resize((len(infile_expids) + len(losses), len(molids))) losses_dset[len(infile_expids):] = losses # write folds (should be optional) folds_dset = group.require_dataset('folds', shape=(len(infile_expids) + len(losses), len(molids)), dtype=np.int32, maxshape=(None, len(molids))) folds_dset.resize((len(infile_expids) + len(losses), len(molids))) folds_dset[len(infile_expids):] = foldss