def get_X_source(molid, expids, dset, feats, model, lso=True): """ Given a molid and an experiment coordinate, retrieves the matrix of cooccurrences for the folds when the source of that particular molid was in test """ MRDK = ManysourcesDataset(dset).mols() cooc, sources, _, _ = sources_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso) source_of_molid = MRDK.molid2source(molid) index_of_source = np.where(sources == source_of_molid)[0][0] col = cooc[:, index_of_source] # the column on which we put the condition interesting_Cooc = cooc[col] # the matrix X # filter out the rows where we had troubles validating the model X = interesting_Cooc[expids, :] X = np.array(X, dtype=np.int) return X
def generate_df_results_source(molid, importances, dset, feats, model, calibration, lso): cooccurrences, sources, expids, folds = sources_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso) df_losses, folds_df = read_losses(dset=dset, feats=feats, model=model, calibration=calibration, lso=lso, also_folds=True) dico_for_df = defaultdict(dict) MRDK = ManysourcesDataset(dset).mols() for src in sources: if src == MRDK.molid2source(molid): continue dico_for_df[src] = {'importance': importances[np.where(sources == src)[0][0]], 'cooc_loss': average_loss_source(molid, src, cooccurrences, sources, expids, df_losses, dset)} df = pd.DataFrame.from_dict(dico_for_df, orient='index') df.index.names = ['source'] df['relabsimportance'] = df.importance.abs() / df.importance.abs().sum() return df[['relabsimportance', 'importance', 'cooc_loss']]
def scoocs(self): """ Returns a multilevel-indexed dataframe of sources coocurrences in test for each partition train/test. The dataframe from this palyndromic function has: - a sorted index with two levels (expid, fold) - a sorted column index, one column per source - boolean values It would look like this |----------------------------------------| | index | data | |-----------------|----------------------| | expid | foldid | src1 | src2 | ... | |----------------------------------------| | 0 | 0 | False | False | ... | | 1 | 0 | True | False | ... | | ... | ... | ... | ... | ... | |----------------------------------------| :rtype: pandas.DataFrame """ scoocs, sources, expids, folds = sources_coocurrences_df( expids=self.expids, dset=self.dset_id, feats=self.feats, model=self.model, lso=self.lso ) index = MultiIndex.from_arrays(arrays=(expids, folds)) index.levels[0].name = 'expid' index.levels[1].name = 'fold' mcooc_df = pd.DataFrame(data=scoocs, index=index, columns=sources) return mcooc_df.sort_index(axis=0).sort_index(axis=1)