def generate_df_results(molid, importances, dset, feats, model, calibration, lso): cooccurrences, molids, expids, folds = molecules_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso) df_losses, folds_df = read_losses(dset=dset, feats=feats, model=model, calibration=calibration, lso=lso, also_folds=True) dico_for_df = defaultdict(dict) MRDK = ManysourcesDataset(dset).mols() for other_molid in molids: if other_molid == molid: continue dico_for_df[other_molid] = {'importance': importances[np.where(molids == other_molid)[0][0]], 'cooc_loss': average_loss(molid, other_molid, cooccurrences, molids, expids, df_losses), 'smiles': MolToSmiles(MRDK.molid2mol(other_molid))} df = pd.DataFrame.from_dict(dico_for_df, orient='index') df.index.names = ['molid'] df['relabsimportance'] = df.importance.abs() / df.importance.abs().sum() return df[['relabsimportance', 'importance', 'smiles', 'cooc_loss']]
def get_X_source(molid, expids, dset, feats, model, lso=True): """ Given a molid and an experiment coordinate, retrieves the matrix of cooccurrences for the folds when the source of that particular molid was in test """ MRDK = ManysourcesDataset(dset).mols() cooc, sources, _, _ = sources_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso) source_of_molid = MRDK.molid2source(molid) index_of_source = np.where(sources == source_of_molid)[0][0] col = cooc[:, index_of_source] # the column on which we put the condition interesting_Cooc = cooc[col] # the matrix X # filter out the rows where we had troubles validating the model X = interesting_Cooc[expids, :] X = np.array(X, dtype=np.int) return X
def average_loss_source(molid, source, scoocs, sources, expids, losses_df, dset): """ Molid is the target molid (the one for which we have built the model) source2 is the source for which we know its importance for the molid1 We want to compute the average loss of molid1 when source2 is in test """ MRDK = ManysourcesDataset(dset).mols() source1 = MRDK.molid2source(molid) # sources index # FIXME: this should be taken from hub / molecules source1_index = np.where(sources == source1)[0][0] molids_in_source = MRDK.source2molids(source) source2_index = np.where(sources == source)[0][0] # coocurrences target_in_test = scoocs[:, source1_index] impsrc_in_train = ~scoocs[:, source2_index] # FIXME: this must be parameterizable expids = expids[target_in_test & impsrc_in_train] losses_mol1 = losses_df.loc[expids, molids_in_source] return losses_mol1.mean().mean()
def generate_df_results_source(molid, importances, dset, feats, model, calibration, lso): cooccurrences, sources, expids, folds = sources_coocurrences_df(dset=dset, feats=feats, model=model, lso=lso) df_losses, folds_df = read_losses(dset=dset, feats=feats, model=model, calibration=calibration, lso=lso, also_folds=True) dico_for_df = defaultdict(dict) MRDK = ManysourcesDataset(dset).mols() for src in sources: if src == MRDK.molid2source(molid): continue dico_for_df[src] = {'importance': importances[np.where(sources == src)[0][0]], 'cooc_loss': average_loss_source(molid, src, cooccurrences, sources, expids, df_losses, dset)} df = pd.DataFrame.from_dict(dico_for_df, orient='index') df.index.names = ['source'] df['relabsimportance'] = df.importance.abs() / df.importance.abs().sum() return df[['relabsimportance', 'importance', 'cooc_loss']]
self._molids.append(molid) except Exception, _: warning('Could not compute unfolded fingerprint for molecule %s' % molid) self._failed_moldids.append(molid) def fingerprints(self): i2s = [smiles for smiles, _ in sorted(self._s2i.items(), key=itemgetter(1))] csr = coo_matrix((self._vals, (self._rows, self._cols)), shape=(len(self._molids), len(self._s2i))).tocsr() return UnfoldedFingerprints(self._molids, i2s, csr, failed_molids=self._failed_moldids) if __name__ == '__main__': from manysources.datasets import ManysourcesDataset, MANYSOURCES_MOLECULES for dset in MANYSOURCES_MOLECULES.keys(): print dset dset = ManysourcesDataset(dset) ufp = dset.ecfps() molids, X, Y = dset.ecfps_molidsXY() # This should find "duplicate" features dupe_columns = find_sparse_dupes(X, by_rows=False) for group in dupe_columns: if len(group) > 1: print 'Duplicated features: %s' % ' '.join(map(str, group)) print '\t%s' % ' '.join(ufp.substructures(list(group))) # This should remove "duplicate" features # nnz_before = X.nnz # X = zero_dupes(X, by_rows=False) # print 'Before: %d; After: %d' % (nnz_before, X.nnz) # This should find duplicate rows dupe_rows = find_sparse_dupes(X, by_rows=True) # Groups of duplicates... for group in dupe_rows:
def do_for_one_molid(calibration, dset, feats, lso, model, molid, results_dir, rm_factory, by_source=False): print molid MRDK = ManysourcesDataset(dset).mols() # FIXME: this is read on each job, so once per molecule ATM... # Train and evaluate the model y, expids = get_y(molid, dset, feats, model, calibration, lso) if not by_source: X = get_X(molid, expids, dset, feats, model, lso) else: X = get_X_source(molid, expids, dset, feats, model) # makes no sense to run by source on LSO=False X = ~X # coocurrences in train, less sparse, but better interpretation unless we tweak well the numbers... rsquared, feat_weights, trained_model = build_and_validate_regression_model(X, y, model_factory=rm_factory) rsquared = float(rsquared) # REMOVE moldir shows r2 moldir = op.join(results_dir, 'r2=%.2f__%s' % (rsquared, molid)) ensure_dir(moldir) # Save the model pd.to_pickle(trained_model, op.join(moldir, 'model_trained_rsquare=%.2f.pkl' % rsquared)) # Save the smiles smiles = MolToSmiles(MRDK.molid2mol(molid)) with open(op.join(moldir, 'smiles.txt'), 'w') as writer: writer.write(smiles) # Save the molecule-influence table if not by_source: df = generate_df_results(molid, feat_weights, dset, feats, model, calibration, lso) pd.to_pickle(df, op.join(moldir, 'results_df.pkl')) df.loc[molid] = (1E16, rsquared, smiles, np.mean(y)) # FIXME df['label'] = map(MRDK.molid2label, df.index) df = df[['label', 'relabsimportance', 'importance', 'smiles', 'cooc_loss']] df = df.sort('relabsimportance', ascending=False) df.head(20).to_html(op.join(moldir, 'results_df.html')) else: df = generate_df_results_source(molid, feat_weights, dset, feats, model, calibration, lso) pd.to_pickle(df, op.join(moldir, 'results_df_bysource.pkl')) df = df.sort('relabsimportance', ascending=False) df.head(20).to_html(op.join(moldir, 'results_df_bysource.html')) # Plot the distribution of losses (y) plt.figure() seaborn.distplot(y, bins=40) plt.xlim((-0.05, 1.05)) plt.title('molid=%s, r2=%.2f' % (molid, rsquared)) plt.savefig(op.join(moldir, 'y_dist.png'), bbox_inches='tight') plt.close() # --- WIP gridspec with chemdeco pics and things like that if not by_source: show_top = 4 gs = gridspec.GridSpec(show_top, 2) fig = plt.figure(figsize=(24, 16)) # Plot the molecule itself ax_mol = fig.add_subplot(gs[0:show_top / 2, 0]) ax_mol.grid(False) ax_mol.get_xaxis().set_ticks([]) ax_mol.get_yaxis().set_ticks([]) mol = MRDK.molid2mol(molid) AllChem.Compute2DCoords(mol) ax_mol.imshow(artdeco2(rdkit2im(mol, size=(400, 400)), color='red' if df.loc[molid]['label'] == 'INHIBITOR' else 'green', chorrada=5)) # Plot the distribution of losses ax_distr = fig.add_subplot(gs[show_top / 2:0, 0]) seaborn.distplot(y, bins=40, ax=ax_distr) # Plot the top (we should align all to a common scaffold and maybe highlight substructures that matter) for rank, (inf_molid, row) in enumerate(df.iloc[1:show_top + 1].iterrows()): ax_influential_mol = fig.add_subplot(gs[rank, 1]) ax_influential_mol.grid(False) ax_influential_mol.get_xaxis().set_ticks([]) ax_influential_mol.get_yaxis().set_ticks([]) mol_color = 'red' if row['label'] == 'INHIBITOR' else 'green' good_or_bad_color = 'red' if row['importance'] > 0 else 'green' # add decos mol = MRDK.molid2mol(inf_molid) AllChem.Compute2DCoords(mol) image = rdkit2im(mol) image = artdeco1(image, decos=(('black', good_or_bad_color),)) image = artdeco2(image, color=mol_color) ax_influential_mol.imshow(image) ax_influential_mol.set_title('%s, inf=%.4f, cooc_loss=%.4f' % (inf_molid, row['importance'], row['cooc_loss'])) # FIXME: cooc_loss also with stddev and standard error fig.suptitle('%s, r2=%.2f, cooc_loss=%.4f +/- %.4f' % (molid, rsquared, float(np.mean(y)), float(np.std(y)))) plt.savefig(op.join(moldir, 'verde_que_te_quiero_verde.png'), bbox_inches='tight') plt.close()