def substructs_weights_one_source(source, model="logreg3", feats="ecfps1", dset="bcrp", num_expids=4096): """ Given a source, what are the weights of all the substructures when this source is in train / in test for LSO for all requested expids. We now use the Hub. For the cases where the source is in train, it happens many times per expid so we take the average. """ importances_source_in_lso = [] expids = tuple(range(num_expids)) hub = Hub(dset_id=dset, lso=True, model=model, feats=feats, expids=expids) source_coocs = hub.scoocs() # indices (expids, fold id) of source in test indices_in_test = source_coocs[source_coocs[source]].index indices_in_test = [ (expid, foldnum) for (expid, foldnum) in indices_in_test if expid not in PROBLEMATIC_EXPIDS[dset] ] # indices (expids, fold ids) of source in train indices_in_train = source_coocs[source_coocs[source] == False].index # transform it into a dictionary of {expids:[foldnums]} indices_in_train_dict = defaultdict(list) for expid, foldnum in indices_in_train: if expid not in PROBLEMATIC_EXPIDS[dset]: indices_in_train_dict[expid].append(foldnum) # get corresponding weights weights, _, expids, foldnums = hub.logreg_models() rows_out = [ row for row, (expid, foldnum) in enumerate(izip(expids, foldnums)) if (expid, foldnum) in indices_in_test ] weights_in_test = weights[rows_out, :].todense() # For train, we get several foldnums per expids and we want to average those weights for expid_in in indices_in_train_dict.keys(): rows = [ row for row, (expid, fold) in enumerate(izip(expids, foldnums)) if expid == expid_in and fold in indices_in_train_dict[expid_in] ] w = weights[rows, :] w = np.squeeze(np.asarray(w.tocsc().mean(axis=0))) importances_source_in_lso.append(w) return indices_in_train_dict.keys(), np.array(importances_source_in_lso), np.asarray(weights_in_test)
def positive_negative_substructs( model="logreg3", feats="ecfps1", dset="bcrp", lso=True, num_expids=4096, top_interesting=20 ): """ Given a dataset, collect all weights for all substructures across all expids, then average them and check the extremes: positive weights mean a substructure that is likely to occur in inhibitors, negative weights mean substructures more likely to occur in non-inhibitors. Are we learning something? """ hub = Hub(dset_id=dset, expids=num_expids, lso=lso, model=model, feats=feats) weights, _, expids, foldnums = hub.logreg_models() average_weights = np.asarray(weights.mean(axis=0))[0] i2s = ManysourcesDataset(dset).ecfps(no_dupes=True).i2s order = np.argsort(average_weights) ordered_substructures = i2s[order] ordered_importances = average_weights[order] top_inactives = zip(ordered_importances[0:top_interesting], ordered_substructures[0:top_interesting]) top_inhibitors = zip(ordered_importances[-top_interesting:], ordered_substructures[-top_interesting:]) # Let's plot them! from PIL import Image for weight, substr in top_inactives: plot_smarts(substr, "/home/flo/Desktop") ims = [Image.open(f) for f in glob.glob(op.join("/home/flo/Desktop", "*.png"))] num_lines = math.ceil(float(len(ims)) / 4) blank_image = Image.new("RGB", (800, int(num_lines * 200)), color="white") for i, im in enumerate(ims): im.thumbnail((200, 200), Image.ANTIALIAS) blank_image.paste(im, (200 * (i % 4), 200 * (i / 4))) blank_image.save(op.join(MANYSOURCES_ROOT, "data", "results", dset, "substructs_max_negative_weights_lso.png")) for f in glob.glob(op.join("/home/flo/Desktop", "*.png")): os.remove(f) for weight, substr in top_inhibitors: plot_smarts(substr, "/home/flo/Desktop") ims = [Image.open(f) for f in glob.glob(op.join("/home/flo/Desktop", "*.png"))] num_lines = math.ceil(float(len(ims)) / 4) blank_image = Image.new("RGB", (800, int(num_lines * 200)), color="white") for i, im in enumerate(ims): im.thumbnail((200, 200), Image.ANTIALIAS) blank_image.paste(im, (200 * (i % 4), 200 * (i / 4))) blank_image.save(op.join(MANYSOURCES_ROOT, "data", "results", dset, "substructs_max_positive_weights_lso.png")) for f in glob.glob(op.join("/home/flo/Desktop", "*.png")): os.remove(f) return top_inactives, top_inhibitors
print '\t%s' % '\n\t'.join(X.columns[influential]) if __name__ == '__main__': MOLIDS = [ 's=Matsson_2009__n=Bromosulfalein', 's=Zembruski_2011__n=103268452', 's=Patel_2011__n=19', 's=Ochoa-Puentes_2011__n=131273183', 's=Jin_2006__n=Ginsenoside Rg1', 's=Matsson_2007__n=Timolol', ] hub_lso = Hub(dset_id='bcrp', lso=True, expids=range(40000)) hub_csr = Hub(dset_id='bcrp', lso=False, expids=range(40000)) for molid, hub in product(sorted(hub_csr.mols().molids()), (hub_lso, hub_csr)): # print molid, hub.lso # rfr_the_loss(hub, molid) molid, most_influential, r2, _ = regress_the_loss(hub, molid, regressor=LinearRegression()) print molid, hub.lso, r2 for infmolid, coeff in most_influential: print '\t %.4f %s' % (coeff, infmolid) print '-' * 80 # MOLID = 'CHEMBL1951453' # hERG # MOLID = 'NOCAS_M43' # mutagenicity, FAILS with BAD SMELL # MOLID = '74-83-9' # mutagenicity # MOLID = 'Bromocriptine' # pgp-cruciani, BSEP HIT!!!