예제 #1
0
def substructs_weights_one_source(source, model="logreg3", feats="ecfps1", dset="bcrp", num_expids=4096):
    """
    Given a source, what are the weights of all the substructures when this source is in train / in test for LSO for
    all requested expids. We now use the Hub. For the cases where the source is in train, it happens many times per
    expid so we take the average.
    """
    importances_source_in_lso = []
    expids = tuple(range(num_expids))
    hub = Hub(dset_id=dset, lso=True, model=model, feats=feats, expids=expids)
    source_coocs = hub.scoocs()

    # indices (expids, fold id) of source in test
    indices_in_test = source_coocs[source_coocs[source]].index
    indices_in_test = [
        (expid, foldnum) for (expid, foldnum) in indices_in_test if expid not in PROBLEMATIC_EXPIDS[dset]
    ]

    # indices (expids, fold ids) of source in train
    indices_in_train = source_coocs[source_coocs[source] == False].index
    # transform it into a dictionary of {expids:[foldnums]}
    indices_in_train_dict = defaultdict(list)
    for expid, foldnum in indices_in_train:
        if expid not in PROBLEMATIC_EXPIDS[dset]:
            indices_in_train_dict[expid].append(foldnum)

    # get corresponding weights
    weights, _, expids, foldnums = hub.logreg_models()
    rows_out = [
        row for row, (expid, foldnum) in enumerate(izip(expids, foldnums)) if (expid, foldnum) in indices_in_test
    ]

    weights_in_test = weights[rows_out, :].todense()

    # For train, we get several foldnums per expids and we want to average those weights
    for expid_in in indices_in_train_dict.keys():
        rows = [
            row
            for row, (expid, fold) in enumerate(izip(expids, foldnums))
            if expid == expid_in and fold in indices_in_train_dict[expid_in]
        ]
        w = weights[rows, :]
        w = np.squeeze(np.asarray(w.tocsc().mean(axis=0)))
        importances_source_in_lso.append(w)

    return indices_in_train_dict.keys(), np.array(importances_source_in_lso), np.asarray(weights_in_test)
예제 #2
0
def positive_negative_substructs(
    model="logreg3", feats="ecfps1", dset="bcrp", lso=True, num_expids=4096, top_interesting=20
):
    """
    Given a dataset, collect all weights for all substructures across all expids, then average them and check the
    extremes: positive weights mean a substructure that is likely to occur in inhibitors, negative weights mean
    substructures more likely to occur in non-inhibitors. Are we learning something?
    """
    hub = Hub(dset_id=dset, expids=num_expids, lso=lso, model=model, feats=feats)
    weights, _, expids, foldnums = hub.logreg_models()
    average_weights = np.asarray(weights.mean(axis=0))[0]
    i2s = ManysourcesDataset(dset).ecfps(no_dupes=True).i2s
    order = np.argsort(average_weights)
    ordered_substructures = i2s[order]
    ordered_importances = average_weights[order]
    top_inactives = zip(ordered_importances[0:top_interesting], ordered_substructures[0:top_interesting])
    top_inhibitors = zip(ordered_importances[-top_interesting:], ordered_substructures[-top_interesting:])
    # Let's plot them!
    from PIL import Image

    for weight, substr in top_inactives:
        plot_smarts(substr, "/home/flo/Desktop")
    ims = [Image.open(f) for f in glob.glob(op.join("/home/flo/Desktop", "*.png"))]
    num_lines = math.ceil(float(len(ims)) / 4)
    blank_image = Image.new("RGB", (800, int(num_lines * 200)), color="white")
    for i, im in enumerate(ims):
        im.thumbnail((200, 200), Image.ANTIALIAS)
        blank_image.paste(im, (200 * (i % 4), 200 * (i / 4)))
    blank_image.save(op.join(MANYSOURCES_ROOT, "data", "results", dset, "substructs_max_negative_weights_lso.png"))
    for f in glob.glob(op.join("/home/flo/Desktop", "*.png")):
        os.remove(f)
    for weight, substr in top_inhibitors:
        plot_smarts(substr, "/home/flo/Desktop")
    ims = [Image.open(f) for f in glob.glob(op.join("/home/flo/Desktop", "*.png"))]
    num_lines = math.ceil(float(len(ims)) / 4)
    blank_image = Image.new("RGB", (800, int(num_lines * 200)), color="white")
    for i, im in enumerate(ims):
        im.thumbnail((200, 200), Image.ANTIALIAS)
        blank_image.paste(im, (200 * (i % 4), 200 * (i / 4)))
    blank_image.save(op.join(MANYSOURCES_ROOT, "data", "results", dset, "substructs_max_positive_weights_lso.png"))
    for f in glob.glob(op.join("/home/flo/Desktop", "*.png")):
        os.remove(f)
    return top_inactives, top_inhibitors
예제 #3
0
    print '\t%s' % '\n\t'.join(X.columns[influential])


if __name__ == '__main__':

    MOLIDS = [
        's=Matsson_2009__n=Bromosulfalein',
        's=Zembruski_2011__n=103268452',
        's=Patel_2011__n=19',
        's=Ochoa-Puentes_2011__n=131273183',
        's=Jin_2006__n=Ginsenoside Rg1',
        's=Matsson_2007__n=Timolol',
    ]

    hub_lso = Hub(dset_id='bcrp', lso=True, expids=range(40000))
    hub_csr = Hub(dset_id='bcrp', lso=False, expids=range(40000))

    for molid, hub in product(sorted(hub_csr.mols().molids()), (hub_lso, hub_csr)):
        # print molid, hub.lso
        # rfr_the_loss(hub, molid)
        molid, most_influential, r2, _ = regress_the_loss(hub, molid, regressor=LinearRegression())
        print molid, hub.lso, r2
        for infmolid, coeff in most_influential:
            print '\t %.4f %s' % (coeff, infmolid)
        print '-' * 80


# MOLID = 'CHEMBL1951453'            # hERG
# MOLID = 'NOCAS_M43'                # mutagenicity, FAILS with BAD SMELL
# MOLID = '74-83-9'                  # mutagenicity
# MOLID = 'Bromocriptine'            # pgp-cruciani, BSEP HIT!!!