Python Hub 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: manysources.hub

클래스/타입: Hub

hotexamples.com에서의 예제들: 3

Python Hub - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 manysources.hub.Hub에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

logreg_models(2)

mols(1)

scoocs(1)

예제 #1

파일 보기

파일: substructures.py 프로젝트: sdvillal/manysources

def substructs_weights_one_source(source, model="logreg3", feats="ecfps1", dset="bcrp", num_expids=4096):
    """
    Given a source, what are the weights of all the substructures when this source is in train / in test for LSO for
    all requested expids. We now use the Hub. For the cases where the source is in train, it happens many times per
    expid so we take the average.
    """
    importances_source_in_lso = []
    expids = tuple(range(num_expids))
    hub = Hub(dset_id=dset, lso=True, model=model, feats=feats, expids=expids)
    source_coocs = hub.scoocs()

    # indices (expids, fold id) of source in test
    indices_in_test = source_coocs[source_coocs[source]].index
    indices_in_test = [
        (expid, foldnum) for (expid, foldnum) in indices_in_test if expid not in PROBLEMATIC_EXPIDS[dset]
    ]

    # indices (expids, fold ids) of source in train
    indices_in_train = source_coocs[source_coocs[source] == False].index
    # transform it into a dictionary of {expids:[foldnums]}
    indices_in_train_dict = defaultdict(list)
    for expid, foldnum in indices_in_train:
        if expid not in PROBLEMATIC_EXPIDS[dset]:
            indices_in_train_dict[expid].append(foldnum)

    # get corresponding weights
    weights, _, expids, foldnums = hub.logreg_models()
    rows_out = [
        row for row, (expid, foldnum) in enumerate(izip(expids, foldnums)) if (expid, foldnum) in indices_in_test
    ]

    weights_in_test = weights[rows_out, :].todense()

    # For train, we get several foldnums per expids and we want to average those weights
    for expid_in in indices_in_train_dict.keys():
        rows = [
            row
            for row, (expid, fold) in enumerate(izip(expids, foldnums))
            if expid == expid_in and fold in indices_in_train_dict[expid_in]
        ]
        w = weights[rows, :]
        w = np.squeeze(np.asarray(w.tocsc().mean(axis=0)))
        importances_source_in_lso.append(w)

    return indices_in_train_dict.keys(), np.array(importances_source_in_lso), np.asarray(weights_in_test)

예제 #2

파일 보기

파일: substructures.py 프로젝트: sdvillal/manysources

def positive_negative_substructs(
    model="logreg3", feats="ecfps1", dset="bcrp", lso=True, num_expids=4096, top_interesting=20
):
    """
    Given a dataset, collect all weights for all substructures across all expids, then average them and check the
    extremes: positive weights mean a substructure that is likely to occur in inhibitors, negative weights mean
    substructures more likely to occur in non-inhibitors. Are we learning something?
    """
    hub = Hub(dset_id=dset, expids=num_expids, lso=lso, model=model, feats=feats)
    weights, _, expids, foldnums = hub.logreg_models()
    average_weights = np.asarray(weights.mean(axis=0))[0]
    i2s = ManysourcesDataset(dset).ecfps(no_dupes=True).i2s
    order = np.argsort(average_weights)
    ordered_substructures = i2s[order]
    ordered_importances = average_weights[order]
    top_inactives = zip(ordered_importances[0:top_interesting], ordered_substructures[0:top_interesting])
    top_inhibitors = zip(ordered_importances[-top_interesting:], ordered_substructures[-top_interesting:])
    # Let's plot them!
    from PIL import Image

    for weight, substr in top_inactives:
        plot_smarts(substr, "/home/flo/Desktop")
    ims = [Image.open(f) for f in glob.glob(op.join("/home/flo/Desktop", "*.png"))]
    num_lines = math.ceil(float(len(ims)) / 4)
    blank_image = Image.new("RGB", (800, int(num_lines * 200)), color="white")
    for i, im in enumerate(ims):
        im.thumbnail((200, 200), Image.ANTIALIAS)
        blank_image.paste(im, (200 * (i % 4), 200 * (i / 4)))
    blank_image.save(op.join(MANYSOURCES_ROOT, "data", "results", dset, "substructs_max_negative_weights_lso.png"))
    for f in glob.glob(op.join("/home/flo/Desktop", "*.png")):
        os.remove(f)
    for weight, substr in top_inhibitors:
        plot_smarts(substr, "/home/flo/Desktop")
    ims = [Image.open(f) for f in glob.glob(op.join("/home/flo/Desktop", "*.png"))]
    num_lines = math.ceil(float(len(ims)) / 4)
    blank_image = Image.new("RGB", (800, int(num_lines * 200)), color="white")
    for i, im in enumerate(ims):
        im.thumbnail((200, 200), Image.ANTIALIAS)
        blank_image.paste(im, (200 * (i % 4), 200 * (i / 4)))
    blank_image.save(op.join(MANYSOURCES_ROOT, "data", "results", dset, "substructs_max_positive_weights_lso.png"))
    for f in glob.glob(op.join("/home/flo/Desktop", "*.png")):
        os.remove(f)
    return top_inactives, top_inhibitors

예제 #3

파일 보기

파일: loss_regression.py 프로젝트: sdvillal/manysources

    print '\t%s' % '\n\t'.join(X.columns[influential])


if __name__ == '__main__':

    MOLIDS = [
        's=Matsson_2009__n=Bromosulfalein',
        's=Zembruski_2011__n=103268452',
        's=Patel_2011__n=19',
        's=Ochoa-Puentes_2011__n=131273183',
        's=Jin_2006__n=Ginsenoside Rg1',
        's=Matsson_2007__n=Timolol',
    ]

    hub_lso = Hub(dset_id='bcrp', lso=True, expids=range(40000))
    hub_csr = Hub(dset_id='bcrp', lso=False, expids=range(40000))

    for molid, hub in product(sorted(hub_csr.mols().molids()), (hub_lso, hub_csr)):
        # print molid, hub.lso
        # rfr_the_loss(hub, molid)
        molid, most_influential, r2, _ = regress_the_loss(hub, molid, regressor=LinearRegression())
        print molid, hub.lso, r2
        for infmolid, coeff in most_influential:
            print '\t %.4f %s' % (coeff, infmolid)
        print '-' * 80


# MOLID = 'CHEMBL1951453'            # hERG
# MOLID = 'NOCAS_M43'                # mutagenicity, FAILS with BAD SMELL
# MOLID = '74-83-9'                  # mutagenicity
# MOLID = 'Bromocriptine'            # pgp-cruciani, BSEP HIT!!!