Пример #1
0
    fold1 = next(KFold(n_splits=test_n_splits, shuffle=True).split(isect))
    tests = np.array([v for i, v in enumerate(isect) if i in fold1[1]])
    np.save(Path(f'data/test/scop40_{n_splits}fold_sf{sf_sunid}_testdata_{test_n_splits}fold.npy'), tests)
    # select domain sids only in scop40 for training data
    train = np.setdiff1d(samples, tests)
    np.save(Path(f'data/train/scop40_{n_splits}fold_trainingdata.npy'), train)
    # {sf: [sid]} list for making alignment pairs in the same superfamily
    hie = {}
    for i in train:
        dom = scop100_hie.getDomainBySid(i)
        if dom:
            sf = dom.getAscendent('sf').sccs
        else:
            # FIX: Why nothing?
            continue
        if sf in hie:
            hie[sf].append(i)
        else:
            hie[sf] = [i]
    pickle.dump(hie, Path(f'data/train/scop40_{n_splits}fold_sf{sf_sunid}_hie.pkl').open('wb'))
else:
    train = np.array([x for x in scop40 if x not in test_data])
    np.save(Path(f'data/train/scop40_{n_splits}fold.npy'), train)
    test = np.array(test_data)
    np.save(Path(f'data/test/scop40_{n_splits}fold.npy'), test)
    hie = {}
    for sf in scop100_hie.getRoot().getDescendents('sf'):
        isect = np.intersect1d(train, np.array([x.sid for x in sf.getDescendents('px')]))
        hie[sf.sunid] = isect.tolist()
    pickle.dump(hie, Path(f'data/train/scop40_{n_splits}fold_hie.pkl').open('wb'))