fold1 = next(KFold(n_splits=test_n_splits, shuffle=True).split(isect)) tests = np.array([v for i, v in enumerate(isect) if i in fold1[1]]) np.save(Path(f'data/test/scop40_{n_splits}fold_sf{sf_sunid}_testdata_{test_n_splits}fold.npy'), tests) # select domain sids only in scop40 for training data train = np.setdiff1d(samples, tests) np.save(Path(f'data/train/scop40_{n_splits}fold_trainingdata.npy'), train) # {sf: [sid]} list for making alignment pairs in the same superfamily hie = {} for i in train: dom = scop100_hie.getDomainBySid(i) if dom: sf = dom.getAscendent('sf').sccs else: # FIX: Why nothing? continue if sf in hie: hie[sf].append(i) else: hie[sf] = [i] pickle.dump(hie, Path(f'data/train/scop40_{n_splits}fold_sf{sf_sunid}_hie.pkl').open('wb')) else: train = np.array([x for x in scop40 if x not in test_data]) np.save(Path(f'data/train/scop40_{n_splits}fold.npy'), train) test = np.array(test_data) np.save(Path(f'data/test/scop40_{n_splits}fold.npy'), test) hie = {} for sf in scop100_hie.getRoot().getDescendents('sf'): isect = np.intersect1d(train, np.array([x.sid for x in sf.getDescendents('px')])) hie[sf.sunid] = isect.tolist() pickle.dump(hie, Path(f'data/train/scop40_{n_splits}fold_hie.pkl').open('wb'))