def HIV_CV(DB, data_type, list_ID, list_y, list_SMILES, dict_id2smile, n_folds): if data_type == 'kernel': if not os.path.isfile('data/' + DB + '/' + DB + '_K.npy'): K = mol_build_K(list_SMILES) np.save('data/' + DB + '/' + DB + '_K', K) else: K = np.load('data/' + DB + '/' + DB + '_K.npy') list_assignment = np.zeros(K.shape[0]) for y in [0, 1]: indices = np.where(list_y == y)[0] K_local = K[indices, :] K_local = K_local[:, indices] local_assignment = Khierarchical_cluster(K_local, n_folds) list_assignment[indices] = local_assignment elif data_type == 'features': if not os.path.isfile('data/' + DB + '/' + DB + '_X.npy'): X = mol_build_X(list_SMILES) np.save('data/' + DB + '/' + DB + '_X', X) else: X = np.load('data/' + DB + '/' + DB + '_X.npy') list_assignment = np.zeros(X.shape[0]) for y in [0, 1]: indices = np.where(list_y == y)[0] X_local = X[indices, :] local_assignment = Xkmeans_cluster(X_local, n_folds) list_assignment[indices] = local_assignment elif data_type == 'standard': if not os.path.isfile('data/' + DB + '/' + DB + '_X.npy'): X = mol_build_X(list_SMILES) np.save('data/' + DB + '/' + DB + '_X', X) else: X = np.load('data/' + DB + '/' + DB + '_X.npy') list_assignment = np.zeros(X.shape[0]) skf = model_selection.StratifiedKFold(n_folds, shuffle=True, random_state=92) skf.get_n_splits(X, list_y) ifold = 0 for train_index, test_index in skf.split(X, list_y): list_assignment[test_index] = ifold ifold += 1 # import pdb; pdb.Pdb().set_trace() c = collections.Counter(list_assignment) print(c) folds = [np.where(list_assignment == cle)[0] for cle in list(c.keys())] fo = open('data/' + DB + '/' + DB + '_folds.txt', 'w') for ifold in range(n_folds): fo.write("ifold" + str(ifold) + '\n') fo.write(str(collections.Counter(list_y[folds[ifold]])) + '\n') print(ifold, collections.Counter(list_y[folds[ifold]])) fo.write('\n') return folds
def SecondaryStructure_CV(DB, data_type, list_ID, list_y, list_FASTA, dict_id2fasta, n_folds): if data_type == 'kernel': if not os.path.isfile('data/' + DB + '/' + DB + '_K.npy'): print('data/' + DB + '/' + DB + '_K.npy', 'does not exist') else: K = np.load('data/' + DB + '/' + DB + '_K.npy') list_assignment = Khierarchical_cluster(K, n_folds) elif data_type == 'features': if not os.path.isfile('data/' + DB + '/' + DB + '_X.npy'): print('data/' + DB + '/' + DB + '_X.npy', 'does not exist') else: X = np.load('data/' + DB + '/' + DB + '_X.npy') list_assignment = Xkmeans_cluster(X, n_folds) elif data_type == 'standard': X = np.zeros((len(list_ID), 1)) list_assignment = np.zeros(X.shape[0]) skf = model_selection.KFold(n_folds, shuffle=True, random_state=92) skf.get_n_splits(X) ifold = 0 for train_index, test_index in skf.split(X): list_assignment[test_index] = ifold ifold += 1 import pdb pdb.Pdb().set_trace() c = collections.Counter(list_assignment) folds = [np.where(list_assignment == cle)[0] for cle in list(c.keys())] fo = open('data/' + DB + '/' + DB + '_folds.txt', 'w') for ifold in range(n_folds): # import pdb; pdb.Pdb().set_trace() fo.write("ifold " + str(ifold) + '\t' + str( collections.Counter( [el for ll in list_y[folds[ifold]] for el in ll]))) fo.write('\n') print("ifold " + str(ifold) + '\t' + str( collections.Counter( [el for ll in list_y[folds[ifold]] for el in ll]))) fo.close() return folds
def AtomizationEnergy_CV(DB, data_type, list_ID, list_y, list_SMILES, dict_id2smile, n_folds): if data_type == 'kernel': if not os.path.isfile('data/' + DB + '/' + DB + '_K.npy'): K = mol_build_K(list_SMILES) np.save('data/' + DB + '/' + DB + '_K', K) else: K = np.load('data/' + DB + '/' + DB + '_K.npy') list_assignment = Khierarchical_cluster(K, n_folds) elif data_type == 'features': if not os.path.isfile('data/' + DB + '/' + DB + '_X.npy'): X = mol_build_X(list_SMILES) np.save('data/' + DB + '/' + DB + '_X', X) else: X = np.load('data/' + DB + '/' + DB + '_X.npy') list_assignment = Xkmeans_cluster(X, n_folds) elif data_type == 'standard': if not os.path.isfile('data/' + DB + '/' + DB + '_X.npy'): X = mol_build_X(list_SMILES) np.save('data/' + DB + '/' + DB + '_X', X) else: X = np.load('data/' + DB + '/' + DB + '_X.npy') list_assignment = np.zeros(X.shape[0]) skf = model_selection.KFold(n_folds, shuffle=True, random_state=92) skf.get_n_splits(X) ifold = 0 for train_index, test_index in skf.split(X): list_assignment[test_index] = ifold ifold += 1 # import pdb; pdb.Pdb().set_trace() c = collections.Counter(list_assignment) print(c) folds = [np.where(list_assignment == cle)[0] for cle in list(c.keys())] fo = open('data/' + DB + '/' + DB + '_folds.txt', 'w') fo.write(str(c) + '\n') fo.close() return folds
def PCBA_CV(DB, data_type, list_ID, list_y, list_SMILES, dict_id2smile, n_folds): if data_type == 'kernel': if not os.path.isfile('data/' + DB + '/' + DB + '_K.npy'): K = mol_build_K(list_SMILES) np.save('data/' + DB + '/' + DB + '_K', K) else: K = np.load('data/' + DB + '/' + DB + '_K.npy') if DB == 'PCBA': # if Kmedoid # list_assignment, medoids = Kmedoid_cluster(K, n_folds) # if agglomerative clustering list_assignment = Khierarchical_cluster(K, n_folds) else: list_assignment = np.zeros(K.shape[0]) for y in [0, 1]: indices = np.where(list_y == y)[0] K_local = K[indices, :] K_local = K_local[:, indices] local_assignment = Khierarchical_cluster(K_local, n_folds) list_assignment[indices] = local_assignment elif data_type == 'features': if not os.path.isfile('data/' + DB + '/' + DB + '_X.npy'): X = mol_build_X(list_SMILES) np.save('data/' + DB + '/' + DB + '_X', X) else: X = np.load('data/' + DB + '/' + DB + '_X.npy') if DB == 'PCBA': list_assignment = Xkmeans_cluster(X, n_folds) else: list_assignment = np.zeros(X.shape[0]) for y in [0, 1]: indices = np.where(list_y == y)[0] X_local = X[indices, :] local_assignment = Xkmeans_cluster(X_local, n_folds) list_assignment[indices] = local_assignment elif data_type == 'standard': # if not os.path.isfile('data/' + DB + '/' + DB + '_X.npy'): # X = mol_build_X(list_SMILES) # np.save('data/' + DB + '/' + DB + '_X', X) # else: # X = np.load('data/' + DB + '/' + DB + '_X.npy') list_ID = pickle.load(open('data/' + DB + '/' + DB + '_list_ID.data', 'rb')) list_y = np.array(pickle.load(open('data/' + DB + '/' + DB + '_list_y.data', 'rb'))) X = np.zeros((len(list_ID), 1)) list_assignment = np.zeros(X.shape[0]) if DB not in ['PCBA', 'PCBA10', 'PCBA100']: skf = model_selection.StratifiedKFold(n_folds, shuffle=True, random_state=92) skf.get_n_splits(X, list_y) ifold = 0 for train_index, test_index in skf.split(X, list_y): list_assignment[test_index] = ifold ifold += 1 else: skf = model_selection.KFold(n_folds, shuffle=True, random_state=92) skf.get_n_splits(X) ifold = 0 for train_index, test_index in skf.split(X): list_assignment[test_index] = ifold ifold += 1 # import pdb; pdb.Pdb().set_trace() c = collections.Counter(list_assignment) print(c) folds = [np.where(list_assignment == cle)[0] for cle in list(c.keys())] fo = open('data/' + DB + '/' + DB + '_folds.txt', 'w') for ifold in range(n_folds): fo.write("ifold" + str(ifold) + '\n') if DB in ['PCBA', 'PCBA10', 'PCBA100']: for iclass in range(list_y.shape[1]): fo.write("iclass " + str(iclass) + ' ' + str(collections.Counter(list_y[folds[ifold], iclass])) + '\n') print("iclass " + str(iclass) + ' ' + str(collections.Counter(list_y[folds[ifold], iclass]))) else: fo.write(str(collections.Counter(list_y[folds[ifold]])) + '\n') print(ifold, collections.Counter(list_y[folds[ifold]])) fo.write('\n') return folds