def main(args): BSZ, RUNS, LR, N_EPOCH = args.batch_size, args.runs, args.lr, args.epochs DATAFOLDER = Path(args.datafolder) assert DATAFOLDER.exists() OUTPUT = args.output_dir + DATAFOLDER.resolve().stem Path(OUTPUT).mkdir(parents=True, exist_ok=True) # load data NCLASS = len(args.target_name) if len(args.target_name) > 1: OUTPUT += '_'.join(t for t in args.target_name) else: OUTPUT += args.target_name[0] train_data, norm_fn, rstr_fn =\ construct_dataset(DATAFOLDER/'train.csv', args.target_name, args.sample, args.use_tqdm) val_smiles, val_y = load_csv(DATAFOLDER / 'val.csv', args.target_name, args.sample) test_smiles, test_y = load_csv(DATAFOLDER / 'test.csv', args.target_name, args.sample) if len(args.target_name) == 1: val_y, test_y = np.squeeze(val_y), np.squeeze(test_y) val_data = MolData(val_smiles, norm_fn(val_y), args.use_tqdm) test_data = MolData(test_smiles, norm_fn(test_y), args.use_tqdm) if args.fp_method == FP_METHODS[0]: raise NotImplementedError res = [] for _ in range(RUNS): train_loader = DataLoader(train_data, batch_size=BSZ, shuffle=True, drop_last=True, pin_memory=True) valid_loader = DataLoader(val_data, batch_size=BSZ, shuffle=False, pin_memory=True) test_loader = DataLoader(test_data, batch_size=BSZ, shuffle=False) # net = QSAR(hid_dim=128, n_class=NCLASS) net = create_net(hid_dim=128, n_class=NCLASS, pre_trained=args.fine_tune) model_path = OUTPUT + "_" net = net.fit(train_loader, valid_loader, epochs=N_EPOCH, path=model_path, criterion=nn.MSELoss(), lr=LR) score = net.predict(test_loader) gt = test_y prd = rstr_fn(score) res.append(mse(gt, prd, 0)) print(f"mse_{DATAFOLDER.stem}_RUN_{_}: {mse(gt, prd)}") print(f"mse_percls_{DATAFOLDER.stem}_RUN_{_}: {mse(gt, prd, 0)}") avg_mse, std_mse = np.asarray(res).mean(), np.asarray(res).std() return avg_mse, std_mse
def main(reg=False, is_extra=True): pair = ['TARGET_CHEMBLID', 'CMPD_CHEMBLID', 'PCHEMBL_VALUE', 'CANONICAL_SMILES', 'ACTIVITY_COMMENT', 'STANDARD_TYPE', 'RELATION'] df = pd.read_csv('data/AR_ALL.csv') cmps = df.set_index(pair[1])[pair[3]].drop_duplicates() df = df[pair].set_index(pair[0:2]) df['PCHEMBL_VALUE'] = df.groupby(pair[0:2]).mean() numery = df[pair[2:4]].dropna().drop_duplicates() comments = df[(df.ACTIVITY_COMMENT.str.contains('Not Active') == True)] inhibits = df[(df.STANDARD_TYPE == 'Inhibition') & df.RELATION.isin(['<', '<='])] relations = df[df.STANDARD_TYPE.isin(['EC50', 'IC50', 'Kd', 'Ki']) & df.RELATION.isin(['>', '>='])] binary = pd.concat([comments, inhibits, relations], axis=0) binary = binary[~binary.index.isin(numery.index)] binary['PCHEMBL_VALUE'] = 3.99 binary = binary[pair[2:4]].dropna().drop_duplicates() df = numery.append(binary) df = df[pair[2]].unstack(pair[0]) df = df.sample(len(df)) if reg: test = binary[pair[2]].sample(len(binary)).unstack(pair[0]) else: df = (df > 6.5).astype(float) test = df.sample(len(df)//8) df = df.drop(test.index) data = df if is_extra else numery.sample(len(numery)) indep_set = MolData(cmps.loc[test.index], test.values) indep_loader = DataLoader(indep_set, batch_size=BATCH_SIZE) folds = KFold(5).split(data) cvs = np.zeros(data.shape) inds = np.zeros(test.shape) out = 'output/gcn%s' % ('_' + subset if subset else '') for i, (trained, valided) in enumerate(folds): trained, valided = data.iloc[trained], data.iloc[valided] train_set = MolData(cmps.loc[trained.index], trained.values) valid_set = MolData(cmps.loc[valided.index], valided.values) train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, drop_last=True) valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE) net = QSAR(hid_dim=128, n_class=data.shape[1]).to(util.dev) net = net.fit(train_loader, valid_loader, epochs=N_EPOCH, path='%s_%d' % (out, i)) print('Evaluation of Loss in validation Set: %f' % net.evaluate(valid_loader)) print('Evaluation of Loss in independent Set: %f' % net.evaluate(indep_loader)) cvs[valided] = net.predict(valid_loader) inds += net.predict(indep_loader) data_score, test_score = pd.DataFrame(), pd.DataFrame() data_score['LABEL'] = data.stack() test_score['LABEL'] = test.stack() data_score['SCORE'] = pd.DataFrame(cvs, index=data.index, columns=data.columns).stack() test_score['SCORE'] = pd.DataFrame(inds, index=test.index, columns=test.columns).stack() data_score.to_csv(out + '.cv.txt') test_score.to_csv(out + '.ind.txt')
def construct_dataset(data_file, target_name, sample=None, use_tqdm=None): x, y = load_csv(data_file, target_name, sample) if len(target_name) == 1: # PyTorch MSELoss treats one class differently from multi-class y = np.squeeze(y) norm_fn, rstr_fn = normalize_array(y) target = norm_fn(y) data = MolData(x, target, use_tqdm=use_tqdm) return data, norm_fn, rstr_fn
def build_data_net(args, target): if args.fp_method == FP_METHODS[0]: #""" CFP """ data = SmileData(SMILES, target, fp_len=FP_LEN, radius=4) net = lambda: MLP(hid_dim=FP_LEN, n_class=1) return data, net elif args.fp_method == FP_METHODS[1]: #""" NFP """ net = lambda: QSAR(hid_dim=128, n_class=1) data = MolData(SMILES, target) return data, net else: raise NotImplementedError
INPUT = Path(args.input_file) if not INPUT.exists(): raise FileNotFoundError SMILES, TARGET, KEYS = load_multiclass_csv(INPUT, target_name=args.target_name) print(f"column names {INPUT.stem} with {len(KEYS)} columns:\ {KEYS.tolist()}") NCLASS = len(KEYS) print(f"NCLASS: {NCLASS}") net = try_load_net(args.model) train_idx, valid_idx, test_idx = \ split_train_valid_test(len(TARGET), seed=args.split_seed) print(f"split_sig: {SHUFFLE_SIG}") norm_func, restore_func = normalize_array( np.concatenate([TARGET[train_idx], TARGET[valid_idx]], axis=0)) target = norm_func(TARGET) test_data = MolData(SMILES[test_idx], target[test_idx], use_tqdm=args.tqdm) test_loader = DataLoader(test_data, batch_size=BSZ, shuffle=False) score = net.predict(test_loader) gt = TARGET[test_idx] prd = restore_func(score) res_r2 = [] res_cor = [] res_mae = [] res_mse = [] if len(prd.shape) == 1: # for single class prd = np.expand_dims(prd, 1) for idx, k in enumerate(KEYS): print(f"idx, k, {idx}, {k}, {prd.shape}, {gt.shape}") gt_i, prd_i = gt[:, idx], prd[:, idx] res_r2.append(r2_score(gt_i, prd_i))
alpha=0.3) plt.xlabel("circular fingerprint distance") plt.ylabel("neural fingerprint distance") plt.xlim([0, 1]) plt.ylim([0, 1]) plt.title("Correlation = {:.4f}".format(np.corrcoef(res[0], res[1])[0, 1])) plt.savefig(filename, dpi=300, bbox_inches='tight') if __name__ == '__main__': # Load Data DATAFILE = Path('./dataset/solubility/delaney-processed.csv') df = pd.read_csv(DATAFILE) target = df['measured log solubility in mols per litre'].values target, restore = normalize_array(target) data = MolData(df['smiles'], target) print(type(df['smiles'][0]), df['smiles'][0]) tmp = df['smiles'][0] print(get_circular_fp(tmp)) exit() # Plot with a random weight and 2048 length as in Figure3Left gcn_act = ['sigmoid', 'relu', 'tanh'] gop_act = ['sigmoid', 'tanh', 'softmax'] large_weights = [(-1e7, 1e7), (0, 1e7), (-1e3, 1e3), (-10, 10)] max_degs = [1, 6] res = {} for a1, a2, bnds, rd in its.product(gcn_act, gop_act, large_weights, max_degs): SEED, FP_LEN = 7, 1 << 11 net = QSAR(hid_dim=FP_LEN,