def get_score(src, trg, task='logp04'): if task == 'logp04': logp_improvement = penalized_logp(trg) - penalized_logp(src) # return logp_improvement * (0.6+min(sim, 0.4)) return logp_improvement elif task == 'qed': return qed(trg) elif task == 'drd2': try: val = drd2(trg) except: print( "***************\n***************\n***************\n***************\n" ) print( "***************\n***************\n***************\n***************\n" ) print(trg) print( "***************\n***************\n***************\n***************\n" ) print( "***************\n***************\n***************\n***************\n" ) val = 0 return val else: assert 'wrong task: %s' % task
def get_scores(src, trg): sim = similarity(src, trg) # if sim < 0.4 or sim >= 1.0: if sim >= 1.0: sim = 0 logp_improvement = penalized_logp(trg) - penalized_logp(src) qed_score = qed(trg) drd2_score = drd2(trg) return sim, logp_improvement, qed_score, drd2_score
def get_score(src, trg, task='logp04'): sim = similarity(src, trg) if sim >= 1.0: return 0 if task == 'logp04': logp_improvement = penalized_logp(trg) - penalized_logp(src) # return logp_improvement * (0.6+min(sim, 0.4)) if sim < 0.4 and sim >= 0: return 0 else: return logp_improvement elif task == 'qed': if sim < 0.4: return 0 else: return qed(trg) elif task == 'drd2': if sim < 0.4: return 0 else: try: val = drd2(trg) except: print( "***************\n***************\n***************\n***************\n" ) print( "***************\n***************\n***************\n***************\n" ) print(trg) print( "***************\n***************\n***************\n***************\n" ) print( "***************\n***************\n***************\n***************\n" ) val = 0 return val else: assert 'wrong task: %s' % task
def evaluate(test_smiles_list, model, moltokenizer, log_diff_list, qed_diff_list, drd2_diff_list): smiles_list = [] ids_list = [] property_x_list = [] property_desired_list = [] n_grid = 3 * 3 * 1 for idx, smiles in enumerate(test_smiles_list): ids = moltokenizer.encode(smiles) logp_val = penalized_logp(smiles) qed_val = qed(smiles) drd2_val = drd2(smiles) property_x = np.expand_dims(np.array([logp_val, qed_val, drd2_val]), axis=0) for log_diff in log_diff_list: logp_improvement = logp_val + log_diff for qed_diff in qed_diff_list: qed_desired = qed_val + qed_diff for drd2_diff in drd2_diff_list: drd2_desired = drd2_val + drd2_diff property_desired = np.expand_dims(np.array( [logp_improvement, qed_desired, drd2_desired]), axis=0) smiles_list.append(smiles) ids_list.append(ids) property_x_list.append(property_x) property_desired_list.append(property_desired) x = np.array( tf.keras.preprocessing.sequence.pad_sequences(ids_list, dtype="int64", padding="post")) px = np.concatenate(property_x_list, axis=0) py = np.concatenate(property_desired_list, axis=0) print(px[2, :]) print(py[2, :]) print(px[10, :]) print(py[10, :]) outputs, _ = model.predict([x, px, py], batch_size=args.eb, verbose=1) if args.p == 1: valid_smiles_list = [] score_plogp_all = [] score_qed_all = [] score_drd2_all = [] for idx, output in tqdm.tqdm(enumerate(outputs)): smiles_x = smiles_list[idx] smiles_y = moltokenizer.decode(output) sim = similarity(smiles_x, smiles_y) if sim < 0.4: score_plogp_all.append(0) score_qed_all.append(0) score_drd2_all.append(0) valid_smiles_list.append('') else: score_plogp = get_score(smiles_x, smiles_y, task='logp04') score_qed = get_score(smiles_x, smiles_y, task='qed') score_drd2 = get_score(smiles_x, smiles_y, task='drd2') score_plogp_all.append(score_plogp) score_qed_all.append(score_qed) score_drd2_all.append(score_drd2) valid_smiles_list.append(smiles_y) else: with Timer("score multi calculation..."): n = len(outputs) n_proc = args.p batch = math.ceil(n / (n_proc)) with Pool(processes=n_proc) as pool: r = pool.map_async( worker_wrapper, zip(repeat(outputs), repeat(smiles_list), repeat(n), range(1, n_proc + 1), range(0, n, batch), range(batch, batch * n_proc + 1, batch))) r.wait() score_all = [] valid_smiles_list = [] for partial_score, valid_smiles_p_list in r.get(): score_all += partial_score valid_smiles_list += valid_smiles_p_list score_all = np.array(score_all) score_plogp_all = score_all[:, 0] score_qed_all = score_all[:, 1] score_drd2_all = score_all[:, 2] return score_plogp_all, score_qed_all, score_drd2_all, valid_smiles_list
) moltokenizer = Moltokenizer(params["vocab_file"]) epoch_num = 270 # drd2 test 0.61, V0011 # epoch_num = 210 tst_model = get_model(params, epoch_num) selected_smiles = 'COC1=CC=C(C=C1)C(=O)N1CCCC1=O' log_diff_list = [0.0] qed_diff_list = [-0.1, -0.05, 0.0, 0.05, 0.1] drd2_diff_list = np.array(range(6, 10)) / 10.0 org_plp = penalized_logp(selected_smiles) org_qed = qed(selected_smiles) org_drd2 = drd2(selected_smiles) score_plogp_all, score_qed_all, score_drd2_all, valid_smiles_list = \ evaluate([selected_smiles], tst_model, moltokenizer, log_diff_list, qed_diff_list, drd2_diff_list) new_plp = score_plogp_all[np.argmax(score_drd2_all)] new_qed = score_qed_all[np.argmax(score_drd2_all)] new_drd2 = score_drd2_all[np.argmax(score_drd2_all)] new_smiles = valid_smiles_list[np.argmax(score_drd2_all)] print(org_plp, org_qed, org_drd2) print(new_plp, new_qed, new_drd2) print("improvemend:", new_drd2 - org_drd2) print("diff:", (new_plp - org_plp)**2 + (new_qed - org_qed)**2) print(new_smiles)