def plot_tsne(func): n_mols = 250 mols = get_chembl(max_size=n_mols, as_mols=True) smile_strings = [m.smiles for m in mols] title = f"{func} ot-dist" distance_computer = OTChemDistanceComputer( mass_assignment_method='molecular_mass', normalisation_method='total_mass', struct_pen_method='bond_frac') distances_mat = distance_computer(smile_strings, smile_strings)[0] # title = f"{func} similarity kernel" # kernel = mol_kern_factory('similarity_kernel') # kern_mat = kernel(mols, mols) # distances_mat = 1/kern_mat # title = f"{func} fingerprint dist" # distances_mat = np.zeros((len(smile_strings), len(smile_strings))) # for i in tqdm(range(len(smile_strings))): # for j in range(len(smile_strings)): # distances_mat[i, j] = np.sum((mols[i].to_fingerprint(ftype='numeric') - # mols[j].to_fingerprint(ftype='numeric')) ** 2 ) tsne = TSNE(metric='precomputed') points_to_plot = tsne.fit_transform(distances_mat) mols = get_chembl(max_size=n_mols) smile_strings = [mol.to_smiles() for mol in mols] func_ = get_objective_by_name(func) prop_list = [func_(mol) for mol in mols] plt.title(title, fontsize=22) plt.scatter(points_to_plot[:, 0], points_to_plot[:, 1], c=prop_list, cmap=plt.cm.Spectral, s=15, alpha=0.8) plt.xticks([]) plt.yticks([]) # extent = ax.get_window_extent().transformed(fig.dpi_scale_trans.inverted()) plt.savefig(os.path.join(VIS_DIR, title.replace(" ", "_") + '.eps'), format='eps', dpi=1000) # bbox_inches=extent, pad_inches=0
def compute_sa_score_datasets(): sas = get_objective_by_name("sascore") chembl = get_chembl(max_size=50) res = [sas(m) for m in chembl] print("ChEMBL: {:.3f} +- std {:.3f}".format(np.mean(res), np.std(res))) zinc = get_zinc250(max_size=50) res = [sas(m) for m in zinc] print("ZINC: {:.3f} +- std {:.3f}".format(np.mean(res), np.std(res)))
def compute_novel_percentage(mol_list): chembl = get_chembl(max_size=-1) # smiles list chembl = [m.smiles for m in chembl] zinc = get_zinc250(max_size=-1) # smiles list zinc = [m.smiles for m in zinc] # n_total = len(chembl) + len(zinc) n_mols = len(mol_list) n_in_data = 0. for mol in tqdm(mol_list): if (mol in chembl) or (mol in zinc): n_in_data += 1 return 1 - n_in_data / n_mols
def make_pairwise(func, n_mols, to_randomize=True): if func == 'prop': smile_strings, smiles_to_prop = get_chembl_prop(n_mols=n_mols) prop_list = [smiles_to_prop[sm] for sm in smile_strings] else: n_mols_to_get = 5 * n_mols if to_randomize else n_mols mols = get_chembl(n_mols=n_mols_to_get) np.random.shuffle(mols) mols = mols[:n_mols] smile_strings = [mol.to_smiles() for mol in mols] func_ = get_objective_by_name(func) prop_list = [func_(mol) for mol in mols] dist_computer = OTChemDistanceComputer() # <-- default computer dists = dist_computer(smile_strings, smile_strings) num_rows = max(2, int(np.ceil(dist_computer.get_num_distances() / 4.0))) print(num_rows) f, ll_ax = plt.subplots(num_rows, 4, figsize=(15, 15)) axes = itertools.chain.from_iterable(ll_ax) for ind, (ax, distmat) in enumerate(zip(axes, dists)): xs, ys = [], [] pairs = [] for i in range(n_mols): for j in range(i, n_mols): dist_in_dist = distmat[i, j] dist_in_val = np.abs(prop_list[i] - prop_list[j]) xs.append(dist_in_dist) ys.append(dist_in_val) pairs.append((i,j)) # pairs.append('(%d,%d)'%(i,j)) ax.set_title(f'Distance {ind}') # TODO: parameters of distance if n_mols > 12: ax.scatter(xs, ys, s=1, alpha=0.6) else: for xval, yval, pval in zip(xs, ys, pairs): print(xval, yval, pval) if pval[0] == pval[1]: # ax.scatter([xval], [yval], s=1, alpha=0.8) ax.text(xval, yval, '*', fontsize=14) else: ax.text(xval, yval, '(%d, %d)'%(pval[0], pval[1])) ax.set_xlim((0.0, max(xs) * 1.25)) # ax.set_xticks([]) # ax.set_yticks([]) plt.savefig(os.path.join(VIS_DIR, "dist_vs_value_%d_%s_%s"%(n_mols, func, datetime.now().strftime('%m%d%H%M%S')))) print(smile_strings, len(smile_strings))
def test(N=100): dist_computer = OTChemDistanceComputer() mols = get_chembl(max_size=N, as_mols=True) natoms = [mol.to_rdkit().GetNumAtoms() for mol in mols] times = defaultdict(list) for i in range(N): for j in range(i): t0 = time() dist_computer([mols[i].to_smiles()], [mols[j].to_smiles()]) time_elapsed = time() - t0 times[natoms[i] + natoms[j]].append(time_elapsed) for k, res_lst in times.items(): times[k] = np.mean(res_lst) return times
def make_pairwise_kernel(kernel_name, func, **kwargs): n_mols = 100 mols = get_chembl(max_size=n_mols) # smile_strings = [mol.to_smiles() for mol in mols] func_ = get_objective_by_name(func) kernel = mol_kern_factory(kernel_name, **kwargs) kern_mat = kernel(mols, mols) prop_list = [func_(mol) for mol in mols] xs, ys = [], [] for i in range(n_mols): for j in range(n_mols): if mode == "inverse_sim": dist_in_dist = 1 / kern_mat[i, j] elif mode == "scaled_kernel": dist_in_dist = 1 / kern_mat[i, j] dist_in_dist /= np.sqrt(kern_mat[i, i] * kern_mat[j, j]) elif mode == "fps_distance": dist_in_dist = np.sum( (mols[i].to_fingerprint(ftype='numeric') - mols[j].to_fingerprint(ftype='numeric'))**2) else: raise ValueError dist_in_val = np.abs(prop_list[i] - prop_list[j]) xs.append(dist_in_dist) ys.append(dist_in_val) fig = plt.figure() # figsize=fsize ax = fig.add_subplot(1, 1, 1) plt.scatter(xs, ys, s=2, alpha=0.6) # plt.yscale('log') plt.xscale('log') plt.xlim([11, 80]) plt.xticks([]) plt.yticks([]) # extent = ax.get_window_extent().transformed(fig.dpi_scale_trans.inverted()) plt.savefig(os.path.join(VIS_DIR, f"{kernel_name}_{func}.eps"), format='eps', dpi=1000) # bbox_inches=extent, pad_inches=0 plt.clf()
def gen_gp_test_data(): """ Xs are molecules, Ys are some numeric value """ n_all = 100 mols = get_chembl(n_all * 3) ys = np.array([func(m) for m in mols]) mols1, mols2, mols3 = mols[:n_all], mols[n_all:2 * n_all], mols[2 * n_all:3 * n_all] ys1, ys2, ys3 = ys[:n_all], ys[n_all:2 * n_all], ys[2 * n_all:3 * n_all] n_train = int(n_all * 0.8) ys = np.array([SAScore(m) for m in mols]) X1_tr, X1_te = mols1[:n_train], mols1[n_train:] Y1_tr, Y1_te = ys1[:n_train], ys1[n_train:] X2_tr, X2_te = mols2[:n_train], mols2[n_train:] Y2_tr, Y2_te = ys2[:n_train], ys2[n_train:] X3_tr, X3_te = mols3[:n_train], mols3[n_train:] Y3_tr, Y3_te = ys3[:n_train], ys3[n_train:] return [(X1_tr, Y1_tr, X1_te, Y1_te), (X2_tr, Y2_tr, X2_te, Y2_te), (X3_tr, Y3_tr, X3_te, Y3_te)]
def make_tsne(func, as_subplots=False): """ Plot TSNE embeddings colored with property for several distance computers. """ n_mols = 200 dist_computers = [ OTChemDistanceComputer(mass_assignment_method='equal', normalisation_method='none', struct_pen_method='bond_frac'), OTChemDistanceComputer(mass_assignment_method='equal', normalisation_method='total_mass', struct_pen_method='bond_frac'), OTChemDistanceComputer(mass_assignment_method='molecular_mass', normalisation_method='none', struct_pen_method='bond_frac'), OTChemDistanceComputer(mass_assignment_method='molecular_mass', normalisation_method='total_mass', struct_pen_method='bond_frac') ] titles = [ 'Equal mass assign, no norm', 'Equal mass assign, total mass norm', 'Mol mass assign, no norm', 'Mol mass assign, total mass norm' ] smile_strings, smiles_to_prop = get_chembl_prop(n_mols=n_mols) if func == 'prop': smile_strings, smiles_to_prop = get_chembl_prop(n_mols=n_mols) prop_list = [smiles_to_prop[sm] for sm in smile_strings] else: mols = get_chembl(max_size=n_mols) smile_strings = [mol.to_smiles() for mol in mols] func_ = get_objective_by_name(func) prop_list = [func_(mol) for mol in mols] f, ll_ax = plt.subplots(2, 2, figsize=(15, 15)) axes = itertools.chain.from_iterable(ll_ax) for ind, (ax, dist_computer, title) in enumerate(zip(axes, dist_computers, titles)): distances_mat = dist_computer(smile_strings, smile_strings)[0] # plot them tsne = TSNE(metric='precomputed') points_to_plot = tsne.fit_transform(distances_mat) if as_subplots: ax.set_title(title) ax.scatter(points_to_plot[:, 0], points_to_plot[:, 1], c=prop_list, cmap=plt.cm.Spectral, s=9, alpha=0.8) ax.set_xticks([]) ax.set_yticks([]) else: # save separately: plt.clf() fig = plt.figure() # figsize=fsize ax = fig.add_subplot(1, 1, 1) plt.title(title) plt.scatter(points_to_plot[:, 0], points_to_plot[:, 1], c=prop_list, cmap=plt.cm.Spectral, s=9, alpha=0.8) plt.xticks([]) plt.yticks([]) # extent = ax.get_window_extent().transformed(fig.dpi_scale_trans.inverted()) plt.savefig(os.path.join(VIS_DIR, f'tsne_vis_{func}_{dist_computer}.eps'), format='eps', dpi=1000) # bbox_inches=extent, pad_inches=0 plt.clf() if as_subplots: plt.savefig(os.path.join(VIS_DIR, f'tsne_vis_{func}.eps'), format='eps', dpi=1000) plt.clf()
def make_pairwise(func, as_subplots=False): n_mols = 100 if func == 'prop': smile_strings, smiles_to_prop = get_chembl_prop(n_mols=n_mols) prop_list = [smiles_to_prop[sm] for sm in smile_strings] else: mols = get_chembl(max_size=n_mols) smile_strings = [mol.to_smiles() for mol in mols] func_ = get_objective_by_name(func) prop_list = [func_(mol) for mol in mols] dist_computers = [ OTChemDistanceComputer(mass_assignment_method='equal', normalisation_method='none', struct_pen_method='bond_frac'), OTChemDistanceComputer(mass_assignment_method='equal', normalisation_method='total_mass', struct_pen_method='bond_frac'), OTChemDistanceComputer(mass_assignment_method='molecular_mass', normalisation_method='none', struct_pen_method='bond_frac'), OTChemDistanceComputer(mass_assignment_method='molecular_mass', normalisation_method='total_mass', struct_pen_method='bond_frac') ] titles = [ 'Unit weight, Unnormalized', 'Unit weight, Normalized', 'Molecular mass weight, Unnormalized', 'Molecular mass weight, Normalized' ] f, ll_ax = plt.subplots(2, 2, figsize=(15, 15)) axes = itertools.chain.from_iterable(ll_ax) for ind, (ax, dist_computer, title) in enumerate(zip(axes, dist_computers, titles)): distmat = dist_computer(smile_strings, smile_strings)[0] xs, ys = [], [] for i in range(n_mols): for j in range(n_mols): dist_in_dist = distmat[i, j] dist_in_val = np.abs(prop_list[i] - prop_list[j]) xs.append(dist_in_dist) ys.append(dist_in_val) if as_subplots: ax.set_title(title) ax.scatter(xs, ys, s=2, alpha=0.6) ax.set_xticks([]) ax.set_yticks([]) else: # save separately: plt.clf() fig = plt.figure() # figsize=fsize ax = fig.add_subplot(1, 1, 1) plt.title(title, fontsize=22) plt.scatter(xs, ys, s=2, alpha=0.6) plt.xscale('log') plt.xticks([]) plt.yticks([]) plt.xlim([None, 1.03 * max(xs)]) plt.xlabel("OT-distance, log scale", fontsize=20) if ind == 0: plt.ylabel(f"Difference in SA score", fontsize=20) extent = ax.get_window_extent().transformed( fig.dpi_scale_trans.inverted()) extent.x0 -= 0.5 extent.x1 += 0.1 extent.y0 -= 0.6 extent.y1 += 0.7 else: extent = ax.get_window_extent().transformed( fig.dpi_scale_trans.inverted()) extent.x0 -= 0.5 extent.x1 += 0.1 extent.y0 -= 0.6 extent.y1 += 0.7 plt.savefig( os.path.join(VIS_DIR, f"dist_vs_value_{func}_{ind+1}.pdf"), bbox_inches=extent, pad_inches=0 ) #bbox_inches=extent, pad_inches=0, format='eps', dpi=1000, plt.clf() if as_subplots: plt.savefig(os.path.join(VIS_DIR, f"dist_vs_value_{func}.eps"), format='eps', dpi=1000) plt.clf()