def training_all(): # properties properties = misc.load_npy(args.scratch + "properties") # fchls kernel = misc.load_npy(args.scratch + "kernel." + "fchl18") return
def get_avg_repr(idx, scr="_tmp_ensemble_/", **kwargs): name = "slatm" energies = misc.load_npy(scr + str(idx) + ".energies") molobjs = cheminfo.read_sdffile(scr + str(idx) + ".sdf") molobjs = [mol for mol in molobjs] xyzs = molobjs_to_xyzs(molobjs) reprs = xyzs_to_representations(*xyzs, **kwargs) # Boltzmann factors factors = np.exp(-energies) factors /= np.sum(factors) length = reprs.shape[1] avgrep = np.zeros(length) for rep, factor in zip(reprs, factors): avgrep += factor * rep print(idx, avgrep.shape) if "array" in kwargs: results = kwargs["array"] results[idx, :] = avgrep else: return idx, avgrep
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="dir", default="_tmp_") parser.add_argument('-j', '--procs', action='store', help='pararallize', metavar="int", default=0, type=int) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" # Read properties properties = misc.load_npy(args.scratch + "properties") molecules = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz") molecules = list(molecules) heavy_atoms = [] predictions = [] errors = [] for mol, prop in zip(molecules, properties): smi = cheminfo.molobj_to_smiles(mol, remove_hs=True) J = thermo.joback.Joback(smi) # J = thermo.joback.Joback('CC(=O)C') # J = thermo.joback.Joback('CCC(=O)OC(=O)CC') status = J.status atoms, coord = cheminfo.molobj_to_xyz(mol) idx = np.where(atoms != 1) atoms = atoms[idx] N = len(atoms) heavy_atoms.append(N) if "Did not match all atoms present" in status: errors.append(1) predictions.append(float("nan")) continue try: estimate = J.estimate() except TypeError: errors.append(1) predictions.append(float("nan")) continue errors.append(0) T_b = estimate["Tb"] T_m = estimate["Tm"] predictions.append(T_m) errors = np.array(errors, dtype=int) idx_success, = np.where(errors == 0) heavy_atoms = np.array(heavy_atoms) predictions = np.array(predictions) properties = np.array(properties) predictions = predictions[idx_success] properties = properties[idx_success] heavy_atoms = heavy_atoms[idx_success] print("total", errors.shape[0], "filter", idx_success.shape[0]) print() print(rmse(properties, predictions)) plt.plot(properties, properties, "-k") plt.scatter(properties, predictions, s=0.95, alpha=0.8, c=heavy_atoms) plt.xlabel("True") plt.ylabel("Predicted") plt.savefig("_fig_joback") plt.clf() return
def dump_distances_and_kernels(scr, name, procs=0): # TODO Properties should be read by scr!! # properties # print("Saving properties") # with open(scr + 'properties.csv', 'r') as f: # properties = f.readlines() # properties = [x.split()[0] for x in properties] # properties = [float(x) for x in properties] # properties = np.array(properties) # print(properties.shape) # misc.save_npy(scr + "properties", properties) representation_names_coordbased = ["cm", "slatm", "bob"] representation_names_molbased = ["morgan", "rdkitfp"] if procs != 0: os.environ["OMP_NUM_THREADS"] = str(procs) # Prepare fchl kernels if name == "fclh18": print("Generating fchl18 kernel") start = time.time() reps = misc.load_npy(scr + "repr." + "fchl18") print("shape:", reps.shape) sigmas, kernels = get_fchl18_kernels(reps, return_sigmas=True) end = time.time() print("time:", end - start) misc.save_npy(scr + "fchl18." + "sigmas", sigmas) misc.save_npy(scr + "kernels." + "fchl18", kernels) reps = None del reps kernels = None del kernels elif name == "fchl19": print("Generating fchl19 kernel") reps = misc.load_npy(scr + "repr." + "fchl19") print("shape:", reps.shape) atoms = misc.load_obj(scr + "atoms") start = time.time() sigmas, kernels = get_fchl19_kernels(reps, atoms, return_sigmas=True) end = time.time() print("time:", end - start) misc.save_npy(scr + "fchl19." + "sigmas", sigmas) misc.save_npy(scr + "kernels." + "fchl19", kernels) elif name in representation_names_coordbased: print("Distance", name) representations = misc.load_npy(scr + "repr." + name) print(representations.shape) dist = generate_l2_distances(representations) misc.save_npy(scr + "dist." + name, dist) dist = None del dist elif name == "rdkitfp" or name == "morgan": print("Generating fingerprint kernel", name) representations_fp = misc.load_npy(scr + "repr." + name) representations_fp = np.asarray(representations_fp, dtype=np.float) # t = time.time() # print("jaccard numpy") # kernel = fingerprints.bitmap_jaccard_kernel(representations_fp) # print("time", time.time()-t) # print("saving kernel") # # kernel = None # del kernel print(os.environ["OMP_NUM_THREADS"]) n_items = representations_fp.shape[0] # FORTRAN KERNEL # t = time.time() # print("jaccard fortran") # representations_fp = np.array(representations_fp, dtype=int).T # kernel = bitmap_kernels.symmetric_jaccard_kernel(n_items, representations_fp) # print("time", time.time()-t) # kernel = fingerprints.fingerprints_to_kernel(representations_fp, representations_fp, procs=procs) # misc.save_npy(scr + "kernel." + name, kernel) # DISTANCE print("make dist") dist = generate_l2_distances(representations_fp) print("save dist") misc.save_npy(scr + "dist." + name, dist) print("saved") print(dist.shape) kernel = None del kernel else: print("error: unknown representation", name) quit() return
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="dir", default="_tmp_") parser.add_argument('-j', '--procs', action='store', help='pararallize', metavar="int", default=0, type=int) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" properties = misc.load_npy(args.scratch + "properties") molecules = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz") heavy_atoms = [] distances = [] volumes = [] for mol in molecules: # atoms = cheminfo.molobj_to_atoms(mol) atoms, coord = cheminfo.molobj_to_xyz(mol) idx = np.where(atoms != 1) atoms = atoms[idx] N = len(atoms) heavy_atoms.append(N) hull = ConvexHull(coord, qhull_options="QJ") vol = hull.volume volumes.append(vol) avgdist = distance.pdist(coord) avgdist = np.mean(avgdist) distances.append(avgdist) heavy_atoms = np.array(heavy_atoms) volumes = np.array(volumes) distances = np.array(distances) # # # representation = distances # linear fit p = np.polyfit(representation, properties, 3) p = np.poly1d(p) results = p(representation) rmse_error = rmse(results, properties) print(rmse_error) plt.scatter(representation, properties, c=heavy_atoms, s=0.8) x_prop = np.linspace(min(representation), max(representation), 80) plt.plot(x_prop, p(x_prop), "k-") plt.savefig("i_can_member_it") plt.clf() return
def dump_distances_and_kernels(scr): # TODO Properties should be read by scr!! # properties print("Saving properties") with open(scr + 'properties.csv', 'r') as f: properties = f.readlines() properties = [x.split()[0] for x in properties] properties = [float(x) for x in properties] properties = np.array(properties) print("properties", properties.shape) misc.save_npy(scr + "properties", properties) # Prepare distances representation_names = ["cm", "bob", "slatm"] # + ["avgslatm"] for name in representation_names: print("Distance", name) representations = misc.load_npy(scr + "repr." + name) print(representations.shape) dist = generate_l2_distances(representations) misc.save_npy(scr + "dist." + name, dist) dist = None del dist # Prepare fchl kernels if False: print("Generating fchl18 kernel") start = time.time() reps = misc.load_npy(scr + "repr." + "fchl18") print("shape:", reps.shape) sigmas, kernels = get_fchl18_kernels(reps, return_sigmas=True) end = time.time() print("time:", end-start) misc.save_npy(scr + "fchl18." + "sigmas", sigmas) misc.save_npy(scr + "kernels." + "fchl18", kernels) reps = None del reps kernels = None del kernels if False: print("Generating fchl19 kernel") reps = misc.load_npy(scr + "repr." + "fchl19") print("shape:", reps.shape) atoms = misc.load_obj(scr + "atoms") start = time.time() sigmas, kernels = get_fchl19_kernels(reps, atoms, return_sigmas=True) end = time.time() print("time:", end-start) misc.save_npy(scr + "fchl19." + "sigmas", sigmas) misc.save_npy(scr + "kernels." + "fchl19", kernels) if True: print("Generating fingerprint kernel") representations_fp = misc.load_obj(scr + "repr.fp") kernel = get_fp_kernel(representations_fp) misc.save_npy(scr + "kernel.fp", kernel) return
def dump_kernel_scores(scr, names=[]): # Predefined reg l2regs = [10**-x for x in range(1, 6, 2)] + [0.0] n_l2regs = len(l2regs) # Define n_training # n_trains=[2**x for x in range(4, 12)] n_trains=[2**x for x in range(4, 17)] n_trains = np.array(n_trains, dtype=int) n_items = misc.load_txt(scr + "n_items") n_train_idx, = np.where(n_trains < n_items*4.0/5.0) n_trains = n_trains[n_train_idx] n_trains = list(n_trains) # + [-1] print("Assume total items", n_items, "N train", "{:5.1f}".format(np.floor(n_items*4/5)), "N test", "{:5.1f}".format(np.ceil(n_items*1/5))) print("Training:", list(n_trains)) misc.save_npy(scr + "n_train", n_trains) # Load properties try: properties = misc.load_npy(scr + "properties") except: with open(scr + "properties.csv", 'r') as f: lines = f.readlines() properties = [] for line in lines: values = [float(x) for x in line.split()] values = values[1:] value = np.median(values) properties.append(value) properties = np.array(properties) misc.save_npy(scr + "properties", properties) print(n_items, "==", len(properties)) assert n_items == len(properties) # Load done kernel this_names = ["rdkitfp", "morgan"] for name in names: break if name not in this_names: continue print("scoring", name) now = time.time() print("load kernel", name) kernel = misc.load_npy(scr + "kernel." + name) n_len = kernel.shape[0] diaidx = np.diag_indices(n_len) def scan_kernels(debug=True): kernel[diaidx] += l2regs[0] yield kernel # for i in tqdm.tqdm(range(1, n_l2regs), ncols=47, ascii=True, desc=name): for i in range(1, n_l2regs): kernel[diaidx] += -l2regs[i-1] +l2regs[i] yield kernel generator = functools.partial(tqdm, scan_kernels(), ncols=75, ascii=True, desc=name+ " kernels", total=n_l2regs) print("scan kernels", name) idx_winners, scores = cross_validation(generator(), properties, training_points=n_trains) misc.save_npy(scr + "score."+name, scores) scores = np.around(np.mean(scores, axis=1), decimals=2) # Save parameters winner_parameters = {} for ni, index in enumerate(idx_winners): n = n_trains[ni] l2reg = l2regs[index] parameters = { "reg": l2reg, } winner_parameters[str(n)] = parameters nower = time.time() print("time: {:10.1f}s".format(nower-now)) print(name, list(scores)) misc.save_json(scr + "parameters."+name, winner_parameters) print("saved") kernel = None del kernel # Load multi kernels (reg search) this_names = ["fchl19", "fchl18"] for name in names: break kernels = misc.load_npy(scr + "kernels." + name) n_l2regs = len(l2regs) n_kernels = kernels.shape[0] n_len = kernels[0].shape[0] diaidx = np.diag_indices(n_len) def scan_kernels(): for kernel in kernels: kernel[diaidx] += l2regs[0] yield kernel for i in range(1, n_l2regs): kernel[diaidx] += -l2regs[i-1] +l2regs[i] yield kernel idx_winners, scores = cross_validation(scan_kernels(), properties, training_points=n_trains) misc.save_npy(scr + "score."+name, scores) scores = np.around(np.mean(scores, axis=1), decimals=2) # Clean kernels = None del kernels # Save parameters winner_parameters = {} for ni, index in enumerate(idx_winners): # convert linear index to multi-dimensions idx_parameters = np.unravel_index([index], (n_kernels, n_l2regs)) i, j = idx_parameters i = int(i[0]) j = int(j[0]) n = n_trains[ni] sigma = i l2reg = l2regs[j] parameters = { "sigma": sigma, "reg": l2reg, } winner_parameters[str(n)] = parameters misc.save_json(scr + "parameters."+name, winner_parameters) print(name, scores) # Load distance kernels models = [] parameters = { "name": "rdkitfp", "sigma": [2**x for x in range(1, 12, 2)], # "sigma": [2**x for x in np.arange(20, 40, 0.5)], # "lambda": l2regs, # "lambda": [10.0**-x for x in np.arange(1, 10, 1)] "lambda": [10.0**-6], } models.append(parameters) parameters = { "name": "slatm", "sigma": [2**x for x in range(1, 12, 2)], # "sigma": [2**x for x in np.arange(20, 40, 0.5)], # "lambda": l2regs, # "lambda": [10.0**-x for x in np.arange(1, 10, 1)] "lambda": [10.0**-6], } models.append(parameters) parameters = { "name": "cm", "sigma": [2**x for x in range(1, 12, 2)], "lambda": l2regs, } models.append(parameters) parameters = { "name": "bob", "sigma": [2**x for x in range(1, 12, 2)], "lambda": l2regs, } models.append(parameters) parameters = { "name": "avgslatm", "sigma": [2**x for x in range(1, 20, 2)], "lambda": l2regs, } # models.append(parameters) for model in models: name = model["name"] if name not in names: continue print("scoring", name) parameters = model n_sigma = len(parameters["sigma"]) n_lambda = len(parameters["lambda"]) print("parameter range") print("sigma", min(parameters["sigma"]), max(parameters["sigma"])) dist = misc.load_npy(scr + "dist." + name) kernels = get_kernels_l2distance(dist, parameters) # Cross validate idx_winners, scores = cross_validation(kernels, properties, training_points=n_trains) # Save scores misc.save_npy(scr + "score."+name, scores) scores = np.around(np.mean(scores, axis=1), decimals=2) # Save parameters winner_parameters = {} for ni, index in enumerate(idx_winners): # convert linear index to multi-dimensions idx_parameters = np.unravel_index([index], (n_sigma, n_lambda)) i, j = idx_parameters i = int(i[0]) j = int(j[0]) n = n_trains[ni] sigma = parameters["sigma"][i] l2reg = parameters["lambda"][j] this_parameters = { "sigma": str(sigma), "reg": str(l2reg), } winner_parameters[str(n)] = this_parameters print(name, scores) misc.save_json(scr + "parameters."+name, winner_parameters) quit() return
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="dir", default="_tmp_") parser.add_argument('--randomseed', action='store', help='random seed', metavar="int", default=1) parser.add_argument('-j', '--procs', action='store', help='pararallize', type=int, metavar="int", default=0) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" # Not that random np.random.seed(args.randomseed) # Get properties properties = misc.load_npy(args.scratch + "properties") molobjs = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz") # Get features filename = "repr.ols" if os.path.exists(args.scratch + filename + ".pkl"): features = misc.load_obj(args.scratch + filename) else: features = extract_features(properties, molobjs, procs=args.procs) features = pd.DataFrame(features) features = features.fillna(0) misc.save_obj(args.scratch + filename, features) n_items = len(features) X = np.arange(n_items) assert len(properties) == n_items # Train n_splits = 5 n_train = misc.load_npy(args.scratch + "n_train") fold_five = sklearn.model_selection.KFold(n_splits=n_splits, random_state=45, shuffle=True) scores = [] for i, (idxs_train, idxs_test) in enumerate(fold_five.split(X)): # un-ordered idxs_train np.random.seed(45 + i) np.random.shuffle(idxs_train) learning_curve = [] for n in n_train: idxs = idxs_train[:n] # signed difference sign_diff = fit_model(features, idxs, idxs_test) # rmse diff = sign_diff**2 rmse_test = np.sqrt(diff.mean()) # save learning_curve.append(rmse_test) scores.append(learning_curve) scores = np.array(scores) scores = scores.T mean_score = np.mean(scores, axis=1) print(mean_score) misc.save_npy(args.scratch + "score.ols", scores) return
def main(): # L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001. import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="dir", default="_tmp_") parser.add_argument('--randomseed', action='store', help='random seed', metavar="int", default=1) parser.add_argument('-j', '--procs', action='store', help='pararallize', type=int, metavar="int", default=0) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" # Not that random np.random.seed(args.randomseed) # Get properties properties = misc.load_npy(args.scratch + "properties") molobjs = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz") X = [] try: X = misc.load_npy(args.scratch + "repr.rdkitfp") print("loaded") except: for molobj in molobjs: bitmap = fingerprints.get_rdkitfp(molobj) X.append(bitmap) X = np.asarray(X) y = properties # load predefined training points n_train = misc.load_npy(args.scratch + "n_train") # CV idxs = np.array(list(range(len(properties))), dtype=int) scores = [] for idxs_train, idxs_test in cv.cross_view(idxs): learning_curve = [] for n in n_train: idxs = idxs_train[:n] clf = get_best_rfr(X[idxs], y[idxs]) # training error # predictions = clf.predict(X) # predictions predictions = clf.predict(X[idxs_test]) diff = predictions-y[idxs_test] diff = diff**2 rmse_test = np.sqrt(diff.mean()) learning_curve.append(rmse_test) print(n, rmse_test) scores.append(learning_curve) scores = np.array(scores) scores = scores.T mean_score = np.mean(scores, axis=1) print(mean_score) misc.save_npy(args.scratch + "score.rfr", scores)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="dir", default="_tmp_") parser.add_argument('--randomseed', action='store', help='random seed', metavar="int", default=1) parser.add_argument('-j', '--procs', action='store', help='pararallize', type=int, metavar="int", default=0) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" # Not that random # np.random.seed(args.randomseed) # Get properties properties = misc.load_npy(args.scratch + "properties") # molobjs = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz") n_items = len(properties) X = np.arange(n_items) # Train n_splits = 5 n_train = misc.load_npy(args.scratch + "n_train") fold_five = sklearn.model_selection.KFold(n_splits=n_splits, random_state=45, shuffle=True) scores = [] for i, (idxs_train, idxs_test) in enumerate(fold_five.split(X)): # un-ordered idxs_train np.random.seed(45 + i) np.random.shuffle(idxs_train) learning_curve = [] for n in n_train: idxs = idxs_train[:n] train = properties[idxs] model = train.mean() test = properties[idxs_test] # predict sign_diff = model - test # rmse diff = sign_diff**2 rmse_test = np.sqrt(diff.mean()) # save learning_curve.append(rmse_test) scores.append(learning_curve) scores = np.array(scores) scores = scores.T mean_score = np.mean(scores, axis=1) print(mean_score) misc.save_npy(args.scratch + "score.null", scores) return
def plot_errors(scr): fig, axes = plt.subplots(1, 1, figsize=(8, 4)) # fig, axes = plt.subplots(1, 1, figsize=(4,4)) ax = axes # n_trains=[2**x for x in range(4, 4+7)] try: n_trains = misc.load_npy(scr + "n_train") except FileNotFoundError: n_trains = misc.load_txt(scr + "n_train") print(n_trains) names = ["cm", "bob", "fchl18", "fchl19", "fp", "slatm"] names = glob.glob(scr + "score.*") fix_name = lambda x: x.replace(scr, "").replace(".npy", "").replace( "score.", "") names = [fix_name(x) for x in names] lines = [] last_points = [] y_min = np.inf y_max = -np.inf for name in names: scores = misc.load_npy(scr + "score." + name) mean = scores.mean(axis=1) std = scores.std(axis=1) if "ols" in name: view, = np.where(n_trains > 250) x_mean = n_trains[view] mean = mean[view] std = std[view] else: valid_scores, = np.where(mean < 200) x_mean = n_trains[valid_scores] mean = mean[valid_scores] std = std[valid_scores] line = ax.errorbar( x_mean, mean, std, fmt='-o', # color="k", capsize=3, lw=1, markersize=4, label=name.upper()) lines.append(line) last_points.append(mean[-1]) max_mean = max(mean) + max(std) if max_mean > y_max: y_max = max_mean min_mean = min(mean) - max(std) if min_mean < y_min: y_min = min_mean print(name, list(mean)) y_min = np.floor(y_min) y_min = int(np.floor(y_min / 10.0)) * 10 y_max = int(np.ceil(y_max) / 10.0) * 10 ykeys = [] y_min = 40 print("y", y_min, y_max) diff = y_max - y_min if diff < 50: y_min -= 40 if y_min < 0.0: y_min = 50 if y_max > 120: y_max = 120 # ykeys = np.arange(y_min, y_max, 30) # y_max = 100 y_min = 30 ykeys = np.geomspace(y_min, y_max, num=5) ykeys = [int(np.ceil(y) / 5.0) * 5 for y in ykeys] # ykeys = [40 +10*x for x in range(0, 12, 2)] xkeys = n_trains print("x", n_trains) views.learning_curve_error(ax, xkeys, ykeys, x_range=(10, max(n_trains) * 1.3), y_range=(y_min * 0.95, y_max * 1.12)) views.legend_colorcoded(ax, lines, names) # learning legends # idxs = np.argsort(last_points) # idxs = np.flip(idxs, axis=0) # offset = 0.06 # # for n, idx in enumerate(idxs): # # name = names[idx] # point = last_points[idx] # color = plt.getp(lines[idx][0], 'color') # # ax.text(0.8, 0.46-offset*n, name.upper(), # fontweight='bold', # color=color, # transform=ax.transAxes) # # help(ax.grid) # ax.grid( linestyle='-', linewidth=.5, axis="x") # ax.grid(True) ax.set_xlabel('Training set size', fontweight='medium', fontsize=11) ax.set_ylabel('RMSE [Kelvin]', fontweight='medium', fontsize=11) plt.savefig(scr + "learning_curves.png", bbox_inches="tight") plt.savefig(scr + "learning_curves.pdf", bbox_inches="tight") print(scr + "learning_curves.png") return
def generate_conformer_representation(scr="_tmp_ensemble_/", procs=0): names = ["cm", "slatm", "bob"] name = "slatm" mbtypes = misc.load_npy(scr + "slatm.mbtypes") # TODO Calculate max_size mol_atoms = misc.load_obj(scr + "atoms") max_atoms = [len(atoms) for atoms in mol_atoms] max_atoms = max(max_atoms) kwargs = { "name": name, "mbtypes": mbtypes, "debug": False, "max_atoms": max_atoms, } # n_total = 1285 n_total = 3456 idxs = range(n_total) avgreps = [0] * n_total if procs == 0: for idx in idxs: idx, avgrep = get_avg_repr(idx, **kwargs) avgreps[idx] = avgrep else: idx, rep = get_avg_repr(0, **kwargs) rep_size = rep.shape[0] print("rep size", rep_size) m = MyManager() m.start() results = m.np_zeros((n_total, rep_size)) # TODO Hardcoded, puuuha pool = Pool(32) kwargs["array"] = results func = partial(get_avg_repr, **kwargs) pool.map(func, idxs) avgreps = results # results = misc.parallel(idxs, get_avg_repr, [], kwargs, procs=nprocs) # # for result in results: # idx, avgrep = result # avgreps[idx] = avgrep # print(idx, avgrep.mean()) avgreps = np.array(avgreps) misc.save_npy(scr + "repr.avgslatm", avgreps) return