def get_conformations(line, scr="_tmp_ensemble_/", **kwargs): im, molecule = line # smi = Chem.MolToSmiles(molecule) energies = generate_conformers(molecule) misc.save_npy(scr + str(im) + ".energies", energies) txtsdf = cheminfo.molobj_to_sdfstr(molecule) fsdf = open(scr + str(im) + ".sdf", 'w') fsdf.write(txtsdf) fsdf.close() print(im, "{:} {:5.2f} {:5.2f}".format("smi", energies.mean(), energies.std())) return
def dump_distances_and_kernels(scr, name, procs=0): # TODO Properties should be read by scr!! # properties # print("Saving properties") # with open(scr + 'properties.csv', 'r') as f: # properties = f.readlines() # properties = [x.split()[0] for x in properties] # properties = [float(x) for x in properties] # properties = np.array(properties) # print(properties.shape) # misc.save_npy(scr + "properties", properties) representation_names_coordbased = ["cm", "slatm", "bob"] representation_names_molbased = ["morgan", "rdkitfp"] if procs != 0: os.environ["OMP_NUM_THREADS"] = str(procs) # Prepare fchl kernels if name == "fclh18": print("Generating fchl18 kernel") start = time.time() reps = misc.load_npy(scr + "repr." + "fchl18") print("shape:", reps.shape) sigmas, kernels = get_fchl18_kernels(reps, return_sigmas=True) end = time.time() print("time:", end - start) misc.save_npy(scr + "fchl18." + "sigmas", sigmas) misc.save_npy(scr + "kernels." + "fchl18", kernels) reps = None del reps kernels = None del kernels elif name == "fchl19": print("Generating fchl19 kernel") reps = misc.load_npy(scr + "repr." + "fchl19") print("shape:", reps.shape) atoms = misc.load_obj(scr + "atoms") start = time.time() sigmas, kernels = get_fchl19_kernels(reps, atoms, return_sigmas=True) end = time.time() print("time:", end - start) misc.save_npy(scr + "fchl19." + "sigmas", sigmas) misc.save_npy(scr + "kernels." + "fchl19", kernels) elif name in representation_names_coordbased: print("Distance", name) representations = misc.load_npy(scr + "repr." + name) print(representations.shape) dist = generate_l2_distances(representations) misc.save_npy(scr + "dist." + name, dist) dist = None del dist elif name == "rdkitfp" or name == "morgan": print("Generating fingerprint kernel", name) representations_fp = misc.load_npy(scr + "repr." + name) representations_fp = np.asarray(representations_fp, dtype=np.float) # t = time.time() # print("jaccard numpy") # kernel = fingerprints.bitmap_jaccard_kernel(representations_fp) # print("time", time.time()-t) # print("saving kernel") # # kernel = None # del kernel print(os.environ["OMP_NUM_THREADS"]) n_items = representations_fp.shape[0] # FORTRAN KERNEL # t = time.time() # print("jaccard fortran") # representations_fp = np.array(representations_fp, dtype=int).T # kernel = bitmap_kernels.symmetric_jaccard_kernel(n_items, representations_fp) # print("time", time.time()-t) # kernel = fingerprints.fingerprints_to_kernel(representations_fp, representations_fp, procs=procs) # misc.save_npy(scr + "kernel." + name, kernel) # DISTANCE print("make dist") dist = generate_l2_distances(representations_fp) print("save dist") misc.save_npy(scr + "dist." + name, dist) print("saved") print(dist.shape) kernel = None del kernel else: print("error: unknown representation", name) quit() return
def dump_distances_and_kernels(scr): # TODO Properties should be read by scr!! # properties print("Saving properties") with open(scr + 'properties.csv', 'r') as f: properties = f.readlines() properties = [x.split()[0] for x in properties] properties = [float(x) for x in properties] properties = np.array(properties) print("properties", properties.shape) misc.save_npy(scr + "properties", properties) # Prepare distances representation_names = ["cm", "bob", "slatm"] # + ["avgslatm"] for name in representation_names: print("Distance", name) representations = misc.load_npy(scr + "repr." + name) print(representations.shape) dist = generate_l2_distances(representations) misc.save_npy(scr + "dist." + name, dist) dist = None del dist # Prepare fchl kernels if False: print("Generating fchl18 kernel") start = time.time() reps = misc.load_npy(scr + "repr." + "fchl18") print("shape:", reps.shape) sigmas, kernels = get_fchl18_kernels(reps, return_sigmas=True) end = time.time() print("time:", end-start) misc.save_npy(scr + "fchl18." + "sigmas", sigmas) misc.save_npy(scr + "kernels." + "fchl18", kernels) reps = None del reps kernels = None del kernels if False: print("Generating fchl19 kernel") reps = misc.load_npy(scr + "repr." + "fchl19") print("shape:", reps.shape) atoms = misc.load_obj(scr + "atoms") start = time.time() sigmas, kernels = get_fchl19_kernels(reps, atoms, return_sigmas=True) end = time.time() print("time:", end-start) misc.save_npy(scr + "fchl19." + "sigmas", sigmas) misc.save_npy(scr + "kernels." + "fchl19", kernels) if True: print("Generating fingerprint kernel") representations_fp = misc.load_obj(scr + "repr.fp") kernel = get_fp_kernel(representations_fp) misc.save_npy(scr + "kernel.fp", kernel) return
def dump_kernel_scores(scr, names=[]): # Predefined reg l2regs = [10**-x for x in range(1, 6, 2)] + [0.0] n_l2regs = len(l2regs) # Define n_training # n_trains=[2**x for x in range(4, 12)] n_trains=[2**x for x in range(4, 17)] n_trains = np.array(n_trains, dtype=int) n_items = misc.load_txt(scr + "n_items") n_train_idx, = np.where(n_trains < n_items*4.0/5.0) n_trains = n_trains[n_train_idx] n_trains = list(n_trains) # + [-1] print("Assume total items", n_items, "N train", "{:5.1f}".format(np.floor(n_items*4/5)), "N test", "{:5.1f}".format(np.ceil(n_items*1/5))) print("Training:", list(n_trains)) misc.save_npy(scr + "n_train", n_trains) # Load properties try: properties = misc.load_npy(scr + "properties") except: with open(scr + "properties.csv", 'r') as f: lines = f.readlines() properties = [] for line in lines: values = [float(x) for x in line.split()] values = values[1:] value = np.median(values) properties.append(value) properties = np.array(properties) misc.save_npy(scr + "properties", properties) print(n_items, "==", len(properties)) assert n_items == len(properties) # Load done kernel this_names = ["rdkitfp", "morgan"] for name in names: break if name not in this_names: continue print("scoring", name) now = time.time() print("load kernel", name) kernel = misc.load_npy(scr + "kernel." + name) n_len = kernel.shape[0] diaidx = np.diag_indices(n_len) def scan_kernels(debug=True): kernel[diaidx] += l2regs[0] yield kernel # for i in tqdm.tqdm(range(1, n_l2regs), ncols=47, ascii=True, desc=name): for i in range(1, n_l2regs): kernel[diaidx] += -l2regs[i-1] +l2regs[i] yield kernel generator = functools.partial(tqdm, scan_kernels(), ncols=75, ascii=True, desc=name+ " kernels", total=n_l2regs) print("scan kernels", name) idx_winners, scores = cross_validation(generator(), properties, training_points=n_trains) misc.save_npy(scr + "score."+name, scores) scores = np.around(np.mean(scores, axis=1), decimals=2) # Save parameters winner_parameters = {} for ni, index in enumerate(idx_winners): n = n_trains[ni] l2reg = l2regs[index] parameters = { "reg": l2reg, } winner_parameters[str(n)] = parameters nower = time.time() print("time: {:10.1f}s".format(nower-now)) print(name, list(scores)) misc.save_json(scr + "parameters."+name, winner_parameters) print("saved") kernel = None del kernel # Load multi kernels (reg search) this_names = ["fchl19", "fchl18"] for name in names: break kernels = misc.load_npy(scr + "kernels." + name) n_l2regs = len(l2regs) n_kernels = kernels.shape[0] n_len = kernels[0].shape[0] diaidx = np.diag_indices(n_len) def scan_kernels(): for kernel in kernels: kernel[diaidx] += l2regs[0] yield kernel for i in range(1, n_l2regs): kernel[diaidx] += -l2regs[i-1] +l2regs[i] yield kernel idx_winners, scores = cross_validation(scan_kernels(), properties, training_points=n_trains) misc.save_npy(scr + "score."+name, scores) scores = np.around(np.mean(scores, axis=1), decimals=2) # Clean kernels = None del kernels # Save parameters winner_parameters = {} for ni, index in enumerate(idx_winners): # convert linear index to multi-dimensions idx_parameters = np.unravel_index([index], (n_kernels, n_l2regs)) i, j = idx_parameters i = int(i[0]) j = int(j[0]) n = n_trains[ni] sigma = i l2reg = l2regs[j] parameters = { "sigma": sigma, "reg": l2reg, } winner_parameters[str(n)] = parameters misc.save_json(scr + "parameters."+name, winner_parameters) print(name, scores) # Load distance kernels models = [] parameters = { "name": "rdkitfp", "sigma": [2**x for x in range(1, 12, 2)], # "sigma": [2**x for x in np.arange(20, 40, 0.5)], # "lambda": l2regs, # "lambda": [10.0**-x for x in np.arange(1, 10, 1)] "lambda": [10.0**-6], } models.append(parameters) parameters = { "name": "slatm", "sigma": [2**x for x in range(1, 12, 2)], # "sigma": [2**x for x in np.arange(20, 40, 0.5)], # "lambda": l2regs, # "lambda": [10.0**-x for x in np.arange(1, 10, 1)] "lambda": [10.0**-6], } models.append(parameters) parameters = { "name": "cm", "sigma": [2**x for x in range(1, 12, 2)], "lambda": l2regs, } models.append(parameters) parameters = { "name": "bob", "sigma": [2**x for x in range(1, 12, 2)], "lambda": l2regs, } models.append(parameters) parameters = { "name": "avgslatm", "sigma": [2**x for x in range(1, 20, 2)], "lambda": l2regs, } # models.append(parameters) for model in models: name = model["name"] if name not in names: continue print("scoring", name) parameters = model n_sigma = len(parameters["sigma"]) n_lambda = len(parameters["lambda"]) print("parameter range") print("sigma", min(parameters["sigma"]), max(parameters["sigma"])) dist = misc.load_npy(scr + "dist." + name) kernels = get_kernels_l2distance(dist, parameters) # Cross validate idx_winners, scores = cross_validation(kernels, properties, training_points=n_trains) # Save scores misc.save_npy(scr + "score."+name, scores) scores = np.around(np.mean(scores, axis=1), decimals=2) # Save parameters winner_parameters = {} for ni, index in enumerate(idx_winners): # convert linear index to multi-dimensions idx_parameters = np.unravel_index([index], (n_sigma, n_lambda)) i, j = idx_parameters i = int(i[0]) j = int(j[0]) n = n_trains[ni] sigma = parameters["sigma"][i] l2reg = parameters["lambda"][j] this_parameters = { "sigma": str(sigma), "reg": str(l2reg), } winner_parameters[str(n)] = this_parameters print(name, scores) misc.save_json(scr + "parameters."+name, winner_parameters) quit() return
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="dir", default="_tmp_") parser.add_argument('--randomseed', action='store', help='random seed', metavar="int", default=1) parser.add_argument('-j', '--procs', action='store', help='pararallize', type=int, metavar="int", default=0) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" # Not that random np.random.seed(args.randomseed) # Get properties properties = misc.load_npy(args.scratch + "properties") molobjs = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz") # Get features filename = "repr.ols" if os.path.exists(args.scratch + filename + ".pkl"): features = misc.load_obj(args.scratch + filename) else: features = extract_features(properties, molobjs, procs=args.procs) features = pd.DataFrame(features) features = features.fillna(0) misc.save_obj(args.scratch + filename, features) n_items = len(features) X = np.arange(n_items) assert len(properties) == n_items # Train n_splits = 5 n_train = misc.load_npy(args.scratch + "n_train") fold_five = sklearn.model_selection.KFold(n_splits=n_splits, random_state=45, shuffle=True) scores = [] for i, (idxs_train, idxs_test) in enumerate(fold_five.split(X)): # un-ordered idxs_train np.random.seed(45 + i) np.random.shuffle(idxs_train) learning_curve = [] for n in n_train: idxs = idxs_train[:n] # signed difference sign_diff = fit_model(features, idxs, idxs_test) # rmse diff = sign_diff**2 rmse_test = np.sqrt(diff.mean()) # save learning_curve.append(rmse_test) scores.append(learning_curve) scores = np.array(scores) scores = scores.T mean_score = np.mean(scores, axis=1) print(mean_score) misc.save_npy(args.scratch + "score.ols", scores) return
def main(): # L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001. import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="dir", default="_tmp_") parser.add_argument('--randomseed', action='store', help='random seed', metavar="int", default=1) parser.add_argument('-j', '--procs', action='store', help='pararallize', type=int, metavar="int", default=0) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" # Not that random np.random.seed(args.randomseed) # Get properties properties = misc.load_npy(args.scratch + "properties") molobjs = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz") X = [] try: X = misc.load_npy(args.scratch + "repr.rdkitfp") print("loaded") except: for molobj in molobjs: bitmap = fingerprints.get_rdkitfp(molobj) X.append(bitmap) X = np.asarray(X) y = properties # load predefined training points n_train = misc.load_npy(args.scratch + "n_train") # CV idxs = np.array(list(range(len(properties))), dtype=int) scores = [] for idxs_train, idxs_test in cv.cross_view(idxs): learning_curve = [] for n in n_train: idxs = idxs_train[:n] clf = get_best_rfr(X[idxs], y[idxs]) # training error # predictions = clf.predict(X) # predictions predictions = clf.predict(X[idxs_test]) diff = predictions-y[idxs_test] diff = diff**2 rmse_test = np.sqrt(diff.mean()) learning_curve.append(rmse_test) print(n, rmse_test) scores.append(learning_curve) scores = np.array(scores) scores = scores.T mean_score = np.mean(scores, axis=1) print(mean_score) misc.save_npy(args.scratch + "score.rfr", scores)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="dir", default="_tmp_") parser.add_argument('--randomseed', action='store', help='random seed', metavar="int", default=1) parser.add_argument('-j', '--procs', action='store', help='pararallize', type=int, metavar="int", default=0) args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" # Not that random # np.random.seed(args.randomseed) # Get properties properties = misc.load_npy(args.scratch + "properties") # molobjs = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz") n_items = len(properties) X = np.arange(n_items) # Train n_splits = 5 n_train = misc.load_npy(args.scratch + "n_train") fold_five = sklearn.model_selection.KFold(n_splits=n_splits, random_state=45, shuffle=True) scores = [] for i, (idxs_train, idxs_test) in enumerate(fold_five.split(X)): # un-ordered idxs_train np.random.seed(45 + i) np.random.shuffle(idxs_train) learning_curve = [] for n in n_train: idxs = idxs_train[:n] train = properties[idxs] model = train.mean() test = properties[idxs_test] # predict sign_diff = model - test # rmse diff = sign_diff**2 rmse_test = np.sqrt(diff.mean()) # save learning_curve.append(rmse_test) scores.append(learning_curve) scores = np.array(scores) scores = scores.T mean_score = np.mean(scores, axis=1) print(mean_score) misc.save_npy(args.scratch + "score.null", scores) return
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--scratch', action='store', help='', metavar="dir", default="_tmp_") parser.add_argument('--conformers', action='store_true', help='') parser.add_argument('--sdf', action='store', help='', metavar="file") parser.add_argument('-j', '--procs', action='store', help='pararallize', metavar="int", default=0, type=int) parser.add_argument('-r', '--representations', action='store', help='', metavar="STR", nargs="+") args = parser.parse_args() if args.scratch[-1] != "/": args.scratch += "/" if args.procs == -1: args.procs = int(os.cpu_count()) print("set procs", args.procs) representation_names_coordbased = [ "cm", "fchl18", "fchl19", "slatm", "bob" ] representation_names_molbased = ["morgan", "rdkitfp"] if args.representations is None: # representation_names = ["cm", "fchl18", "fchl19", "slatm", "bob"] # representation_names = ["fchl18"] # representation_names = ["bob"] representation_names = ["slatm", "bob", "cm", "rdkitfp", "morgan"] else: representation_names = args.representations molobjs = cheminfo.read_sdffile(args.sdf) molobjs = [mol for mol in molobjs] xyzs = molobjs_to_xyzs(molobjs) mol_atoms, mol_coords = xyzs misc.save_obj(args.scratch + "atoms", mol_atoms) # Print unique atoms unique_atoms = [] for atoms in mol_atoms: unique_atoms += list(np.unique(atoms)) unique_atoms = np.array(unique_atoms) unique_atoms = unique_atoms.flatten() unique_atoms = np.unique(unique_atoms) # Calculate max_size max_atoms = [len(atoms) for atoms in mol_atoms] max_atoms = max(max_atoms) n_items = len(mol_coords) print("total mols:", n_items) print("atom types:", unique_atoms) print("max atoms: ", max_atoms) print() print("representations:", representation_names) print() misc.save_txt(args.scratch + "n_items", n_items) # Gas phase for name in representation_names: if name not in representation_names_coordbased: continue representations = xyzs_to_representations(mol_atoms, mol_coords, name=name, scr=args.scratch, max_atoms=max_atoms, procs=args.procs) if isinstance(representations, (np.ndarray, np.generic)): misc.save_npy(args.scratch + "repr." + name, representations) else: misc.save_obj(args.scratch + "repr." + name, representations) representations = None del representations for name in representation_names: if name not in representation_names_molbased: continue representations = molobjs_to_representations(molobjs, name=name, procs=args.procs) if isinstance(representations, (np.ndarray, np.generic)): misc.save_npy(args.scratch + "repr." + name, representations) else: misc.save_obj(args.scratch + "repr." + name, representations) representations = None del representations quit() # Ensemble # if args.conformers: # generate_conformer_representation(scr=args.scratch, procs=args.procs) return
def generate_conformer_representation(scr="_tmp_ensemble_/", procs=0): names = ["cm", "slatm", "bob"] name = "slatm" mbtypes = misc.load_npy(scr + "slatm.mbtypes") # TODO Calculate max_size mol_atoms = misc.load_obj(scr + "atoms") max_atoms = [len(atoms) for atoms in mol_atoms] max_atoms = max(max_atoms) kwargs = { "name": name, "mbtypes": mbtypes, "debug": False, "max_atoms": max_atoms, } # n_total = 1285 n_total = 3456 idxs = range(n_total) avgreps = [0] * n_total if procs == 0: for idx in idxs: idx, avgrep = get_avg_repr(idx, **kwargs) avgreps[idx] = avgrep else: idx, rep = get_avg_repr(0, **kwargs) rep_size = rep.shape[0] print("rep size", rep_size) m = MyManager() m.start() results = m.np_zeros((n_total, rep_size)) # TODO Hardcoded, puuuha pool = Pool(32) kwargs["array"] = results func = partial(get_avg_repr, **kwargs) pool.map(func, idxs) avgreps = results # results = misc.parallel(idxs, get_avg_repr, [], kwargs, procs=nprocs) # # for result in results: # idx, avgrep = result # avgreps[idx] = avgrep # print(idx, avgrep.mean()) avgreps = np.array(avgreps) misc.save_npy(scr + "repr.avgslatm", avgreps) return