def main(): parser = argparse.ArgumentParser() parser.add_argument( '--smiles_file', help= 'Location of the ChEMBL dataset to use for the distribution benchmarks.', default='data/guacamol_v1_all.smiles') parser.add_argument( '--pickle_directory', help= 'Directory containing pickle files with the distribution statistics', default=None) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--n_jobs', type=int, default=-1) parser.add_argument('--generations', type=int, default=1000) parser.add_argument('--population_size', type=int, default=100) parser.add_argument('--num_sims', type=int, default=40) parser.add_argument('--max_children', type=int, default=25) parser.add_argument('--max_atoms', type=int, default=60) parser.add_argument('--init_smiles', type=str, default='CC') parser.add_argument('--random_start', action='store_true') parser.add_argument('--output_dir', type=str, default=None) parser.add_argument('--suite', default='v2') args = parser.parse_args() if args.output_dir is None: args.output_dir = os.path.dirname(os.path.realpath(__file__)) if args.pickle_directory is None: args.pickle_directory = os.path.dirname(os.path.realpath(__file__)) np.random.seed(args.seed) setup_default_logger() # save command line args with open( os.path.join(args.output_dir, 'distribution_learning_params.json'), 'w') as jf: json.dump(vars(args), jf, sort_keys=True, indent=4) sampler = GB_MCTS_Sampler(pickle_directory=args.pickle_directory, n_jobs=args.n_jobs, random_start=args.random_start, num_sims=args.num_sims, max_children=args.max_children, init_smiles=args.init_smiles, max_atoms=args.max_atoms, generations=args.generations, population_size=args.population_size) json_file_path = os.path.join(args.output_dir, 'distribution_learning_results.json') assess_distribution_learning(sampler, json_output_file=json_file_path, chembl_training_file=args.smiles_file, benchmark_version=args.suite)
def main(): population_size = 100 ### each generation for each mol in population, one oracle call. max_children = 10 generations_num = int(max_oracle_num / population_size / max_children) parser = argparse.ArgumentParser() parser.add_argument( '--pickle_directory', help= 'Directory containing pickle files with the distribution statistics', default=None) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--n_jobs', type=int, default=-1) parser.add_argument('--generations', type=int, default=generations_num) parser.add_argument('--population_size', type=int, default=population_size) parser.add_argument('--num_sims', type=int, default=40) parser.add_argument('--max_children', type=int, default=max_children) ### 25 -> 5 parser.add_argument('--max_atoms', type=int, default=60) parser.add_argument('--init_smiles', type=str, default='CC') parser.add_argument('--output_dir', type=str, default=None) parser.add_argument('--patience', type=int, default=5) parser.add_argument('--suite', default='v3') args = parser.parse_args() if args.output_dir is None: args.output_dir = os.path.dirname(os.path.realpath(__file__)) if args.pickle_directory is None: args.pickle_directory = os.path.dirname(os.path.realpath(__file__)) np.random.seed(args.seed) setup_default_logger() # save command line args with open(os.path.join(args.output_dir, 'goal_directed_params.json'), 'w') as jf: json.dump(vars(args), jf, sort_keys=True, indent=4) optimiser = GB_MCTS_Generator(pickle_directory=args.pickle_directory, n_jobs=args.n_jobs, num_sims=args.num_sims, max_children=args.max_children, init_smiles=args.init_smiles, max_atoms=args.max_atoms, patience=args.patience, generations=args.generations, population_size=args.population_size) json_file_path = os.path.join(args.output_dir, 'goal_directed_results.json') assess_goal_directed_generation(optimiser, json_output_file=json_file_path, benchmark_version=args.suite)
def main(config): setup_default_logger() set_seed(config.seed) generator = OrganGenerator(config) json_file_path = os.path.join(config.output_dir, 'distribution_learning_results.json') assess_distribution_learning(generator, chembl_training_file=config.dist_file, json_output_file=json_file_path, benchmark_version=config.suite)
def entry_point(): parser = argparse.ArgumentParser() parser.add_argument('--smiles_file', type=str) parser.add_argument('--db_fname', type=str) parser.add_argument('--selection_size', type=int, default=10) parser.add_argument('--radius', type=int, default=3) parser.add_argument('--replacements', type=int, default=1000) parser.add_argument('--min_size', type=int, default=0) parser.add_argument('--max_size', type=int, default=10) parser.add_argument('--min_inc', type=int, default=-7) parser.add_argument('--max_inc', type=int, default=7) parser.add_argument('--generations', type=int, default=1000) parser.add_argument('--ncpu', type=int, default=1) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--output_dir', type=str, default=None) parser.add_argument('--suite', default='v2') args = parser.parse_args() np.random.seed(args.seed) setup_default_logger() if args.output_dir is None: args.output_dir = os.path.dirname(os.path.realpath(__file__)) # save command line args with open(os.path.join(args.output_dir, 'goal_directed_params.json'), 'w') as jf: json.dump(vars(args), jf, sort_keys=True, indent=4) optimiser = CREM_Generator(smi_file=args.smiles_file, selection_size=args.selection_size, db_fname=args.db_fname, radius=args.radius, min_size=args.min_size, max_size=args.max_size, min_inc=args.min_inc, max_inc=args.max_inc, replacements=args.replacements, generations=args.generations, ncpu=args.ncpu, random_start=True, output_dir=args.output_dir) json_file_path = os.path.join(args.output_dir, 'goal_directed_results.json') assess_goal_directed_generation(optimiser, json_output_file=json_file_path, benchmark_version=args.suite)
def main(): setup_default_logger() parser = argparse.ArgumentParser( description= 'Generate pickle files for the statistics of a training set for MCTS', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '--smiles_file', default='data/moses.smiles', help= 'Full path to SMILES file from which to generate the distribution statistics' ) parser.add_argument('--output_dir', default=None, help='Output directory for the pickle files') args = parser.parse_args() if args.output_dir is None: args.output_dir = os.path.dirname(os.path.realpath(__file__)) logger.info('Generating probabilities for MCTS...') t0 = time() stats = StatsCalculator(args.smiles_file) size_stats = stats.size_statistics() rxn_smarts_rings = stats.rxn_smarts_rings() rxn_smarts_make_rings = stats.rxn_smarts_make_rings() p_rings = stats.ring_probs() pickle.dump(size_stats, open(os.path.join(args.output_dir, 'size_stats.p'), 'wb')) pickle.dump(p_rings, open(os.path.join(args.output_dir, 'p_ring.p'), 'wb')) pickle.dump(rxn_smarts_rings, open(os.path.join(args.output_dir, 'rs_ring.p'), 'wb')) pickle.dump(rxn_smarts_make_rings, open(os.path.join(args.output_dir, 'rs_make_ring.p'), 'wb')) p = stats.pair_probs() rxn_smarts = stats.rxn_smarts() pickle.dump(p, open(os.path.join(args.output_dir, 'p1.p'), 'wb')) pickle.dump(rxn_smarts, open(os.path.join(args.output_dir, 'r_s1.p'), 'wb')) print( f'Total time: {str(datetime.timedelta(seconds=int(time() - t0)))} secs' )
def main(config): setup_default_logger() set_seed(config.seed) if config.output_dir is None: config.output_dir = os.path.dirname(os.path.realpath(__file__)) generator = VaeGenerator(config) json_file_path = os.path.join(config.output_dir, 'distribution_learning_results.json') assess_distribution_learning(generator, chembl_training_file=config.dist_file, json_output_file=json_file_path, benchmark_version=config.suite)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--n_jobs', type=int, default=-1) parser.add_argument('--episode_size', type=int, default=8192) parser.add_argument('--batch_size', type=int, default=1024) parser.add_argument('--entropy_weight', type=int, default=1) parser.add_argument('--kl_div_weight', type=int, default=10) parser.add_argument('--output_dir', default=None) parser.add_argument('--clip_param', type=int, default=0.2) parser.add_argument('--num_epochs', type=int, default=20) parser.add_argument('--model_path', default=None) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--suite', default='v3') args = parser.parse_args() np.random.seed(args.seed) setup_default_logger() if args.output_dir is None: args.output_dir = os.path.dirname(os.path.realpath(__file__)) if args.model_path is None: dir_path = os.path.dirname(os.path.realpath(__file__)) args.model_path = os.path.join(dir_path, 'pretrained_model', 'model_final_0.473.pt') # save command line args with open(os.path.join(args.output_dir, 'goal_directed_params.json'), 'w') as jf: json.dump(vars(args), jf, sort_keys=True, indent=4) optimiser = PPODirectedGenerator(pretrained_model_path=args.model_path, num_epochs=args.num_epochs, episode_size=args.episode_size, batch_size=args.batch_size, entropy_weight=args.entropy_weight, kl_div_weight=args.kl_div_weight, clip_param=args.clip_param) json_file_path = os.path.join(args.output_dir, 'goal_directed_results.json') assess_goal_directed_generation(optimiser, json_output_file=json_file_path, benchmark_version=args.suite)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--pickle_directory', help='Directory containing pickle files with the distribution statistics', default=None) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--n_jobs', type=int, default=-1) parser.add_argument('--generations', type=int, default=1000) parser.add_argument('--population_size', type=int, default=100) parser.add_argument('--num_sims', type=int, default=40) parser.add_argument('--max_children', type=int, default=25) parser.add_argument('--max_atoms', type=int, default=60) parser.add_argument('--init_smiles', type=str, default='CC') parser.add_argument('--random_start', type=bool, default=False) parser.add_argument('--output_dir', type=str, default=None) parser.add_argument('--patience', type=int, default=5) args = parser.parse_args() if args.output_dir is None: args.output_dir = os.path.dirname(os.path.realpath(__file__)) if args.pickle_directory is None: args.pickle_directory = os.path.dirname(os.path.realpath(__file__)) np.random.seed(args.seed) setup_default_logger() # save command line args with open(os.path.join(args.output_dir, 'goal_directed_params.json'), 'w') as jf: json.dump(vars(args), jf, sort_keys=True, indent=4) optimiser = GB_MCTS_Generator(pickle_directory=args.pickle_directory, n_jobs=args.n_jobs, random_start=args.random_start, num_sims=args.num_sims, max_children=args.max_children, init_smiles=args.init_smiles, max_atoms=args.max_atoms, patience=args.patience, generations=args.generations, population_size=args.population_size) json_file_path = os.path.join(args.output_dir, 'goal_directed_results.json') assess_goal_directed_generation(optimiser, json_output_file=json_file_path)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--smiles_file', default='data/guacamol_v1_all.smiles') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--population_size', type=int, default=100) parser.add_argument('--n_mutations', type=int, default=200) parser.add_argument('--gene_size', type=int, default=300) parser.add_argument('--generations', type=int, default=1000) parser.add_argument('--n_jobs', type=int, default=-1) parser.add_argument('--random_start', action='store_true') parser.add_argument('--output_dir', type=str, default=None) parser.add_argument('--patience', type=int, default=5) parser.add_argument('--suite', default='v1') args = parser.parse_args() np.random.seed(args.seed) setup_default_logger() if args.output_dir is None: args.output_dir = os.path.dirname(os.path.realpath(__file__)) # save command line args with open(os.path.join(args.output_dir, 'goal_directed_params.json'), 'w') as jf: json.dump(vars(args), jf, sort_keys=True, indent=4) optimiser = ChemGEGenerator(smi_file=args.smiles_file, population_size=args.population_size, n_mutations=args.n_mutations, gene_size=args.gene_size, generations=args.generations, n_jobs=args.n_jobs, random_start=args.random_start, patience=args.patience) json_file_path = os.path.join(args.output_dir, 'goal_directed_results.json') assess_goal_directed_generation(optimiser, json_output_file=json_file_path, benchmark_version=args.suite)
def main(config): setup_default_logger() set_seed(config.seed) train = read_smiles(config.train_load) vocab = CharVocab.from_data(train) torch.save(config, config.config_save) torch.save(vocab, config.vocab_save) device = torch.device(config.device) model = AAE(vocab, config) model = model.to(device) trainer = AAETrainer(config) trainer.fit(model, train) model.to('cpu') torch.save(model.state_dict(), config.model_save)
import argparse import logging import os from pathlib import Path import torch from guacamol.assess_distribution_learning import assess_distribution_learning from guacamol.utils.helpers import setup_default_logger from .rnn_utils import load_rnn_model, set_random_seed from .smiles_rnn_generator import SmilesRnnGenerator if __name__ == '__main__': setup_default_logger() logger = logging.getLogger(__name__) parser = argparse.ArgumentParser(description='Distribution learning benchmark for SMILES RNN', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--seed', default=42, type=int, help='Random seed') parser.add_argument('--model_path', default=None, help='Full path to SMILES RNN model') parser.add_argument('--output_dir', default=None, help='Output directory') parser.add_argument('--dist_file', default='data/guacamol_v1_all.smiles', help='Distribution file') args = parser.parse_args() device = 'cuda' if torch.cuda.is_available() else 'cpu' logger.info(f'device:\t{device}') set_random_seed(args.seed, device)
def main(): """ Get Chembl-23. Preprocessing steps: 1) filter SMILES shorter than 5 and longer than 200 chars and those with forbidden symbols 2) canonicalize, neutralize, only permit smiles shorter than 100 chars 3) shuffle, write files, check if they are consistently hashed. """ setup_default_logger() argparser = get_argparser() args = argparser.parse_args() # Set constants np.random.seed(1337) neutralization_rxns = initialise_neutralisation_reactions() smiles_dict = AllowedSmilesCharDictionary() print("Preprocessing ChEMBL molecules...") chembl_file = os.path.join(args.destination, CHEMBL_FILE_NAME) data = ( pkgutil.get_data("guacamol.data", "holdout_set_gcm_v1.smiles").decode("utf-8").splitlines() ) holdout_mols = [i.split(" ")[0] for i in data] holdout_set = set(canonicalize_list(holdout_mols, False)) holdout_fps = get_fingerprints_from_smileslist(holdout_set) # Download Chembl23 if needed. download_if_not_present(chembl_file, uri=CHEMBL_URL) raw_smiles = get_raw_smiles( chembl_file, smiles_char_dict=smiles_dict, open_fn=gzip.open, extract_fn=extract_chembl ) file_prefix = "chembl24_canon" print( f"and standardizing {len(raw_smiles)} molecules using {args.n_jobs} cores, " f"and excluding molecules based on ECFP4 similarity of > {TANIMOTO_CUTOFF} to the holdout set." ) # Process all the SMILES in parallel runner = Parallel(n_jobs=args.n_jobs, verbose=2) joblist = ( delayed(filter_and_canonicalize)( smiles_str, holdout_set, holdout_fps, neutralization_rxns, TANIMOTO_CUTOFF, False ) for smiles_str in raw_smiles ) output = runner(joblist) # Put all nonzero molecules in a list, remove duplicates, sort and shuffle all_good_mols = sorted(list(set([item[0] for item in output if item]))) np.random.shuffle(all_good_mols) print(f"Ended up with {len(all_good_mols)} molecules. Preparing splits...") # Split into train-dev-test # Check whether the md5-hashes of the generated smiles files match # the precomputed hashes, this ensures everyone works with the same splits. VALID_SIZE = int(0.05 * len(all_good_mols)) TEST_SIZE = int(0.15 * len(all_good_mols)) dev_set = all_good_mols[0:VALID_SIZE] dev_path = os.path.join(args.destination, f"{file_prefix}_dev-valid.smiles") write_smiles(dev_set, dev_path) test_set = all_good_mols[VALID_SIZE : VALID_SIZE + TEST_SIZE] test_path = os.path.join(args.destination, f"{file_prefix}_test.smiles") write_smiles(test_set, test_path) train_set = all_good_mols[VALID_SIZE + TEST_SIZE :] train_path = os.path.join(args.destination, f"{file_prefix}_train.smiles") write_smiles(train_set, train_path) # check the hashes valid_hashes = [ compare_hash(train_path, TRAIN_HASH), compare_hash(dev_path, VALID_HASH), compare_hash(test_path, TEST_HASH), ] if not all(valid_hashes): raise SystemExit(f"Invalid hashes for the dataset files") print("Dataset generation successful. You are ready to go.")