def run(self, json_path=None): ''' Run benchmark Parameters ---------- json_path : str, optional if `json_path` is given, the results are saved to the path ''' results = _evaluate_goal_directed_benchmarks(self.optimizer, self.benchmark_list) if json_path is not None: if os.path.splitext(json_path)[1] != '.json': raise ValueError('json_path must have extension .json') from collections import OrderedDict import json from guacamol.utils.data import get_time_string benchmark_results = OrderedDict() benchmark_results['guacamol_version'] = guacamol.__version__ benchmark_results[ 'benchmark_suite_version'] = self.benchmark_suite_name benchmark_results['timestamp'] = get_time_string() benchmark_results['results'] = [vars(result) for result in results] logger(f'Save results to file {json_path}') with open(json_path, 'wt') as f: f.write(json.dumps(benchmark_results, indent=4)) return results
def assess_goal_directed_generation( goal_directed_molecule_generator: GoalDirectedGenerator, json_output_file='output_goal_directed.json', benchmark_version='v3') -> None: """ Assesses a distribution-matching model for de novo molecule design. Args: goal_directed_molecule_generator: Model to evaluate json_output_file: Name of the file where to save the results in JSON format benchmark_version: which benchmark suite to execute """ logger.info( f'Benchmarking goal-directed molecule generation, version {benchmark_version}' ) benchmarks = goal_directed_benchmark_suite(version_name=benchmark_version) results = _evaluate_goal_directed_benchmarks( goal_directed_molecule_generator=goal_directed_molecule_generator, benchmarks=benchmarks) benchmark_results: Dict[str, Any] = OrderedDict() benchmark_results['guacamol_version'] = guacamol.__version__ benchmark_results['benchmark_suite_version'] = benchmark_version benchmark_results['timestamp'] = get_time_string() benchmark_results['results'] = [vars(result) for result in results] logger.info(f'Save results to file {json_output_file}') with open(json_output_file, 'wt') as f: f.write(json.dumps(benchmark_results, indent=4))
def _assess_distribution_learning(model: DistributionMatchingGenerator, chembl_training_file: str, json_output_file: str, benchmark_version: str, number_samples: int) -> None: """ Internal equivalent to assess_distribution_learning, but allows for a flexible number of samples. To call directly only for testing. """ logger.info( f'Benchmarking distribution learning, version {benchmark_version}') benchmarks = distribution_learning_benchmark_suite( chembl_file_path=chembl_training_file, version_name=benchmark_version, number_samples=number_samples) results = _evaluate_distribution_learning_benchmarks(model=model, benchmarks=benchmarks) benchmark_results: Dict[str, Any] = OrderedDict() benchmark_results['guacamol_version'] = guacamol.__version__ benchmark_results['benchmark_suite_version'] = benchmark_version benchmark_results['timestamp'] = get_time_string() benchmark_results['samples'] = model.generate(100) benchmark_results['results'] = [vars(result) for result in results] logger.info(f'Save results to file {json_output_file}') with open(json_output_file, 'wt') as f: f.write(json.dumps(benchmark_results, indent=4))
def WriteGuacaMolBenchmarkResult(benchmark_result, output_json_path): results: Dict[str, Any] = OrderedDict() results["guacamol_version"] = guacamol.__version__ results["timestamp"] = get_time_string() results["result"] = vars(benchmark_result) with open(output_json_path, "w") as file: file.write(json.dumps(results, indent=4))
def get_argparser(): timestring = get_time_string() parser = argparse.ArgumentParser( description='Data Preparation for GuacaMol', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-o', '--destination', default='.', help='Download and Output location') parser.add_argument('-i', '--input', required=True, help='Filename of input smiles file') parser.add_argument('--output_prefix', default=timestring, help='Prefix of the output file') parser.add_argument('--n_jobs', default=4, type=int, help='Number of cores to use') parser.add_argument('--seed', default=9325, type=int, help='Random number seed') parser.add_argument('--with_hydrogens', action='store_true', default=False, help='Whether to add hydrogen nodes to the graph.') return parser
def get_argparser(): timestring = get_time_string() parser = argparse.ArgumentParser(description='Data Preparation for GuacaMol', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-o', '--destination', default='.', help='Download and Output location') parser.add_argument('-i', '--input', default=None, help='Filename of input smiles file') parser.add_argument('--output_prefix', default=timestring, help='Prefix of the output file') parser.add_argument('--n_jobs', default=8, type=int, help='Number of cores to use') parser.add_argument('--tanimoto_cutoff', default=0.323, type=float, help='Remove molecules too similar to the holdout set') parser.add_argument('--chembl', action='store_true', help='Specify to download and process molecules from chembl') return parser
def config_logger(model_dir): from guacamol.utils.data import get_time_string timestring = get_time_string() fh = logging.FileHandler(model_dir / '{}-train.log'.format(timestring)) fh.setFormatter(logging.Formatter(logging.BASIC_FORMAT)) # configure root logger sh = logging.StreamHandler() sh.setFormatter(logging.Formatter(logging.BASIC_FORMAT)) logging.getLogger().addHandler(sh) for name in ('gan', 'guacamol', 'tensorflow', 'tensorpack'): logger = logging.getLogger(name) logger.addHandler(fh) logger.setLevel(logging.INFO) if name == 'tensorflow': # avoid double logging logger.propagate = False fh.setLevel(logging.DEBUG)
def assess_distribution_learning(model: DistributionMatchingGenerator, training_file_path: str, json_output_file: str, number_samples: int) -> None: LOG.info('Benchmarking distribution learning') benchmarks = [ ValidityBenchmark(number_samples=number_samples), UniquenessBenchmark(number_samples=number_samples), novelty_benchmark(training_set_file=training_file_path, number_samples=number_samples), kldiv_benchmark(training_set_file=training_file_path, number_samples=number_samples), ] results = _evaluate_distribution_learning_benchmarks(model=model, benchmarks=benchmarks) benchmark_results = OrderedDict() benchmark_results['guacamol_version'] = guacamol.__version__ benchmark_results['timestamp'] = get_time_string() benchmark_results['results'] = [vars(result) for result in results] LOG.info('Save results to file %s', json_output_file) with open(json_output_file, 'wt') as f: f.write(json.dumps(benchmark_results, indent=4))