def print_header(): import horovod.tensorflow as hvd if hvd.rank() == 0: text = """ _ _ _ _ (_) | | | | | | ___ ___ _ _ __ ___ | | | |__ ___ _ __ ___| |__ / __|/ __| | '_ ` _ \| | | '_ \ / _ \ '_ \ / __| '_ \\ \__ \ (__| | | | | | | | | |_) | __/ | | | (__| | | | |___/\___|_|_| |_| |_|_| |_.__/ \___|_| |_|\___|_| |_| """ sys.stdout.write(text) sys.stdout.write("\n\n") LOGGER.info('Version: %s', sciml_bench.__version__) from mpi4py import MPI data = (MPI.Get_processor_name(), hvd.local_size()) _comm = MPI.COMM_WORLD data = _comm.bcast(data, root=0) data = [data] if not isinstance(data, list) else data plurality = 'es' if len(data) > 1 else '' for node_name, local_size in data: LOGGER.info('%s has %s process%s', node_name, local_size, plurality)
def run_benchmark(benchmark, **params): benchmark_name = benchmark.name now = datetime.now() folder = now.strftime("%Y-%m-%d-%H%M") params['data_dir'] = Path(params['data_dir']) / benchmark_name params['model_dir'] = str(Path(params['model_dir']).joinpath(benchmark_name).joinpath(folder)) params['metrics'] = list(benchmark.metrics) params['batch_size'] = benchmark.batch_size # create the model directory if it does not yet exist Path(params['model_dir']).mkdir(parents=True, exist_ok=True) if not isinstance(benchmark, TensorflowKerasMixin): raise RuntimeError("Expected benchmark to be a tensorflow model but it was not!") LOGGER.debug('Benchmark %s', benchmark.name) LOGGER.debug('Loss %s', benchmark.loss) LOGGER.debug('Batch size %s', benchmark.batch_size) LOGGER.debug('Optimizer %s', benchmark.optimizer) LOGGER.debug('Epochs %s', benchmark.epochs) runner = TensorflowKerasBenchmarkRunner(benchmark, output_dir=params['model_dir']) runner.run(**params)
def create_benchmark(name): if name not in BENCHMARK_REGISTRY or len(BENCHMARK_REGISTRY[name]) == 0: raise RuntimeError("Benchmark {} does not exist in registry!".format(name)) benchmark_cls = BENCHMARK_REGISTRY[name][-1] LOGGER.debug('Benchmark implementation is {}'.format(benchmark_cls.__name__)) benchmark = benchmark_cls() return benchmark
def download(benchmark_names): register() config = load_config() data_dir = Path(config.get('data_dir')).expanduser() for name in benchmark_names: if name not in BENCHMARKS and name != 'all': LOGGER.error('No benchmark with name {}'.format(name)) sys.exit(1) if 'all' in benchmark_names: benchmark_names = BENCHMARKS.keys() for name in benchmark_names: sync_datasets(name, data_dir)
def predict(self, lr_warmup=3, **params): if self._model is None: raise RuntimeError("Model has not been built!\n \ Please call benchmark.build() first to compile the model!") # Add hooks for Horovod hooks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0), hvd.callbacks.MetricAverageCallback(), ] if hvd.rank() == 0: # These hooks only need to be called by one instance. # Therefore we need to only add them on rank == 0 tracker_hook = TrackingCallback(self._output_dir, params['global_batch_size'], self._log_batch) hooks.append(tracker_hook) LOGGER.info('Begin Predict...') model_dir = Path(self._output_dir) weights_file = model_dir / 'final_weights.h5' # Edge case: user is trying to run inference but not training # See if we can find a pre-trained model from another run # If not then throw and error as we're in an inconsistent state. if not weights_file.exists(): LOGGER.info('Searching for pre-trained models') weight_files = model_dir.parent.glob('**/*final_weights.h5') weight_files = list(sorted(weight_files)) if len(weight_files) == 0: raise RuntimeError("No pre-trained model exists! Please train a model before running inference!") weights_file = weight_files[-1] LOGGER.info('Using weights file: {}'.format(str(weights_file))) self._model.load_weights(str(weights_file)) dataset = self.benchmark.validation_data_loader_.to_dataset() verbose = 1 if params.get('verbosity', 0) > 1 and hvd.rank() == 0 else 0 LOGGER.debug('Evaluate Start') self._model.evaluate(dataset, callbacks=hooks, verbose=verbose) LOGGER.debug('Evaluate End')
def register_all_objects(module_dir=None): from sciml_bench.core.bench_logger import LOGGER if module_dir is None: module_dir = Path(__file__).parent.absolute() else: module_dir = Path(module_dir).expanduser() LOGGER.debug('Importing modules from {}'.format(module_dir)) _benchmark_modules = module_dir.glob('**/*.py') for module_name in _benchmark_modules: module_path = module_name module_path = str(module_path) LOGGER.debug(module_path) try: spec = importlib.util.spec_from_file_location( module_path.replace('/', '.'), module_path) foo = importlib.util.module_from_spec(spec) spec.loader.exec_module(foo) except ModuleNotFoundError: LOGGER.debug( 'Skipping module {} due to module not found error'.format( module_path))
def set_environment_variables(cpu_only=False, use_amp=False, **kwargs): # Optimization flags os.environ['CUDA_CACHE_DISABLE'] = '0' os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' os.environ['TF_ADJUST_HUE_FUSED'] = 'data' os.environ['TF_ADJUST_SATURATION_FUSED'] = 'data' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = 'data' os.environ['TF_SYNC_ON_FINISH'] = '0' os.environ['TF_AUTOTUNE_THRESHOLD'] = '2' if cpu_only: os.environ["CUDA_VISIBLE_DEVICES"] = "-1" if use_amp: os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1' if kwargs['verbosity'] >= 3 and kwargs['log_level'] == 'debug': os.environ['TF_CPP_MIN_LOG_LEVEL'] = '-1' # Try and import tensorflow to check for any issues try: import tensorflow as tf devices = tf.config.list_physical_devices('GPU') if len(devices) == 0 and not cpu_only: LOGGER.warning('No available GPUs could be detected. This could be because no GPU exists or could be due to a mismatch between CUDA runtime version and the compute capability of the system hardware. Check that the CUDA drivers are correctly installted on your system. Set verbosity = 3 and check the output of the Tensorflow logs.') # sys.exit(1) except Exception as e: LOGGER.debug(traceback.format_exc()) LOGGER.critical('Fatal issue importing Tensorflow: %s', e) sys.exit(1)
def build(self, log_batch=False, **params): self._log_batch = log_batch self._model = self.benchmark.model(input_shape=self.benchmark.data_loader_.input_shape, **params) opt = self.benchmark.optimizer_ opt_cfg = opt.get_config() opt_cfg['learning_rate'] *= hvd.size() opt = opt.from_config(opt_cfg) opt = hvd.DistributedOptimizer(opt) loss = self.benchmark.loss_ LOGGER.debug(loss.__name__) metrics = self.benchmark.metrics self._model.compile(loss=loss, optimizer=opt, metrics=metrics, experimental_run_tf_function=False) if hvd.rank() == 0: model_dir = Path(self._output_dir) model_dir.mkdir(parents=True, exist_ok=True)
def train(self, **params): verbose = 1 if params.get('verbosity', 0) > 1 and hvd.rank() == 0 else 0 if self._model is None: raise RuntimeError("Model has not been built!\n \ Please call benchmark.build() first to compile the model!") # Add hooks for Horovod hooks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0), hvd.callbacks.MetricAverageCallback(), ] if hvd.rank() == 0: # These hooks only need to be called by one instance. # Therefore we need to only add them on rank == 0 tracker_hook = TrackingCallback(self._output_dir, params['global_batch_size'], self._log_batch) hooks.append(tracker_hook) # Add hook for capturing metrics vs. epoch log_file = Path(self._output_dir).joinpath('training.log') log_file = str(log_file) csv_logger = tf.keras.callbacks.CSVLogger(log_file) hooks.append(csv_logger) LOGGER.info('Begin Training...') LOGGER.info('Training for {} epochs'.format(self.benchmark.epochs)) dataset = self.benchmark.data_loader_.to_dataset() LOGGER.debug('Fitting Start') self._model.fit(dataset, epochs=self.benchmark.epochs, callbacks=hooks, verbose=verbose, **self.benchmark.fit_params) LOGGER.debug('Fitting End') if hvd.rank() == 0: model_dir = Path(self._output_dir) weights_file = str(model_dir / 'final_weights.h5') self._model.save_weights(weights_file)
def run(self, log_interval=0.5, **params): params = self.setup(**params) self.build(**params) if hvd.rank() == 0: db = TrackingClient(Path(self._output_dir) / 'logs.json') db.log_param('params', params) LOGGER.info('Number of Replicas: {}'.format(params['num_replicas'])) LOGGER.info('Global Batch Size: {}'.format(params['global_batch_size'])) LOGGER.info('Replica Batch Size: {}'.format(params['batch_size'])) if 'train' in params['exec_mode']: with NodeLogger(self._output_dir, name=self._node_name, prefix='train', interval=log_interval): self.train(**params) if 'predict' in params['exec_mode']: with NodeLogger(self._output_dir, name=self._node_name, prefix='predict', interval=log_interval): self.predict(**params)
def sync_datasets(benchmark_name, data_dir): # Grab latest DB file from STFC s3 LOGGER.info('Downloading Dataset Database at {}'.format(DB_URI)) download_file(DB_URI, DB_FILE_NAME) conn = sqlite3.connect(DB_FILE_NAME) exports_db = pd.read_sql("select * from exports", con=conn) # clean up db file os.remove(DB_FILE_NAME) # parse files in buckets bucket_name = exports_db.loc[exports_db.detail == benchmark_name].bucket.values[0] dataset_uri = ''.join([STFC_S3_URI, bucket_name]) LOGGER.info('Dataset uri {}'.format(dataset_uri)) response = requests.get(dataset_uri) tree = BeautifulSoup(response.content, 'lxml') bucket_contents = tree.findAll('contents') bucket_contents = [{item.name: item.text for item in c} for c in bucket_contents] bucket_contents = pd.DataFrame(bucket_contents) bucket_contents['name'] = bucket_contents.key.str.split('/').map( lambda s: s[0]) bucket_contents = bucket_contents.loc[benchmark_name == bucket_contents.name] # download data from bucket start_time = time.time() LOGGER.info('Downloading data for {}'.format(benchmark_name)) for index, row in bucket_contents.iterrows(): file_name = Path(row.key) LOGGER.info('Downloading {}'.format(file_name.name)) if file_name.exists(): LOGGER.info('{} already downloaded'.format(file_name.name)) continue file_uri = '/'.join([dataset_uri, str(file_name)]) file_name = Path(data_dir) / file_name file_name.parent.mkdir(parents=True, exist_ok=True) download_file(file_uri, file_name) end_time = time.time() LOGGER.info('Total Download Time (s): {}'.format(end_time - start_time))
def run(benchmark_names, skip, **params): # Load configuration for benchmarks config = load_config() params['model_dir'] = params['model_dir'] if params['model_dir'] is not None else config['model_dir'] params['data_dir'] = params['data_dir'] if params['data_dir'] is not None else config['data_dir'] config.update(params) LOGGER.setLevel(params.get('log_level').upper()) if params.get('verbosity') < 2: LOGGER.setLevel(logging.WARNING) if params.get('verbosity') == 0: LOGGER.setLevel(logging.CRITICAL) set_environment_variables(**params) if params.get('verbosity') >= 2: print_header() register() for name in benchmark_names: if name not in BENCHMARKS and name != 'all': LOGGER.error('No benchmark with name {}'.format(name)) sys.exit(1) model_dir = Path(config['model_dir']).expanduser() data_dir = Path(config['data_dir']).expanduser() if not data_dir.exists(): LOGGER.error("Data directory {} does not exist!".format(data_dir)) sys.exit(1) LOGGER.info('Model directory is: %s', str(model_dir)) LOGGER.info('Data directory is: %s', str(data_dir)) # If no benchmarks specified or all then run everything if len(benchmark_names) == 0 or 'all' in benchmark_names: benchmark_names = BENCHMARKS.keys() # Log which benchmarks we will run LOGGER.info('Selected the following benchmarks:') for name in benchmark_names: LOGGER.info('{}'.format(str(name))) # Ok, run all requested benchmarks for name in benchmark_names: LOGGER.info('Running %s benchmark', name) benchmark_data_dir = data_dir / name if not benchmark_data_dir.exists(): LOGGER.error('Data directory {} does not exist! Is the data for benchmark {} downloaded?'.format(str(benchmark_data_dir), name)) if skip: LOGGER.error('Skipping benchmark {}'.format(name)) continue else: sys.exit(1) cfg = dict(config[name]) if name in config else {} cfg.update(config) cfg['data_dir'] = benchmark_data_dir benchmark = BENCHMARKS[name](**cfg) try: run_benchmark(benchmark, **cfg) except Exception as e: LOGGER.debug(traceback.format_exc()) LOGGER.error('Failed to run benchmark {} due to unhandled exception.\n{}'.format(name, e)) if skip: LOGGER.info('Skipping benchmark {}'.format(name)) continue else: sys.exit(1)