Пример #1
0
def print_header():
    import horovod.tensorflow as hvd
    if hvd.rank() == 0:
        text = """

               _           _   _                     _
              (_)         | | | |                   | |
      ___  ___ _ _ __ ___ | | | |__   ___ _ __   ___| |__
     / __|/ __| | '_ ` _ \| | | '_ \ / _ \ '_ \ / __| '_ \\
     \__ \ (__| | | | | | | | | |_) |  __/ | | | (__| | | |
     |___/\___|_|_| |_| |_|_| |_.__/ \___|_| |_|\___|_| |_|



        """
        sys.stdout.write(text)
        sys.stdout.write("\n\n")

    LOGGER.info('Version: %s', sciml_bench.__version__)

    from mpi4py import MPI
    data = (MPI.Get_processor_name(), hvd.local_size())
    _comm = MPI.COMM_WORLD
    data = _comm.bcast(data, root=0)

    data = [data] if not isinstance(data, list) else data

    plurality = 'es' if len(data) > 1 else ''
    for node_name, local_size in data:
        LOGGER.info('%s has %s process%s', node_name, local_size, plurality)
Пример #2
0
def run_benchmark(benchmark, **params):
    benchmark_name = benchmark.name

    now = datetime.now()
    folder = now.strftime("%Y-%m-%d-%H%M")

    params['data_dir'] = Path(params['data_dir']) / benchmark_name
    params['model_dir'] = str(Path(params['model_dir']).joinpath(benchmark_name).joinpath(folder))
    params['metrics'] = list(benchmark.metrics)
    params['batch_size'] = benchmark.batch_size

    # create the model directory if it does not yet exist
    Path(params['model_dir']).mkdir(parents=True, exist_ok=True)

    if not isinstance(benchmark, TensorflowKerasMixin):
        raise RuntimeError("Expected benchmark to be a tensorflow model but it was not!")

    LOGGER.debug('Benchmark %s', benchmark.name)
    LOGGER.debug('Loss %s', benchmark.loss)
    LOGGER.debug('Batch size %s', benchmark.batch_size)
    LOGGER.debug('Optimizer %s', benchmark.optimizer)
    LOGGER.debug('Epochs %s', benchmark.epochs)

    runner = TensorflowKerasBenchmarkRunner(benchmark, output_dir=params['model_dir'])
    runner.run(**params)
Пример #3
0
def create_benchmark(name):
    if name not in BENCHMARK_REGISTRY or len(BENCHMARK_REGISTRY[name]) == 0:
        raise RuntimeError("Benchmark {} does not exist in registry!".format(name))

    benchmark_cls = BENCHMARK_REGISTRY[name][-1]
    LOGGER.debug('Benchmark implementation is {}'.format(benchmark_cls.__name__))
    benchmark = benchmark_cls()
    return benchmark
Пример #4
0
def download(benchmark_names):
    register()
    config = load_config()
    data_dir = Path(config.get('data_dir')).expanduser()

    for name in benchmark_names:
        if name not in BENCHMARKS and name != 'all':
            LOGGER.error('No benchmark with name {}'.format(name))
            sys.exit(1)

    if 'all' in benchmark_names:
        benchmark_names = BENCHMARKS.keys()

    for name in benchmark_names:
        sync_datasets(name, data_dir)
Пример #5
0
    def predict(self, lr_warmup=3, **params):
        if self._model is None:
            raise RuntimeError("Model has not been built!\n \
                    Please call benchmark.build() first to compile the model!")

        # Add hooks for Horovod
        hooks = [
            hvd.callbacks.BroadcastGlobalVariablesCallback(0),
            hvd.callbacks.MetricAverageCallback(),
        ]

        if hvd.rank() == 0:
            # These hooks only need to be called by one instance.
            # Therefore we need to only add them on rank == 0
            tracker_hook = TrackingCallback(self._output_dir, params['global_batch_size'], self._log_batch)
            hooks.append(tracker_hook)

        LOGGER.info('Begin Predict...')

        model_dir = Path(self._output_dir)
        weights_file = model_dir / 'final_weights.h5'

        # Edge case: user is trying to run inference but not training
        # See if we can find a pre-trained model from another run
        # If not then throw and error as we're in an inconsistent state.
        if not weights_file.exists():
            LOGGER.info('Searching for pre-trained models')

            weight_files = model_dir.parent.glob('**/*final_weights.h5')
            weight_files = list(sorted(weight_files))
            if len(weight_files) == 0:
                raise RuntimeError("No pre-trained model exists! Please train a model before running inference!")
            weights_file = weight_files[-1]

        LOGGER.info('Using weights file: {}'.format(str(weights_file)))
        self._model.load_weights(str(weights_file))

        dataset = self.benchmark.validation_data_loader_.to_dataset()
        verbose = 1 if params.get('verbosity', 0) > 1 and hvd.rank() == 0 else 0

        LOGGER.debug('Evaluate Start')
        self._model.evaluate(dataset, callbacks=hooks, verbose=verbose)
        LOGGER.debug('Evaluate End')
Пример #6
0
def register_all_objects(module_dir=None):
    from sciml_bench.core.bench_logger import LOGGER

    if module_dir is None:
        module_dir = Path(__file__).parent.absolute()
    else:
        module_dir = Path(module_dir).expanduser()

    LOGGER.debug('Importing modules from {}'.format(module_dir))

    _benchmark_modules = module_dir.glob('**/*.py')
    for module_name in _benchmark_modules:
        module_path = module_name
        module_path = str(module_path)
        LOGGER.debug(module_path)

        try:
            spec = importlib.util.spec_from_file_location(
                module_path.replace('/', '.'), module_path)
            foo = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(foo)
        except ModuleNotFoundError:
            LOGGER.debug(
                'Skipping module {} due to module not found error'.format(
                    module_path))
Пример #7
0
def set_environment_variables(cpu_only=False, use_amp=False, **kwargs):
    # Optimization flags
    os.environ['CUDA_CACHE_DISABLE'] = '0'

    os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'

    os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'

    os.environ['TF_ADJUST_HUE_FUSED'] = 'data'
    os.environ['TF_ADJUST_SATURATION_FUSED'] = 'data'
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = 'data'

    os.environ['TF_SYNC_ON_FINISH'] = '0'
    os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'

    if cpu_only:
        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

    if use_amp:
        os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'

    if kwargs['verbosity'] >= 3 and kwargs['log_level'] == 'debug':
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '-1'

    # Try and import tensorflow to check for any issues
    try:
        import tensorflow as tf
        devices = tf.config.list_physical_devices('GPU')
        if len(devices) == 0 and not cpu_only:
            LOGGER.warning('No available GPUs could be detected. This could be because no GPU exists or could be due to a mismatch between CUDA runtime version and the compute capability of the system hardware. Check that the CUDA drivers are correctly installted on your system. Set verbosity = 3 and check the output of the Tensorflow logs.')
        #     sys.exit(1)
    except Exception as e:
        LOGGER.debug(traceback.format_exc())
        LOGGER.critical('Fatal issue importing Tensorflow: %s', e)
        sys.exit(1)
Пример #8
0
    def build(self, log_batch=False, **params):
        self._log_batch = log_batch

        self._model = self.benchmark.model(input_shape=self.benchmark.data_loader_.input_shape, **params)

        opt = self.benchmark.optimizer_
        opt_cfg = opt.get_config()
        opt_cfg['learning_rate'] *= hvd.size()
        opt = opt.from_config(opt_cfg)
        opt = hvd.DistributedOptimizer(opt)

        loss = self.benchmark.loss_
        LOGGER.debug(loss.__name__)
        metrics = self.benchmark.metrics

        self._model.compile(loss=loss,
                    optimizer=opt,
                    metrics=metrics,
                    experimental_run_tf_function=False)

        if hvd.rank() == 0:
            model_dir = Path(self._output_dir)
            model_dir.mkdir(parents=True, exist_ok=True)
Пример #9
0
    def train(self, **params):
        verbose = 1 if params.get('verbosity', 0) > 1 and hvd.rank() == 0 else 0

        if self._model is None:
            raise RuntimeError("Model has not been built!\n \
                    Please call benchmark.build() first to compile the model!")

        # Add hooks for Horovod
        hooks = [
            hvd.callbacks.BroadcastGlobalVariablesCallback(0),
            hvd.callbacks.MetricAverageCallback(),
        ]

        if hvd.rank() == 0:
            # These hooks only need to be called by one instance.
            # Therefore we need to only add them on rank == 0
            tracker_hook = TrackingCallback(self._output_dir, params['global_batch_size'], self._log_batch)
            hooks.append(tracker_hook)

        # Add hook for capturing metrics vs. epoch
        log_file = Path(self._output_dir).joinpath('training.log')
        log_file = str(log_file)
        csv_logger = tf.keras.callbacks.CSVLogger(log_file)
        hooks.append(csv_logger)

        LOGGER.info('Begin Training...')
        LOGGER.info('Training for {} epochs'.format(self.benchmark.epochs))

        dataset = self.benchmark.data_loader_.to_dataset()

        LOGGER.debug('Fitting Start')

        self._model.fit(dataset,
                epochs=self.benchmark.epochs,
                callbacks=hooks,
                verbose=verbose, **self.benchmark.fit_params)

        LOGGER.debug('Fitting End')

        if hvd.rank() == 0:
            model_dir = Path(self._output_dir)
            weights_file = str(model_dir / 'final_weights.h5')
            self._model.save_weights(weights_file)
Пример #10
0
    def run(self, log_interval=0.5, **params):

        params = self.setup(**params)
        self.build(**params)

        if hvd.rank() == 0:
            db = TrackingClient(Path(self._output_dir) / 'logs.json')
            db.log_param('params', params)

        LOGGER.info('Number of Replicas: {}'.format(params['num_replicas']))
        LOGGER.info('Global Batch Size: {}'.format(params['global_batch_size']))
        LOGGER.info('Replica Batch Size: {}'.format(params['batch_size']))

        if 'train' in params['exec_mode']:
            with NodeLogger(self._output_dir, name=self._node_name, prefix='train', interval=log_interval):
                self.train(**params)

        if 'predict' in params['exec_mode']:
            with NodeLogger(self._output_dir, name=self._node_name, prefix='predict', interval=log_interval):
                self.predict(**params)
Пример #11
0
def sync_datasets(benchmark_name, data_dir):
    # Grab latest DB file from STFC s3
    LOGGER.info('Downloading Dataset Database at {}'.format(DB_URI))
    download_file(DB_URI, DB_FILE_NAME)
    conn = sqlite3.connect(DB_FILE_NAME)
    exports_db = pd.read_sql("select * from exports", con=conn)

    # clean up db file
    os.remove(DB_FILE_NAME)

    # parse files in buckets
    bucket_name = exports_db.loc[exports_db.detail ==
                                 benchmark_name].bucket.values[0]
    dataset_uri = ''.join([STFC_S3_URI, bucket_name])

    LOGGER.info('Dataset uri {}'.format(dataset_uri))
    response = requests.get(dataset_uri)

    tree = BeautifulSoup(response.content, 'lxml')

    bucket_contents = tree.findAll('contents')
    bucket_contents = [{item.name: item.text
                        for item in c} for c in bucket_contents]
    bucket_contents = pd.DataFrame(bucket_contents)

    bucket_contents['name'] = bucket_contents.key.str.split('/').map(
        lambda s: s[0])
    bucket_contents = bucket_contents.loc[benchmark_name ==
                                          bucket_contents.name]

    # download data from bucket
    start_time = time.time()
    LOGGER.info('Downloading data for {}'.format(benchmark_name))

    for index, row in bucket_contents.iterrows():
        file_name = Path(row.key)

        LOGGER.info('Downloading {}'.format(file_name.name))
        if file_name.exists():
            LOGGER.info('{} already downloaded'.format(file_name.name))
            continue

        file_uri = '/'.join([dataset_uri, str(file_name)])

        file_name = Path(data_dir) / file_name
        file_name.parent.mkdir(parents=True, exist_ok=True)
        download_file(file_uri, file_name)

    end_time = time.time()

    LOGGER.info('Total Download Time (s): {}'.format(end_time - start_time))
Пример #12
0
def run(benchmark_names, skip, **params):
    # Load configuration for benchmarks
    config = load_config()

    params['model_dir'] = params['model_dir'] if params['model_dir'] is not None else config['model_dir']
    params['data_dir'] = params['data_dir'] if params['data_dir'] is not None else config['data_dir']

    config.update(params)

    LOGGER.setLevel(params.get('log_level').upper())
    if params.get('verbosity') < 2:
        LOGGER.setLevel(logging.WARNING)
    if params.get('verbosity') == 0:
        LOGGER.setLevel(logging.CRITICAL)

    set_environment_variables(**params)

    if params.get('verbosity') >= 2:
        print_header()

    register()

    for name in benchmark_names:
        if name not in BENCHMARKS and name != 'all':
            LOGGER.error('No benchmark with name {}'.format(name))
            sys.exit(1)

    model_dir = Path(config['model_dir']).expanduser()
    data_dir = Path(config['data_dir']).expanduser()

    if not data_dir.exists():
        LOGGER.error("Data directory {} does not exist!".format(data_dir))
        sys.exit(1)

    LOGGER.info('Model directory is: %s', str(model_dir))
    LOGGER.info('Data directory is: %s', str(data_dir))

    # If no benchmarks specified or all then run everything
    if len(benchmark_names) == 0 or 'all' in benchmark_names:
        benchmark_names = BENCHMARKS.keys()

    # Log which benchmarks we will run
    LOGGER.info('Selected the following benchmarks:')
    for name in benchmark_names:
        LOGGER.info('{}'.format(str(name)))

    # Ok, run all requested benchmarks
    for name in benchmark_names:

        LOGGER.info('Running %s benchmark', name)

        benchmark_data_dir = data_dir / name

        if not benchmark_data_dir.exists():
            LOGGER.error('Data directory {} does not exist! Is the data for benchmark {} downloaded?'.format(str(benchmark_data_dir), name))

            if skip:
                LOGGER.error('Skipping benchmark {}'.format(name))
                continue
            else:
                sys.exit(1)

        cfg = dict(config[name]) if name in config else {}
        cfg.update(config)
        cfg['data_dir'] = benchmark_data_dir

        benchmark = BENCHMARKS[name](**cfg)

        try:
            run_benchmark(benchmark, **cfg)
        except Exception as e:
            LOGGER.debug(traceback.format_exc())
            LOGGER.error('Failed to run benchmark {} due to unhandled exception.\n{}'.format(name, e))

            if skip:
                LOGGER.info('Skipping benchmark {}'.format(name))
                continue
            else:
                sys.exit(1)