def test_HostLogger(mocker, tmpdir): logger = HostLogger(tmpdir, interval=0.1) logger.start() time.sleep(1) logger.cancel() assert (tmpdir / 'host.json').exists() db = TrackingClient(tmpdir / 'host.json') logs = db.get_metric('host_log') assert len(logs) > 1 for log in logs: log = log['data'] assert 'execution_mode' in log assert 'name' in log assert 'cpu' in log assert 'percent' in log['cpu'] assert 'memory' in log assert 'free' in log['memory'] assert 'used' in log['memory'] assert 'available' in log['memory'] assert 'utilization' in log['memory'] assert 'disk' in log assert 'net' in log
def test_DeviceLogger(mocker, tmpdir): logger = DeviceLogger(tmpdir, interval=0.1) logger.start() time.sleep(1) logger.cancel() assert (tmpdir / 'devices.json').exists() db = TrackingClient(tmpdir / 'devices.json') logs = db.get_metric('device_log') assert len(logs) > 1 if logs[0] == {}: return for log in logs: log = log['data'] gpu = log['gpu_0'] assert 'execution_mode' in log assert 'name' in log assert 'utilization' in gpu assert 'power' in gpu assert 'memory' in gpu assert 'free' in gpu['memory'] assert 'used' in gpu['memory']
def test_TrackingCallback_multiple_epochs(tmpdir): loader = FakeDataLoader((10, 10), (1, )) model = fake_model_fn((10, 10)) model.compile(loss='binary_crossentropy') hook = TrackingCallback(tmpdir, batch_size=1, warmup_steps=0) model.fit(loader.to_dataset(batch_size=1), callbacks=[hook], epochs=5) # We expect 5 calls to epoch duration & train samples per sec # Followed by one call to train duration at the end expected_calls = zip(['epoch_duration' for i in range(5)], ['train_samples_per_sec' for i in range(5)]) expected_calls = list(chain.from_iterable(expected_calls)) expected_calls.append('train_duration') db = TrackingClient(tmpdir / 'logs.json') assert len(db.get_metric('epoch_log')) == 5 assert len(db.get_metric('train_log')) == 1 epoch_logs = db.get_metric('epoch_log') assert isinstance(epoch_logs, list) for log in epoch_logs: log = log['data'] assert 'duration' in log assert 'loss' in log assert 'samples_per_sec' in log train_log = db.get_metric('train_log')[0] log = train_log['data'] assert 'duration' in log assert 'samples_per_sec' in log
def test_TrackingCallback(tmpdir): loader = FakeDataLoader((10, 10), (1, )) model = fake_model_fn((10, 10)) model.compile(loss='binary_crossentropy') hook = TrackingCallback(tmpdir, batch_size=1) model.fit(loader.to_dataset(batch_size=1), callbacks=[hook]) db = TrackingClient(tmpdir / 'logs.json') assert len(db.get_metric('train_log')) == 1
def test_log_metrics_non_JSON_type(tmpdir): name = 'my-benchmark.json' client = TrackingClient(tmpdir / name) client.log_metric('log', {'loss': set([1, 2, 3]), 'acc': .99}, step=1) path = Path(tmpdir / name).with_suffix('.json') assert path.exists() with TinyDB(path) as db: assert db.count(Query().type == 'metric') == 1
def create_report(folder): client = TrackingClient(folder / 'logs.json') params = client.get_params()[0] params = params['data'] # output to static HTML file output_file(folder / "report.html") widgets = [] param_header = Div(text=""" <h2> Model Parameters </h2> """) widgets.append(param_header) param_table = create_table(params) widgets.append(param_table) widgets.append(Div(text=""" <h2> Model Metrics </h2> """)) metric_explorer = create_metrics_explorer(client) widgets.append(metric_explorer) host_files = list(folder.glob('*_host.json')) host_logs = {f.name.split('_')[0]: f for f in host_files} device_files = list(folder.glob('*_devices.json')) device_logs = {f.name.split('_')[0]: f for f in device_files} node_names = [f.name.split('_')[0] for f in host_files] node_logs = defaultdict(dict) for key in node_names: node_logs[key]['host'] = host_logs[key] node_logs[key]['devices'] = device_logs[key] for node_name, logs in node_logs.items(): widgets.append(Div(text="<h2>{} Host</h2>".format(node_name))) client = TrackingClient(logs['host']) tags = client.get_tags()[0] tags_table = create_table(tags['data']) widgets.append(tags_table) host_explorer = create_metrics_explorer(client) widgets.append(host_explorer) widgets.append(Div(text="<h2>{} Devices</h2>".format(node_name))) client = TrackingClient(logs['devices']) tags = client.get_tags()[0] tags_table = create_table(tags['data']) widgets.append(tags_table) devices_explorer = create_metrics_explorer(client) widgets.append(devices_explorer) show(layout(widgets))
def test_create_benchmark(tmpdir): name = 'my-benchmark.json' client = TrackingClient(tmpdir / name) client.log_metric('log', {'loss': 1}, step=1) client.log_metric('log', {'loss': 1}, step=2) path = Path(tmpdir / name).with_suffix('.json') assert path.exists() with TinyDB(path) as db: assert db.count(Query().name == 'log') == 2
def test_create_tracking_client_no_folder(tmpdir): name = 'my-benchmark.json' tmpdir = tmpdir / 'test-folder' TrackingClient(tmpdir / name) path = Path(tmpdir / name).with_suffix('.json') assert path.exists()
def run(self, log_interval=0.5, **params): params = self.setup(**params) self.build(**params) if hvd.rank() == 0: db = TrackingClient(Path(self._output_dir) / 'logs.json') db.log_param('params', params) LOGGER.info('Number of Replicas: {}'.format(params['num_replicas'])) LOGGER.info('Global Batch Size: {}'.format(params['global_batch_size'])) LOGGER.info('Replica Batch Size: {}'.format(params['batch_size'])) if 'train' in params['exec_mode']: with NodeLogger(self._output_dir, name=self._node_name, prefix='train', interval=log_interval): self.train(**params) if 'predict' in params['exec_mode']: with NodeLogger(self._output_dir, name=self._node_name, prefix='predict', interval=log_interval): self.predict(**params)
def __init__(self, benchmark, output_dir): self._benchmark = benchmark self._output_dir = output_dir Path(self._output_dir).mkdir(parents=True, exist_ok=True) host_spec = HostSpec() self._node_name = host_spec.node_name # Log system information if on local rank 0 if hvd.local_rank() == 0: # Log host information file_name = '{}_host.json'.format(self._node_name) db = TrackingClient(Path(self._output_dir) / file_name) host_info = { 'name': host_spec.name, 'node_name': host_spec.node_name, 'ip': host_spec.node_name, 'num_cores': host_spec.num_cores, 'release': host_spec.release, 'system': host_spec.system, 'cpu_info': host_spec.cpu_info, } db.log_tag('host_info', host_info) # Log device information device_specs = DeviceSpecs() file_name = '{}_devices.json'.format(self._node_name) db = TrackingClient(Path(self._output_dir) / file_name) device_info = {} device_info['gpu_count'] = device_specs.device_count device_info.update({'gpu_{}'.format(i): device_specs.get_device_info(i) for i in range(device_specs.device_count)}) db.log_tag('device_info', device_info)