def test_HostLogger(mocker, tmpdir):

    logger = HostLogger(tmpdir, interval=0.1)

    logger.start()
    time.sleep(1)
    logger.cancel()

    assert (tmpdir / 'host.json').exists()

    db = TrackingClient(tmpdir / 'host.json')
    logs = db.get_metric('host_log')
    assert len(logs) > 1

    for log in logs:
        log = log['data']
        assert 'execution_mode' in log
        assert 'name' in log
        assert 'cpu' in log
        assert 'percent' in log['cpu']
        assert 'memory' in log
        assert 'free' in log['memory']
        assert 'used' in log['memory']
        assert 'available' in log['memory']
        assert 'utilization' in log['memory']
        assert 'disk' in log
        assert 'net' in log
def test_DeviceLogger(mocker, tmpdir):

    logger = DeviceLogger(tmpdir, interval=0.1)

    logger.start()
    time.sleep(1)
    logger.cancel()

    assert (tmpdir / 'devices.json').exists()

    db = TrackingClient(tmpdir / 'devices.json')
    logs = db.get_metric('device_log')
    assert len(logs) > 1

    if logs[0] == {}:
        return

    for log in logs:
        log = log['data']
        gpu = log['gpu_0']
        assert 'execution_mode' in log
        assert 'name' in log
        assert 'utilization' in gpu
        assert 'power' in gpu
        assert 'memory' in gpu
        assert 'free' in gpu['memory']
        assert 'used' in gpu['memory']
def test_TrackingCallback_multiple_epochs(tmpdir):
    loader = FakeDataLoader((10, 10), (1, ))
    model = fake_model_fn((10, 10))
    model.compile(loss='binary_crossentropy')
    hook = TrackingCallback(tmpdir, batch_size=1, warmup_steps=0)
    model.fit(loader.to_dataset(batch_size=1), callbacks=[hook], epochs=5)

    # We expect 5 calls to epoch duration & train samples per sec
    # Followed by one call to train duration at the end
    expected_calls = zip(['epoch_duration' for i in range(5)],
                         ['train_samples_per_sec' for i in range(5)])
    expected_calls = list(chain.from_iterable(expected_calls))
    expected_calls.append('train_duration')

    db = TrackingClient(tmpdir / 'logs.json')
    assert len(db.get_metric('epoch_log')) == 5
    assert len(db.get_metric('train_log')) == 1

    epoch_logs = db.get_metric('epoch_log')
    assert isinstance(epoch_logs, list)

    for log in epoch_logs:
        log = log['data']
        assert 'duration' in log
        assert 'loss' in log
        assert 'samples_per_sec' in log

    train_log = db.get_metric('train_log')[0]
    log = train_log['data']
    assert 'duration' in log
    assert 'samples_per_sec' in log
def test_TrackingCallback(tmpdir):
    loader = FakeDataLoader((10, 10), (1, ))
    model = fake_model_fn((10, 10))
    model.compile(loss='binary_crossentropy')
    hook = TrackingCallback(tmpdir, batch_size=1)
    model.fit(loader.to_dataset(batch_size=1), callbacks=[hook])

    db = TrackingClient(tmpdir / 'logs.json')
    assert len(db.get_metric('train_log')) == 1
def test_log_metrics_non_JSON_type(tmpdir):
    name = 'my-benchmark.json'
    client = TrackingClient(tmpdir / name)
    client.log_metric('log', {'loss': set([1, 2, 3]), 'acc': .99}, step=1)

    path = Path(tmpdir / name).with_suffix('.json')
    assert path.exists()

    with TinyDB(path) as db:
        assert db.count(Query().type == 'metric') == 1
示例#6
0
def create_report(folder):
    client = TrackingClient(folder / 'logs.json')
    params = client.get_params()[0]
    params = params['data']
    # output to static HTML file
    output_file(folder / "report.html")

    widgets = []

    param_header = Div(text="""
    <h2> Model Parameters </h2>
    """)
    widgets.append(param_header)

    param_table = create_table(params)
    widgets.append(param_table)

    widgets.append(Div(text="""
    <h2> Model Metrics </h2>
    """))
    metric_explorer = create_metrics_explorer(client)
    widgets.append(metric_explorer)

    host_files = list(folder.glob('*_host.json'))
    host_logs = {f.name.split('_')[0]: f for f in host_files}

    device_files = list(folder.glob('*_devices.json'))
    device_logs = {f.name.split('_')[0]: f for f in device_files}

    node_names = [f.name.split('_')[0] for f in host_files]

    node_logs = defaultdict(dict)
    for key in node_names:
        node_logs[key]['host'] = host_logs[key]
        node_logs[key]['devices'] = device_logs[key]

    for node_name, logs in node_logs.items():
        widgets.append(Div(text="<h2>{} Host</h2>".format(node_name)))
        client = TrackingClient(logs['host'])
        tags = client.get_tags()[0]

        tags_table = create_table(tags['data'])
        widgets.append(tags_table)

        host_explorer = create_metrics_explorer(client)
        widgets.append(host_explorer)

        widgets.append(Div(text="<h2>{} Devices</h2>".format(node_name)))

        client = TrackingClient(logs['devices'])
        tags = client.get_tags()[0]

        tags_table = create_table(tags['data'])
        widgets.append(tags_table)

        devices_explorer = create_metrics_explorer(client)
        widgets.append(devices_explorer)

    show(layout(widgets))
def test_create_benchmark(tmpdir):

    name = 'my-benchmark.json'
    client = TrackingClient(tmpdir / name)
    client.log_metric('log', {'loss': 1}, step=1)
    client.log_metric('log', {'loss': 1}, step=2)

    path = Path(tmpdir / name).with_suffix('.json')
    assert path.exists()

    with TinyDB(path) as db:
        assert db.count(Query().name == 'log') == 2
def test_create_tracking_client_no_folder(tmpdir):
    name = 'my-benchmark.json'
    tmpdir = tmpdir / 'test-folder'
    TrackingClient(tmpdir / name)

    path = Path(tmpdir / name).with_suffix('.json')
    assert path.exists()
示例#9
0
    def run(self, log_interval=0.5, **params):

        params = self.setup(**params)
        self.build(**params)

        if hvd.rank() == 0:
            db = TrackingClient(Path(self._output_dir) / 'logs.json')
            db.log_param('params', params)

        LOGGER.info('Number of Replicas: {}'.format(params['num_replicas']))
        LOGGER.info('Global Batch Size: {}'.format(params['global_batch_size']))
        LOGGER.info('Replica Batch Size: {}'.format(params['batch_size']))

        if 'train' in params['exec_mode']:
            with NodeLogger(self._output_dir, name=self._node_name, prefix='train', interval=log_interval):
                self.train(**params)

        if 'predict' in params['exec_mode']:
            with NodeLogger(self._output_dir, name=self._node_name, prefix='predict', interval=log_interval):
                self.predict(**params)
示例#10
0
    def __init__(self, benchmark, output_dir):
        self._benchmark = benchmark
        self._output_dir = output_dir

        Path(self._output_dir).mkdir(parents=True, exist_ok=True)

        host_spec = HostSpec()
        self._node_name = host_spec.node_name

        # Log system information if on local rank 0
        if hvd.local_rank() == 0:

            # Log host information
            file_name = '{}_host.json'.format(self._node_name)
            db = TrackingClient(Path(self._output_dir) / file_name)

            host_info = {
                'name': host_spec.name,
                'node_name': host_spec.node_name,
                'ip': host_spec.node_name,
                'num_cores': host_spec.num_cores,
                'release': host_spec.release,
                'system': host_spec.system,
                'cpu_info': host_spec.cpu_info,
            }

            db.log_tag('host_info', host_info)

            # Log device information
            device_specs = DeviceSpecs()

            file_name = '{}_devices.json'.format(self._node_name)
            db = TrackingClient(Path(self._output_dir) / file_name)

            device_info = {}
            device_info['gpu_count'] = device_specs.device_count
            device_info.update({'gpu_{}'.format(i): device_specs.get_device_info(i) for i in range(device_specs.device_count)})

            db.log_tag('device_info', device_info)