Exemplo n.º 1
0
def test_gpu_memory_profiler_gluon():
    enable_profiler(profile_filename='test_profiler.json',
                    run=True,
                    continuous_dump=True)
    profiler.set_state('run')

    model = nn.HybridSequential()
    model.add(nn.Dense(128, activation='tanh'))
    model.add(nn.Dropout(0.5))
    model.add(nn.Dense(64, activation='tanh'), nn.Dense(32, in_units=64))
    model.add(nn.Activation('relu'))
    model.initialize(ctx=mx.gpu())
    model.hybridize()

    inputs = mx.sym.var('data')

    with mx.autograd.record():
        out = model(mx.nd.zeros((16, 10), ctx=mx.gpu()))
    out.backward()
    mx.nd.waitall()
    profiler.set_state('stop')
    profiler.dump(True)

    # We are only checking for weight parameters here, also making sure that
    # there is no unknown entries in the memory profile.
    with open('gpu_memory_profile-pid_%d.csv' % (os.getpid()),
              mode='r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            print(",".join(list(row.values())))
        for scope in ['in_arg', 'arg_grad']:
            for key, nd in model.collect_params().items():
                expected_arg_name = "%s:%s:" % (model.name, scope) + nd.name
                expected_arg_size = str(4 * np.prod(nd.shape))
                csv_file.seek(0)
                entry_found = False
                for row in csv_reader:
                    if row['Attribute Name'] == expected_arg_name:
                        assert row['Requested Size'] == expected_arg_size, \
                            "requested size={} is not equal to the expected size={}" \
                            .format(row['Requested Size'], expected_arg_size)
                        entry_found = True
                        break
                assert entry_found, \
                    "Entry for attr_name={} has not been found" \
                    .format(expected_arg_name)
        # Make sure that there is no unknown allocation entry.
        csv_file.seek(0)
        for row in csv_reader:
            if row['Attribute Name'] == "<unk>:unknown" or \
               row['Attribute Name'] == "<unk>:":
                assert False, "Unknown allocation entry has been encountered"
Exemplo n.º 2
0
def test_continuous_profile_and_instant_marker():
    file_name = 'test_continuous_profile_and_instant_marker.json'
    enable_profiler(file_name, True, True, True)
    python_domain = profiler.Domain('PythonDomain::test_continuous_profile')
    last_file_size = 0
    for i in range(5):
        profiler.Marker(python_domain, "StartIteration-" + str(i)).mark('process')
        test_profile_event(False)
        test_profile_counter(False)
        profiler.dump(False)
        # File size should keep increasing
        new_file_size = os.path.getsize(file_name)
        assert new_file_size >= last_file_size
        last_file_size = new_file_size
    profiler.dump(False)
    debug_str = profiler.dumps()
    assert(len(debug_str) > 0)
    profiler.set_state('stop')
Exemplo n.º 3
0
def test_continuous_profile_and_instant_marker():
    enable_profiler(True, True, True)
    python_domain = profiler.Domain('PythonDomain::test_continuous_profile')
    last_file_size = 0
    for i in range(5):
        profiler.Marker(python_domain, "StartIteration-" + str(i)).mark('process')
        print("{}...".format(i))
        test_profile_event(False)
        test_profile_counter(False)
        profiler.dump(False)
        # File size should keep increasing
        new_file_size = os.path.getsize("test_profile.json")
        assert new_file_size >= last_file_size
        last_file_size = new_file_size
    profiler.dump(False)
    debug_str = profiler.dumps()
    assert(len(debug_str) > 0)
    print(debug_str)
    profiler.set_state('stop')
Exemplo n.º 4
0
def test_aggregate_duplication():
    file_name = 'test_aggregate_duplication.json'
    enable_profiler(profile_filename = file_name, run=True, continuous_dump=True, \
                    aggregate_stats=True)
    inp = mx.nd.zeros(shape=(100, 100))
    y = mx.nd.sqrt(inp)
    inp = inp + 1
    inp = inp + 1
    mx.nd.waitall()
    profiler.dump(False)
    debug_str = profiler.dumps(format='json')
    target_dict = json.loads(debug_str)
    assert 'Time' in target_dict and 'operator' in target_dict['Time'] \
        and 'sqrt' in target_dict['Time']['operator'] \
        and 'Count' in target_dict['Time']['operator']['sqrt'] \
        and '_plus_scalar' in target_dict['Time']['operator'] \
        and 'Count' in target_dict['Time']['operator']['_plus_scalar']
    # they are called once and twice respectively
    assert target_dict['Time']['operator']['sqrt']['Count'] == 1
    assert target_dict['Time']['operator']['_plus_scalar']['Count'] == 2
    profiler.set_state('stop')
Exemplo n.º 5
0
def test_profiler():
    iter_num = 5
    begin_profiling_iter = 2
    end_profiling_iter = 4

    enable_profiler('test_profiler.json', False, False)

    A = mx.sym.Variable('A')
    B = mx.sym.Variable('B')
    C = mx.symbol.dot(A, B)

    executor = C.simple_bind(mx.cpu(1),
                             'write',
                             A=(4096, 4096),
                             B=(4096, 4096))

    a = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096))
    b = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096))

    a.copyto(executor.arg_dict['A'])
    b.copyto(executor.arg_dict['B'])

    print("execution begin")
    for i in range(iter_num):
        print("Iteration {}/{}".format(i + 1, iter_num))
        if i == begin_profiling_iter:
            t0 = time.clock()
            profiler.set_state('run')
        if i == end_profiling_iter:
            t1 = time.clock()
            profiler.set_state('stop')
        executor.forward()
        c = executor.outputs[0]
        c.wait_to_read()
    print("execution end")
    duration = t1 - t0
    print('duration: {0}s'.format(duration))
    print('          {0}ms/operator'.format(duration * 1000 / iter_num))
    profiler.dump(True)
    profiler.set_state('stop')
Exemplo n.º 6
0
def test_profiler():
    iter_num = 5
    begin_profiling_iter = 2
    end_profiling_iter = 4

    enable_profiler(False, False)

    A = mx.sym.Variable('A')
    B = mx.sym.Variable('B')
    C = mx.symbol.dot(A, B)

    executor = C.simple_bind(mx.cpu(1), 'write', A=(4096, 4096), B=(4096, 4096))

    a = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096))
    b = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096))

    a.copyto(executor.arg_dict['A'])
    b.copyto(executor.arg_dict['B'])

    print("execution begin")
    for i in range(iter_num):
        print("Iteration {}/{}".format(i + 1, iter_num))
        if i == begin_profiling_iter:
            t0 = time.clock()
            profiler.set_state('run')
        if i == end_profiling_iter:
            t1 = time.clock()
            profiler.set_state('stop')
        executor.forward()
        c = executor.outputs[0]
        c.wait_to_read()
    print("execution end")
    duration = t1 - t0
    print('duration: {0}s'.format(duration))
    print('          {0}ms/operator'.format(duration*1000/iter_num))
    profiler.dump(True)
    profiler.set_state('stop')
Exemplo n.º 7
0
def test_profiler():
    iter_num = 5
    begin_profiling_iter = 2
    end_profiling_iter = 4

    enable_profiler('test_profiler.json', False, False)

    A = mx.sym.Variable('A')
    B = mx.sym.Variable('B')
    C = mx.symbol.dot(A, B)

    executor = C._simple_bind(mx.cpu(1),
                              'write',
                              A=(4096, 4096),
                              B=(4096, 4096))

    a = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096))
    b = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096))

    a.copyto(executor.arg_dict['A'])
    b.copyto(executor.arg_dict['B'])

    for i in range(iter_num):
        if i == begin_profiling_iter:
            t0 = time.process_time()
            profiler.set_state('run')
        if i == end_profiling_iter:
            t1 = time.process_time()
            profiler.set_state('stop')
        executor.forward()
        c = executor.outputs[0]
        c.wait_to_read()

    duration = t1 - t0
    profiler.dump(True)
    profiler.set_state('stop')
Exemplo n.º 8
0
def test_profile_create_domain_dept():
    profiler.set_config(profile_symbolic=True, filename='test_profile_create_domain_dept.json')
    profiler.set_state('run')
    domain = profiler.Domain(name='PythonDomain')
    profiler.dump()
    profiler.set_state('stop')
Exemplo n.º 9
0
def test_gpu_memory_profiler_symbolic():
    enable_profiler('test_profiler.json')
    profiler.set_state('run')

    with profiler.scope("tensordot"):
        A = mx.sym.Variable('A')
        B = mx.sym.Variable('B')
        C = mx.symbol.dot(A, B, name='dot')

    executor = C._simple_bind(mx.gpu(),
                              'write',
                              A=(1024, 2048),
                              B=(2048, 4096))

    with profiler.scope("init"):
        a = mx.random.uniform(-1.0, 1.0, shape=(1024, 2048))
        b = mx.random.uniform(-1.0, 1.0, shape=(2048, 4096))

    a.copyto(executor.arg_dict['A'])
    b.copyto(executor.arg_dict['B'])

    executor.forward()
    executor.backward()
    c = executor.outputs[0]
    mx.nd.waitall()
    profiler.set_state('stop')
    profiler.dump(True)

    expected_alloc_entries = [{
        'Attribute Name': 'tensordot:in_arg:A',
        'Requested Size': str(4 * a.size)
    }, {
        'Attribute Name': 'tensordot:in_arg:B',
        'Requested Size': str(4 * b.size)
    }, {
        'Attribute Name': 'tensordot:dot',
        'Requested Size': str(4 * c.size)
    }, {
        'Attribute Name': 'init:_random_uniform',
        'Requested Size': str(4 * a.size)
    }, {
        'Attribute Name': 'init:_random_uniform',
        'Requested Size': str(4 * b.size)
    }]

    # Sample gpu_memory_profile.csv:
    # "Attribute Name","Requested Size","Device","Actual Size","Reuse?"
    # init:_random_uniform,33554432,0,33554432,1
    # init:_random_uniform,8388608,0,8388608,1
    # resource:temp_space (sample_op.h +365),8,0,4096,0
    # symbol:arg_grad:unknown,8388608,0,8388608,0
    # symbol:arg_grad:unknown,33554432,0,33554432,0
    # tensordot:dot,16777216,0,16777216,0
    # tensordot:dot_backward,33554432,0,33554432,0
    # tensordot:dot_backward,8388608,0,8388608,0
    # tensordot:dot_head_grad,16777216,0,16777216,0
    # tensordot:in_arg:A,8388608,0,8388608,0
    # tensordot:in_arg:B,33554432,0,33554432,0

    with open('gpu_memory_profile-pid_%d.csv' % (os.getpid()),
              mode='r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        # TODO: Remove this print statement later on.
        for row in csv_reader:
            print(",".join(list(row.values())))
        for expected_alloc_entry in expected_alloc_entries:
            csv_file.seek(0)
            entry_found = False
            for row in csv_reader:
                if row['Attribute Name'] == expected_alloc_entry['Attribute Name'] and \
                   row['Requested Size'] == expected_alloc_entry['Requested Size']:
                    entry_found = True
                    break
            assert entry_found, \
                    "Entry for (attr_name={}, alloc_size={}) has not been found" \
                    .format(expected_alloc_entry['Attribute Name'],
                            expected_alloc_entry['Requested Size'])
        # Make sure that there is no unknown allocation entry.
        csv_file.seek(0)
        for row in csv_reader:
            if row['Attribute Name'] == "<unk>:unknown" or \
               row['Attribute Name'] == "<unk>:":
                assert False, "Unknown allocation entry has been encountered"
Exemplo n.º 10
0
def test_gpu_memory_profiler_gluon():
    enable_profiler(profile_filename='test_profiler.json')
    profiler.set_state('run')

    model = nn.HybridSequential()
    model.add(nn.Dense(128, activation='tanh'))
    model.add(nn.Dropout(0.5))
    model.add(nn.Dense(64, activation='tanh'), nn.Dense(32, in_units=64))
    model.add(nn.Activation('relu'))
    model.initialize(ctx=mx.gpu())
    model.hybridize()

    inputs = mx.sym.var('data')

    with mx.autograd.record():
        out = model(mx.nd.zeros((16, 10), ctx=mx.gpu()))
    out.backward()
    mx.nd.waitall()
    profiler.set_state('stop')
    profiler.dump(True)

    # Sample gpu_memory_profile.csv:
    # "Attribute Name","Requested Size","Device","Actual Size","Reuse?"
    # <unk>:in_arg:data,640,0,4096,0
    # hybridsequential:activation0:hybridsequential_activation0_fwd,2048,0,4096,0
    # hybridsequential:activation0:hybridsequential_activation0_fwd_backward,8192,0,8192,0
    # hybridsequential:activation0:hybridsequential_activation0_fwd_head_grad,2048,0,4096,0
    # hybridsequential:dense0:activation0:hybridsequential_dense0_activation0_fwd,8192,0,8192,0
    # hybridsequential:dense0:arg_grad:bias,512,0,4096,0
    # hybridsequential:dense0:arg_grad:weight,5120,0,8192,0
    # hybridsequential:dense0:hybridsequential_dense0_fwd,8192,0,8192,0
    # hybridsequential:dense0:in_arg:bias,512,0,4096,0
    # hybridsequential:dense0:in_arg:weight,5120,0,8192,0
    # hybridsequential:dense1:activation0:hybridsequential_dense1_activation0_fwd,4096,0,4096,0
    # hybridsequential:dense1:arg_grad:bias,256,0,4096,0
    # hybridsequential:dense1:arg_grad:weight,32768,0,32768,0
    # hybridsequential:dense1:hybridsequential_dense1_fwd,4096,0,4096,0
    # hybridsequential:dense1:in_arg:bias,256,0,4096,0
    # hybridsequential:dense1:in_arg:weight,32768,0,32768,0
    # hybridsequential:dense2:arg_grad:bias,128,0,4096,0
    # hybridsequential:dense2:arg_grad:weight,8192,0,8192,0
    # hybridsequential:dense2:hybridsequential_dense2_fwd_backward,4096,0,4096,1
    # hybridsequential:dense2:in_arg:bias,128,0,4096,0
    # hybridsequential:dense2:in_arg:weight,8192,0,8192,0
    # hybridsequential:dropout0:hybridsequential_dropout0_fwd,8192,0,8192,0
    # hybridsequential:dropout0:hybridsequential_dropout0_fwd,8192,0,8192,0
    # resource:cudnn_dropout_state (dropout-inl.h +256),1474560,0,1474560,0
    # resource:temp_space (fully_connected-inl.h +316),15360,0,16384,0

    # We are only checking for weight parameters here, also making sure that
    # there is no unknown entries in the memory profile.
    with open('gpu_memory_profile-pid_%d.csv' % (os.getpid()),
              mode='r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        # TODO: Remove this print statement later on.
        for row in csv_reader:
            print(",".join(list(row.values())))
        for param in model.collect_params().values():
            expected_arg_name = "%sin_arg:" % param.var().attr('__profiler_scope__') + \
                                param.name
            expected_arg_size = str(4 * np.prod(param.shape))
            csv_file.seek(0)
            entry_found = False
            for row in csv_reader:
                if row['Attribute Name'] == expected_arg_name and \
                   row['Requested Size'] == expected_arg_size:
                    entry_found = True
                    break
            assert entry_found, \
                    "Entry for (attr_name={}, alloc_size={}) has not been found" \
                        .format(expected_arg_name,
                                expected_arg_size)
        # Make sure that there is no unknown allocation entry.
        csv_file.seek(0)
        for row in csv_reader:
            if row['Attribute Name'] == "<unk>:unknown" or \
               row['Attribute Name'] == "<unk>:":
                assert False, "Unknown allocation entry has been encountered"
Exemplo n.º 11
0
def train(
    args,
    model,
    train_sampler,
    valid_samplers=None,
    rank=0,
    rel_parts=None,
    barrier=None,
):
    assert args.num_proc <= 1, "MXNet KGE does not support multi-process now"
    assert (args.rel_part == False
            ), "No need for relation partition in single process for MXNet KGE"
    logs = []

    for arg in vars(args):
        logging.info("{:20}:{}".format(arg, getattr(args, arg)))

    if len(args.gpu) > 0:
        gpu_id = (args.gpu[rank % len(args.gpu)]
                  if args.mix_cpu_gpu and args.num_proc > 1 else args.gpu[0])
    else:
        gpu_id = -1

    if args.strict_rel_part:
        model.prepare_relation(mx.gpu(gpu_id))

    if mxprofiler:
        from mxnet import profiler

        profiler.set_config(
            profile_all=True,
            aggregate_stats=True,
            continuous_dump=True,
            filename="profile_output.json",
        )
    start = time.time()
    for step in range(0, args.max_step):
        pos_g, neg_g = next(train_sampler)
        args.step = step
        if step == 1 and mxprofiler:
            profiler.set_state("run")
        with mx.autograd.record():
            loss, log = model.forward(pos_g, neg_g, gpu_id)
        loss.backward()
        logs.append(log)
        model.update(gpu_id)

        if step % args.log_interval == 0:
            for k in logs[0].keys():
                v = sum(l[k] for l in logs) / len(logs)
                print("[Train]({}/{}) average {}: {}".format(
                    step, args.max_step, k, v))
            logs = []
            print(time.time() - start)
            start = time.time()

        if (args.valid and step % args.eval_interval == 0 and step > 1
                and valid_samplers is not None):
            start = time.time()
            test(args, model, valid_samplers, mode="Valid")
            print("test:", time.time() - start)
    if args.strict_rel_part:
        model.writeback_relation(rank, rel_parts)
    if mxprofiler:
        nd.waitall()
        profiler.set_state("stop")
        profiler.dump()
        print(profiler.dumps())
    # clear cache
    logs = []
Exemplo n.º 12
0
def test_gpu_memory_profiler_gluon():
    enable_profiler(profile_filename='test_profiler.json',
                    run=True,
                    continuous_dump=True)
    profiler.set_state('run')

    model = nn.HybridSequential(prefix='net_')
    with model.name_scope():
        model.add(nn.Dense(128, activation='tanh'))
        model.add(nn.Dropout(0.5))
        model.add(nn.Dense(64, activation='tanh'), nn.Dense(32, in_units=64))
        model.add(nn.Activation('relu'))
    model.initialize(ctx=mx.gpu())
    model.hybridize()

    inputs = mx.sym.var('data')

    with mx.autograd.record():
        out = model(mx.nd.zeros((16, 10), ctx=mx.gpu()))
    out.backward()
    mx.nd.waitall()
    profiler.set_state('stop')
    profiler.dump(True)

    # Sample gpu_memory_profiler.csv
    # "Attribute Name","Requested Size","Device","Actual Size","Reuse?"
    # "<unk>:in_arg:data","640","0","4096","0"
    # "net:arg_grad:net_dense0_bias","512","0","4096","0"
    # "net:arg_grad:net_dense0_weight","5120","0","8192","0"
    # "net:arg_grad:net_dense1_bias","256","0","4096","0"
    # "net:arg_grad:net_dense1_weight","32768","0","32768","0"
    # "net:arg_grad:net_dense2_bias","128","0","4096","0"
    # "net:arg_grad:net_dense2_weight","8192","0","8192","0"
    # "net:dense0:net_dense0_fwd","8192","0","8192","0"
    # "net:dense0:tanh:net_dense0_tanh_fwd","8192","0","8192","0"
    # "net:dense1:net_dense1_fwd","4096","0","4096","0"
    # "net:dense1:tanh:net_dense1_tanh_fwd","4096","0","4096","0"
    # "net:dense2:net_dense2_fwd","2048","0","4096","0"
    # "net:dense2:net_dense2_fwd_backward","4096","0","4096","0"
    # "net:dropout0:net_dropout0_fwd","8192","0","8192","0"
    # "net:dropout0:net_dropout0_fwd","8192","0","8192","0"
    # "net:in_arg:net_dense0_bias","512","0","4096","0"
    # "net:in_arg:net_dense0_weight","5120","0","8192","0"
    # "net:in_arg:net_dense1_bias","256","0","4096","0"
    # "net:in_arg:net_dense1_weight","32768","0","32768","0"
    # "net:in_arg:net_dense2_bias","128","0","4096","0"
    # "net:in_arg:net_dense2_weight","8192","0","8192","0"
    # "net:relu0:net_relu0_fwd","2048","0","4096","0"
    # "net:relu0:net_relu0_fwd_backward","8192","0","8192","0"
    # "net:relu0:net_relu0_fwd_head_grad","2048","0","4096","0"
    # "resource:cudnn_dropout_state (dropout-inl.h +258)","1671168","0","1671168","0"
    # "resource:temp_space (fully_connected-inl.h +316)","34816","0","36864","0"

    # We are only checking for weight parameters here, also making sure that
    # there is no unknown entries in the memory profile.
    with open('gpu_memory_profile-pid_%d.csv' % (os.getpid()),
              mode='r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for scope in ['in_arg', 'arg_grad']:
            for key, nd in model.collect_params().items():
                expected_arg_name = "net:%s:" % scope + key
                expected_arg_size = str(4 * np.prod(nd.shape))
                csv_file.seek(0)
                entry_found = False
                for row in csv_reader:
                    if row['Attribute Name'] == expected_arg_name:
                        assert row['Requested Size'] == expected_arg_size, \
                            "requested size={} is not equal to the expected size={}" \
                            .format(row['Requested Size'], expected_arg_size)
                        entry_found = True
                        break
                assert entry_found, \
                    "Entry for attr_name={} has not been found" \
                    .format(expected_arg_name)
        # Make sure that there is no unknown allocation entry.
        csv_file.seek(0)
        for row in csv_reader:
            if row['Attribute Name'] == "<unk>:unknown" or \
               row['Attribute Name'] == "<unk>:":
                assert False, "Unknown allocation entry has been encountered"
Exemplo n.º 13
0
def test_gpu_memory_profiler_symbolic():
    iter_num = 5

    enable_profiler('test_profiler.json', False, False)
    profiler.set_state('run')

    with profiler.Scope("tensordot"):
        A = mx.sym.Variable('A')
        B = mx.sym.Variable('B')
        C = mx.symbol.dot(A, B, name='dot')

    executor = C.simple_bind(mx.gpu(), 'write', A=(4096, 4096), B=(4096, 4096))

    a = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096))
    b = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096))

    a.copyto(executor.arg_dict['A'])
    b.copyto(executor.arg_dict['B'])

    for i in range(iter_num):
        executor.forward()
        c = executor.outputs[0]
        mx.nd.waitall()
    profiler.set_state('stop')
    profiler.dump(True)

    expected_alloc_entries = [{
        'Attribute Name': 'tensordot:in_arg:A',
        'Requested Size': str(4 * a.size)
    }, {
        'Attribute Name': 'tensordot:in_arg:B',
        'Requested Size': str(4 * b.size)
    }, {
        'Attribute Name': 'tensordot:arg_grad:A',
        'Requested Size': str(4 * a.size)
    }, {
        'Attribute Name': 'tensordot:arg_grad:B',
        'Requested Size': str(4 * b.size)
    }, {
        'Attribute Name': 'tensordot:dot',
        'Requested Size': str(4 * c.size)
    }, {
        'Attribute Name': 'tensordot:dot_head_grad',
        'Requested Size': str(4 * c.size)
    }]

    # Sample gpu_memory_profile.csv:
    # "Attribute Name","Requested Size","Device","Actual Size","Reuse?"
    # "tensordot:arg_grad:A","67108864","0","67108864","0"
    # "tensordot:arg_grad:B","67108864","0","67108864","0"
    # "tensordot:dot","67108864","0","67108864","0"
    # "tensordot:dot_head_grad","67108864","0","67108864","0"
    # "tensordot:in_arg:A","67108864","0","67108864","0"
    # "tensordot:in_arg:B","67108864","0","67108864","0"

    with open('gpu_memory_profile-pid_%d.csv' % (os.getpid()),
              mode='r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for expected_alloc_entry in expected_alloc_entries:
            csv_file.seek(0)
            entry_found = False
            for row in csv_reader:
                if row['Attribute Name'] == expected_alloc_entry[
                        'Attribute Name']:
                    assert row['Requested Size'] == expected_alloc_entry['Requested Size'], \
                           "requested size={} is not equal to the expected size={}" \
                           .format(row['Requested Size'],
                                   expected_alloc_entry['Requested Size'])
                    entry_found = True
                    break
            assert entry_found, \
                   "Entry for attr_name={} has not been found" \
                   .format(expected_alloc_entry['Attribute Name'])
Exemplo n.º 14
0
def custom_operator_profiling_multiple_custom_ops(seed, mode, file_name):
    class MyAdd(mx.operator.CustomOp):
        def forward(self, is_train, req, in_data, out_data, aux):
            self.assign(out_data[0], req[0], in_data[0] + 1)

        def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
            self.assign(in_grad[0], req[0], out_grad[0])

    @mx.operator.register('MyAdd1')
    class MyAdd1Prop(mx.operator.CustomOpProp):
        def __init__(self):
            super(MyAdd1Prop, self).__init__(need_top_grad=True)

        def list_arguments(self):
            return ['data']

        def list_outputs(self):
            return ['output']

        def infer_shape(self, in_shape):
            # inputs, outputs, aux
            return [in_shape[0]], [in_shape[0]], []

        def create_operator(self, ctx, shapes, dtypes):
            return MyAdd()

    @mx.operator.register('MyAdd2')
    class MyAdd2Prop(mx.operator.CustomOpProp):
        def __init__(self):
            super(MyAdd2Prop, self).__init__(need_top_grad=True)

        def list_arguments(self):
            return ['data']

        def list_outputs(self):
            return ['output']

        def infer_shape(self, in_shape):
            # inputs, outputs, aux
            return [in_shape[0]], [in_shape[0]], []

        def create_operator(self, ctx, shapes, dtypes):
            return MyAdd()

    enable_profiler(profile_filename=file_name, run=True, continuous_dump=True,\
                    aggregate_stats=True)
    # clear aggregate stats
    profiler.dumps(reset=True)
    inp = mx.nd.zeros(shape=(100, 100))
    if mode == 'imperative':
        y = mx.nd.Custom(inp, op_type='MyAdd1')
        z = mx.nd.Custom(inp, op_type='MyAdd2')
    elif mode == 'symbolic':
        a = mx.symbol.Variable('a')
        b = mx.symbol.Custom(data=a, op_type='MyAdd1')
        c = mx.symbol.Custom(data=a, op_type='MyAdd2')
        y = b.bind(mx.cpu(), {'a': inp})
        z = c.bind(mx.cpu(), {'a': inp})
        yy = y.forward()
        zz = z.forward()
    mx.nd.waitall()
    profiler.dump(False)
    debug_str = profiler.dumps(format='json')
    check_custom_operator_profiling_multiple_custom_ops_output(debug_str)
    profiler.set_state('stop')
Exemplo n.º 15
0
       # print(key, aux_params[key])
        param_size += aux_params[key].size * 4
    print("Parameter size", param_size / 1024 / 1024, " MB")

    repeat_times = 10
    profiler.set_state('run')
    # profiler.pause()
    # train 5 epochs, i.e. going over the data iter one pass
    start = time.time()
    for epoch in range(5):
        train_data.reset()
        metric.reset()
        for i, batch in enumerate(train_data):
            if i == 1:
                profiler.resume()
            mod.forward(batch, is_train=True)       # compute predictions
            mod.update_metric(metric, batch.label)  # accumulate prediction accuracy
            mod.backward()                          # compute gradients
            mod.update()                            # update parameters
            if i == repeat_times: # benchmark 100 iterations
                break

        # print('Epoch %d, Training %s' % (epoch, metric.get()))
        mx.nd.waitall()
        profiler.set_state('stop')
        profiler.dump()
        end = time.time()
        time_per_img = (end - start) * 1.0 / batch_size / repeat_times
        print("batch\tthreshold\tthread number\ttime per image\tmemory (GB)")
        print("%d\t%d\t%s\t%s\t%f" %(batch_size, threshold, os.environ["MXNET_CPU_WORKER_NTHREADS"], time_per_img,  cpuStats()))
Exemplo n.º 16
0
def main():
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt

    opt = parse_args()
    batch_size = opt.batch_size
    classes = 10

    num_gpus = opt.num_gpus
    batch_size *= max(1, num_gpus)
    context = [mx.gpu(i)
               for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
    num_workers = opt.num_workers

    lr_sch = lr_scheduler.CosineScheduler((50000//batch_size)*opt.num_epochs,
                                          base_lr=opt.lr,
                                          warmup_steps=5*(50000//batch_size),
                                          final_lr=1e-5)
    # lr_sch = lr_scheduler.FactorScheduler((50000//batch_size)*20,
    #                                       factor=0.2, base_lr=opt.lr,
    #                                       warmup_steps=5*(50000//batch_size))
    # lr_sch = LRScheduler('cosine',opt.lr, niters=(50000//batch_size)*opt.num_epochs,)

    model_name = opt.model
    net = SKT_Lite()
    # if model_name.startswith('cifar_wideresnet'):
    #     kwargs = {'classes': classes,
    #             'drop_rate': opt.drop_rate}
    # else:
    #     kwargs = {'classes': classes}
    # net = get_model(model_name, **kwargs)
    if opt.mixup:
        model_name += '_mixup'
    if opt.amp:
        model_name += '_amp'

    makedirs('./'+model_name)
    os.chdir('./'+model_name)
    sw = SummaryWriter(
        logdir='.\\tb\\'+model_name, flush_secs=5, verbose=False)
    makedirs(opt.save_plot_dir)

    if opt.resume_from:
        net.load_parameters(opt.resume_from, ctx=context)
    optimizer = 'nag'

    save_period = opt.save_period
    if opt.save_dir and save_period:
        save_dir = opt.save_dir
        makedirs(save_dir)
    else:
        save_dir = ''
        save_period = 0

    plot_name = opt.save_plot_dir

    logging_handlers = [logging.StreamHandler()]
    if opt.logging_dir:
        logging_dir = opt.logging_dir
        makedirs(logging_dir)
        logging_handlers.append(logging.FileHandler(
            '%s/train_cifar10_%s.log' % (logging_dir, model_name)))

    logging.basicConfig(level=logging.INFO, handlers=logging_handlers)
    logging.info(opt)

    if opt.amp:
        amp.init()

    if opt.profile_mode:
        profiler.set_config(profile_all=True,
                            aggregate_stats=True,
                            continuous_dump=True,
                            filename='%s_profile.json' % model_name)

    transform_train = transforms.Compose([
        gcv_transforms.RandomCrop(32, pad=4),
        CutOut(8),
        # gcv_transforms.block.RandomErasing(s_max=0.25),
        transforms.RandomFlipLeftRight(),
        # transforms.RandomFlipTopBottom(),
        transforms.Resize(32),
        transforms.ToTensor(),
        transforms.Normalize([0.4914, 0.4822, 0.4465],
                             [0.2023, 0.1994, 0.2010])
    ])

    transform_test = transforms.Compose([
        transforms.Resize(32),
        transforms.ToTensor(),
        transforms.Normalize([0.4914, 0.4822, 0.4465],
                             [0.2023, 0.1994, 0.2010])
    ])

    def label_transform(label, classes):
        ind = label.astype('int')
        res = nd.zeros((ind.shape[0], classes), ctx=label.context)
        res[nd.arange(ind.shape[0], ctx=label.context), ind] = 1
        return res

    def test(ctx, val_data):
        metric = mx.metric.Accuracy()
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
        num_batch = len(val_data)
        test_loss = 0
        for i, batch in enumerate(val_data):
            data = gluon.utils.split_and_load(
                batch[0], ctx_list=ctx, batch_axis=0)
            label = gluon.utils.split_and_load(
                batch[1], ctx_list=ctx, batch_axis=0)
            outputs = [net(X) for X in data]
            loss = [loss_fn(yhat, y) for yhat, y in zip(outputs, label)]
            metric.update(label, outputs)
            test_loss += sum([l.sum().asscalar() for l in loss])
        test_loss /= batch_size * num_batch
        name, val_acc = metric.get()
        return name, val_acc, test_loss

    def train(epochs, ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]
        net.initialize(mx.init.MSRAPrelu(), ctx=ctx)

        root = os.path.join('..', 'datasets', 'cifar-10')
        train_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=True).transform_first(transform_train),
            batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)

        val_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=False).transform_first(transform_test),
            batch_size=batch_size, shuffle=False, num_workers=num_workers)

        trainer = gluon.Trainer(net.collect_params(), optimizer,
                                {'learning_rate': opt.lr, 'wd': opt.wd,
                                 'momentum': opt.momentum, 'lr_scheduler': lr_sch})
        if opt.amp:
            amp.init_trainer(trainer)
        metric = mx.metric.Accuracy()
        train_metric = mx.metric.RMSE()
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss(
            sparse_label=False if opt.mixup else True)
        train_history = TrainingHistory(['training-error', 'validation-error'])
        # acc_history = TrainingHistory(['training-acc', 'validation-acc'])
        loss_history = TrainingHistory(['training-loss', 'validation-loss'])

        iteration = 0

        best_val_score = 0

        for epoch in range(epochs):
            tic = time.time()
            train_metric.reset()
            metric.reset()
            train_loss = 0
            num_batch = len(train_data)
            alpha = 1

            for i, batch in enumerate(train_data):
                if epoch == 0 and iteration == 1 and opt.profile_mode:
                    profiler.set_state('run')
                lam = np.random.beta(alpha, alpha)
                if epoch >= epochs - 20 or not opt.mixup:
                    lam = 1

                data_1 = gluon.utils.split_and_load(
                    batch[0], ctx_list=ctx, batch_axis=0)
                label_1 = gluon.utils.split_and_load(
                    batch[1], ctx_list=ctx, batch_axis=0)

                if not opt.mixup:
                    data = data_1
                    label = label_1
                else:
                    data = [lam*X + (1-lam)*X[::-1] for X in data_1]
                    label = []
                    for Y in label_1:
                        y1 = label_transform(Y, classes)
                        y2 = label_transform(Y[::-1], classes)
                        label.append(lam*y1 + (1-lam)*y2)

                with ag.record():
                    output = [net(X) for X in data]
                    loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
                if opt.amp:
                    with ag.record():
                        with amp.scale_loss(loss, trainer) as scaled_loss:
                            ag.backward(scaled_loss)
                            # scaled_loss.backward()
                else:
                    for l in loss:
                        l.backward()
                trainer.step(batch_size)
                train_loss += sum([l.sum().asscalar() for l in loss])

                output_softmax = [nd.SoftmaxActivation(out) for out in output]
                train_metric.update(label, output_softmax)
                metric.update(label_1, output_softmax)
                name, acc = train_metric.get()
                sw.add_scalar(tag='lr', value=trainer.learning_rate,
                              global_step=iteration)
                if epoch == 0 and iteration == 1 and opt.profile_mode:
                    nd.waitall()
                    profiler.set_state('stop')
                iteration += 1

            train_loss /= batch_size * num_batch
            name, acc = train_metric.get()
            _, train_acc = metric.get()
            name, val_acc, _ = test(ctx, val_data)
            if opt.mixup:
                train_history.update([acc, 1-val_acc])
                plt.cla()
                train_history.plot(save_path='%s/%s_history.png' %
                                   (plot_name, model_name))
            else:
                train_history.update([1-train_acc, 1-val_acc])
                plt.cla()
                train_history.plot(save_path='%s/%s_history.png' %
                                   (plot_name, model_name))
            # acc_history.update([train_acc, val_acc])
            # plt.cla()
            # acc_history.plot(save_path='%s/%s_acc.png' %
            #                  (plot_name, model_name), legend_loc='best')

            if val_acc > best_val_score:
                best_val_score = val_acc
                net.save_parameters('%s/%.4f-cifar-%s-%d-best.params' %
                                    (save_dir, best_val_score, model_name, epoch))

            current_lr = trainer.learning_rate
            name, val_acc, val_loss = test(ctx, val_data)
            loss_history.update([train_loss, val_loss])
            plt.cla()
            loss_history.plot(save_path='%s/%s_loss.png' %
                              (plot_name, model_name), y_lim=(0, 2), legend_loc='best')
            logging.info('[Epoch %d] loss=%f train_acc=%f train_RMSE=%f\n     val_acc=%f val_loss=%f lr=%f time: %f' %
                         (epoch, train_loss, train_acc, acc, val_acc, val_loss, current_lr, time.time()-tic))
            sw._add_scalars(tag='Acc',
                            scalar_dict={'train_acc': train_acc, 'test_acc': val_acc}, global_step=epoch)
            sw._add_scalars(tag='Loss',
                            scalar_dict={'train_loss': train_loss, 'test_loss': val_loss}, global_step=epoch)
            if save_period and save_dir and (epoch + 1) % save_period == 0:
                net.save_parameters('%s/cifar10-%s-%d.params' %
                                    (save_dir, model_name, epoch))
        if save_period and save_dir:
            net.save_parameters('%s/cifar10-%s-%d.params' %
                                (save_dir, model_name, epochs-1))

    if opt.mode == 'hybrid':
        net.hybridize()
    train(opt.num_epochs, context)
    if opt.profile_mode:
        profiler.dump(finished=False)
    sw.close()
Exemplo n.º 17
0
    def train(epochs, ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]

        if config.train_cfg.param_init:
            init_func = getattr(mx.init, config.train_cfg.init)
            net.initialize(init_func(), ctx=ctx, force_reinit=True)
        else:
            net.load_parameters(config.train_cfg.param_file, ctx=ctx)

        summary(net, stat_name, nd.uniform(
            shape=(1, 3, imgsize, imgsize), ctx=ctx[0]))
        # net = nn.HybridBlock()
        net.hybridize()

        root = config.dir_cfg.dataset
        train_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=True).transform_first(transform_train),
            batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)

        val_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=False).transform_first(transform_test),
            batch_size=batch_size, shuffle=False, num_workers=num_workers)

        trainer_arg = {'learning_rate': config.lr_cfg.lr,
                       'wd': config.lr_cfg.wd, 'lr_scheduler': lr_sch}
        extra_arg = eval(config.lr_cfg.extra_arg)
        trainer_arg.update(extra_arg)
        trainer = gluon.Trainer(net.collect_params(), optimizer, trainer_arg)
        if config.train_cfg.amp:
            amp.init_trainer(trainer)
        metric = mx.metric.Accuracy()
        train_metric = mx.metric.RMSE()
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss(
            sparse_label=False if config.data_cfg.mixup else True)
        train_history = TrainingHistory(['training-error', 'validation-error'])
        # acc_history = TrainingHistory(['training-acc', 'validation-acc'])
        loss_history = TrainingHistory(['training-loss', 'validation-loss'])

        iteration = 0

        best_val_score = 0

        # print('start training')
        sig_state.emit(1)
        sig_pgbar.emit(0)
        # signal.emit('Training')
        for epoch in range(epochs):
            tic = time.time()
            train_metric.reset()
            metric.reset()
            train_loss = 0
            num_batch = len(train_data)
            alpha = 1
            for i, batch in enumerate(train_data):
                if epoch == 0 and iteration == 1 and config.save_cfg.profiler:
                    profiler.set_state('run')
                    is_profiler_run = True
                if epoch == 0 and iteration == 1 and config.save_cfg.tensorboard:
                    sw.add_graph(net)
                lam = np.random.beta(alpha, alpha)
                if epoch >= epochs - 20 or not config.data_cfg.mixup:
                    lam = 1

                data_1 = gluon.utils.split_and_load(
                    batch[0], ctx_list=ctx, batch_axis=0)
                label_1 = gluon.utils.split_and_load(
                    batch[1], ctx_list=ctx, batch_axis=0)

                if not config.data_cfg.mixup:
                    data = data_1
                    label = label_1
                else:
                    data = [lam*X + (1-lam)*X[::-1] for X in data_1]
                    label = []
                    for Y in label_1:
                        y1 = label_transform(Y, classes)
                        y2 = label_transform(Y[::-1], classes)
                        label.append(lam*y1 + (1-lam)*y2)

                with ag.record():
                    output = [net(X) for X in data]
                    loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
                if config.train_cfg.amp:
                    with ag.record():
                        with amp.scale_loss(loss, trainer) as scaled_loss:
                            ag.backward(scaled_loss)
                            # scaled_loss.backward()
                else:
                    for l in loss:
                        l.backward()
                trainer.step(batch_size)
                train_loss += sum([l.sum().asscalar() for l in loss])

                output_softmax = [nd.SoftmaxActivation(out) for out in output]
                train_metric.update(label, output_softmax)
                metric.update(label_1, output_softmax)
                name, acc = train_metric.get()
                if config.save_cfg.tensorboard:
                    sw.add_scalar(tag='lr', value=trainer.learning_rate,
                                  global_step=iteration)
                if epoch == 0 and iteration == 1 and config.save_cfg.profiler:
                    nd.waitall()
                    profiler.set_state('stop')
                    profiler.dump()
                iteration += 1
                sig_pgbar.emit(iteration)
                if check_flag()[0]:
                    sig_state.emit(2)
                while(check_flag()[0] or check_flag()[1]):
                    if check_flag()[1]:
                        print('stop')
                        return
                    else:
                        time.sleep(5)
                        print('pausing')

            epoch_time = time.time() - tic
            train_loss /= batch_size * num_batch
            name, acc = train_metric.get()
            _, train_acc = metric.get()
            name, val_acc, _ = test(ctx, val_data)
            # if config.data_cfg.mixup:
            #     train_history.update([acc, 1-val_acc])
            #     plt.cla()
            #     train_history.plot(save_path='%s/%s_history.png' %
            #                        (plot_name, model_name))
            # else:
            train_history.update([1-train_acc, 1-val_acc])
            plt.cla()
            train_history.plot(save_path='%s/%s_history.png' %
                               (plot_name, model_name))

            if val_acc > best_val_score:
                best_val_score = val_acc
                net.save_parameters('%s/%.4f-cifar-%s-%d-best.params' %
                                    (save_dir, best_val_score, model_name, epoch))

            current_lr = trainer.learning_rate
            name, val_acc, val_loss = test(ctx, val_data)

            logging.info('[Epoch %d] loss=%f train_acc=%f train_RMSE=%f\n     val_acc=%f val_loss=%f lr=%f time: %f' %
                         (epoch, train_loss, train_acc, acc, val_acc, val_loss, current_lr, epoch_time))
            loss_history.update([train_loss, val_loss])
            plt.cla()
            loss_history.plot(save_path='%s/%s_loss.png' %
                              (plot_name, model_name), y_lim=(0, 2), legend_loc='best')
            if config.save_cfg.tensorboard:
                sw._add_scalars(tag='Acc',
                                scalar_dict={'train_acc': train_acc, 'test_acc': val_acc}, global_step=epoch)
                sw._add_scalars(tag='Loss',
                                scalar_dict={'train_loss': train_loss, 'test_loss': val_loss}, global_step=epoch)

            sig_table.emit([epoch, train_loss, train_acc,
                            val_loss, val_acc, current_lr, epoch_time])
            csv_writer.writerow([epoch, train_loss, train_acc,
                                 val_loss, val_acc, current_lr, epoch_time])
            csv_file.flush()

            if save_period and save_dir and (epoch + 1) % save_period == 0:
                net.save_parameters('%s/cifar10-%s-%d.params' %
                                    (save_dir, model_name, epoch))
        if save_period and save_dir:
            net.save_parameters('%s/cifar10-%s-%d.params' %
                                (save_dir, model_name, epochs-1))
Exemplo n.º 18
0
def test_custom_operator_profiling_multiple_custom_ops_imperative(seed = None, \
        mode = 'imperative', file_name = None):
    class MyAdd(mx.operator.CustomOp):
        def forward(self, is_train, req, in_data, out_data, aux):
            self.assign(out_data[0], req[0], in_data[0] + 1)

        def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
            self.assign(in_grad[0], req[0], out_grad[0])

    @mx.operator.register('MyAdd1')
    class MyAdd1Prop(mx.operator.CustomOpProp):
        def __init__(self):
            super(MyAdd1Prop, self).__init__(need_top_grad=True)

        def list_arguments(self):
            return ['data']

        def list_outputs(self):
            return ['output']

        def infer_shape(self, in_shape):
            # inputs, outputs, aux
            return [in_shape[0]], [in_shape[0]], []

        def create_operator(self, ctx, shapes, dtypes):
            return MyAdd()

    @mx.operator.register('MyAdd2')
    class MyAdd2Prop(mx.operator.CustomOpProp):
        def __init__(self):
            super(MyAdd2Prop, self).__init__(need_top_grad=True)

        def list_arguments(self):
            return ['data']

        def list_outputs(self):
            return ['output']

        def infer_shape(self, in_shape):
            # inputs, outputs, aux
            return [in_shape[0]], [in_shape[0]], []

        def create_operator(self, ctx, shapes, dtypes):
            return MyAdd()

    if file_name is None:
        file_name = 'test_custom_operator_profiling_multiple_custom_ops_imperative.json'
    enable_profiler(profile_filename = file_name, run=True, continuous_dump=True,\
                    aggregate_stats=True)
    inp = mx.nd.zeros(shape=(100, 100))
    if mode == 'imperative':
        x = inp + 1
        y = mx.nd.Custom(inp, op_type='MyAdd1')
        z = mx.nd.Custom(inp, op_type='MyAdd2')
    elif mode == 'symbolic':
        a = mx.symbol.Variable('a')
        b = a + 1
        c = mx.symbol.Custom(data=a, op_type='MyAdd1')
        d = mx.symbol.Custom(data=a, op_type='MyAdd2')
        b.bind(mx.cpu(), {'a': inp}).forward()
        c.bind(mx.cpu(), {'a': inp}).forward()
        d.bind(mx.cpu(), {'a': inp}).forward()
    mx.nd.waitall()
    profiler.dump(False)
    debug_str = profiler.dumps(format='json')
    target_dict = json.loads(debug_str)
    '''
    We are calling _plus_scalar within MyAdd1 and MyAdd2 and outside both the custom 
    operators, so in aggregate stats we should have three different kinds of 
    _plus_scalar under domains "Custom Operator" and "operator"
    '''
    assert 'Time' in target_dict and 'Custom Operator' in target_dict['Time'] \
        and 'MyAdd1::pure_python' in target_dict['Time']['Custom Operator'] \
        and 'MyAdd2::pure_python' in target_dict['Time']['Custom Operator'] \
        and 'MyAdd1::_plus_scalar' in target_dict['Time']['Custom Operator'] \
        and 'MyAdd2::_plus_scalar' in target_dict['Time']['Custom Operator'] \
        and '_plus_scalar' not in target_dict['Time']['Custom Operator'] \
        and 'operator' in target_dict['Time'] \
        and '_plus_scalar' in target_dict['Time']['operator']
    profiler.set_state('stop')
Exemplo n.º 19
0
def main():
    data_p = Path('/storage/data/').resolve()
    checkpoint_p = Path('./checkpoints/').resolve()
    checkpoint_p.mkdir(parents=True, exist_ok=True)
    logs_p = Path('./logs/').resolve()
    shutil.rmtree(logs_p, ignore_errors=True)
    encoder = SevenPlaneEncoder((19, 19))
    builder = SGFDatasetBuilder(data_p, encoder=encoder)
    builder.download_and_prepare()
    train_itr = builder.train_dataset(batch_size=BATCH_SIZE,
                                      max_worker=cpu_count(),
                                      factor=FACTOR)
    test_itr = builder.test_dataset(batch_size=BATCH_SIZE,
                                    max_worker=cpu_count(),
                                    factor=FACTOR)
    # build model
    betago = Model()
    # convert to half-presicion floating point FP16
    # NOTE: all NVIDIA GPUs with compute capability 6.1 have a low-rate FP16 performance == FFP16 is not the fast path on these GPUs
    #       data passed to split_and_load() must be float16 too
    #betago.cast('float16')
    # hybridize for speed
    betago.hybridize(static_alloc=True, static_shape=True)
    # print graph
    shape = (1, ) + encoder.shape()
    mx.viz.print_summary(betago(mx.sym.var('data')), shape={'data': shape})
    # pin GPUs
    ctx = [mx.gpu(i) for i in range(GPU_COUNT)]
    # optimizer
    opt_params = {
        'learning_rate': 0.001,
        'beta1': 0.9,
        'beta2': 0.999,
        'epsilon': 1e-08
    }
    opt = mx.optimizer.create('adam', **opt_params)
    # initialize parameters
    # MXNet initializes the weight matrices uniformly by drawing from [−0.07,0.07], bias parameters are all set to 0
    # 'Xavier': initializer is designed to keep the scale of gradients roughly the same in all layers
    betago.initialize(mx.init.Xavier(magnitude=2.3),
                      ctx=ctx,
                      force_reinit=True)
    # fetch and broadcast parameters
    params = betago.collect_params()
    # trainer
    trainer = Trainer(params=params, optimizer=opt, kvstore='device')
    # loss function
    loss_fn = SoftmaxCrossEntropyLoss()
    # use accuracy as the evaluation metric
    metric = Accuracy()
    with mxb.SummaryWriter(logdir='./logs') as sw:
        # add graph to MXBoard
        #betago.forward(mx.nd.ones(shape, ctx=ctx[0]))
        #betago.forward(mx.nd.ones(shape, ctx=ctx[1]))
        #sw.add_graph(betago)
        profiler.set_config(profile_all=True,
                            aggregate_stats=True,
                            continuous_dump=True,
                            filename='profile_output.json')
        start = time.perf_counter()
        # train
        for e in range(EPOCHS):
            if 0 == e:
                profiler.set_state('run')
            tick = time.time()
            # reset the train data iterator.
            train_itr.reset()
            # loop over the train data iterator
            for i, batch in enumerate(train_itr):
                if 0 == i:
                    tick_0 = time.time()
                # splits train data into multiple slices along batch_axis
                # copy each slice into a context
                data = split_and_load(batch.data[0],
                                      ctx_list=ctx,
                                      batch_axis=0,
                                      even_split=False)
                # splits train label into multiple slices along batch_axis
                # copy each slice into a context
                label = split_and_load(batch.label[0],
                                       ctx_list=ctx,
                                       batch_axis=0,
                                       even_split=False)
                outputs = []
                losses = []
                # inside training scope
                with ag.record():
                    for x, y in zip(data, label):
                        z = betago(x)
                        # computes softmax cross entropy loss
                        l = loss_fn(z, y)
                        outputs.append(z)
                        losses.append(l)
                # backpropagate the error for one iteration
                for l in losses:
                    l.backward()
                # make one step of parameter update.
                # trainer needs to know the batch size of data
                # to normalize the gradient by 1/batch_size
                trainer.step(BATCH_SIZE)
                # updates internal evaluation
                metric.update(label, outputs)
                # Print batch metrics
                if 0 == i % PRINT_N and 0 < i:
                    # checkpointing
                    betago.save_parameters(
                        str(checkpoint_p.joinpath(
                            'betago-{}.params'.format(e))))
                    sw.add_scalar(tag='Accuracy',
                                  value={'naive': metric.get()[1]},
                                  global_step=i - PRINT_N)
                    sw.add_scalar(tag='Speed',
                                  value={
                                      'naive':
                                      BATCH_SIZE * (PRINT_N) /
                                      (time.time() - tick)
                                  },
                                  global_step=i - PRINT_N)
                    print(
                        'epoch[{}] batch [{}], accuracy {:.4f}, samples/sec: {:.4f}'
                        .format(e, i,
                                metric.get()[1],
                                BATCH_SIZE * (PRINT_N) / (time.time() - tick)))
                    tick = time.time()
            if 0 == e:
                profiler.set_state('stop')
                profiler.dump()
            # gets the evaluation result
            print('epoch [{}], accuracy {:.4f}, samples/sec: {:.4f}'.format(
                e,
                metric.get()[1],
                BATCH_SIZE * (i + 1) / (time.time() - tick_0)))
            # reset evaluation result to initial state
            metric.reset()

    elapsed = time.perf_counter() - start
    print('elapsed: {:0.3f}'.format(elapsed))
    # use Accuracy as the evaluation metric
    metric = Accuracy()
    for batch in test_itr:
        data = split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
        label = split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
        outputs = []
        for x in data:
            outputs.append(betago(x))
        metric.update(label, outputs)
    print('validation %s=%f' % metric.get())
Exemplo n.º 20
0
 def _save_profile(self):
     if self._profile:
         print(profiler.dumps())
         profiler.dump()