Пример #1
0
def make_batch_script(trainer_params, model_params, script_params):

    # Create LBANN objects
    trainer = lbann.Trainer(mini_batch_size=trainer_params.mini_batch_size)
    model = make_model(**model_params)
    reader = make_data_reader()

    # Optimizer with learning rate schedule
    # Note: Rough approximation of
    #   embed_dim^-0.5 * min(step^-0.5, step*warmup^-1.5)
    # with embed_dim=512 and warmup=4000.
    opt = lbann.Adam(learn_rate=0.0001, beta1=0.9, beta2=0.98, eps=1e-9)
    model.callbacks.append(
        lbann.CallbackDropFixedLearningRate(
            drop_epoch=[1],
            amt=2,
        ))
    model.callbacks.append(
        lbann.CallbackDropFixedLearningRate(
            drop_epoch=[2, 4, 8, 12],
            amt=0.75,
        ))

    # Checkpoint after every epoch
    trainer.callbacks.append(
        lbann.CallbackCheckpoint(
            checkpoint_dir=os.path.join(script_params['work_dir'],
                                        'checkpoint'),
            checkpoint_epochs=1,
        ))

    # Dump weights after every epoch
    model.callbacks.append(
        lbann.CallbackDumpWeights(
            basename=os.path.join(script_params['work_dir'], 'weights'),
            epoch_interval=1,
        ))

    # Create Protobuf file
    protobuf_file = os.path.join(script_params['work_dir'],
                                 'experiment.prototext')
    lbann.proto.save_prototext(
        protobuf_file,
        trainer=trainer,
        model=model,
        data_reader=reader,
        optimizer=opt,
    )

    # Create batch script
    script = lbann.contrib.launcher.make_batch_script(**script_params, )
    script.add_command('echo "Started training at $(date)"')
    script.add_parallel_command([
        lbann.lbann_exe(),
        f'--prototext={protobuf_file}',
    ])
    script.add_command('status=$?')
    script.add_command('echo "Finished training at $(date)"')
    script.add_command('exit ${status}')
    return script
Пример #2
0
def make_batch_script(trainer_params, model_params, script_params):

    #inference exe
    lbann_exe = abspath(lbann.lbann_exe())
    lbann_exe = join(dirname(lbann_exe), 'lbann_inf')

    # Create LBANN objects
    trainer = lbann.Trainer(mini_batch_size=trainer_params['mini_batch_size'])
    model = make_model(**model_params)
    # model.eval()
    reader = make_data_reader()

    # Optimizer with learning rate schedule
    # Note: Rough approximation of
    #   embed_dim^-0.5 * min(step^-0.5, step*warmup^-1.5)
    # with embed_dim=512 and warmup=4000.
    # opt = lbann.Adam(learn_rate=0.0001, beta1=0.9, beta2=0.98, eps=1e-9)
    opt = lbann.NoOptimizer()
    model.callbacks.append(
        lbann.CallbackDropFixedLearningRate(
            drop_epoch=[1],
            amt=2,
        ))
    model.callbacks.append(
        lbann.CallbackDropFixedLearningRate(
            drop_epoch=[2, 4, 8, 12],
            amt=0.75,
        ))

    # Checkpoint after every epoch
    # trainer.callbacks.append(
    #     lbann.CallbackCheckpoint(
    #         checkpoint_dir=os.path.join(script_params['work_dir'], 'checkpoint'),
    #         checkpoint_epochs=1,
    #     )
    # )

    # Dump weights after every epoch
    # model.callbacks.append(
    #     lbann.CallbackDumpWeights(
    #         basename=os.path.join(script_params['work_dir'], 'weights'),
    #         epoch_interval=1,
    #     )
    # )

    status = lbann.contrib.launcher.run(
        trainer,
        model,
        reader,
        opt,
        lbann_exe,
        nodes=script_params['nodes'],
        procs_per_node=script_params['procs_per_node'],
        time_limit=30,
        setup_only=False,
        batch_job=False,
    )
    # **kwargs)

    print(status)
Пример #3
0
def set_up_experiment(args,
                      input_,
                      probs,
                      labels):
    # Set up objective function
    cross_entropy = lbann.CrossEntropy([probs, labels])
    layers = list(lbann.traverse_layer_graph(input_))
    weights = set()
    for l in layers:
        weights.update(l.weights)
    # scale = weight decay
    l2_reg = lbann.L2WeightRegularization(weights=weights, scale=1e-4)
    objective_function = lbann.ObjectiveFunction([cross_entropy, l2_reg])

    # Set up model
    top1 = lbann.CategoricalAccuracy([probs, labels])
    top5 = lbann.TopKCategoricalAccuracy([probs, labels], k=5)
    metrics = [lbann.Metric(top1, name='top-1 accuracy', unit='%'),
               lbann.Metric(top5, name='top-5 accuracy', unit='%')]
    callbacks = [lbann.CallbackPrint(),
                 lbann.CallbackTimer(),
                 lbann.CallbackDropFixedLearningRate(
                     drop_epoch=[30, 60], amt=0.1)]
    model = lbann.Model(args.mini_batch_size,
                        args.num_epochs,
                        layers=layers,
                        weights=weights,
                        objective_function=objective_function,
                        metrics=metrics,
                        callbacks=callbacks)

    # Load data reader from prototext
    data_reader_proto = lbann.lbann_pb2.LbannPB()
    with open(args.data_reader, 'r') as f:
        txtf.Merge(f.read(), data_reader_proto)
    data_reader_proto = data_reader_proto.data_reader

    # Set up optimizer
    if args.optimizer == 'sgd':
        print('Creating sgd optimizer')
        optimizer = lbann.optimizer.SGD(
            learn_rate=args.optimizer_learning_rate,
            momentum=0.9,
            nesterov=True
        )
    else:
        optimizer = lbann.contrib.args.create_optimizer(args)

    # Save prototext to args.prototext
    if args.prototext:
        lbann.proto.save_prototext(args.prototext,
                                   model=model,
                                   optimizer=optimizer,
                                   data_reader=data_reader_proto)

    return model, data_reader_proto, optimizer
Пример #4
0
def set_up_experiment(args, input_, probs, labels):
    # Set up objective function
    cross_entropy = lbann.CrossEntropy([probs, labels])
    layers = list(lbann.traverse_layer_graph(input_))
    l2_reg_weights = set()
    for l in layers:
        if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected:
            l2_reg_weights.update(l.weights)
    # scale = weight decay
    l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=1e-4)
    objective_function = lbann.ObjectiveFunction([cross_entropy, l2_reg])

    # Set up model
    top1 = lbann.CategoricalAccuracy([probs, labels])
    top5 = lbann.TopKCategoricalAccuracy([probs, labels], k=5)
    metrics = [
        lbann.Metric(top1, name='top-1 accuracy', unit='%'),
        lbann.Metric(top5, name='top-5 accuracy', unit='%')
    ]
    callbacks = [
        lbann.CallbackPrint(),
        lbann.CallbackTimer(),
        lbann.CallbackDropFixedLearningRate(drop_epoch=[30, 60], amt=0.1)
    ]
    model = lbann.Model(args.num_epochs,
                        layers=layers,
                        objective_function=objective_function,
                        metrics=metrics,
                        callbacks=callbacks)

    # Set up data reader
    data_reader = data.imagenet.make_data_reader(num_classes=args.num_classes)

    # Set up optimizer
    if args.optimizer == 'sgd':
        print('Creating sgd optimizer')
        optimizer = lbann.optimizer.SGD(
            learn_rate=args.optimizer_learning_rate,
            momentum=0.9,
            nesterov=True)
    else:
        optimizer = lbann.contrib.args.create_optimizer(args)

    # Setup trainer
    trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size)

    return trainer, model, data_reader, optimizer
Пример #5
0
def setup(data_reader_file,
          name='classifier',
          num_labels=200,
          mini_batch_size=128,
          num_epochs=1000,
          learning_rate=0.1,
          bn_statistics_group_size=2,
          fc_data_layout='model_parallel',
          warmup_epochs=50,
          learning_rate_drop_interval=50,
          learning_rate_drop_factor=0.25,
          checkpoint_interval=None):

    # Setup input data
    input = lbann.Input(target_mode = 'classification')
    images = lbann.Identity(input)
    labels = lbann.Identity(input)

    # Classification network
    head_cnn = modules.ResNet(bn_statistics_group_size=bn_statistics_group_size)
    class_fc = lbann.modules.FullyConnectedModule(num_labels,
                                                  activation=lbann.Softmax,
                                                  name=f'{name}_fc',
                                                  data_layout=fc_data_layout)
    x = head_cnn(images)
    probs = class_fc(x)

    # Setup objective function
    cross_entropy = lbann.CrossEntropy([probs, labels])
    l2_reg_weights = set()
    for l in lbann.traverse_layer_graph(input):
        if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected:
            l2_reg_weights.update(l.weights)
    l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=0.0002)
    obj = lbann.ObjectiveFunction([cross_entropy, l2_reg])

    # Setup model
    metrics = [lbann.Metric(lbann.CategoricalAccuracy([probs, labels]),
                            name='accuracy', unit='%')]
    callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()]
    if checkpoint_interval:
        callbacks.append(
            lbann.CallbackCheckpoint(
                checkpoint_dir='ckpt',
                checkpoint_epochs=5
            )
        )

    # Learning rate schedules
    if warmup_epochs:
        callbacks.append(
            lbann.CallbackLinearGrowthLearningRate(
                target=learning_rate * mini_batch_size / 128,
                num_epochs=warmup_epochs
            )
        )
    if learning_rate_drop_factor:
        callbacks.append(
            lbann.CallbackDropFixedLearningRate(
                drop_epoch=list(range(0, num_epochs, learning_rate_drop_interval)),
                amt=learning_rate_drop_factor)
        )

    # Construct model
    model = lbann.Model(num_epochs,
                        layers=lbann.traverse_layer_graph(input),
                        objective_function=obj,
                        metrics=metrics,
                        callbacks=callbacks)

    # Setup optimizer
    # opt = lbann.Adam(learn_rate=learning_rate, beta1=0.9, beta2=0.999, eps=1e-8)
    opt = lbann.SGD(learn_rate=learning_rate, momentum=0.9)

    # Load data reader from prototext
    data_reader_proto = lbann.lbann_pb2.LbannPB()
    with open(data_reader_file, 'r') as f:
        google.protobuf.text_format.Merge(f.read(), data_reader_proto)
    data_reader_proto = data_reader_proto.data_reader
    for reader_proto in data_reader_proto.reader:
        reader_proto.python.module_dir = os.path.dirname(os.path.realpath(__file__))

    # Return experiment objects
    return model, data_reader_proto, opt
Пример #6
0
l2_reg_weights = set()
for l in layers:
    if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected:
        l2_reg_weights.update(l.weights)
l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=1e-4)
obj = lbann.ObjectiveFunction([cross_entropy, l2_reg])

# Setup model
metrics = [
    lbann.Metric(top1, name='top-1 accuracy', unit='%'),
    lbann.Metric(top5, name='top-5 accuracy', unit='%')
]
callbacks = [
    lbann.CallbackPrint(),
    lbann.CallbackTimer(),
    lbann.CallbackDropFixedLearningRate(drop_epoch=[30, 60, 80], amt=0.1)
]
if args.warmup:
    callbacks.append(
        lbann.CallbackLinearGrowthLearningRate(target=0.1 *
                                               args.mini_batch_size / 256,
                                               num_epochs=5))
model = lbann.Model(args.num_epochs,
                    layers=layers,
                    objective_function=obj,
                    metrics=metrics,
                    callbacks=callbacks)

# Setup optimizer
opt = lbann.contrib.args.create_optimizer(args)
Пример #7
0
# Setup objective function
weights = set()
for l in layers:
    weights.update(l.weights)
l2_reg = lbann.L2WeightRegularization(weights=weights, scale=5e-4)
obj = lbann.ObjectiveFunction([cross_entropy, l2_reg])

# Setup model
metrics = [
    lbann.Metric(top1, name='top-1 accuracy', unit='%'),
    lbann.Metric(top5, name='top-5 accuracy', unit='%')
]
callbacks = [
    lbann.CallbackPrint(),
    lbann.CallbackTimer(),
    lbann.CallbackDropFixedLearningRate(drop_epoch=[20, 40, 60], amt=0.1)
]
model = lbann.Model(args.mini_batch_size,
                    args.num_epochs,
                    layers=layers,
                    weights=weights,
                    objective_function=obj,
                    metrics=metrics,
                    callbacks=callbacks)

# Setup optimizer
opt = lbann.contrib.args.create_optimizer(args)

# Load data reader from prototext
data_reader_proto = lbann.lbann_pb2.LbannPB()
with open(args.data_reader, 'r') as f:
Пример #8
0
def setup(num_patches=3,
          mini_batch_size=512,
          num_epochs=75,
          learning_rate=0.005,
          bn_statistics_group_size=2,
          fc_data_layout='model_parallel',
          warmup=True,
          checkpoint_interval=None):

    # Data dimensions
    patch_dims = patch_generator.patch_dims
    num_labels = patch_generator.num_labels(num_patches)

    # Extract tensors from data sample
    input = lbann.Input()
    slice_points = [0]
    for _ in range(num_patches):
        patch_size = functools.reduce(operator.mul, patch_dims)
        slice_points.append(slice_points[-1] + patch_size)
    slice_points.append(slice_points[-1] + num_labels)
    sample = lbann.Slice(input, slice_points=str_list(slice_points))
    patches = [
        lbann.Reshape(sample, dims=str_list(patch_dims))
        for _ in range(num_patches)
    ]
    labels = lbann.Identity(sample)

    # Siamese network
    head_cnn = modules.ResNet(
        bn_statistics_group_size=bn_statistics_group_size)
    heads = [head_cnn(patch) for patch in patches]
    heads_concat = lbann.Concatenation(heads)

    # Classification network
    class_fc1 = modules.FcBnRelu(
        4096,
        statistics_group_size=bn_statistics_group_size,
        name='siamese_class_fc1',
        data_layout=fc_data_layout)
    class_fc2 = modules.FcBnRelu(
        4096,
        statistics_group_size=bn_statistics_group_size,
        name='siamese_class_fc2',
        data_layout=fc_data_layout)
    class_fc3 = lbann.modules.FullyConnectedModule(num_labels,
                                                   activation=lbann.Softmax,
                                                   name='siamese_class_fc3',
                                                   data_layout=fc_data_layout)
    x = class_fc1(heads_concat)
    x = class_fc2(x)
    probs = class_fc3(x)

    # Setup objective function
    cross_entropy = lbann.CrossEntropy([probs, labels])
    l2_reg_weights = set()
    for l in lbann.traverse_layer_graph(input):
        if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected:
            l2_reg_weights.update(l.weights)
    l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=0.0002)
    obj = lbann.ObjectiveFunction([cross_entropy, l2_reg])

    # Setup model
    metrics = [
        lbann.Metric(lbann.CategoricalAccuracy([probs, labels]),
                     name='accuracy',
                     unit='%')
    ]
    callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()]
    if checkpoint_interval:
        callbacks.append(
            lbann.CallbackCheckpoint(checkpoint_dir='ckpt',
                                     checkpoint_epochs=5))

    # Learning rate schedules
    if warmup:
        callbacks.append(
            lbann.CallbackLinearGrowthLearningRate(target=learning_rate *
                                                   mini_batch_size / 128,
                                                   num_epochs=5))
    callbacks.append(
        lbann.CallbackDropFixedLearningRate(drop_epoch=list(range(0, 100, 15)),
                                            amt=0.25))

    # Construct model
    model = lbann.Model(num_epochs,
                        layers=lbann.traverse_layer_graph(input),
                        objective_function=obj,
                        metrics=metrics,
                        callbacks=callbacks)

    # Setup optimizer
    opt = lbann.SGD(learn_rate=learning_rate, momentum=0.9)
    # opt = lbann.Adam(learn_rate=learning_rate, beta1=0.9, beta2=0.999, eps=1e-8)

    # Setup data reader
    data_reader = make_data_reader(num_patches)

    # Return experiment objects
    return model, data_reader, opt