Пример #1
0
    def __init__(self,
                 epochs,
                 layers=[],
                 weights=[],
                 objective_function=None,
                 metrics=[],
                 callbacks=[],
                 summary_dir=None):

        # Scalar fields
        self.epochs = epochs
        self.summary_dir = summary_dir
        # Get connected layers
        self.layers = list(lbann.core.layer.traverse_layer_graph(layers))

        # Get weights associated with layers
        self.weights = set(make_iterable(weights))
        for l in self.layers:
            self.weights.update(l.weights)

        # Construct objective function if needed
        obj_type = lbann.core.objective_function.ObjectiveFunction
        if isinstance(objective_function, obj_type):
            self.objective_function = objective_function
        elif objective_function is None:
            self.objective_function = obj_type()
        else:
            self.objective_function = obj_type(objective_function)

        # Metrics and callbacks
        self.metrics = make_iterable(metrics)
        self.callbacks = make_iterable(callbacks)
Пример #2
0
    def __init__(self, mini_batch_size, epochs,
                 layers=[], weights=[], objective_function=None,
                 metrics=[], callbacks=[]):

        # Scalar fields
        self.mini_batch_size = mini_batch_size
        self.epochs = epochs
        self.block_size = 256           # TODO: Make configurable
        self.num_parallel_readers = 0   # TODO: Make configurable
        self.procs_per_trainer = 0      # TODO: Make configurable

        # Get connected layers
        self.layers = list(lbann.layer.traverse_layer_graph(layers))

        # Get weights associated with layers
        self.weights = set(make_iterable(weights))
        for l in self.layers:
            self.weights.update(l.weights)

        # Construct objective function if needed
        obj_type = lbann.objective_function.ObjectiveFunction
        if isinstance(objective_function, obj_type):
            self.objective_function = objective_function
        elif objective_function is None:
            self.objective_function = obj_type()
        else:
            self.objective_function = obj_type(objective_function)

        # Metrics and callbacks
        self.metrics = make_iterable(metrics)
        self.callbacks = make_iterable(callbacks)
Пример #3
0
    def add_parallel_command(self,
                             command,
                             work_dir=None,
                             nodes=None,
                             procs_per_node=None,
                             reservation=None,
                             launcher=None,
                             launcher_args=None):
        """Add command to be executed in parallel.

        The command is launched with jsrun. Parallel processes are
        distributed evenly amongst the compute nodes.

        Args:
            command (`str` or `Iterable` of `str`s): Command to be
                executed in parallel.
            work_dir (str, optional): Working directory.
            nodes (int, optional): Number of compute nodes.
            procs_per_node (int, optional): Number of parallel
                processes per compute node.
            reservation (str, optional): Scheduler advance reservation.
            launcher (str, optional): jsrun executable.
            launcher_args (`Iterable` of `str`s, optional):
                Command-line arguments to jsrun.

        """

        # Use default values if needed
        if work_dir is None:
            work_dir = self.work_dir
        if nodes is None:
            nodes = self.nodes
        if procs_per_node is None:
            procs_per_node = self.procs_per_node
        if reservation is None:
            reservation = self.reservation
        if launcher is None:
            launcher = self.launcher
        if launcher_args is None:
            launcher_args = self.launcher_args

        # Construct jsrun invocation
        args = [launcher]
        args.extend(make_iterable(launcher_args))
        args.append(f'--chdir {work_dir}')
        args.extend([
            f'--nrs {nodes}',
            '--rs_per_host 1',
            f'--tasks_per_rs {procs_per_node}',
            '--launch_distribution packed',
            '--cpu_per_rs ALL_CPUS',
            '--gpu_per_rs ALL_GPUS',
        ])
        args.extend(make_iterable(command))
        self.add_command(args)
Пример #4
0
    def export_proto(self):
        """Construct and return a protobuf message."""

        # Construct Protobuf message
        if base_has_export_proto:
            proto = base_class.export_proto(self)
            message = getattr(proto, base_field_name)
            message.SetInParent()
        else:
            # TODO (trb 08/01/2019): This list would have to be
            # updated any time another _pb2 file is created. It might
            # be better to have this as a global `frozenset`
            # (ndryden's suggestion) that gets maintained
            # elsewhere. But this code either works or doesn't get
            # executed now, so I vote delaying this fix until a need
            # arises.
            proto_modules = [
                callbacks_pb2, layers_pb2, metrics_pb2, model_pb2,
                objective_functions_pb2, operators_pb2, optimizers_pb2,
                training_algorithm_pb2, weights_pb2
            ]
            proto_type = None
            while proto_type is None:
                proto_type = getattr(proto_modules.pop(), message_name, None)
            proto = proto_type()
            message = proto

        # Set message
        for field_name in field_names:
            val = getattr(self, field_name)
            if val is not None:
                try:
                    field = getattr(message, field_name)
                    field_descriptor = field_descriptors[field_name]
                    if field_descriptor.message_type in _protobuf_type_wrappers:
                        field.SetInParent()
                        field.value = val
                    elif field_descriptor.label == google.protobuf.descriptor.FieldDescriptor.LABEL_REPEATED:
                        iterable_val = make_iterable(val)
                        if field_descriptor.type == field_descriptor.TYPE_MESSAGE:
                            field.extend(
                                [x.export_proto() for x in iterable_val])
                        else:
                            field.extend(iterable_val)
                    elif isinstance(val, google.protobuf.message.Message):
                        getattr(message, field_name).MergeFrom(val)
                    elif callable(getattr(val, "export_proto", None)):
                        # 'val' is (hopefully) an LBANN class
                        # representation of a protobuf message.
                        getattr(message,
                                field_name).MergeFrom(val.export_proto())
                    else:
                        setattr(message, field_name, val)
                except:
                    raise TypeError('{} is invalid type for {}.{}'.format(
                        type(val).__name__, self.__class__.__name__,
                        field_name))

        # Return Protobuf message
        return proto
Пример #5
0
    def add_parallel_command(self,
                             command,
                             work_dir=None,
                             nodes=None,
                             procs_per_node=None,
                             launcher=None,
                             launcher_args=None):
        """Add command to be executed in parallel.

        The command is launched with mpiexec. Parallel processes are
        distributed evenly amongst the compute nodes.

        Args:
            command (`str` or `Iterable` of `str`s): Command to be
                executed in parallel.
            work_dir (str, optional): Working directory.
            nodes (int, optional): Number of compute nodes.
            procs_per_node (int, optional): Number of parallel
                processes per compute node.
            launcher (str, optional): mpiexec executable.
            launcher_args (`Iterable` of `str`s, optional):
                Command-line arguments to mpiexec.

        """

        # Use default values if needed
        if work_dir is None:
            work_dir = self.work_dir
        if nodes is None:
            nodes = self.nodes
        if procs_per_node is None:
            procs_per_node = self.procs_per_node
        if launcher is None:
            launcher = self.launcher
        if launcher_args is None:
            launcher_args = self.launcher_args

        # Construct mpiexec invocation
        args = [launcher]
        args.extend(make_iterable(launcher_args))
        args.extend([
            f'-n {nodes*procs_per_node}',
            f'--map-by ppr:{procs_per_node}:node', f'-wdir {work_dir}'
        ])
        args.extend(make_iterable(command))
        self.add_command(args)
Пример #6
0
    def add_command(self, command):
        """Add executable command to script.

        Args:
            command (`str` or `Iterable` of `str`s): Program
                invocation or sequence of program arguments.

        """
        self.add_body_line(' '.join(make_iterable(command)))
Пример #7
0
    def __init__(self, size, bias = True,
                 weights=[], name=None, data_layout='data_parallel'):
        """Initialize LSTM cell.

        Args:
            size (int): Size of output tensor.
            bias (bool): Whether to apply biases after linearity.
            weights (`Weights` or iterator of `Weights`): Weights in
                fully-connected layer. There are at most two - a
                matrix ((4*size) x (input_size+size) dimensions) and a
                bias (4*size entries). If weights are not provided,
                the matrix and bias will be initialized in a similar
                manner as PyTorch (uniform random values from
                [-1/sqrt(size), 1/sqrt(size)]).
            name (str): Default name is in the form 'lstmcell<index>'.
            data_layout (str): Data layout.

        """
        super().__init__()
        LSTMCell.global_count += 1
        self.step = 0
        self.size = size
        self.name = (name
                     if name
                     else 'lstmcell{0}'.format(LSTMCell.global_count))
        self.data_layout = data_layout

        # Initial state
        self.last_output = lbann.Constant(value=0.0, num_neurons=str(size),
                                          name=self.name + '_init_output',
                                          data_layout=self.data_layout)
        self.last_cell = lbann.Constant(value=0.0, num_neurons=str(size),
                                        name=self.name + '_init_cell',
                                        data_layout=self.data_layout)

        # Weights
        self.weights = list(make_iterable(weights))
        if len(self.weights) > 2:
            raise ValueError('`LSTMCell` has at most two weights, '
                             'but got {0}'.format(len(self.weights)))
        if len(self.weights) == 0:
            self.weights.append(
                lbann.Weights(initializer=lbann.UniformInitializer(min=-1/sqrt(self.size),
                                                                   max=-1/sqrt(self.size)),
                              name=self.name+'_matrix'))
        if len(self.weights) == 1:
            self.weights.append(
                lbann.Weights(initializer=lbann.UniformInitializer(min=-1/sqrt(self.size),
                                                                   max=-1/sqrt(self.size)),
                           name=self.name+'_bias'))

        # Linearity
        self.fc = FullyConnectedModule(4*size, bias=bias,
                                       weights=self.weights,
                                       name=self.name + '_fc',
                                       data_layout=self.data_layout)
Пример #8
0
    def __init__(self,
                 size,
                 bias=False,
                 weights=[],
                 activation=None,
                 transpose=False,
                 name=None,
                 parallel_strategy={}):
        """Initalize channelwise fully connected module

    Args:
        size (int or list): Dimension of the output tensor
        bias (bool): Whether to apply bias after linearity.
        transpose (bool): Whether to apply transpose of weights
                matrix.
        weights (`Weights` or iterator of `Weights`): Weights in
                fully-connected layer. There are at most two: the
                matrix and the bias. If weights are not provided, the
                matrix will be initialized with He normal
                initialization and the bias with zeros.
        activation (type): Layer class for activation function.
        name (str): Default name is in the form 'channelwisefc<index>'.
        parallel_strategy (dict): Data partitioning scheme.
    """
        super().__init__()
        ChannelwiseFullyConnectedModule.global_count += 1
        self.instance = 0
        self.size = size
        self.bias = bias
        self.transpose = transpose
        self.parallel_strategy = parallel_strategy
        self.name = (name if name else 'channelwisefc{0}'.format(
            ChannelwiseFullyConnectedModule.global_count))
        self.data_layout = 'data_parallel'

        self.weights = list(make_iterable(weights))
        if len(self.weights) > 2:
            raise ValueError('`FullyConnectedModule` has '
                             'at most two weights, '
                             'but got {0}'.format(len(self.weights)))
        if len(self.weights) == 0:
            self.weights.append(
                lbann.Weights(initializer=lbann.HeNormalInitializer(),
                              name=self.name + '_matrix'))
        if self.bias and len(self.weights) == 1:
            self.weights.append(
                lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0),
                              name=self.name + '_bias'))
        self.activation = None
        if activation:
            if isinstance(activation, type):
                self.activation = activation
            else:
                self.activation = type(activation)
            if not issubclass(self.activation, lbann.Layer):
                raise ValueError('activation must be a layer')
Пример #9
0
    def __init__(self, terms=[]):
        """Create an objective function with layer terms and regularization.

        `terms` should be a sequence of `ObjectiveFunctionTerm`s and
        `Layer`s.

        """
        self.terms = []
        for t in make_iterable(terms):
            self.add_term(t)
Пример #10
0
    def __init__(self, size, bias=True, weights=[], activation=None,
                 name=None, data_layout='data_parallel'):
        """Initialize fully-connected module.

        Args:
            size (int): Size of output tensor.
            activation (type): Layer class for activation function.
            bias (bool): Whether to apply bias after linearity.
            weights (`Weights` or iterator of `Weights`): Weights in
                fully-connected layer. There are at most two: the
                matrix and the bias. If weights are not provided, the
                matrix will be initialized with He normal
                initialization and the bias with zeros.
            name (str): Default name is in the form 'fcmodule<index>'.
            data_layout (str): Data layout.

        """
        super().__init__()
        FullyConnectedModule.global_count += 1
        self.instance = 0
        self.size = size
        self.bias = bias
        self.name = (name
                     if name
                     else 'fcmodule{0}'.format(FullyConnectedModule.global_count))
        self.data_layout = data_layout

        # Initialize weights
        # Note: If weights are not provided, matrix weights are
        # initialized with He normal scheme and bias weights are
        # initialized with zeros.
        self.weights = list(make_iterable(weights))
        if len(self.weights) > 2:
            raise ValueError('`FullyConnectedModule` has '
                             'at most two weights, '
                             'but got {0}'.format(len(self.weights)))
        if len(self.weights) == 0:
            self.weights.append(
                lbann.Weights(initializer=lbann.HeNormalInitializer(),
                              name=self.name+'_matrix'))
        if len(self.weights) == 1:
            self.weights.append(
                lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0),
                              name=self.name+'_bias'))

        # Initialize activation layer
        self.activation = None
        if activation:
            if isinstance(activation, type):
                self.activation = activation
            else:
                self.activation = type(activation)
            if not issubclass(self.activation, lbann.Layer):
                raise ValueError('activation must be a layer')
Пример #11
0
def run(
    trainer,
    model,
    data_reader,
    optimizer,
    lbann_exe=lbann.lbann_exe(),
    lbann_args=[],
    overwrite_script=False,
    setup_only=False,
    batch_job=False,
    *args,
    **kwargs,
):
    """Run LBANN with system-specific optimizations.

    This is intended to match the behavior of `lbann.run`, with
    defaults and optimizations for the current system. See that
    function for a full list of options.

    """

    # Create batch script generator
    script = make_batch_script(*args, **kwargs)

    # Batch script prints start time
    script.add_command('echo "Started at $(date)"')

    # Batch script invokes LBANN
    lbann_command = [lbann_exe]
    lbann_command.extend(make_iterable(lbann_args))
    prototext_file = os.path.join(script.work_dir, 'experiment.prototext')
    lbann.proto.save_prototext(prototext_file,
                               trainer=trainer,
                               model=model,
                               data_reader=data_reader,
                               optimizer=optimizer)
    lbann_command.append('--prototext={}'.format(prototext_file))
    script.add_parallel_command(lbann_command)
    script.add_command('status=$?')

    # Batch script prints finish time and returns status
    script.add_command('echo "Finished at $(date)"')
    script.add_command('exit ${status}')

    # Write, run, or submit batch script
    status = 0
    if setup_only:
        script.write(overwrite=overwrite_script)
    elif batch_job:
        status = script.submit(overwrite=overwrite_script)
    else:
        status = script.run(overwrite=overwrite_script)
    return status
Пример #12
0
    def __init__(self,
                 parents=[],
                 children=[],
                 weights=[],
                 name=None,
                 data_layout='data_parallel',
                 hint_layer=None):
        """Constructor.

        Args:
            parents (Iterable of Layer, optional): Sources of input
                tensors.
            children (Iterable of Layer, optional): Destinations of
                output tensors.
            weights (Iterable of Weights, optional): Trainable
                parameters.
            name (str, optional): Unique identifier (default is
                'layer<index>').
            data_layout (str, optional): Data distribution scheme.
            hint_layer (Layer, optional): Hint for output dimensions.

        """
        Layer.global_count += 1
        self.parents = []
        self.children = []
        self.weights = []
        self.name = name if name else 'layer{0}'.format(Layer.global_count)
        self.data_layout = data_layout
        self.hint_layer = hint_layer

        # Initialize parents, children, and weights
        for l in make_iterable(parents):
            self.add_parent(l)
        for l in make_iterable(children):
            self.add_child(child)
        for w in make_iterable(weights):
            self.add_weights(w)
Пример #13
0
    def __init__(self, num_channels, size, bias=True, weights=[], name=None):
        """Initialize GRU cell.

        Args:
            num_channels (int): The number of rows in the matrix to perform GRU 
            size (int): Size of output tensor.
            bias (bool): Whether to apply biases after linearity.
            weights (`Weights` or iterator of `Weights`): Weights in
                fully-connected layer. There are at most four - two
                matrices ((3*size) x (input_size) and (3*size) x (size) dimensions) each and two
                biases (3*size entries) each. If weights are not provided,
                the matrix and bias will be initialized in a similar
                manner as PyTorch (uniform random values from
                [-1/sqrt(size), 1/sqrt(size)]).
            name (str): Default name is in the form 'gru<index>'.
            data_layout (str): Data layout.

        """

        super().__init__()
        ChannelwiseGRU.global_count += 1
        self.step = 0
        self.size = size
        self.num_channels = num_channels
        self.name = (name if name else f'gru{ChannelwiseGRU.global_count}')
        self.data_layout = 'data_parallel'
        scale = 1 / math.sqrt(self.size)

        self.weights = list(make_iterable(weights))

        weight_name = ['_ih_matrix', '_ih_bias', '_hh_matrix', '_hh_bias']
        for i in range(4):
            if (len(self.weights) == i):
                self.weights.append(
                    lbann.Weights(initializer=lbann.UniformInitializer(
                        min=-scale, max=scale),
                                  name=self.name + weight_name[i]))

        self.ih_fc = ChannelwiseFullyConnectedModule(3 * size,
                                                     bias=bias,
                                                     weights=self.weights[:2],
                                                     name=self.name + '_ih_fc')
        self.hh_fc = ChannelwiseFullyConnectedModule(3 * size,
                                                     bias=bias,
                                                     weights=self.weights[2:],
                                                     name=self.name + '_hh_fc')
        self.ones = lbann.Constant(value=1.0,
                                   num_neurons=str_list([num_channels, size]),
                                   name=self.name + '_ones')
Пример #14
0
 def __init__(self,
              mini_batch_size,
              name=None,
              procs_per_trainer=None,
              num_parallel_readers=None,
              random_seed=None,
              callbacks=[]):
     self.name = name
     self.procs_per_trainer = procs_per_trainer
     self.num_parallel_readers = num_parallel_readers
     self.random_seed = random_seed
     self.mini_batch_size = mini_batch_size
     self.hydrogen_block_size = None
     # Callbacks
     self.callbacks = make_iterable(callbacks)
Пример #15
0
 def __init__(self,
              mini_batch_size,
              name=None,
              num_parallel_readers=None,
              random_seed=None,
              serialize_io=None,
              training_algo=None,
              callbacks=[]):
     self.name = name
     self.num_parallel_readers = num_parallel_readers
     self.random_seed = random_seed
     self.serialize_io = serialize_io
     self.mini_batch_size = mini_batch_size
     self.hydrogen_block_size = None
     self.training_algo = training_algo
     # Callbacks
     self.callbacks = make_iterable(callbacks)
Пример #16
0
        def __init__(self, strategy: str = "checkpoint_binary",
                     weights_names: list[str] = [],
                     exchange_hyperparameters: bool = False,
                     checkpoint_dir: str = None):
            """Construct a new exchange strategy.

            Args:
                strategy:
                  Which strategy to use (default: "checkpoint_binary").
                weights_names:
                  A list of weights names that should be exchanged.
                exchange_hyperparameters:
                  If True, exchange all optimizer state. Only applies to
                  the "sendrecv_weights" strategy.
                checkpoint_dir: A path to a directory for storing the
                  checkpoint files. Only applies to "checkpoint_file".
            """
            self.strategy = strategy
            self.exchange_hyperparameters = exchange_hyperparameters
            self.weights_names = make_iterable(weights_names)
            self.checkpoint_dir = checkpoint_dir
Пример #17
0
def traverse_layer_graph(layers):
    """Topologically ordered traversal of layer graph.

    All layers that are connected to `layers` will be traversed. The
    layer graph is assumed to be acyclic. No checks are made for
    cycles and strange things may happen if one exists.

    Args:
        layers (Layer or Iterator of Layer): Node(s) in layer graph.

    Yields:
        Layer: Node in layer graph, in a topological order.

    """

    # DFS to find root nodes in layer graph
    roots = []
    visited = set()
    stack = list(make_iterable(layers))
    while stack:
        l = stack.pop()
        if l not in visited:
            visited.add(l)
            stack.extend(l.parents)
            stack.extend(l.children)
            if not l.parents:
                roots.append(l)

    # DFS to traverse layer graph in topological order
    visited = set()
    stack = roots
    while stack:
        l = stack.pop()
        if (l not in visited and all([(p in visited) for p in l.parents])):
            visited.add(l)
            stack.extend(l.children)
            yield l
Пример #18
0
 def __init__(self, weights=[], scale=1.0):
     self.scale = scale
     self.weights = list(make_iterable(weights))
Пример #19
0
    def __init__(self,
                 num_dims,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 bias=True,
                 weights=[],
                 activation=None,
                 name=None,
                 transpose=False,
                 parallel_strategy={}):
        """Initialize convolution module.

        Args:
            num_dims (int): Number of dimensions.
            out_channels (int): Number of output channels, i.e. number
                of filters.
            kernel_size (int): Size of convolution kernel.
            stride (int): Convolution stride.
            padding (int): Convolution padding.
            dilation (int): Convolution dilation.
            groups (int): Number of convolution groups.
            bias (bool): Whether to apply channel-wise bias after
                convolution.
            weights (`Weights` or iterator of `Weights`): Weights in
                convolution layer. There are at most two: the kernel
                and the bias. If weights are not provided, the kernel
                will be initialized with He normal initialization and
                the bias with zeros.
            name (str): Default name is in the form 'convmodule<index>'.
            transpose (bool): If true call deconvolution (or convolution
                         transpose)
            parallel_strategy dict): Data partitioning scheme.

        """
        super().__init__()
        ConvolutionModule.global_count += 1
        self.instance = 0
        self.num_dims = num_dims
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.dilation = dilation
        self.groups = groups
        self.bias = bias
        self.weights = list(make_iterable(weights))
        self.name = (name if name else 'convmodule{0}'.format(
            ConvolutionModule.global_count))
        self.transpose = transpose
        self.parallel_strategy = parallel_strategy

        # Initialize weights
        # Note: If weights are not provided, kernel weights are
        # initialized with He normal scheme and bias weights are
        # initialized with zeros.
        self.weights = list(make_iterable(weights))
        if len(self.weights) > 2:
            raise ValueError('`ConvolutionModule` has '
                             'at most two weights, '
                             'but got {0}'.format(len(self.weights)))
        if len(self.weights) == 0:
            self.weights.append(
                lbann.Weights(initializer=lbann.HeNormalInitializer(),
                              name=self.name + '_kernel'))
        if len(self.weights) == 1:
            self.weights.append(
                lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0),
                              name=self.name + '_bias'))

        # Initialize activation layer
        self.activation = None
        if activation:
            if isinstance(activation, type):
                self.activation = activation
            else:
                self.activation = type(activation)
            if not issubclass(self.activation, lbann.Layer):
                raise ValueError('activation must be a layer')
Пример #20
0
 def add_weights(self, w):
     """Add w to this layer's weights."""
     self.weights.extend(make_iterable(w))
Пример #21
0
 def add_child(self, child):
     """"This layer will send an output tensor to `child`."""
     for c in make_iterable(child):
         self.children.append(c)
         c.parents.append(self)
Пример #22
0
 def add_parent(self, parent):
     """This layer will receive an input tensor from `parent`."""
     for p in make_iterable(parent):
         self.parents.append(p)
         p.children.append(self)
Пример #23
0
def run(
    trainer,
    model,
    data_reader,
    optimizer,
    work_dir=None,
    proto_file_name='experiment.prototext',
    nodes=1,
    procs_per_node=1,
    time_limit=None,
    scheduler=None,
    job_name='lbann',
    partition=None,
    account=None,
    reservation=None,
    launcher_args=[],
    lbann_exe=lbann.lbann_exe(),
    lbann_args=[],
    procs_per_trainer=None,
    environment={},
    overwrite_script=False,
    setup_only=False,
    batch_job=False,
    nvprof=False,
    nvprof_output_name=None,
    experiment_dir=None,
):
    """Run LBANN.

    This is intended to interface with job schedulers on HPC
    clusters. It will either submit a batch job (if on a login node)
    or run with an existing node allocation (if on a compute
    node). Behavior may vary across schedulers.

    If an experiment directory is not provided, a timestamped
    directory is created (by default in the current working
    directory). The location of autogenerated experiment directories
    can be set with the environment variable `LBANN_EXPERIMENT_DIR`.

    Args:
        trainer (lbann.Trainer): LBANN trainer.
        model (lbann.Model): Neural network model.
        data_reader (lbann.reader_pb2.DataReader): Data reader.
        optimizer (lbann.model.Optimizer): Default optimizer for
            model.
        work_dir (str, optional): Working directory.
        nodes (int, optional): Number of compute nodes.
        procs_per_node (int, optional): Number of processes per compute
            node.
        time_limit (int, optional): Job time limit, in minutes.
        scheduler (str, optional): Job scheduler.
        job_name (str, optional): Batch job name.
        partition (str, optional): Scheduler partition.
        account (str, optional): Scheduler account.
        reservation (str, optional): Scheduler reservation name.
        launcher_args (str, optional): Command-line arguments to
            launcher.
        lbann_exe (str, optional): LBANN executable.
        lbann_args (str, optional): Command-line arguments to LBANN
            executable.
        procs_per_trainer (int, optional): Number of processes per
            LBANN trainer. Default is all processes in one trainer.
        environment (dict of {str: str}, optional): Environment
            variables.
        overwrite_script (bool, optional): Whether to overwrite script
            file if it already exists.
        setup_only (bool, optional): If true, the experiment is not
            run after the experiment directory is initialized.
        batch_job (bool, optional): If true, the experiment is
            submitted to the scheduler as a batch job.
        nvprof (bool, optional): If true, an nvprof command is added
            to the beginning of LBANN executable.
        nvprof_output_name (str, optional): nvprof output filename.
            Filename should be unique to each process by using %q{ENV}
            (see https://docs.nvidia.com/cuda/profiler-users-guide/).
        experiment_dir (str, optional, deprecated): See `work_dir`.

    Returns:
        int: Exit status.

    """

    # Create batch script generator
    if not work_dir:
        work_dir = experiment_dir
    script = make_batch_script(work_dir=work_dir,
                               nodes=nodes,
                               procs_per_node=procs_per_node,
                               time_limit=time_limit,
                               scheduler=scheduler,
                               job_name=job_name,
                               partition=partition,
                               account=account,
                               reservation=reservation,
                               launcher_args=launcher_args,
                               environment=environment)

    # Batch script prints start time
    script.add_command('echo "Started at $(date)"')

    # Batch script invokes LBANN
    lbann_command = [lbann_exe]
    if nvprof:
        lbann_command = nvprof_command(
            work_dir=work_dir, output_name=nvprof_output_name) + lbann_command
    lbann_command.extend(make_iterable(lbann_args))
    prototext_file = os.path.join(script.work_dir, proto_file_name)
    lbann.proto.save_prototext(prototext_file,
                               trainer=trainer,
                               model=model,
                               data_reader=data_reader,
                               optimizer=optimizer)
    lbann_command.append('--prototext={}'.format(prototext_file))
    if procs_per_trainer is not None:
        lbann_command.append(f'--procs_per_trainer={procs_per_trainer}')

    script.add_parallel_command(lbann_command)
    script.add_command('status=$?')

    # Batch script prints finish time and returns status
    script.add_command('echo "Finished at $(date)"')
    script.add_command('exit ${status}')

    # Write, submit, or run batch script
    status = 0
    if setup_only:
        script.write(overwrite=overwrite_script)
    elif batch_job:
        status = script.submit(overwrite=overwrite_script)
    else:
        status = script.run(overwrite=overwrite_script)
    return status
Пример #24
0
def str_list(l):
    """Convert an iterable object to a space-separated string."""
    return ' '.join(str(i) for i in make_iterable(l))
Пример #25
0
def make_batch_script(
    system=system(),
    procs_per_node=procs_per_node(),
    scheduler=scheduler(),
    launcher_args=[],
    environment={},
    *args,
    **kwargs,
):
    """Construct batch script manager with NERSC-specific optimizations.

    This is a wrapper around `lbann.launcher.make_batch_script`, with
    defaults and optimizations for NERSC systems. See that function for a
    full list of options.

    """

    # Create shallow copies of input arguments
    launcher_args = list(make_iterable(launcher_args))
    environment = environment.copy()

    # Helper function to configure environment variables
    # Note: User-provided values take precedence, followed by values
    # in the environment, followed by default values.
    def set_environment(key, default):
        if key not in environment:
            environment[key] = os.getenv(key, default)

    # Optimizations for Cori GPU nodes
    if system == 'cgpu':
        cores_per_proc = cores_per_node(system) // procs_per_node
        set_environment(
            'AL_PROGRESS_RANKS_PER_NUMA_NODE',
            math.ceil(procs_per_node / numa_nodes_per_node(system)))
        set_environment('OMP_NUM_THREADS', cores_per_proc - 1)
        if scheduler == 'slurm':
            masks = [2**cores_per_proc - 1]
            while len(masks) < procs_per_node:
                masks.append(masks[-1] << cores_per_proc)
            mask_str = ','.join([hex(mask) for mask in masks])
            launcher_args.append('--cpu_bind=mask_cpu:{}'.format(mask_str))

        launcher_args.extend([
            '--qos=regular', f'--cpus-per-task={cores_per_proc}',
            '--gpus-per-task=1', '--constraint=gpu'
        ])

        # Hack to enable process forking
        # Note: InfiniBand is known to experience hangs if an MPI
        # process is forked (see
        # https://www.open-mpi.org/faq/?category=openfabrics#ofa-fork).
        # Setting IBV_FORK_SAFE seems to fix this issue, but it may
        # hurt performance (see
        # https://linux.die.net/man/3/ibv_fork_init).
        set_environment('IBV_FORK_SAFE', 1)

        set_environment('MV2_ENABLE_AFFINITY', 0)

        set_environment('MV2_USE_CUDA', 1)

        set_environment('MKL_THREADING_LAYER', 'GNU')

    return lbann.launcher.make_batch_script(
        procs_per_node=procs_per_node,
        scheduler=scheduler,
        launcher_args=launcher_args,
        environment=environment,
        *args,
        **kwargs,
    )
Пример #26
0
def run(command,
        experiment_dir=os.getcwd(),
        nodes=1,
        procs_per_node=1,
        time_limit=-1,
        job_name=None,
        partition=None,
        account=None,
        reservation=None,
        jsrun_args='',
        environment={},
        setup_only=False):
    """Run executable with LSF.

    Creates an LSF batch script in the experiment directory. If a LSF
    job allocation is detected, the script is run directly. Otherwise,
    the script is submitted to bsub.

    Args:
        command (str): Program to run under LSF, i.e. an executable and
            its command-line arguments.
        experiment_dir (str, optional): Experiment directory.
        nodes (int, optional): Number of compute nodes.
        procs_per_node (int, optional): Number of processes per compute
            node.
        time_limit (int, optional): Job time limit, in minutes. A
            negative value implies the system-default time limit.
        job_name (str, optional): Batch job name.
        partition (str, optional): Scheduler partition.
        account (str, optional): Scheduler account.
        reservation (str, optional): Scheduler reservation name.
        jsrun_args (str, optional): Command-line arguments to jsrun.
        environment (dict of {str: str}, optional): Environment
            variables.
        setup_only (bool, optional): If true, the experiment is not
            run after the batch script is created.

    """
    # Check for an existing job allocation.
    # Note: Settings for existing allocations take precedence.
    has_allocation = 'LSB_JOBID' in os.environ
    if has_allocation:
        job_name = os.environ['LSB_JOBNAME']
        partition = os.environ['LSB_QUEUE']
        # LSF does not provide a way to get the account via env vars.
        time_limit = -1

    # Experiment directory
    experiment_dir = os.path.abspath(experiment_dir)
    os.makedirs(experiment_dir, exist_ok=True)
    batch_file = os.path.join(experiment_dir, 'batch.sh')
    out_file = os.path.join(experiment_dir, 'out.log')
    err_file = os.path.join(experiment_dir, 'err.log')
    nodes_file = os.path.join(experiment_dir, 'nodes.txt')

    # Create batch script.
    s = '#!/bin/sh\n'
    if job_name:
        s += '#BSUB -J {}\n'.format(job_name)
    s += '#BSUB -nnodes {}\n'.format(nodes)
    if partition:
        s += '#BSUB -q {}\n'.format(partition)
    if account:
        s += '#BSUB -G {}\n'.format(account)
    else:
        raise ValueError('LSF requires an account')
    if reservation:
        s += '#BSUB -U {}\n'.format(reservation)
    s += '#BSUB -cwd {}\n'.format(experiment_dir)
    s += '#BSUB -o {}\n'.format(out_file)
    s += '#BSUB -e {}\n'.format(err_file)
    if time_limit >= 0:
        s += '#BSUB -W {}\n'.format(time_limit)

    # Set environment variables.
    if environment:
        s += '\n# ==== Environment ====\n'
        for variable, value in environment.items():
            s += 'export {}={}\n'.format(variable, value)

    # Time and node list.
    s += '\n# ==== Useful info ====\n'
    s += 'date\n'
    s += 'jsrun -n {} -a 1 hostname > {}\n'.format(nodes, nodes_file)
    s += 'sort --unique --output={0} {0}\n'.format(nodes_file)

    # Run experiment.
    s += '\n# ==== Experiment ====\n'
    for cmd in make_iterable(command):
        s += 'jsrun -n {} -a {} {} {}\n'.format(nodes, procs_per_node,
                                                jsrun_args, cmd)

    with open(batch_file, 'w') as f:
        f.write(s)

    # Make batch script executable.
    os.chmod(batch_file, 0o755)

    # Launch if needed.
    if not setup_only:
        if has_allocation:
            run_proc = subprocess.Popen(['sh', batch_file],
                                        stdout=subprocess.PIPE,
                                        stderr=subprocess.PIPE,
                                        cwd=experiment_dir)
        else:
            # bsub requires the batch script be read from its stdin.
            run_proc = subprocess.Popen('bsub < {}'.format(batch_file),
                                        stdout=subprocess.PIPE,
                                        stderr=subprocess.PIPE,
                                        cwd=experiment_dir,
                                        shell=True)
        out_proc = subprocess.Popen(['tee', out_file],
                                    stdin=run_proc.stdout,
                                    cwd=experiment_dir)
        err_proc = subprocess.Popen(['tee', err_file],
                                    stdin=run_proc.stderr,
                                    cwd=experiment_dir)
        run_proc.stdout.close()
        run_proc.stderr.close()
        run_proc.wait()
        out_proc.wait()
        err_proc.wait()
Пример #27
0
    def add_parallel_command(self,
                             command,
                             work_dir=None,
                             nodes=None,
                             procs_per_node=None,
                             time_limit=None,
                             job_name=None,
                             partition=None,
                             account=None,
                             launcher=None,
                             launcher_args=None):
        """Add command to be executed in parallel.

        The command is launched with srun. Parallel processes are
        distributed evenly amongst the compute nodes.

        Args:
            command (`str` or `Iterable` of `str`s): Command to be
                executed in parallel.
            work_dir (str, optional): Working directory.
            nodes (int, optional): Number of compute nodes.
            procs_per_node (int, optional): Number of parallel
                processes per compute node.
            time_limit (int, optional): Job time limit, in minutes.
            job_name (str, optional): Job name.
            partition (str, optional): Scheduler partition.
            account (str, optional): Scheduler account.
            launcher (str, optional): srun executable.
            launcher_args (`Iterable` of `str`s, optional):
                Command-line arguments to srun.

        """

        # Use default values if needed
        if work_dir is None:
            work_dir = self.work_dir
        if nodes is None:
            nodes = self.nodes
        if procs_per_node is None:
            procs_per_node = self.procs_per_node
        if time_limit is None:
            time_limit = self.time_limit
        if job_name is None:
            job_name = self.job_name
        if partition is None:
            partition = self.partition
        if account is None:
            account = self.account
        if launcher is None:
            launcher = self.launcher
        if launcher_args is None:
            launcher_args = self.launcher_args

        # Construct srun invocation
        args = [launcher]
        args.extend(make_iterable(launcher_args))
        args.append(f'--chdir={work_dir}')
        args.append(f'--nodes={nodes}')
        args.append(f'--ntasks={nodes * procs_per_node}')
        args.append(f'--ntasks-per-node={procs_per_node}')
        if time_limit is not None:
            args.append(f'--time={_time_string(time_limit)}')
        if job_name:
            args.append(f'--job-name={job_name}')
        if partition:
            args.append(f'--partition={partition}')
        if account:
            args.append(f'--account={account}')
        args.extend(make_iterable(command))
        self.add_command(args)
Пример #28
0
def run(command,
        experiment_dir=os.getcwd(),
        nodes=1,
        procs_per_node=1,
        time_limit=-1,
        job_name=None,
        partition=None,
        account=None,
        reservation=None,
        srun_args='',
        environment={},
        setup_only=False):
    """Run executable with Slurm.

    Creates a Slurm batch script in the experiment directory. If a
    Slurm job allocation is detected, the script is run
    directly. Otherwise, the script is submitted to sbatch.

    Args:
        command (str): Program to run under Slurm, i.e. an executable
            and its command-line arguments.
        experiment_dir (str, optional): Experiment directory.
        nodes (int, optional): Number of compute nodes.
        procs_per_node (int, optional): Number of processes per compute
            node.
        time_limit (int, optional): Job time limit, in minutes. A
            negative value implies the system-default time limit.
        job_name (str, optional): Batch job name.
        partition (str, optional): Scheduler partition.
        account (str, optional): Scheduler account.
        reservation (str, optional): Scheduler reservation name.
        srun_args (str, optional): Command-line arguments to srun.
        environment (dict of {str: str}, optional): Environment
            variables.
        setup_only (bool, optional): If true, the experiment is not
            run after the batch script is created.

    """

    # Check for an existing job allocation from Slurm
    # Note: Settings for current job allocation take precedence
    has_allocation = 'SLURM_JOB_ID' in os.environ
    if has_allocation:
        job_name = os.environ['SLURM_JOB_NAME']
        partition = os.environ['SLURM_JOB_PARTITION']
        account = os.environ['SLURM_JOB_ACCOUNT']
        time_limit = -1

    # Experiment directory
    experiment_dir = os.path.abspath(experiment_dir)
    os.makedirs(experiment_dir, exist_ok=True)
    batch_file = os.path.join(experiment_dir, 'batch.sh')
    out_file = os.path.join(experiment_dir, 'out.log')
    err_file = os.path.join(experiment_dir, 'err.log')
    nodes_file = os.path.join(experiment_dir, 'nodes.txt')

    # Write batch script
    with open(batch_file, 'w') as f:
        f.write('#!/bin/sh\n')

        # Slurm job settings
        if job_name:
            f.write('#SBATCH --job-name={}\n'.format(job_name))
        f.write('#SBATCH --nodes={}\n'.format(nodes))
        if partition:
            f.write('#SBATCH --partition={}\n'.format(partition))
        if account:
            f.write('#SBATCH --account={}\n'.format(account))
        if reservation:
            raise ValueError('Slurm reservations not supported')
        f.write('#SBATCH --workdir={}\n'.format(experiment_dir))
        f.write('#SBATCH --output={}\n'.format(out_file))
        f.write('#SBATCH --error={}\n'.format(err_file))
        if time_limit >= 0:
            seconds = int((time_limit % 1) * 60)
            hours, minutes = divmod(int(time_limit), 60)
            days, hours = divmod(hours, 24)
            f.write('#SBATCH --time={}-{:02d}:{:02d}:{:02d}\n'.format(
                days, hours, minutes, seconds))

        # Set environment
        if environment:
            f.write('\n')
            f.write('# ==== Environment ====\n')
            for variable, value in environment.items():
                f.write('export {}={}\n'.format(variable, value))

        # Display time and node list
        f.write('\n')
        f.write('# ==== Useful info ====\n')
        f.write('date\n')
        f.write('srun --nodes={0} --ntasks={0} hostname > {1}\n'.format(
            nodes, nodes_file))
        f.write('sort --unique --output={0} {0}\n'.format(nodes_file))

        # Run experiment
        f.write('\n')
        f.write('# ==== Experiment ====\n')
        for cmd in make_iterable(command):
            f.write('srun {} --nodes={} --ntasks={} {}\n'.format(
                srun_args, nodes, nodes * procs_per_node, cmd))

    # Make batch script executable
    os.chmod(batch_file, 0o755)

    # Launch job if needed
    # Note: Pipes output to log files
    if not setup_only:
        run_exe = 'sh' if has_allocation else 'sbatch'
        run_proc = subprocess.Popen([run_exe, batch_file],
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE,
                                    cwd=experiment_dir)
        out_proc = subprocess.Popen(['tee', out_file],
                                    stdin=run_proc.stdout,
                                    cwd=experiment_dir)
        err_proc = subprocess.Popen(['tee', err_file],
                                    stdin=run_proc.stderr,
                                    cwd=experiment_dir)
        run_proc.stdout.close()
        run_proc.stderr.close()
        run_proc.wait()
        out_proc.wait()
        err_proc.wait()
Пример #29
0
def make_batch_script(
    system=system(),
    procs_per_node=procs_per_node(),
    scheduler=scheduler(),
    launcher_args=[],
    environment={},
    *args,
    **kwargs,
):
    """Construct batch script manager with OLCF-specific optimizations.

    This is a wrapper around `lbann.launcher.make_batch_script`, with
    defaults and optimizations for LC systems. See that function for a
    full list of options.

    """

    # Create shallow copies of input arguments
    launcher_args = list(make_iterable(launcher_args))
    environment = environment.copy()

    # Helper function to configure environment variables
    # Note: User-provided values take precedence, followed by values
    # in the environment, followed by default values.
    def set_environment(key, default):
        if key not in environment:
            environment[key] = os.getenv(key, default)

    # Setup GPU bindings
    # Note: Each Hydrogen process is assigned to the GPU index that
    # matches its node communicator rank. This is not compatible with
    # mpibind, which assigns a GPU with index 0 to each process. We
    # can't use an exclusive GPU compute mode since processes may
    # touch the wrong GPU while figuring out ownership.
    if scheduler == 'slurm' and has_gpu(system):
        launcher_args.extend(
            ['--mpibind=off', '--nvidia_compute_mode=default'])

    # Optimizations for Summit-like systems
    if system in ('summit'):

        # Set thread affinity
        # Note: Aluminum's default thread affinity is incorrect since
        # hwloc treats GPUs as NUMA domains.
        # Note: There are actually 22 cores/socket, but it seems that
        # powers of 2 are better for performance.
        cores_per_socket = 16
        procs_per_socket = (procs_per_node + 1) // 2
        cores_per_proc = cores_per_socket // procs_per_socket
        set_environment('AL_PROGRESS_RANKS_PER_NUMA_NODE', procs_per_socket)
        set_environment('OMP_NUM_THREADS', cores_per_proc)
        if scheduler == 'lsf':
            launcher_args.append('--bind packed:{}'.format(cores_per_proc))

        # Hack to enable process forking
        # Note: InfiniBand is known to experience hangs if an MPI
        # process is forked (see
        # https://www.open-mpi.org/faq/?category=openfabrics#ofa-fork).
        # Setting IBV_FORK_SAFE seems to fix this issue, but it may
        # hurt performance (see
        # https://linux.die.net/man/3/ibv_fork_init).
        set_environment('IBV_FORK_SAFE', 1)

        # Hacked bugfix for hcoll (1/23/19)
        # Note: Fixes hangs in MPI_Bcast.
        set_environment('HCOLL_ENABLE_SHARP', 0)
        set_environment('OMPI_MCA_coll_hcoll_enable', 0)

        # Hacked bugfix for Spectrum MPI PAMI (9/17/19)
        set_environment('PAMI_MAX_NUM_CACHED_PAGES', 0)

        # Configure NVSHMEM to load Spectrum MPI
        set_environment('NVSHMEM_MPI_LIB_NAME', 'libmpi_ibm.so')

    return lbann.launcher.make_batch_script(
        procs_per_node=procs_per_node,
        scheduler=scheduler,
        launcher_args=launcher_args,
        environment=environment,
        *args,
        **kwargs,
    )
Пример #30
0
    def __init__(self, size, bias = True,
                 weights=[], name=None, data_layout='data_parallel'):
        """Initialize GRU cell.

        Args:
            size (int): Size of output tensor.
            bias (bool): Whether to apply biases after linearity.
            weights (`Weights` or iterator of `Weights`): Weights in
                fully-connected layer. There are at most four - two
                matrices ((3*size) x (input_size) and (3*size) x (size) dimensions) each and two
                biases (3*size entries) each. If weights are not provided,
                the matrix and bias will be initialized in a similar
                manner as PyTorch (uniform random values from
                [-1/sqrt(size), 1/sqrt(size)]).
            name (str): Default name is in the form 'gru<index>'.
            data_layout (str): Data layout.

        """
        super().__init__()
        GRU.global_count += 1
        self.step = 0
        self.size = size
        self.name = (name
                     if name
                     else 'gru{0}'.format(GRU.global_count))
        self.data_layout = data_layout

        # Weights
        self.weights = list(make_iterable(weights))
        if len(self.weights) > 4:
            raise ValueError('`GRU` has at most 4 weights, '
                             'but got {0}'.format(len(self.weights)))
        ##@todo: use loop
        scale = 1 / math.sqrt(self.size)
        if len(self.weights) == 0:
            self.weights.append(
                lbann.Weights(initializer=lbann.UniformInitializer(min=-scale,
                                                                   max=scale),
                              name=self.name+'_ih_matrix')
            )
        if len(self.weights) == 1:
            self.weights.append(
                lbann.Weights(initializer=lbann.UniformInitializer(min=-scale,
                                                                   max=scale),
                              name=self.name+'_ih_bias')
            )
        if len(self.weights) == 2:
            self.weights.append(
                lbann.Weights(initializer=lbann.UniformInitializer(min=-scale,
                                                                   max=scale),
                              name=self.name+'_hh_matrix')
            )
        if len(self.weights) == 3:
            self.weights.append(
                lbann.Weights(initializer=lbann.UniformInitializer(min=-scale,
                                                                   max=scale),
                              name=self.name+'_hh_bias')
            )

        # Linearity
        ####Learnable input-hidden weights
        self.ih_fc = FullyConnectedModule(
            3*size, bias=bias,
            weights=self.weights[:2],
            name=self.name + '_ih_fc',
            data_layout=self.data_layout
        )
        ###Learnable hidden-hidden weights
        self.hh_fc = FullyConnectedModule(
            3*size, bias=bias,
            weights=self.weights[2:],
            name=self.name + '_hh_fc',
            data_layout=self.data_layout
        )

        self.ones = lbann.Constant(
            value=1.0,
            num_neurons=str(size),
            data_layout=self.data_layout,
            name=self.name+'_ones',
        )