def mergesum_test_config(be, modfunc, use_stride=1): l1 = Conv(**conv_params(3, 16)) neon_layer = modfunc(16, use_stride) inshape = (16, 32, 32) insize = np.prod(inshape) inpa = np.random.random((insize, batch_size)) neon_seq = Sequential([l1] + neon_layer) neon_seq.configure(inshape) inp = be.array(inpa) neon_seq.allocate() # print neon_layer.nested_str() # neon_layer.layers[0].prev_layer = True neon_seq.allocate_deltas() neon_out = neon_seq.fprop(inp).get() # Now make the reference pathways: p1, p2 = module_factory_copy(neon_layer, modfunc, 16, use_stride) l11 = Conv(**conv_params(3, 16)) l12 = Conv(**conv_params(3, 16)) for ll in (l11, l12): for lcopy, lref in zip(ll, l1): if lcopy.has_params: lcopy.set_params(lref.get_params_serialize()) path1 = Sequential([l11] + p1) path2 = Sequential([l12] + p2) for ll in (path1, path2): ll.configure(inshape) ll.allocate() ll.allocate_deltas() o1 = path1.fprop(inp) o2 = path2.fprop(inp) neon_out_ref = be.empty_like(o1) neon_out_ref[:] = be.maximum(o1 + o2, 0) # need to have bsum false for this test to be valid assert allclose_with_out(neon_out_ref.get(), neon_out, rtol=0) print "Fprop matching" print "Beginning Back prop" erra = np.random.random(neon_out.shape) err = be.array(erra) ebr = neon_seq.layers[-1].bprop(err) ebr = neon_seq.layers[-2].bprop(ebr) trunk_neon = ebr.get() err = be.array(erra) err[:] = be.greater(neon_out_ref, 0) * err pstart = len(l1) eb1 = err for l in reversed(path1.layers[pstart:]): eb1 = l.bprop(eb1) eb2 = err for l in reversed(path2.layers[pstart:]): eb2 = l.bprop(eb2) err_ref = be.empty_like(eb1) err_ref[:] = eb1 + eb2 assert allclose_with_out(err_ref.get(), trunk_neon, rtol=0)
def mergesum_test_config(be, modfunc, use_stride=1): l1 = Conv(**conv_params(3, 16)) neon_layer = modfunc(16, use_stride) inshape = (16, 32, 32) insize = np.prod(inshape) inpa = np.random.random((insize, batch_size)) neon_seq = Sequential([l1] + neon_layer) neon_seq.configure(inshape) inp = be.array(inpa) neon_seq.allocate() # neon_layer.layers[0].prev_layer = True neon_seq.allocate_deltas() neon_out = neon_seq.fprop(inp).get() # Now make the reference pathways: p1, p2 = module_factory_copy(neon_layer, modfunc, 16, use_stride) l11 = Conv(**conv_params(3, 16)) l12 = Conv(**conv_params(3, 16)) for ll in (l11, l12): for lcopy, lref in zip(ll, l1): if lcopy.has_params: lcopy.set_params(lref.get_params_serialize()) path1 = Sequential([l11] + p1) path2 = Sequential([l12] + p2) for ll in (path1, path2): ll.configure(inshape) ll.allocate() ll.allocate_deltas() o1 = path1.fprop(inp) o2 = path2.fprop(inp) # convert mkl buffer to cpu for following cpu execution be.convert_data(o1, False) be.convert_data(o2, False) neon_out_ref = be.empty_like(o1) neon_out_ref[:] = be.maximum(o1 + o2, 0) # need to have bsum false for this test to be valid assert allclose_with_out(neon_out_ref.get(), neon_out, rtol=0) erra = np.random.random(neon_out.shape) err = be.array(erra) ebr = neon_seq.layers[-1].bprop(err) ebr = neon_seq.layers[-2].bprop(ebr) trunk_neon = ebr.get() err = be.array(erra) err[:] = be.greater(neon_out_ref, 0) * err pstart = len(l1) eb1 = err for l in reversed(path1.layers[pstart:]): eb1 = l.bprop(eb1) eb2 = err for l in reversed(path2.layers[pstart:]): eb2 = l.bprop(eb2) be.convert_data(eb1, False) be.convert_data(eb2, False) err_ref = be.empty_like(eb1) err_ref[:] = eb1 + eb2 assert allclose_with_out(err_ref.get(), trunk_neon, rtol=0)
class Model(NervanaObject): """ Basic model class which stores a list of layers describing the model. Can train the layer weights on a dataset, evaluate on a test set and serialize the mode. Additional functionality can be added to fit through callback functions. Arguments: layers: layer container, or a list of layers (that will be containerized), or a serialized model description dataset (iterator): Data set (ignored, will be removed) weights_only (bool): set to True if you do not want to recreate layers and states during deserialization from a serialized model description. Defaults to False. name (str): Model name. Defaults to "model" optimizer (Optimizer): Optimizer object which defines the learning rule for updating model parameters (ie DescentMomentum, AdaDelta) """ def __init__(self, layers, dataset=None, weights_only=False, name="model", optimizer=None): super(Model, self).__init__(name) self.optimizer = optimizer self.params = None # should be able to remove self.states = None # should be able to remove self.epoch_index = 0 self.finished = False self.initialized = False self.cost = None self.nbatches = 0 self.ndata = 0 if dataset is not None: logger.warning('dataset is a deprecated argument and will be ignored') if type(layers) in (ModelDescription, dict): # load up the model from a serialized file (dataset could be None here) self.deserialize(layers, load_states=(not weights_only)) elif type(layers) is str: self.load_params(layers, load_states=(not weights_only)) else: # Wrap the list of layers in a Sequential container if a raw list of layers if type(layers) in (Sequential, Tree, SingleOutputTree): self.layers = layers else: self.layers = Sequential(layers) self.layers.propagate_parallelism("Data") @property def layers_to_optimize(self): return self.layers.layers_to_optimize def set_shortcut(self): # infer whether bprop shortcut can be used on final activation # self.cost should be set to run this otherwise do nothing lastlayer = self.layers[-1] try: if self.cost.costfunc.__class__ is CrossEntropyBinary: if (lastlayer.__class__ is Activation and lastlayer.transform.__class__ is Logistic): lastlayer.transform.set_shortcut(True) except: # if any attributes are not set or any other exception # is thrown leave transform.shortcut as is (do nothing) pass def initialize(self, dataset, cost=None): if self.initialized: return # Propagate shapes through the layers to configure prev_input = dataset prev_input = self.layers.configure(prev_input) if cost is not None: cost.initialize(prev_input) self.cost = cost # Now allocate space self.layers.allocate() self.layers.allocate_deltas() self.initialized = True def __str__(self): """ String representation of model's layers """ config_string = "Network Layers:\n" + self.layers.nested_str() return config_string def fit(self, dataset, cost, optimizer, num_epochs, callbacks): """ Trains the model parameters on a dataset by minimizing the cost function through gradient descent and updates the layer weights according to a learning rule defined in optimizer. Arguments: dataset (iterator): An iterable of minibatches where each element is a (x, y) tuple where x is the input data and y are the labels. x is of dimension (feature_size, batch_size) y is of dimension (label_size, batch_size) Length of the iterator is num_batches which is num_data / batch_size cost (Cost): Defines the function which the model is minimizing based on the output of the last layer and the input labels optimizer (Optimizer): Defines the learning rule for updating the model parameters num_epochs: Number of times to iterate over the dataset. callbacks (Callbacks): Defines callbacks to run at the end of each mini-batch / epoch. """ self.nbatches = dataset.nbatches self.ndata = dataset.ndata # self.set_shortcut() # infer if bprop shortcut can be used self.total_cost = self.be.empty((1, 1), dtype=np.float32) self.optimizer = optimizer self.initialize(dataset, cost) callbacks.on_train_begin(num_epochs) while self.epoch_index < num_epochs and not self.finished: self.nbatches = dataset.nbatches callbacks.on_epoch_begin(self.epoch_index) self._epoch_fit(dataset, callbacks) callbacks.on_epoch_end(self.epoch_index) self.epoch_index += 1 callbacks.on_train_end() def _epoch_fit(self, dataset, callbacks): """ Helper function for fit which performs training on a dataset for one epoch. Arguments: dataset (iterable): Dataset iterator to perform fit on """ epoch = self.epoch_index self.total_cost[:] = 0 # iterate through minibatches of the dataset for mb_idx, (x, t) in enumerate(dataset): callbacks.on_minibatch_begin(epoch, mb_idx) self.be.begin(Block.minibatch, mb_idx) x = self.fprop(x) self.total_cost[:] = self.total_cost + self.cost.get_cost(x, t) # deltas back propagate through layers # for every layer in reverse except the 0th one delta = self.cost.get_errors(x, t) self.bprop(delta) self.optimizer.optimize(self.layers_to_optimize, epoch=epoch) self.be.end(Block.minibatch, mb_idx) callbacks.on_minibatch_end(epoch, mb_idx) # now we divide total cost by the number of batches, # so it was never total cost, but sum of averages # across all the minibatches we trained on self.total_cost[:] = self.total_cost / dataset.nbatches def fprop(self, x, inference=False): """ Forward propagates a minibatch x through the model. Arguments: x (Tensor): Input minibatch data inference (bool): Flag for performing training or inference Only affects batch norm and dropout layers. Returns: Tensor: the output of the final layer in the model """ return self.layers.fprop(x, inference) def bprop(self, delta): """ Back propagates the error of a minibatch through the model. Arguments: delta (Tensor): Derivative of cost with respect to the last layer's output """ return self.layers.bprop(delta) def eval(self, dataset, metric): """ Evaluates a model on a dataset according to an input metric. Arguments: datasets (iterable): dataset to evaluate on. metric (Cost): what function to evaluate dataset on. """ self.initialize(dataset) running_error = np.zeros((len(metric.metric_names)), dtype=np.float32) nprocessed = 0 dataset.reset() for x, t in dataset: x = self.fprop(x, inference=True) # This logic is for handling partial batch sizes at the end of the dataset nsteps = x.shape[1] / self.be.bsz if not isinstance(x, list) else \ x[0].shape[1] / self.be.bsz bsz = min(dataset.ndata - nprocessed, self.be.bsz) running_error += metric(x, t, calcrange=slice(0, nsteps * bsz)) * nsteps * bsz nprocessed += bsz * nsteps running_error /= nprocessed return running_error def get_outputs(self, dataset): """ Get the activation outputs of the final model layer for the dataset Arguments: dataset (iterable): Dataset iterator to perform fit on Returns: Host numpy array: the output of the final layer for the entire Dataset """ self.initialize(dataset) dataset.reset() # Move "pointer" back to beginning of dataset n = dataset.nbatches x = self.layers.layers[-1].outputs assert not isinstance(x, list), "Can not get_outputs with Branch terminal" Ypred = None for idx, (x, t) in enumerate(dataset): x = self.fprop(x, inference=True) if Ypred is None: (dim0, dim1) = x.shape Ypred = np.empty((n * dim1, dim0), dtype=x.dtype) nsteps = dim1 / self.be.bsz cur_batch = slice(idx * dim1, (idx + 1) * dim1) Ypred[cur_batch] = x.get().T # Handle the recurrent case. if nsteps != 1: b, s = (self.be.bsz, nsteps) Ypred = Ypred.reshape((n, s, b, -1)).transpose(0, 2, 1, 3).copy().reshape(n*b, s, -1) return Ypred[:dataset.ndata] def get_description(self, get_weights=False, keep_states=False): """ Gets a description of the model required to reconstruct the model with no weights like from a yaml file. Returns: dict: Description of each component of the model. """ pdict = dict() pdict['neon_version'] = __neon_version__ compat_mode = self.be.compat_mode if self.be.compat_mode is not None else 'neon' pdict['backend'] = {'type': self.be.__class__.__name__, 'compat_mode': compat_mode, 'rng_seed': self.be.rng_seed, 'rng_state': self.be.rng_get_state()} if self.cost: pdict['cost'] = self.cost.get_description() if self.optimizer: pdict['optimizer'] = self.optimizer.get_description() pdict['model'] = self.layers.get_description(get_weights=get_weights, keep_states=keep_states) return pdict def save_params(self, param_path, keep_states=True): """ Serializes and saves model parameters to the path specified. Arguments: param_path (str): File to write serialized parameter dict to. keep_states (bool): Whether to save optimizer states too. Defaults to True. """ self.serialize(keep_states=keep_states, fn=param_path) def load_params(self, param_path, load_states=True): """ Loads the model parameters (per layer weights, epochs run, optimizer states) saved in param_path from serialize(). Arguments: param_path (str): File containing serialized python dict with layer weights and states. load_states (bool): if False, then only the weights will be loaded into a model in which the layers have already been created, otherwise will (re)create the layers from the serialized parameters and set the learning states as well """ self.deserialize(load_obj(param_path), load_states=load_states) logger.info('Model weights loaded from %s', param_path) def load_weights(self, weight_path): """ .. deprecated:: 1.1.4 Use :func:`load_params` instead """ logger.warning('Calling deprecated load_weights function. Use ' 'load_params instead') self.load_params(weight_path) def deserialize(self, model_dict, data=None, load_states=True): """ Loads per layer (weights, states) and other model parameters from the dictionary passed. Arguments: model_dict (dict): dictionary describing the model including layers, cost, optimizers, backend settings, etc. generated by the serialize function data (iterator): Data set (ignored, will be removed) load_states (bool): if False, then only the weights will be loaded into a model in which the layers have already been created, otherwise will (re)create the layers from the serialized parameters and set the learning states as well """ if data is not None: logger.warning('data is a deprecated argument and will be ignored') if 'epoch_index' in model_dict: self.epoch_index = model_dict['epoch_index'] if 'model' not in model_dict: logger.error('Using old model serialization format. ' 'Serialized the model into new format') param_layers = [l for l in self.layers_to_optimize] param_dict_list = model_dict['layer_params_states'] for l, ps in zip(param_layers, param_dict_list): l.set_params(ps) if 'states' in ps and load_states: l.set_states(ps) return if 'backend' in model_dict: if 'compat_mode' in model_dict['backend']: self.be.compat_mode = model_dict['backend']['compat_mode'] else: model_dict['backend'] = {} typ = model_dict['model']['type'] main_container = load_class(typ) if not hasattr(self, 'layers'): self.layers = main_container.gen_class(model_dict['model']['config']) self.layers.load_weights(model_dict['model'], load_states) if load_states and 'rng_state' in model_dict['backend']: try: self.be.rng_set_state(model_dict['backend']['rng_state']) except ValueError as e: # could come about when switching backend types (ex GPU to CPU) logger.warning("Problems restoring existing RNG state: %s", str(e)) # serialize tells how to write out the parameters we've learned so # far and associate them with layers. it can ignore layers with no # learned parameters. the model stores states to pass to the # optimizers. if we're saving the model out for inference, we # don't need to remember states. def serialize(self, fn=None, keep_states=True): """ Creates a dictionary storing the layer parameters and epochs complete. Arguments: fn (str): file to save pkl formatted model dictionary keep_states (bool): Whether to save optimizer states. Returns: dict: Model data including layer parameters and epochs complete. """ # get the model dict with the weights pdict = self.get_description(get_weights=True, keep_states=keep_states) pdict['epoch_index'] = self.epoch_index + 1 if self.initialized: pdict['train_input_shape'] = self.layers.in_shape if fn is not None: save_obj(pdict, fn) return return pdict def set_batch_size(self, N): """ Set the actual minibatch size, so eventhough the buffers are allocated considering excessive padding, the processing for some layers may be shortened. Currently most of the neon layers don't use that to control the processing. The interface is here only for when someone wants to set that information and experiment. """ return self.layers.set_batch_size(N) def set_seq_len(self, S): """ Set the actual minibatch sequence length, so eventhough the buffers are allocated considering excessive padding, the processing for some layers may be shortened. Currently most of the neon layers don't use that to control the processing. The interface is here only for when someone wants to set that information and experiment. """ return self.layers.set_seq_len(S) def benchmark(self, dataset, inference=False, cost=None, optimizer=None, niterations=20, nskip=2): """ Measure runtime for computing fprop and bprop seperately, as well as full minibatch run times. For inference case, only the fprop Arguments: dataset (iterable): Dataset iterator to perform fit on cost (Cost): Defines the function which the model is minimizing based on the output of the last layer and the input labels niterations (optional, int): Number of minibatches to average over nskip (optional, int): number of iterations at the beginning to skip when calculating the runtime statistics Returns: dictionary with fprop, bprop run times """ # initialize model if inference is False: assert cost is not None and optimizer is not None, "Need cost and optimizer to \ benchmark bprop and update" self.cost = cost self.initialize(dataset, cost) self.optimizer = optimizer self.total_cost = self.be.empty((1, 1)) self.total_cost[:] = 0 # iterate through minibatches of the dataset times = OrderedDict() time_keys = ['fprop'] if inference else ['fprop', 'bprop', 'iteration'] for ky in time_keys: times[ky] = np.full(niterations + nskip, -1.0) count = 0 fprop_start = self.be.init_mark() fprop_end = self.be.init_mark() bprop_end = self.be.init_mark() while count < niterations + nskip: dataset.reset() for mb_idx, (x, t) in enumerate(dataset): self.be.record_mark(fprop_start) # mark start of fprop x = self.fprop(x) if inference is False: self.total_cost[:] = self.total_cost + self.cost.get_cost(x, t) self.be.record_mark(fprop_end) # mark end of fprop and start of bprop if inference is False: delta = self.cost.get_errors(x, t) self.bprop(delta) self.optimizer.optimize(self.layers_to_optimize, epoch=0) self.be.record_mark(bprop_end) # mark end of bprop self.be.synchronize_mark(bprop_end) else: self.be.synchronize_mark(fprop_end) times['fprop'][count] = self.be.get_time(fprop_start, fprop_end) if inference is False: times['bprop'][count] = self.be.get_time(fprop_end, bprop_end) times['iteration'][count] = times['fprop'][count] + times['bprop'][count] count += 1 if count >= niterations + nskip: break # print results header = ('Func', 'Mean', 'Median', 'Min', 'Max', 'Units') stats = tuple(stat.lower() for stat in header[1:-1]) fmt_titles = '| {:^11} '*len(header) + '|' fmt_nums = '| {func:<11} ' + '| {%s:<10.5g} '*len(stats) % (stats) + '| {units:^11} |' head_str = fmt_titles.format(*header) sep = '-'*len(head_str) head_str = sep + '\n' + head_str + '\n' + sep print(head_str) out_stats = {} for step in times: timesu = np.array(times[step][nskip:]) # in ms out_stats[step] = {} for stat in stats: out_stats[step][stat] = getattr(np, stat)(timesu) print(fmt_nums.format(units='msec', func=step, **out_stats[step])) print(sep) return out_stats
def test_branch_model(): NervanaObject.be = gen_backend("gpu", batch_size=64) be = NervanaObject.be main1 = main_branch() i1 = inception([(32, ), (32, 32), ('max', 16)]) top = top_branch() neon_layer = Sequential(main1 + i1 + top) inshape = (3, 224, 224) insize = np.prod(inshape) inpa = np.random.random((insize, batch_size)) neon_layer.configure(inshape) inp = neon_layer.be.array(inpa) neon_layer.allocate() print neon_layer.nested_str() neon_layer.layers[0].prev_layer = True neon_layer.allocate_deltas() neon_layer.layers[0].set_deltas([be.iobuf(inshape)]) neon_out = neon_layer.fprop(inp).get() # Now make the reference pathways: main_trunk2 = Sequential(main_branch()) main_trunk2.configure(inshape) main2 = main_trunk2.layers main2[0].prev_layer = True main2[0].set_deltas([be.iobuf(inshape)]) (b1, b2, b3) = inception_bare(i1, [(32, ), (32, 32), ('max', 16)]) for bb in (b1, b2, b3): oshape = inshape for ll in main2 + bb: oshape = ll.configure(oshape) main1_trunk = neon_layer.layers[:8] for ll, lo in zip(main2, main1_trunk): if ll.has_params: ll.set_params({'params': {'W': lo.W.get()}}) ll.allocate() ll.set_deltas([be.iobuf(ll.in_shape)]) for bb in (b1, b2, b3): for ll in bb: ll.allocate() ll.set_deltas([be.iobuf(ll.in_shape)]) # Create the combined output buffer merge_output = be.empty_like(neon_layer.layers[8].outputs) x = inp for ll in main2: x = ll.fprop(x) start = 0 for bb in (b1, b2, b3): xb = x for ll in bb: xb = ll.fprop(xb) end = start + xb.shape[0] merge_output[start:end] = xb start = end x = merge_output top_trunk = Sequential(top).layers for ll in top_trunk: x = ll.fprop(x) neon_out_ref = x.get() difference = neon_out_ref - neon_out assert np.max(np.abs(difference)) < 1e-7 print np.max(np.abs(difference)) print "Beginning Back prop" erra = np.random.random(neon_out.shape) err = be.array(erra) for ll in reversed(neon_layer.layers[8:]): err = ll.bprop(err) neon_deltas = err.get() for bb, errb in zip((b1, b2, b3), neon_layer.layers[8].error_views): for ll in reversed(bb): errb = ll.bprop(errb) # Now sum up the deltas at the root of the branch layer and compare ref_deltas = be.zeros_like(b1[0].deltas) ref_deltas[:] = b1[0].deltas + b2[0].deltas + b3[0].deltas neon_ref_deltas = ref_deltas.get() difference = neon_deltas - neon_ref_deltas print np.max(np.abs(difference)) assert np.max(np.abs(difference)) < 1e-8
def test_branch_model(backend_gpu): np.random.seed(0) be = NervanaObject.be be.bsz = 64 main1 = main_branch() i1 = inception([(32,), (32, 32), ('max', 16)]) top = top_branch() neon_layer = Sequential(main1 + i1 + top) inshape = (4, 224, 224) insize = np.prod(inshape) inpa = np.random.random((insize, batch_size)) neon_layer.configure(inshape) inp = neon_layer.be.array(inpa) neon_layer.allocate() neon_logger.display(neon_layer.nested_str()) neon_layer.layers[0].prev_layer = True neon_layer.allocate_deltas() neon_out = neon_layer.fprop(inp).get() # Now make the reference pathways: main_trunk2 = Sequential(main_branch()) main_trunk2.configure(inshape) main2 = main_trunk2.layers main2[0].prev_layer = True main2[0].deltas = be.iobuf(inshape) (b1, b2, b3) = inception_bare(i1, [(32,), (32, 32), ('max', 16)]) for bb in (b1, b2, b3): oshape = inshape for ll in main2 + bb: oshape = ll.configure(oshape) main1_trunk = neon_layer.layers[:6] for ll, lo in zip(main2, main1_trunk): if ll.has_params: ll.set_params({'params': {'W': lo.W.get(), 'weight_bias': lo.weight_bias.get()}}) ll.allocate() temp_buff = DeltasTree() ll.allocate_deltas(temp_buff) temp_buff.allocate_buffers() ll.set_deltas(temp_buff) for bb in (b1, b2, b3): for ll in bb: ll.allocate() temp_buff = DeltasTree() ll.allocate_deltas(temp_buff) temp_buff.allocate_buffers() ll.set_deltas(temp_buff) # Create the combined output buffer merge_output = be.empty_like(neon_layer.layers[6].outputs) x = inp for ll in main2: x = ll.fprop(x) start = 0 for bb in (b1, b2, b3): xb = x for ll in bb: xb = ll.fprop(xb) end = start + xb.shape[0] merge_output[start:end] = xb start = end x = merge_output top_trunk = Sequential(top).layers for ll in top_trunk: x = ll.fprop(x) neon_out_ref = x.get() assert allclose_with_out(neon_out, neon_out_ref, rtol=0) neon_logger.display("Beginning Back prop") erra = np.random.random(neon_out.shape) err = be.array(erra) for ll in reversed(neon_layer.layers[6:]): err = ll.bprop(err) neon_deltas = err.get() for bb, errb in zip((b1, b2, b3), neon_layer.layers[6].error_views): for ll in reversed(bb): errb = ll.bprop(errb) # Now sum up the deltas at the root of the branch layer and compare ref_deltas = be.zeros_like(b1[0].deltas) ref_deltas[:] = b3[0].deltas + b2[0].deltas + b1[0].deltas neon_ref_deltas = ref_deltas.get() assert allclose_with_out(neon_deltas, neon_ref_deltas, rtol=0)
def mergesum_test_config(modfunc, use_stride=1): NervanaObject.be = gen_backend("gpu", batch_size=64) be = NervanaObject.be l1 = Conv(**conv_params(3, 16)) neon_layer = modfunc(16, use_stride) inshape = (16, 32, 32) insize = np.prod(inshape) inpa = np.random.random((insize, batch_size)) neon_seq = Sequential([l1] + neon_layer) neon_seq.configure(inshape) inp = be.array(inpa) neon_seq.allocate() # print neon_layer.nested_str() # neon_layer.layers[0].prev_layer = True neon_seq.allocate_deltas() neon_out = neon_seq.fprop(inp).get() # Now make the reference pathways: p1, p2 = module_factory_copy(neon_layer, modfunc, 16, use_stride) l11 = Conv(**conv_params(3, 16)) l12 = Conv(**conv_params(3, 16)) for ll in (l11, l12): for lcopy, lref in zip(ll, l1): if lcopy.has_params: lcopy.set_params(lref.get_params_serialize()) path1 = Sequential([l11] + p1) path2 = Sequential([l12] + p2) for ll in (path1, path2): ll.configure(inshape) ll.allocate() ll.allocate_deltas() o1 = path1.fprop(inp).get() o2 = path2.fprop(inp).get() # Now relu it neon_out_ref = np.maximum(o1+o2, 0) difference = neon_out_ref - neon_out print np.max(np.abs(difference)) # need to have bsum false for this test to be valid # assert np.max(np.abs(difference)) < 1e-7 print "Fprop matching" print "Beginning Back prop" erra = np.random.random(neon_out.shape) err = be.array(erra) ebr = neon_seq.layers[4].bprop(err) print "Orig Error", ebr.get()[0, :20] ebr = neon_seq.layers[3].bprop(ebr) trunk_neon = ebr.get() err = be.array(erra) err[:] = be.greater(be.array(neon_out_ref), 0) * err eb1 = err for l in reversed(path1.layers[3:]): eb1 = l.bprop(eb1) t1 = eb1.get() err = be.array(erra) err[:] = be.greater(be.array(neon_out_ref), 0) * err eb2 = err for l in reversed(path2.layers[3:]): eb2 = l.bprop(eb2) t2 = eb2.get() print np.max(np.abs(trunk_neon - (t1 + t2)))
def test_branch_model_cpu(backend_cpu64): np.random.seed(0) be = NervanaObject.be be.bsz = 32 main1 = main_branch() i1 = inception([(32,), (32, 32), ('max', 16)]) top = top_branch() neon_layer = Sequential(main1 + i1 + top) inshape = (4, 224, 224) insize = np.prod(inshape) inpa = np.random.random((insize, batch_size)) neon_layer.configure(inshape) inp = neon_layer.be.array(inpa) neon_layer.allocate() neon_logger.display(neon_layer.nested_str()) neon_layer.layers[0].prev_layer = True neon_layer.allocate_deltas() neon_out = neon_layer.fprop(inp).get() # Now make the reference pathways: main_trunk2 = Sequential(main_branch()) main_trunk2.configure(inshape) main2 = main_trunk2.layers main2[0].prev_layer = True main2[0].deltas = be.iobuf(inshape) (b1, b2, b3) = inception_bare(i1, [(32,), (32, 32), ('max', 16)]) for bb in (b1, b2, b3): oshape = inshape for ll in main2 + bb: oshape = ll.configure(oshape) main1_trunk = neon_layer.layers[:8] for ll, lo in zip(main2, main1_trunk): if ll.has_params: ll.set_params({'params': {'W': lo.W.get()}}) ll.allocate() temp_buff = DeltasTree() ll.allocate_deltas(temp_buff) temp_buff.allocate_buffers() ll.set_deltas(temp_buff) for bb in (b1, b2, b3): for ll in bb: ll.allocate() temp_buff = DeltasTree() ll.allocate_deltas(temp_buff) temp_buff.allocate_buffers() ll.set_deltas(temp_buff) # Create the combined output buffer merge_output = be.empty_like(neon_layer.layers[8].outputs) x = inp for ll in main2: x = ll.fprop(x) start = 0 for bb in (b1, b2, b3): xb = x for ll in bb: xb = ll.fprop(xb) end = start + xb.shape[0] merge_output[start:end] = xb start = end x = merge_output top_trunk = Sequential(top).layers for ll in top_trunk: x = ll.fprop(x) neon_out_ref = x.get() assert allclose_with_out(neon_out, neon_out_ref, rtol=0) neon_logger.display("Beginning Back prop") erra = np.random.random(neon_out.shape) err = be.array(erra) for ll in reversed(neon_layer.layers[8:]): err = ll.bprop(err) neon_deltas = err.get() for bb, errb in zip((b1, b2, b3), neon_layer.layers[8].error_views): for ll in reversed(bb): errb = ll.bprop(errb) # Now sum up the deltas at the root of the branch layer and compare ref_deltas = be.zeros_like(b1[0].deltas) ref_deltas[:] = b3[0].deltas + b2[0].deltas + b1[0].deltas neon_ref_deltas = ref_deltas.get() assert allclose_with_out(neon_deltas, neon_ref_deltas, rtol=0)
def test_branch_model(): NervanaObject.be = gen_backend("gpu", batch_size=64) be = NervanaObject.be main1 = main_branch() i1 = inception([(32,), (32, 32), ('max', 16)]) top = top_branch() neon_layer = Sequential(main1 + i1 + top) inshape = (3, 224, 224) insize = np.prod(inshape) inpa = np.random.random((insize, batch_size)) neon_layer.configure(inshape) inp = neon_layer.be.array(inpa) neon_layer.allocate() print neon_layer.nested_str() neon_layer.layers[0].prev_layer = True neon_layer.allocate_deltas() neon_layer.layers[0].set_deltas([be.iobuf(inshape)]) neon_out = neon_layer.fprop(inp).get() # Now make the reference pathways: main_trunk2 = Sequential(main_branch()) main_trunk2.configure(inshape) main2 = main_trunk2.layers main2[0].prev_layer = True main2[0].set_deltas([be.iobuf(inshape)]) (b1, b2, b3) = inception_bare(i1, [(32,), (32, 32), ('max', 16)]) for bb in (b1, b2, b3): oshape = inshape for ll in main2 + bb: oshape = ll.configure(oshape) main1_trunk = neon_layer.layers[:8] for ll, lo in zip(main2, main1_trunk): if ll.has_params: ll.set_params(lo.W.get()) ll.allocate() ll.set_deltas([be.iobuf(ll.in_shape)]) for bb in (b1, b2, b3): for ll in bb: ll.allocate() ll.set_deltas([be.iobuf(ll.in_shape)]) # Create the combined output buffer merge_output = be.empty_like(neon_layer.layers[8].outputs) x = inp for ll in main2: x = ll.fprop(x) start = 0 for bb in (b1, b2, b3): xb = x for ll in bb: xb = ll.fprop(xb) end = start + xb.shape[0] merge_output[start:end] = xb start = end x = merge_output top_trunk = Sequential(top).layers for ll in top_trunk: x = ll.fprop(x) neon_out_ref = x.get() difference = neon_out_ref - neon_out assert np.max(np.abs(difference)) < 1e-7 print np.max(np.abs(difference)) print "Beginning Back prop" erra = np.random.random(neon_out.shape) err = be.array(erra) for ll in reversed(neon_layer.layers[8:]): err = ll.bprop(err) neon_deltas = err.get() for bb, errb in zip((b1, b2, b3), neon_layer.layers[8].error_views): for ll in reversed(bb): errb = ll.bprop(errb) # Now sum up the deltas at the root of the branch layer and compare ref_deltas = be.zeros_like(b1[0].deltas) ref_deltas[:] = b1[0].deltas + b2[0].deltas + b3[0].deltas neon_ref_deltas = ref_deltas.get() difference = neon_deltas - neon_ref_deltas print np.max(np.abs(difference)) assert np.max(np.abs(difference)) < 1e-8