def test_model_serialize(backend_default, data): (X_train, y_train), (X_test, y_test), nclass = load_mnist(path=data) train_set = DataIterator( [X_train, X_train], y_train, nclass=nclass, lshape=(1, 28, 28)) init_norm = Gaussian(loc=0.0, scale=0.01) # initialize model path1 = Sequential([Conv((5, 5, 16), init=init_norm, bias=Constant(0), activation=Rectlin()), Pooling(2), Affine(nout=20, init=init_norm, bias=init_norm, activation=Rectlin())]) path2 = Sequential([Affine(nout=100, init=init_norm, bias=Constant(0), activation=Rectlin()), Dropout(keep=0.5), Affine(nout=20, init=init_norm, bias=init_norm, activation=Rectlin())]) layers = [MergeMultistream(layers=[path1, path2], merge="stack"), Affine(nout=20, init=init_norm, batch_norm=True, activation=Rectlin()), Affine(nout=10, init=init_norm, activation=Logistic(shortcut=True))] tmp_save = 'test_model_serialize_tmp_save.pickle' mlp = Model(layers=layers) mlp.optimizer = GradientDescentMomentum(learning_rate=0.1, momentum_coef=0.9) mlp.cost = GeneralizedCost(costfunc=CrossEntropyBinary()) mlp.initialize(train_set, cost=mlp.cost) n_test = 3 num_epochs = 3 # Train model for num_epochs and n_test batches for epoch in range(num_epochs): for i, (x, t) in enumerate(train_set): x = mlp.fprop(x) delta = mlp.cost.get_errors(x, t) mlp.bprop(delta) mlp.optimizer.optimize(mlp.layers_to_optimize, epoch=epoch) if i > n_test: break # Get expected outputs of n_test batches and states of all layers outputs_exp = [] pdicts_exp = [l.get_params_serialize() for l in mlp.layers_to_optimize] for i, (x, t) in enumerate(train_set): outputs_exp.append(mlp.fprop(x, inference=True)) if i > n_test: break # Serialize model save_obj(mlp.serialize(keep_states=True), tmp_save) # Load model mlp = Model(layers=layers) mlp.load_weights(tmp_save) outputs = [] pdicts = [l.get_params_serialize() for l in mlp.layers_to_optimize] for i, (x, t) in enumerate(train_set): outputs.append(mlp.fprop(x, inference=True)) if i > n_test: break # Check outputs, states, and params are the same for output, output_exp in zip(outputs, outputs_exp): assert np.allclose(output.get(), output_exp.get()) for pd, pd_exp in zip(pdicts, pdicts_exp): for s, s_e in zip(pd['states'], pd_exp['states']): if isinstance(s, list): # this is the batch norm case for _s, _s_e in zip(s, s_e): assert np.allclose(_s, _s_e) else: assert np.allclose(s, s_e) for p, p_e in zip(pd['params'], pd_exp['params']): assert type(p) == type(p_e) if isinstance(p, list): # this is the batch norm case for _p, _p_e in zip(p, p_e): assert np.allclose(_p, _p_e) elif isinstance(p, np.ndarray): assert np.allclose(p, p_e) else: assert p == p_e os.remove(tmp_save)
class DeepQNetwork: def __init__(self, num_actions, args): # remember parameters self.num_actions = num_actions self.batch_size = args.batch_size self.discount_rate = args.discount_rate self.history_length = args.history_length self.screen_dim = (args.screen_height, args.screen_width) self.clip_error = args.clip_error # create Neon backend self.be = gen_backend(backend=args.backend, batch_size=args.batch_size, rng_seed=args.random_seed, device_id=args.device_id, default_dtype=np.dtype(args.datatype).type, stochastic_round=args.stochastic_round) # prepare tensors once and reuse them self.input_shape = (self.history_length, ) + self.screen_dim + ( self.batch_size, ) self.tensor = self.be.empty(self.input_shape) self.tensor.lshape = self.input_shape # needed for convolutional networks self.targets = self.be.empty((self.num_actions, self.batch_size)) # create model layers = self.createLayers(num_actions) self.model = Model(layers=layers) self.cost = GeneralizedCost(costfunc=SumSquared()) self.model.initialize(self.tensor.shape[:-1], self.cost) self.optimizer = RMSProp(learning_rate=args.learning_rate, decay_rate=args.rmsprop_decay_rate, stochastic_round=args.stochastic_round) # create target model self.target_steps = args.target_steps self.train_iterations = 0 if self.target_steps: self.target_model = Model(layers=self.createLayers(num_actions)) self.target_model.initialize(self.tensor.shape[:-1]) self.save_weights_path = args.save_weights_path else: self.target_model = self.model self.callback = None def createLayers(self, num_actions): # create network init_norm = Gaussian(loc=0.0, scale=0.01) layers = [] # The first hidden layer convolves 32 filters of 8x8 with stride 4 with the input image and applies a rectifier nonlinearity. layers.append( Conv((8, 8, 32), strides=4, init=init_norm, activation=Rectlin())) # The second hidden layer convolves 64 filters of 4x4 with stride 2, again followed by a rectifier nonlinearity. layers.append( Conv((4, 4, 64), strides=2, init=init_norm, activation=Rectlin())) # This is followed by a third convolutional layer that convolves 64 filters of 3x3 with stride 1 followed by a rectifier. layers.append( Conv((3, 3, 64), strides=1, init=init_norm, activation=Rectlin())) # The final hidden layer is fully-connected and consists of 512 rectifier units. layers.append(Affine(nout=512, init=init_norm, activation=Rectlin())) # The output layer is a fully-connected linear layer with a single output for each valid action. layers.append(Affine(nout=num_actions, init=init_norm)) return layers def setTensor(self, states): # change order of axes to match what Neon expects states = np.transpose(states, axes=(1, 2, 3, 0)) # copy() shouldn't be necessary here, but Neon doesn't work otherwise self.tensor.set(states.copy()) # normalize network input between 0 and 1 self.be.divide(self.tensor, 255, self.tensor) def train(self, minibatch, epoch): # expand components of minibatch prestates, actions, rewards, poststates, terminals = minibatch assert len(prestates.shape) == 4 assert len(poststates.shape) == 4 assert len(actions.shape) == 1 assert len(rewards.shape) == 1 assert len(terminals.shape) == 1 assert prestates.shape == poststates.shape assert prestates.shape[0] == actions.shape[0] == rewards.shape[ 0] == poststates.shape[0] == terminals.shape[0] if self.target_steps and self.train_iterations % self.target_steps == 0: # HACK: push something through network, so that weights exist self.model.fprop(self.tensor) # HACK: serialize network to disk and read it back to clone filename = os.path.join(self.save_weights_path, "target_network.pkl") save_obj(self.model.serialize(keep_states=False), filename) self.target_model.load_weights(filename) # feed-forward pass for poststates to get Q-values self.setTensor(poststates) postq = self.target_model.fprop(self.tensor, inference=True) assert postq.shape == (self.num_actions, self.batch_size) # calculate max Q-value for each poststate maxpostq = self.be.max(postq, axis=0).asnumpyarray() assert maxpostq.shape == (1, self.batch_size) # feed-forward pass for prestates self.setTensor(prestates) preq = self.model.fprop(self.tensor, inference=False) assert preq.shape == (self.num_actions, self.batch_size) # make copy of prestate Q-values as targets targets = preq.asnumpyarray() # update Q-value targets for actions taken for i, action in enumerate(actions): if terminals[i]: targets[action, i] = float(rewards[i]) else: targets[action, i] = float( rewards[i]) + self.discount_rate * maxpostq[0, i] # copy targets to GPU memory self.targets.set(targets) # calculate errors deltas = self.cost.get_errors(preq, self.targets) assert deltas.shape == (self.num_actions, self.batch_size) #assert np.count_nonzero(deltas.asnumpyarray()) == 32 # calculate cost, just in case cost = self.cost.get_cost(preq, self.targets) assert cost.shape == (1, 1) # clip errors if self.clip_error: self.be.clip(deltas, -self.clip_error, self.clip_error, out=deltas) # perform back-propagation of gradients self.model.bprop(deltas) # perform optimization self.optimizer.optimize(self.model.layers_to_optimize, epoch) # increase number of weight updates (needed for target clone interval) self.train_iterations += 1 # calculate statistics if self.callback: self.callback.on_train(cost.asnumpyarray()[0, 0]) def predict(self, states): # minibatch is full size, because Neon doesn't let change the minibatch size assert states.shape == (( self.batch_size, self.history_length, ) + self.screen_dim) # calculate Q-values for the states self.setTensor(states) qvalues = self.model.fprop(self.tensor, inference=True) assert qvalues.shape == (self.num_actions, self.batch_size) if logger.isEnabledFor(logging.DEBUG): logger.debug("Q-values: " + str(qvalues.asnumpyarray()[:, 0])) # find the action with highest q-value actions = self.be.argmax(qvalues, axis=0) assert actions.shape == (1, self.batch_size) # take only the first result return actions.asnumpyarray()[0, 0] def getMeanQ(self, states): assert states.shape == (( self.batch_size, self.history_length, ) + self.screen_dim) # calculate Q-values for the states self.setTensor(states) qvalues = self.model.fprop(self.tensor, inference=True) assert qvalues.shape == (self.num_actions, self.batch_size) # take maximum Q-value for each state actions = self.be.max(qvalues, axis=0) assert actions.astensor().shape == (1, self.batch_size) # calculate mean Q-value of all states meanq = self.be.mean(actions, axis=1) assert meanq.astensor().shape == (1, 1) # return the mean return meanq.asnumpyarray()[0, 0] def load_weights(self, load_path): self.model.load_weights(load_path) def save_weights(self, save_path): save_obj(self.model.serialize(keep_states=True), save_path)
def test_model_serialize(backend): (X_train, y_train), (X_test, y_test), nclass = load_mnist() train_set = DataIterator([X_train, X_train], y_train, nclass=nclass, lshape=(1, 28, 28)) init_norm = Gaussian(loc=0.0, scale=0.01) # initialize model path1 = [ Conv((5, 5, 16), init=init_norm, bias=Constant(0), activation=Rectlin()), Pooling(2), Affine(nout=20, init=init_norm, bias=init_norm, activation=Rectlin()) ] path2 = [ Dropout(keep=0.5), Affine(nout=20, init=init_norm, bias=init_norm, activation=Rectlin()) ] layers = [ MergeConcat([path1, path2]), Affine(nout=20, init=init_norm, bias=init_norm, activation=Rectlin()), BatchNorm(), Affine(nout=10, init=init_norm, activation=Logistic(shortcut=True)) ] tmp_save = 'test_model_serialize_tmp_save.pickle' mlp = Model(layers=layers) mlp.optimizer = GradientDescentMomentum(learning_rate=0.1, momentum_coef=0.9) mlp.cost = GeneralizedCost(costfunc=CrossEntropyBinary()) n_test = 3 num_epochs = 3 # Train model for num_epochs and n_test batches for epoch in range(num_epochs): for i, (x, t) in enumerate(train_set): x = mlp.fprop(x) delta = mlp.cost.get_errors(x, t) mlp.bprop(delta) mlp.optimizer.optimize(mlp.layers_to_optimize, epoch=epoch) if i > n_test: break # Get expected outputs of n_test batches and states of all layers outputs_exp = [] pdicts_exp = [l.get_params_serialize() for l in mlp.layers_to_optimize] for i, (x, t) in enumerate(train_set): outputs_exp.append(mlp.fprop(x, inference=True)) if i > n_test: break # Serialize model save_obj(mlp.serialize(keep_states=True), tmp_save) # Load model mlp = Model(layers=layers) mlp.load_weights(tmp_save) outputs = [] pdicts = [l.get_params_serialize() for l in mlp.layers_to_optimize] for i, (x, t) in enumerate(train_set): outputs.append(mlp.fprop(x, inference=True)) if i > n_test: break # Check outputs, states, and params are the same for output, output_exp in zip(outputs, outputs_exp): assert np.allclose(output.get(), output_exp.get()) for pd, pd_exp in zip(pdicts, pdicts_exp): for s, s_e in zip(pd['states'], pd_exp['states']): if isinstance(s, list): # this is the batch norm case for _s, _s_e in zip(s, s_e): assert np.allclose(_s, _s_e) else: assert np.allclose(s, s_e) for p, p_e in zip(pd['params'], pd_exp['params']): if isinstance(p, list): # this is the batch norm case for _p, _p_e in zip(p, p_e): assert np.allclose(_p, _p_e) else: assert np.allclose(p, p_e) os.remove(tmp_save)
class DeepQNetwork: def __init__(self, num_actions, args): # create Neon backend self.be = gen_backend(backend = args.backend, batch_size = args.batch_size, rng_seed = args.random_seed, device_id = args.device_id, default_dtype = np.dtype(args.datatype).type, stochastic_round = args.stochastic_round) # create model layers = self.createLayers(num_actions) self.model = Model(layers = layers) self.cost = GeneralizedCost(costfunc = SumSquared()) self.optimizer = RMSProp(learning_rate = args.learning_rate, decay_rate = args.rmsprop_decay_rate, stochastic_round = args.stochastic_round) # create target model self.target_steps = args.target_steps self.train_iterations = 0 if self.target_steps: self.target_model = Model(layers = self.createLayers(num_actions)) self.save_weights_path = args.save_weights_path else: self.target_model = self.model # remember parameters self.num_actions = num_actions self.batch_size = args.batch_size self.discount_rate = args.discount_rate self.history_length = args.history_length self.screen_dim = (args.screen_height, args.screen_width) self.clip_error = args.clip_error # prepare tensors once and reuse them self.input_shape = (self.history_length,) + self.screen_dim + (self.batch_size,) self.tensor = self.be.empty(self.input_shape) self.tensor.lshape = self.input_shape # needed for convolutional networks self.targets = self.be.empty((self.num_actions, self.batch_size)) self.callback = None def createLayers(self, num_actions): # create network init_norm = Gaussian(loc=0.0, scale=0.01) layers = [] # The first hidden layer convolves 32 filters of 8x8 with stride 4 with the input image and applies a rectifier nonlinearity. layers.append(Conv((8, 8, 32), strides=4, init=init_norm, activation=Rectlin())) # The second hidden layer convolves 64 filters of 4x4 with stride 2, again followed by a rectifier nonlinearity. layers.append(Conv((4, 4, 64), strides=2, init=init_norm, activation=Rectlin())) # This is followed by a third convolutional layer that convolves 64 filters of 3x3 with stride 1 followed by a rectifier. layers.append(Conv((3, 3, 64), strides=1, init=init_norm, activation=Rectlin())) # The final hidden layer is fully-connected and consists of 512 rectifier units. layers.append(Affine(nout=512, init=init_norm, activation=Rectlin())) # The output layer is a fully-connected linear layer with a single output for each valid action. layers.append(Affine(nout = num_actions, init = init_norm)) return layers def setTensor(self, states): # change order of axes to match what Neon expects states = np.transpose(states, axes = (1, 2, 3, 0)) # copy() shouldn't be necessary here, but Neon doesn't work otherwise self.tensor.set(states.copy()) # normalize network input between 0 and 1 self.be.divide(self.tensor, 255, self.tensor) def train(self, minibatch, epoch): # expand components of minibatch prestates, actions, rewards, poststates, terminals = minibatch assert len(prestates.shape) == 4 assert len(poststates.shape) == 4 assert len(actions.shape) == 1 assert len(rewards.shape) == 1 assert len(terminals.shape) == 1 assert prestates.shape == poststates.shape assert prestates.shape[0] == actions.shape[0] == rewards.shape[0] == poststates.shape[0] == terminals.shape[0] if self.target_steps and self.train_iterations % self.target_steps == 0: # HACK: push something through network, so that weights exist self.model.fprop(self.tensor) # HACK: serialize network to disk and read it back to clone filename = os.path.join(self.save_weights_path, "target_network.pkl") save_obj(self.model.serialize(keep_states = False), filename) self.target_model.load_weights(filename) # feed-forward pass for poststates to get Q-values self.setTensor(poststates) postq = self.target_model.fprop(self.tensor, inference = True) assert postq.shape == (self.num_actions, self.batch_size) # calculate max Q-value for each poststate maxpostq = self.be.max(postq, axis=0).asnumpyarray() assert maxpostq.shape == (1, self.batch_size) # feed-forward pass for prestates self.setTensor(prestates) preq = self.model.fprop(self.tensor, inference = False) assert preq.shape == (self.num_actions, self.batch_size) # make copy of prestate Q-values as targets targets = preq.asnumpyarray() # update Q-value targets for actions taken for i, action in enumerate(actions): if terminals[i]: targets[action, i] = float(rewards[i]) else: targets[action, i] = float(rewards[i]) + self.discount_rate * maxpostq[0,i] # copy targets to GPU memory self.targets.set(targets) # calculate errors deltas = self.cost.get_errors(preq, self.targets) assert deltas.shape == (self.num_actions, self.batch_size) #assert np.count_nonzero(deltas.asnumpyarray()) == 32 # calculate cost, just in case cost = self.cost.get_cost(preq, self.targets) assert cost.shape == (1,1) # clip errors if self.clip_error: self.be.clip(deltas, -self.clip_error, self.clip_error, out = deltas) # perform back-propagation of gradients self.model.bprop(deltas) # perform optimization self.optimizer.optimize(self.model.layers_to_optimize, epoch) # increase number of weight updates (needed for target clone interval) self.train_iterations += 1 # calculate statistics if self.callback: self.callback.on_train(cost.asnumpyarray()[0,0]) def predict(self, states): # minibatch is full size, because Neon doesn't let change the minibatch size assert states.shape == ((self.batch_size, self.history_length,) + self.screen_dim) # calculate Q-values for the states self.setTensor(states) qvalues = self.model.fprop(self.tensor, inference = True) assert qvalues.shape == (self.num_actions, self.batch_size) if logger.isEnabledFor(logging.DEBUG): logger.debug("Q-values: " + str(qvalues.asnumpyarray()[:,0])) # find the action with highest q-value actions = self.be.argmax(qvalues, axis = 0) assert actions.shape == (1, self.batch_size) # take only the first result return actions.asnumpyarray()[0,0] def getMeanQ(self, states): assert states.shape == ((self.batch_size, self.history_length,) + self.screen_dim) # calculate Q-values for the states self.setTensor(states) qvalues = self.model.fprop(self.tensor, inference = True) assert qvalues.shape == (self.num_actions, self.batch_size) # take maximum Q-value for each state actions = self.be.max(qvalues, axis = 0) assert actions.astensor().shape == (1, self.batch_size) # calculate mean Q-value of all states meanq = self.be.mean(actions, axis = 1) assert meanq.astensor().shape == (1, 1) # return the mean return meanq.asnumpyarray()[0,0] def load_weights(self, load_path): self.model.load_weights(load_path) def save_weights(self, save_path): save_obj(self.model.serialize(keep_states = True), save_path)
class DeepQNetwork: def __init__(self, num_actions, args): # remember parameters self.num_actions = num_actions self.batch_size = args.batch_size self.discount_rate = args.discount_rate self.history_length = args.history_length self.screen_dim = (args.screen_height, args.screen_width) self.clip_error = args.clip_error # create Neon backend self.be = gen_backend(backend = args.backend, batch_size = args.batch_size, rng_seed = args.random_seed, device_id = args.device_id, datatype = np.dtype(args.datatype).type, stochastic_round = args.stochastic_round) # prepare tensors once and reuse them self.input_shape = (self.history_length,) + self.screen_dim + (self.batch_size,) self.input = self.be.empty(self.input_shape) self.input.lshape = self.input_shape # HACK: needed for convolutional networks self.targets = self.be.empty((self.num_actions, self.batch_size)) # create model layers = self._createLayers(num_actions) self.model = Model(layers = layers) self.cost = GeneralizedCost(costfunc = SumSquared()) # Bug fix for l in self.model.layers.layers: l.parallelism = 'Disabled' self.model.initialize(self.input_shape[:-1], self.cost) if args.optimizer == 'rmsprop': self.optimizer = RMSProp(learning_rate = args.learning_rate, decay_rate = args.decay_rate, stochastic_round = args.stochastic_round) elif args.optimizer == 'adam': self.optimizer = Adam(learning_rate = args.learning_rate, stochastic_round = args.stochastic_round) elif args.optimizer == 'adadelta': self.optimizer = Adadelta(decay = args.decay_rate, stochastic_round = args.stochastic_round) else: assert false, "Unknown optimizer" # create target model self.target_steps = args.target_steps self.train_iterations = 0 if self.target_steps: self.target_model = Model(layers = self._createLayers(num_actions)) # Bug fix for l in self.target_model.layers.layers: l.parallelism = 'Disabled' self.target_model.initialize(self.input_shape[:-1]) self.save_weights_prefix = args.save_weights_prefix else: self.target_model = self.model self.callback = None def _createLayers(self, num_actions): # create network init_norm = Gaussian(loc=0.0, scale=0.01) layers = [] if self.screen_dim == (84, 84): # The first hidden layer convolves 32 filters of 8x8 with stride 4 with the input image and applies a rectifier nonlinearity. logger.info("Using 8x8 filter for first Conv") layers.append(Conv((8, 8, 32), strides=4, init=init_norm, activation=Rectlin())) elif self.screen_dim == (52, 40): # The first hidden layer convolves 32 filters of 5x4 with stride 2 with the input image and applies a rectifier nonlinearity. logger.info("Using 5x4 filter for first Conv") layers.append(Conv((5, 4, 32), strides=2, init=init_norm, activation=Rectlin())) else: raise NotImplementedError("Unsupported screen dim.") # The second hidden layer convolves 64 filters of 4x4 with stride 2, again followed by a rectifier nonlinearity. layers.append(Conv((4, 4, 64), strides=2, init=init_norm, activation=Rectlin())) # This is followed by a third convolutional layer that convolves 64 filters of 3x3 with stride 1 followed by a rectifier. layers.append(Conv((3, 3, 64), strides=1, init=init_norm, activation=Rectlin())) # The final hidden layer is fully-connected and consists of 512 rectifier units. layers.append(Affine(nout=512, init=init_norm, activation=Rectlin())) # The output layer is a fully-connected linear layer with a single output for each valid action. layers.append(Affine(nout=num_actions, init = init_norm)) return layers def _setInput(self, states): # change order of axes to match what Neon expects states = np.transpose(states, axes = (1, 2, 3, 0)) # copy() shouldn't be necessary here, but Neon doesn't work otherwise self.input.set(states.copy()) # normalize network input between 0 and 1 self.be.divide(self.input, 255, self.input) def train(self, minibatch, epoch): # expand components of minibatch prestates, actions, rewards, poststates, terminals = minibatch assert len(prestates.shape) == 4 assert len(poststates.shape) == 4 assert len(actions.shape) == 1 assert len(rewards.shape) == 1 assert len(terminals.shape) == 1 assert prestates.shape == poststates.shape assert prestates.shape[0] == actions.shape[0] == rewards.shape[0] == poststates.shape[0] == terminals.shape[0] if self.target_steps and self.train_iterations % self.target_steps == 0: pdict = self.model.get_description(get_weights=True) self.target_model.deserialize(pdict, load_states=False) # feed-forward pass for poststates to get Q-values self._setInput(poststates) postq = self.target_model.fprop(self.input, inference = True) assert postq.shape == (self.num_actions, self.batch_size) # calculate max Q-value for each poststate maxpostq = self.be.max(postq, axis=0).asnumpyarray() assert maxpostq.shape == (1, self.batch_size) # feed-forward pass for prestates self._setInput(prestates) preq = self.model.fprop(self.input, inference = False) assert preq.shape == (self.num_actions, self.batch_size) # make copy of prestate Q-values as targets targets = preq.asnumpyarray() # update Q-value targets for actions taken for i, action in enumerate(actions): if terminals[i]: targets[action, i] = float(rewards[i]) else: targets[action, i] = float(rewards[i]) + self.discount_rate * maxpostq[0,i] # copy targets to GPU memory self.targets.set(targets) # calculate errors deltas = self.cost.get_errors(preq, self.targets) assert deltas.shape == (self.num_actions, self.batch_size) #assert np.count_nonzero(deltas.asnumpyarray()) == 32 # calculate cost, just in case cost = self.cost.get_cost(preq, self.targets) assert cost.shape == (1,1) # clip errors if self.clip_error: self.be.clip(deltas, -self.clip_error, self.clip_error, out = deltas) # perform back-propagation of gradients self.model.bprop(deltas) # perform optimization self.optimizer.optimize(self.model.layers_to_optimize, epoch) # increase number of weight updates (needed for target clone interval) self.train_iterations += 1 # calculate statistics if self.callback: self.callback.on_train(cost.asnumpyarray()[0,0]) def predict(self, states): # minibatch is full size, because Neon doesn't let change the minibatch size assert states.shape == ((self.batch_size, self.history_length,) + self.screen_dim) # calculate Q-values for the states self._setInput(states) qvalues = self.model.fprop(self.input, inference = True) assert qvalues.shape == (self.num_actions, self.batch_size) if logger.isEnabledFor(logging.DEBUG): logger.debug("Q-values: " + str(qvalues.asnumpyarray()[:,0])) # transpose the result, so that batch size is first dimension return qvalues.T.asnumpyarray() def load_weights(self, load_path): self.model.load_weights(load_path) def save_weights(self, save_path): save_obj(self.model.serialize(keep_states = True), save_path)
class DeepQNetwork: def __init__(self, state_size, num_steers, num_speeds, args): # remember parameters self.state_size = state_size self.num_steers = num_steers self.num_speeds = num_speeds self.num_actions = num_steers + num_speeds self.num_layers = args.hidden_layers self.hidden_nodes = args.hidden_nodes self.batch_size = args.batch_size self.discount_rate = args.discount_rate self.clip_error = args.clip_error # create Neon backend self.be = gen_backend(backend = args.backend, batch_size = args.batch_size, rng_seed = args.random_seed, device_id = args.device_id, datatype = np.dtype(args.datatype).type, stochastic_round = args.stochastic_round) # prepare tensors once and reuse them self.input_shape = (self.state_size, self.batch_size) self.input = self.be.empty(self.input_shape) self.targets = self.be.empty((self.num_actions, self.batch_size)) # create model self.model = Model(layers = self._createLayers()) self.cost = GeneralizedCost(costfunc = SumSquared()) self.model.initialize(self.input_shape[:-1], self.cost) if args.optimizer == 'rmsprop': self.optimizer = RMSProp(learning_rate = args.learning_rate, decay_rate = args.decay_rate, stochastic_round = args.stochastic_round) elif args.optimizer == 'adam': self.optimizer = Adam(learning_rate = args.learning_rate, stochastic_round = args.stochastic_round) elif args.optimizer == 'adadelta': self.optimizer = Adadelta(decay = args.decay_rate, stochastic_round = args.stochastic_round) else: assert false, "Unknown optimizer" # create target model self.target_steps = args.target_steps self.train_iterations = 0 if self.target_steps: self.target_model = Model(layers = self._createLayers()) self.target_model.initialize(self.input_shape[:-1]) self.save_weights_prefix = args.save_weights_prefix else: self.target_model = self.model def _createLayers(self): # create network init_norm = Gaussian(loc=0.0, scale=0.01) layers = [] for i in xrange(self.num_layers): layers.append(Affine(nout=self.hidden_nodes, init=init_norm, activation=Rectlin())) layers.append(Affine(nout=self.num_actions, init = init_norm)) return layers def _setInput(self, states): # change order of axes to match what Neon expects states = np.transpose(states) # copy() shouldn't be necessary here, but Neon doesn't work otherwise self.input.set(states.copy()) # normalize network input between 0 and 1 #self.be.divide(self.input, 200, self.input) def train(self, minibatch, epoch = 0): # expand components of minibatch prestates, steers, speeds, rewards, poststates, terminals = minibatch assert len(prestates.shape) == 2 assert len(poststates.shape) == 2 assert len(steers.shape) == 1 assert len(speeds.shape) == 1 assert len(rewards.shape) == 1 assert len(terminals.shape) == 1 assert prestates.shape == poststates.shape assert prestates.shape[0] == steers.shape[0] == speeds.shape[0] == rewards.shape[0] == poststates.shape[0] == terminals.shape[0] if self.target_steps and self.train_iterations % self.target_steps == 0: # HACK: serialize network to disk and read it back to clone filename = self.save_weights_prefix + "_target.pkl" save_obj(self.model.serialize(keep_states = False), filename) self.target_model.load_weights(filename) # feed-forward pass for poststates to get Q-values self._setInput(poststates) postq = self.target_model.fprop(self.input, inference = True) assert postq.shape == (self.num_actions, self.batch_size) # calculate max Q-value for each poststate postq = postq.asnumpyarray() maxsteerq = np.max(postq[:self.num_steers,:], axis=0) assert maxsteerq.shape == (self.batch_size,), "size: %s" % str(maxsteerq.shape) maxspeedq = np.max(postq[-self.num_speeds:,:], axis=0) assert maxspeedq.shape == (self.batch_size,) # feed-forward pass for prestates self._setInput(prestates) preq = self.model.fprop(self.input, inference = False) assert preq.shape == (self.num_actions, self.batch_size) # make copy of prestate Q-values as targets # HACK: copy() was needed to make it work on CPU targets = preq.asnumpyarray().copy() # update Q-value targets for actions taken for i, (steer, speed) in enumerate(zip(steers, speeds)): if terminals[i]: targets[steer, i] = float(rewards[i]) targets[self.num_steers + speed, i] = float(rewards[i]) else: targets[steer, i] = float(rewards[i]) + self.discount_rate * maxsteerq[i] targets[self.num_steers + speed, i] = float(rewards[i]) + self.discount_rate * maxspeedq[i] # copy targets to GPU memory self.targets.set(targets) # calculate errors deltas = self.cost.get_errors(preq, self.targets) assert deltas.shape == (self.num_actions, self.batch_size) #assert np.count_nonzero(deltas.asnumpyarray()) == 2 * self.batch_size, str(np.count_nonzero(deltas.asnumpyarray())) # calculate cost, just in case cost = self.cost.get_cost(preq, self.targets) assert cost.shape == (1,1) #print "cost:", cost.asnumpyarray() # clip errors if self.clip_error: self.be.clip(deltas, -self.clip_error, self.clip_error, out = deltas) # perform back-propagation of gradients self.model.bprop(deltas) # perform optimization self.optimizer.optimize(self.model.layers_to_optimize, epoch) ''' if np.any(rewards < 0): preqq = preq.asnumpyarray().copy() self._setInput(prestates) qvalues = self.model.fprop(self.input, inference = True).asnumpyarray().copy() indexes = rewards < 0 print "indexes:", indexes print "preq:", preqq[:, indexes].T print "preq':", qvalues[:, indexes].T print "diff:", (qvalues[:, indexes]-preqq[:, indexes]).T print "steers:", steers[indexes] print "speeds:", speeds[indexes] print "rewards:", rewards[indexes] print "terminals:", terminals[indexes] print "preq[0]:", preqq[:, 0] print "preq[0]':", qvalues[:, 0] print "diff:", qvalues[:, 0] - preqq[:, 0] print "deltas:", deltas.asnumpyarray()[:, indexes].T raw_input("Press Enter to continue...") ''' # increase number of weight updates (needed for target clone interval) self.train_iterations += 1 def predict(self, states): # minibatch is full size, because Neon doesn't let change the minibatch size assert states.shape == (self.batch_size, self.state_size) # calculate Q-values for the states self._setInput(states) qvalues = self.model.fprop(self.input, inference = True) assert qvalues.shape == (self.num_actions, self.batch_size) if logger.isEnabledFor(logging.DEBUG): logger.debug("Q-values: " + str(qvalues.asnumpyarray()[:,0])) # transpose the result, so that batch size is first dimension return qvalues.T.asnumpyarray() def load_weights(self, load_path): self.model.load_weights(load_path) def save_weights(self, save_path): save_obj(self.model.serialize(keep_states = True), save_path)
class DeepQNetwork: def __init__(self, state_size, num_actions, args): # remember parameters self.state_size = state_size self.num_actions = num_actions self.batch_size = args.batch_size self.discount_rate = args.discount_rate self.clip_error = args.clip_error self.action_count = np.zeros(21) # create Neon backend self.be = gen_backend(backend = args.backend, batch_size = args.batch_size, rng_seed = args.random_seed, device_id = args.device_id, datatype = np.dtype(args.datatype).type, stochastic_round = args.stochastic_round) # prepare tensors once and reuse them self.input_shape = (self.state_size, self.batch_size) self.input = self.be.empty(self.input_shape) self.targets = self.be.empty((self.num_actions, self.batch_size)) # create model layers = self._createLayers(num_actions) self.model = Model(layers = layers) self.cost = GeneralizedCost(costfunc = SumSquared()) self.model.initialize(self.input_shape[:-1], self.cost) if args.optimizer == 'rmsprop': self.optimizer = RMSProp(learning_rate = args.learning_rate, decay_rate = args.decay_rate, stochastic_round = args.stochastic_round) elif args.optimizer == 'adam': self.optimizer = Adam(learning_rate = args.learning_rate, stochastic_round = args.stochastic_round) elif args.optimizer == 'adadelta': self.optimizer = Adadelta(decay = args.decay_rate, stochastic_round = args.stochastic_round) else: assert False, "Unknown optimizer" # create target model self.target_steps = args.target_steps self.train_iterations = 0 if self.target_steps: self.target_model = Model(layers = self._createLayers(num_actions)) self.target_model.initialize(self.input_shape[:-1]) self.save_weights_prefix = args.save_weights_prefix else: self.target_model = self.model def _createLayers(self, num_actions): # create network init_norm = Gaussian(loc=0.0, scale=0.01) layers = [] # The final hidden layer is fully-connected and consists of 512 rectifier units. layers.append(Affine(nout=64, init=init_norm, bias=init_norm, activation=Rectlin())) # The output layer is a fully-connected linear layer with a single output for each valid action. layers.append(Affine(nout=num_actions, init=init_norm, bias=init_norm)) return layers def _setInput(self, states): # change order of axes to match what Neon expects states = np.transpose(states) # copy() shouldn't be necessary here, but Neon doesn't work otherwise self.input.set(states.copy()) # normalize network input between 0 and 1 # self.be.divide(self.input, 255, self.input) def train(self, minibatch, epoch): # expand components of minibatch prestates, actions, speed_actions, rewards, poststates, terminals = minibatch assert len(prestates.shape) == 2 assert len(poststates.shape) == 2 assert len(actions.shape) == 1 assert len(rewards.shape) == 1 assert len(terminals.shape) == 1 assert prestates.shape == poststates.shape assert prestates.shape[0] == actions.shape[0] == rewards.shape[0] == poststates.shape[0] == terminals.shape[0] #print "WE ARE ACTUALLY TRAINING IN HERE" if self.target_steps and self.train_iterations % self.target_steps == 0: # HACK: serialize network to disk and read it back to clone filename = self.save_weights_prefix + "_target.pkl" save_obj(self.model.serialize(keep_states = False), filename) self.target_model.load_weights(filename) # feed-forward pass for poststates to get Q-values self._setInput(poststates) postq = self.target_model.fprop(self.input, inference = True) assert postq.shape == (self.num_actions, self.batch_size) # calculate max Q-value for each poststate postq = postq.asnumpyarray() maxpostq = np.max(postq, axis=0) #print maxpostq.shape assert maxpostq.shape == (self.batch_size,) # feed-forward pass for prestates self._setInput(prestates) preq = self.model.fprop(self.input, inference = False) assert preq.shape == (self.num_actions, self.batch_size) # make copy of prestate Q-values as targets targets = preq.asnumpyarray().copy() # update Q-value targets for actions taken for i, action in enumerate(actions): self.action_count[action] += 1 if terminals[i]: targets[action, i] = float(rewards[i]) if rewards[i] == -1000: print "######################### action ", action, "should never be sampled again" print "sampled_terminal" else: targets[action, i] = float(rewards[i]) + self.discount_rate * maxpostq[i] #targets[i,action] = float(rewards[i]) + self.discount_rate * maxpostq[i] #print "action count", self.action_count # copy targets to GPU memory self.targets.set(targets) # calculate errors deltas = self.cost.get_errors(preq, self.targets) assert deltas.shape == (self.num_actions, self.batch_size) #assert np.count_nonzero(deltas.asnumpyarray()) == 32 print "nonzero deltas", np.count_nonzero(deltas.asnumpyarray()) # calculate cost, just in case cost = self.cost.get_cost(preq, self.targets) assert cost.shape == (1,1) print "cost:", cost.asnumpyarray() # clip errors #if self.clip_error: # self.be.clip(deltas, -self.clip_error, self.clip_error, out = deltas) # perform back-propagation of gradients self.model.bprop(deltas) # perform optimization self.optimizer.optimize(self.model.layers_to_optimize, epoch) # increase number of weight updates (needed for target clone interval) self.train_iterations += 1 def predict(self, states): # minibatch is full size, because Neon doesn't let change the minibatch size assert states.shape == (self.batch_size, self.state_size) # calculate Q-values for the states self._setInput(states) qvalues = self.model.fprop(self.input, inference = True) assert qvalues.shape == (self.num_actions, self.batch_size) if logger.isEnabledFor(logging.DEBUG): logger.debug("Q-values: " + str(qvalues.asnumpyarray()[:,0])) # transpose the result, so that batch size is first dimension return qvalues.T.asnumpyarray() def load_weights(self, load_path): self.model.load_weights(load_path) def save_weights(self, save_path): save_obj(self.model.serialize(keep_states = True), save_path)