def memory_stat_read_write(): counter = mx.symbol.Variable('counter') visiting_timestamp = mx.symbol.Variable('visiting_timestamp') control_flag = mx.symbol.Variable('control_flag') memory_write_control_op = MemoryStatUpdateOp(mode='write') memory_read_control_op = MemoryStatUpdateOp(mode='read') controlling_stats_afterwrite = memory_write_control_op(counter=counter, visiting_timestamp=visiting_timestamp, control_flag=control_flag) controlling_stats_afterread = memory_read_control_op(counter=counter, visiting_timestamp=visiting_timestamp, control_flag=control_flag) data_shapes = {'counter': (1, 4), 'visiting_timestamp': (1, 4), 'control_flag':(1,)} write_net = Base(sym=controlling_stats_afterwrite, data_shapes=data_shapes, name='write_net') read_net = Base(sym=controlling_stats_afterread, data_shapes=data_shapes, name='read_net') current_counter = numpy.array([[10, 20, 3, 40]]) current_visiting_timestamp = numpy.array([[1, 3, 2, 4]]) for i in range(100): write_outputs = write_net.forward(data_shapes=data_shapes, counter=current_counter, visiting_timestamp=current_visiting_timestamp, control_flag=numpy.array([i%3,])) write_net.backward(data_shapes=data_shapes) current_counter = write_outputs[0].asnumpy() current_visiting_timestamp = write_outputs[1].asnumpy() print 'Control Flag:', i%3, 'Counter:', current_counter, \ " Visiting Timestamp:", current_visiting_timestamp, "Flags:", write_outputs[2].asnumpy() read_outputs = read_net.forward(data_shapes=data_shapes, counter=current_counter, visiting_timestamp=current_visiting_timestamp, control_flag=numpy.array([i%4,])) read_net.backward(data_shapes=data_shapes) current_counter = read_outputs[0].asnumpy() current_visiting_timestamp = read_outputs[1].asnumpy() print 'Control Flag:', i%4, 'Counter:', current_counter, \ " Visiting Timestamp:", current_visiting_timestamp ch = raw_input()
def memory_update_test(): memory = mx.symbol.Variable('memory') update = mx.symbol.Variable('update') flag = mx.symbol.Variable('flag') update_factor = mx.symbol.Variable('update_factor') output = mx.symbol.MemoryUpdate(data=memory, update=update, flag=flag, factor=update_factor) output2 = mx.symbol.MemoryUpdate(data=output, update=update, flag=flag, factor=update_factor) output2 = mx.symbol.BlockGrad(data=output2) data_shapes = {'memory': (5, 3, 2, 2), 'update': (1, 3, 2, 2), 'flag': (5, ), 'update_factor':(1,)} net = Base(sym=output2, data_shapes=data_shapes) memory_npy = numpy.zeros((5, 3, 2, 2), dtype=numpy.float32) update_npy = numpy.zeros((1, 3, 2, 2), dtype=numpy.float32) flag_npy = numpy.zeros((5,), dtype=numpy.float32) update_factor_npy = numpy.array([0.8,]) for i in range(5): memory_npy[i, :, :, :] = 2*i + 1 flag_npy[1] = 1 output_npy = net.forward(data_shapes=data_shapes, memory=memory_npy, update=update_npy, flag=flag_npy, update_factor=update_factor_npy)[0].asnumpy() net.backward(data_shapes=data_shapes, memory=memory_npy, update=update_npy, flag=flag_npy, update_factor=update_factor_npy) print memory_npy print output_npy
def test_logsoftmax(): var = mx.symbol.Variable('var') data = mx.symbol.Variable('data') net = mx.symbol.FullyConnected(data=data, name='fc1', num_hidden=10) net = mx.symbol.Activation(data=net, name='relu1', act_type='relu') net = mx.symbol.FullyConnected(data=net, name='fc2', num_hidden=4) net = mx.symbol.Custom(data=net, name='policy', op_type='LogSoftmaxPolicy') ctx = mx.gpu() minibatch_size = 100 data_shapes = { 'data': (minibatch_size, 4), 'policy_score': (minibatch_size, ) } qnet = Base(data_shapes=data_shapes, sym_gen=net, name='PolicyNet', initializer=mx.initializer.Xavier(factor_type="in", magnitude=1.0), ctx=ctx) print qnet.internal_sym_names lr = 0.00001 optimizer = mx.optimizer.create(name='sgd', learning_rate=0.00001, clip_gradient=None, rescale_grad=1.0, wd=0.) updater = mx.optimizer.get_updater(optimizer) total_iter = 1000000 stats = numpy.zeros((total_iter, 3), dtype=numpy.float32) plt.ion() fig, ax = plt.subplots() lines, = ax.plot([], []) ax.set_autoscaley_on(True) baseline = 0 for i in range(total_iter): data = numpy.random.randn(minibatch_size, 4) outputs = qnet.forward(is_train=True, data=data) action = outputs[0].asnumpy() prob = outputs[1].asnumpy() #print 'data=', data, 'action=', action, 'prob=', prob #ch = raw_input() score = simple_game_discrete(data, action) baseline = baseline - 0.001 * (baseline - score.mean()) print 'score=', score.mean(), 'acc=', numpy.sum( action == numpy.argmax(data * data, axis=1)).mean(), 'baseline=', baseline stats[i] = [ score.mean(), numpy.sum(action == numpy.argmax(data * data, axis=1)).mean(), baseline ] qnet.backward(policy_score=score - baseline) qnet.update(updater) update_line(lines, fig, ax, i, score.mean()) # numpy.square(means - data*data).mean())
def sum_channel_test(): data = mx.symbol.Variable('data') summed_data = mx.symbol.SumChannel(data=data, dim=3) summed_data = mx.symbol.BlockGrad(data=summed_data) data_shapes = {'data': (10, 9, 8, 7)} net = Base(sym=summed_data, data_shapes=data_shapes) data_npy = numpy.ones((10,9,8,7)) output_npy = net.forward(data_shapes=data_shapes, data=data_npy)[0].asnumpy() net.backward(data_shapes=data_shapes, data=data_npy) print output_npy print output_npy.shape
def complex_conjugate(): data = mx.symbol.Variable('data') conjugate = mx.symbol.Conjugate(data=data) conjugate = mx.symbol.BlockGrad(data=conjugate) data_shapes = {'data': (1, 1, 4, 4)} net = Base(sym=conjugate, data_shapes=data_shapes) data_npy = numpy.ones((1, 1, 4, 4)) output_npy = net.forward(data_shapes=data_shapes, data=data_npy)[0].asnumpy() net.backward(data_shapes=data_shapes, data=data_npy) print output_npy print output_npy.shape
def broadcast_channel(): data = mx.symbol.Variable('data') broadcast = mx.symbol.BroadcastChannel(data=data, dim=0, size=10) broadcast = mx.symbol.BlockGrad(data=broadcast) data_shapes = {'data': (1, 1, 4, 4)} net = Base(sym=broadcast, data_shapes=data_shapes) data_npy = numpy.random.rand(1, 1, 4, 4) output_npy = net.forward(data_shapes=data_shapes, data=data_npy)[0].asnumpy() net.backward(data_shapes=data_shapes, data=data_npy) print output_npy print output_npy.shape
def complex_hadamard_test(): ldata = mx.symbol.Variable('ldata') rdata = mx.symbol.Variable('rdata') product = mx.symbol.ComplexHadamard(ldata=ldata, rdata=rdata) product = mx.symbol.BlockGrad(data=product) data_shapes = {'ldata': (1, 1, 4, 4), 'rdata': (1, 1, 4, 4)} net = Base(sym=product, data_shapes=data_shapes) ldata_npy = numpy.ones((1, 1, 4, 4)) rdata_npy = numpy.ones((1, 1, 4, 4)) ldata_npy[0,0,0,0] = 2 rdata_npy[0,0,1,0] = -1 output_npy = net.forward(data_shapes=data_shapes, ldata=ldata_npy, rdata=rdata_npy)[0].asnumpy() net.backward(data_shapes=data_shapes, ldata=ldata_npy, rdata=rdata_npy) print output_npy print output_npy.shape
def test_mxnet_conj(): a = mx.symbol.Variable('a') b = mx.symbol.conj(a) base_shape = (2, 10) data_shapes = {'a': base_shape} a_npy = numpy.random.rand(*base_shape) out_grad_npy = numpy.random.rand(*base_shape) net = Base(sym=b, data_shapes=data_shapes) outputs = net.forward(is_train=True, a=a_npy) print 'conj:' print numpy.square(outputs[0].asnumpy()[:, ::2] - a_npy[:, ::2]).sum() print numpy.square(outputs[0].asnumpy()[:, 1::2] + a_npy[:, 1::2]).sum() net.backward(out_grads=[nd.array(out_grad_npy, ctx=mx.gpu())]) print numpy.square(net.exe.grad_dict['a'].asnumpy()[:, ::2] - out_grad_npy[:, ::2]).sum() print numpy.square(net.exe.grad_dict['a'].asnumpy()[:, 1::2] + out_grad_npy[:, 1::2]).sum()
def memory_choose_test(): memory = mx.symbol.Variable('memory') index = mx.symbol.Variable('index') chosen_unit = mx.symbol.MemoryChoose(data=memory, index=index) chosen_unit = mx.symbol.BlockGrad(data=chosen_unit) data_shapes ={'memory': (5, 4, 3, 3), 'index': (1,)} net = Base(sym=chosen_unit, data_shapes=data_shapes) memory_npy = numpy.zeros((5, 4, 3, 3), dtype=numpy.float32) for i in range(5): memory_npy[i, :, :, :] = i index_npy = numpy.array([3], dtype=numpy.float32) print net.internal_sym_names output = net.forward(data_shapes=data_shapes, memory=memory_npy, index=index_npy)[0].asnumpy() net.backward(data_shapes=data_shapes, memory=memory_npy, index=index_npy) print output print output.shape
def main(): parser = argparse.ArgumentParser( description='Script to test the trained network on a game.') parser.add_argument('-r', '--rom', required=False, type=str, default=os.path.join('arena', 'games', 'roms', 'breakout.bin'), help='Path of the ROM File.') parser.add_argument('-v', '--visualization', required=False, type=int, default=0, help='Visualize the runs.') parser.add_argument('--lr', required=False, type=float, default=0.01, help='Learning rate of the AdaGrad optimizer') parser.add_argument('--eps', required=False, type=float, default=0.01, help='Eps of the AdaGrad optimizer') parser.add_argument('--clip-gradient', required=False, type=float, default=None, help='Clip threshold of the AdaGrad optimizer') parser.add_argument('--double-q', required=False, type=bool, default=False, help='Use Double DQN') parser.add_argument('--wd', required=False, type=float, default=0.0, help='Weight of the L2 Regularizer') parser.add_argument( '-c', '--ctx', required=False, type=str, default='gpu', help='Running Context. E.g `-c gpu` or `-c gpu1` or `-c cpu`') parser.add_argument('-d', '--dir-path', required=False, type=str, default='', help='Saving directory of model files.') parser.add_argument( '--start-eps', required=False, type=float, default=1.0, help='Eps of the epsilon-greedy policy at the beginning') parser.add_argument('--replay-start-size', required=False, type=int, default=50000, help='The step that the training starts') parser.add_argument( '--kvstore-update-period', required=False, type=int, default=1, help='The period that the worker updates the parameters from the sever' ) parser.add_argument( '--kv-type', required=False, type=str, default=None, help= 'type of kvstore, default will not use kvstore, could also be dist_async' ) parser.add_argument('--optimizer', required=False, type=str, default="adagrad", help='type of optimizer') args = parser.parse_args() if args.dir_path == '': rom_name = os.path.splitext(os.path.basename(args.rom))[0] args.dir_path = 'dqn-%s-lr%g' % (rom_name, args.lr) replay_start_size = args.replay_start_size max_start_nullops = 30 replay_memory_size = 1000000 history_length = 4 rows = 84 cols = 84 ctx = parse_ctx(args.ctx) q_ctx = mx.Context(*ctx[0]) game = AtariGame(rom_path=args.rom, resize_mode='scale', replay_start_size=replay_start_size, resized_rows=rows, resized_cols=cols, max_null_op=max_start_nullops, replay_memory_size=replay_memory_size, display_screen=args.visualization, history_length=history_length) ##RUN NATURE freeze_interval = 10000 epoch_num = 200 steps_per_epoch = 250000 update_interval = 4 discount = 0.99 eps_start = args.start_eps eps_min = 0.1 eps_decay = (eps_start - eps_min) / 1000000 eps_curr = eps_start freeze_interval /= update_interval minibatch_size = 32 action_num = len(game.action_set) data_shapes = { 'data': (minibatch_size, history_length) + (rows, cols), 'dqn_action': (minibatch_size, ), 'dqn_reward': (minibatch_size, ) } dqn_sym = dqn_sym_nature(action_num) qnet = Base(data_shapes=data_shapes, sym_gen=dqn_sym, name='QNet', initializer=DQNInitializer(factor_type="in"), ctx=q_ctx) target_qnet = qnet.copy(name="TargetQNet", ctx=q_ctx) use_easgd = False if args.optimizer != "easgd": optimizer = mx.optimizer.create(name=args.optimizer, learning_rate=args.lr, eps=args.eps, clip_gradient=args.clip_gradient, rescale_grad=1.0, wd=args.wd) else: use_easgd = True easgd_beta = 0.9 easgd_p = 4 easgd_alpha = easgd_beta / (args.kvstore_update_period * easgd_p) server_optimizer = mx.optimizer.create(name="ServerEASGD", learning_rate=easgd_alpha) easgd_eta = 0.00025 local_optimizer = mx.optimizer.create(name='adagrad', learning_rate=args.lr, eps=args.eps, clip_gradient=args.clip_gradient, rescale_grad=1.0, wd=args.wd) central_weight = OrderedDict([(n, nd.zeros(v.shape, ctx=q_ctx)) for n, v in qnet.params.items()]) # Create KVStore if args.kv_type != None: kv = kvstore.create(args.kv_type) #Initialize KVStore for idx, v in enumerate(qnet.params.values()): kv.init(idx, v) # Set Server optimizer on KVStore if not use_easgd: kv.set_optimizer(optimizer) else: kv.set_optimizer(server_optimizer) local_updater = mx.optimizer.get_updater(local_optimizer) kvstore_update_period = args.kvstore_update_period args.dir_path = args.dir_path + "-" + str(kv.rank) else: updater = mx.optimizer.get_updater(optimizer) qnet.print_stat() target_qnet.print_stat() # Begin Playing Game training_steps = 0 total_steps = 0 for epoch in xrange(epoch_num): # Run Epoch steps_left = steps_per_epoch episode = 0 epoch_reward = 0 start = time.time() game.start() while steps_left > 0: # Running New Episode episode += 1 episode_loss = 0.0 episode_q_value = 0.0 episode_update_step = 0 episode_action_step = 0 time_episode_start = time.time() game.begin_episode(steps_left) while not game.episode_terminate: # 1. We need to choose a new action based on the current game status if game.state_enabled and game.replay_memory.sample_enabled: do_exploration = (npy_rng.rand() < eps_curr) eps_curr = max(eps_curr - eps_decay, eps_min) if do_exploration: action = npy_rng.randint(action_num) else: # TODO Here we can in fact play multiple gaming instances simultaneously and make actions for each # We can simply stack the current_state() of gaming instances and give prediction for all of them # We need to wait after calling calc_score(.), which makes the program slow # TODO Profiling the speed of this part! current_state = game.current_state() state = nd.array( current_state.reshape((1, ) + current_state.shape), ctx=q_ctx) / float(255.0) qval_npy = qnet.forward(is_train=False, data=state)[0].asnumpy() action = numpy.argmax(qval_npy) episode_q_value += qval_npy[0, action] episode_action_step += 1 else: action = npy_rng.randint(action_num) # 2. Play the game for a single mega-step (Inside the game, the action may be repeated for several times) game.play(action) total_steps += 1 # 3. Update our Q network if we can start sampling from the replay memory # Also, we update every `update_interval` if total_steps % update_interval == 0 and game.replay_memory.sample_enabled: # 3.1 Draw sample from the replay_memory training_steps += 1 episode_update_step += 1 states, actions, rewards, next_states, terminate_flags \ = game.replay_memory.sample(batch_size=minibatch_size) states = nd.array(states, ctx=q_ctx) / float(255.0) next_states = nd.array(next_states, ctx=q_ctx) / float(255.0) actions = nd.array(actions, ctx=q_ctx) rewards = nd.array(rewards, ctx=q_ctx) terminate_flags = nd.array(terminate_flags, ctx=q_ctx) # 3.2 Use the target network to compute the scores and # get the corresponding target rewards if not args.double_q: target_qval = target_qnet.forward(is_train=False, data=next_states)[0] target_rewards = rewards + nd.choose_element_0index(target_qval, nd.argmax_channel(target_qval))\ * (1.0 - terminate_flags) * discount else: target_qval = target_qnet.forward(is_train=False, data=next_states)[0] qval = qnet.forward(is_train=False, data=next_states)[0] target_rewards = rewards + nd.choose_element_0index(target_qval, nd.argmax_channel(qval))\ * (1.0 - terminate_flags) * discount outputs = qnet.forward(is_train=True, data=states, dqn_action=actions, dqn_reward=target_rewards) qnet.backward() if args.kv_type != None: if use_easgd: if total_steps % kvstore_update_period == 0: for ind, k in enumerate(qnet.params.keys()): kv.pull(ind, central_weight[k], priority=-ind) qnet.params[k][:] -= easgd_alpha * \ (qnet.params[k] - central_weight[k]) kv.push(ind, qnet.params[k], priority=-ind) qnet.update(updater=local_updater) else: update_on_kvstore(kv, qnet.params, qnet.params_grad) else: qnet.update(updater=updater) # 3.3 Calculate Loss diff = nd.abs( nd.choose_element_0index(outputs[0], actions) - target_rewards) quadratic_part = nd.clip(diff, -1, 1) loss = 0.5 * nd.sum(nd.square(quadratic_part)).asnumpy()[0] +\ nd.sum(diff - quadratic_part).asnumpy()[0] episode_loss += loss # 3.3 Update the target network every freeze_interval # (We can do annealing instead of hard copy) if training_steps % freeze_interval == 0: qnet.copy_params_to(target_qnet) steps_left -= game.episode_step time_episode_end = time.time() # Update the statistics epoch_reward += game.episode_reward if args.kv_type != None: info_str = "Node[%d]: " % kv.rank else: info_str = "" info_str += "Epoch:%d, Episode:%d, Steps Left:%d/%d, Reward:%f, fps:%f, Exploration:%f" \ % (epoch, episode, steps_left, steps_per_epoch, game.episode_reward, game.episode_step / (time_episode_end - time_episode_start), eps_curr) if episode_update_step > 0: info_str += ", Avg Loss:%f/%d" % ( episode_loss / episode_update_step, episode_update_step) if episode_action_step > 0: info_str += ", Avg Q Value:%f/%d" % ( episode_q_value / episode_action_step, episode_action_step) logging.info(info_str) end = time.time() fps = steps_per_epoch / (end - start) qnet.save_params(dir_path=args.dir_path, epoch=epoch) if args.kv_type is not None: logging.info( "Node[%d]: Epoch:%d, FPS:%f, Avg Reward: %f/%d" % (kv.rank, epoch, fps, epoch_reward / float(episode), episode)) else: logging.info("Epoch:%d, FPS:%f, Avg Reward: %f/%d" % (epoch, fps, epoch_reward / float(episode), episode))
def test_lognormal(): var = mx.symbol.Variable('var') data = mx.symbol.Variable('data') net_mean = mx.symbol.FullyConnected(data=data, name='fc_mean_1', num_hidden=20) net_mean = mx.symbol.Activation(data=net_mean, name='fc_mean_relu_1', act_type='relu') net_mean = mx.symbol.FullyConnected(data=data, name='fc_mean_2', num_hidden=20) net_mean = mx.symbol.Activation(data=net_mean, name='fc_mean_relu_2', act_type='relu') net_mean = mx.symbol.FullyConnected(data=net_mean, name='fc_mean_3', num_hidden=10) net_var = mx.symbol.FullyConnected(data=data, name='fc_var_1', num_hidden=10) net_var = mx.symbol.Activation(data=net_var, name='fc_var_softplus_1', act_type='softrelu') net = mx.symbol.Custom(mean=net_mean, var=net_var, name='policy', deterministic=False, entropy_regularization=0.01, op_type='LogNormalPolicy') ctx = mx.gpu() minibatch_size = 100 data_shapes = { 'data': (minibatch_size, 10), 'policy_score': (minibatch_size, ) } #, 'var':(minibatch_size,)} qnet = Base(data_shapes=data_shapes, sym_gen=net, name='PolicyNet', initializer=mx.initializer.Xavier(factor_type="in", magnitude=1.0), ctx=ctx) print qnet.internal_sym_names lr = 0.01 lr_scheduler = FactorScheduler(1000, 1.0 / 1.5) optimizer = mx.optimizer.create( name='sgd', learning_rate=lr, #momentum=0.9, clip_gradient=None, lr_scheduler=lr_scheduler, rescale_grad=1.0, wd=0.) updater = mx.optimizer.get_updater(optimizer) total_iter = 1000000 stats = numpy.zeros((total_iter, 3), dtype=numpy.float32) plt.ion() fig, ax = plt.subplots() lines, = ax.plot([], []) ax.set_autoscaley_on(True) baseline = 0 for i in range(total_iter): # for k, v in qnet.params.items(): # print k, v.asnumpy() data = numpy.random.randn(minibatch_size, 10) means = qnet.compute_internal(sym_name="fc_mean_3_output", data=data).asnumpy() vars = qnet.compute_internal(sym_name="fc_var_softplus_1_output", data=data).asnumpy() outputs = qnet.forward( is_train=True, data=data) #, var=0.5*numpy.ones((minibatch_size, ))) action = outputs[0].asnumpy() score = simple_game_multimodal(data, action, 1) baseline = baseline - 0.01 * (baseline - score.mean()) print 'score=', score.mean(), 'err=', numpy.square( means - data * data).mean(), 'var=', vars.mean(), 'baseline=', baseline stats[i] = [ score.mean(), numpy.square(means - data * data).mean(), vars.mean() ] qnet.backward(policy_score=score - baseline) norm_clipping(qnet.params_grad, 10) qnet.update(updater) if i % 10 == 0: update_line(lines, fig, ax, i, score.mean()) #numpy.square(means - data*data).mean())
updater = mx.optimizer.get_updater(optimizer) for i in range(max_iter): seqlen, data_in, data_out = gen_data(batch_size=batch_size, data_dim=data_dim, min_length=min_length, max_length=max_length) print(data_in.shape) print(seqlen) print(data_out.shape) outputs =\ net.forward(is_train=True, bucket_kwargs={'seqlen': seqlen}, **{'data': data_in, 'target': data_out}) net.backward() norm_clipping(net.params_grad, 10) net.update(updater=updater) # for k, v in net.params.items(): # print k, nd.norm(v).asnumpy() for k, v in net.params_grad.items(): print(k, nd.norm(v).asnumpy()) pred = outputs[0].reshape((seqlen, batch_size, data_dim)).asnumpy() state_over_time = outputs[1].asnumpy() read_weight_over_time = outputs[2].asnumpy() write_weight_over_time = outputs[3].asnumpy() read_content_over_time = outputs[4].asnumpy() erase_signal_over_time = outputs[5].asnumpy() add_signal_over_time = outputs[6].asnumpy() CV2Vis.display(data=pred[:, 0, :].T, win_name="prediction") CV2Vis.display(data=data_out[:, 0, :].T, win_name="target")
actions = np.concatenate([p["actions"] for p in paths]) q_estimations = np.concatenate([p["q_estimations"] for p in paths]) advantages = np.concatenate([p['advantages'] for p in paths]) cur_batch_size = observations.shape[0] outputs = net.forward( is_train=True, data=observations, var=1. * np.ones((cur_batch_size, 1)), ) policy_actions = outputs[0].asnumpy() critics = outputs[3].asnumpy() variance = outputs[2].asnumpy() action_mean = outputs[1].asnumpy() net.backward( policy_score=advantages, policy_backward_action=actions, critic_label=q_estimations.reshape(q_estimations.size, ), ) for grad in net.params_grad.values(): grad[:] = grad[:] / cur_batch_size if args.clip_gradient: norm_clipping(net.params_grad, 10) net.update(updater) print 'Epoch:%d, Average Return:%f, Max Return:%f, Min Return:%f, Num Traj:%d\n, Mean:%f, Var:%f, Average Baseline:%f' \ %(itr, np.mean([sum(p["rewards"]) for p in paths]), np.max([sum(p["rewards"]) for p in paths]), np.min([sum(p["rewards"]) for p in paths]), N, action_mean.mean(), variance.mean(), critics.mean() ) if args.save_model:
def mog_backward_test(batch_size=5, num_centers=11, sample_dim=33): prob = mx.symbol.Variable('prob') mean = mx.symbol.Variable('mean') var = mx.symbol.Variable('var') score = mx.symbol.Variable('score') out = mx.symbol.Custom(prob=prob, mean=mean, var=var, score=score, name='policy', op_type='LogMoGPolicy', implicit_backward=False) data_shapes = {'prob': (batch_size, num_centers), 'mean': (batch_size, num_centers, sample_dim), 'var': (batch_size, num_centers, sample_dim), 'score': (batch_size,), 'policy_backward_action': (batch_size, sample_dim)} net = Base(sym=out, data_shapes=data_shapes, ctx=mx.cpu()) prob_npy = get_numpy_rng().rand(batch_size, num_centers) mean_npy = get_numpy_rng().rand(batch_size, num_centers, sample_dim) * 1 + 5 var_npy = get_numpy_rng().rand(batch_size, num_centers, sample_dim) * 2 + 0.001 prob_npy = prob_npy / prob_npy.sum(axis=1).reshape(prob_npy.shape[0], 1) score_npy = get_numpy_rng().rand(batch_size, ) sample_npy = get_numpy_rng().rand(batch_size, sample_dim) * 1 + 5 net.forward(is_train=True, prob=prob_npy, mean=mean_npy, var=var_npy) net.backward(score=score_npy, policy_backward_action=sample_npy) def fd_grad(): eps = 1E-8 base_loglikelihood = logmog(prob=prob_npy, mean=mean_npy, var=var_npy, score=score_npy, sample=sample_npy) fd_prob_grad = numpy.empty(prob_npy.size, dtype=numpy.float32) fd_mean_grad = numpy.empty(mean_npy.size, dtype=numpy.float32) fd_var_grad = numpy.empty(var_npy.size, dtype=numpy.float32) prob_delta = numpy.zeros(prob_npy.size, dtype=numpy.float32) mean_delta = numpy.zeros(mean_npy.size, dtype=numpy.float32) var_delta = numpy.zeros(var_npy.size, dtype=numpy.float32) for i in range(prob_npy.size): prob_delta[i] = eps fd_prob_grad[i] = (logmog(prob=prob_npy + prob_delta.reshape(prob_npy.shape), mean=mean_npy, var=var_npy, score=score_npy, sample=sample_npy) - base_loglikelihood)/eps prob_delta[i] = 0 for i in range(mean_npy.size): mean_delta[i] = eps fd_mean_grad[i] = (logmog(prob=prob_npy, mean=mean_npy + mean_delta.reshape(mean_npy.shape), var=var_npy, score=score_npy, sample=sample_npy) - base_loglikelihood) / eps mean_delta[i] = 0 for i in range(var_npy.size): var_delta[i] = eps fd_var_grad[i] = (logmog(prob=prob_npy, mean=mean_npy, var=var_npy + var_delta.reshape(var_npy.shape), score=score_npy, sample=sample_npy) - base_loglikelihood) / eps var_delta[i] = 0 fd_prob_grad = fd_prob_grad.reshape(prob_npy.shape) fd_mean_grad = fd_mean_grad.reshape(mean_npy.shape) fd_var_grad = fd_var_grad.reshape(var_npy.shape) return fd_prob_grad, fd_mean_grad, fd_var_grad fd_prob_grad, fd_mean_grad, fd_var_grad = fd_grad() # print 'fd_prob_grad:', fd_prob_grad # print 'fd_mean_grad:', fd_mean_grad # print 'fd_var_grad:', fd_var_grad op_prob_grad = net.executor_pool.inputs_grad_dict.values()[0]['prob'].asnumpy() op_mean_grad = net.executor_pool.inputs_grad_dict.values()[0]['mean'].asnumpy() op_var_grad = net.executor_pool.inputs_grad_dict.values()[0]['var'].asnumpy() # print 'op_prob_grad:', op_prob_grad # print 'op_mean_grad:', op_mean_grad # print 'op_var_grad:', op_var_grad print 'prob_grad_diff:', numpy.square(op_prob_grad - fd_prob_grad).sum() print 'mean_grad_diff:', numpy.square(op_mean_grad - fd_mean_grad).sum() print 'var_grad_diff:', numpy.square(op_var_grad - fd_var_grad).sum()
def main(): parser = argparse.ArgumentParser(description='Script to test the trained network on a game.') parser.add_argument('-r', '--rom', required=False, type=str, default=os.path.join('arena', 'games', 'roms', 'breakout.bin'), help='Path of the ROM File.') parser.add_argument('-v', '--visualization', required=False, type=int, default=0, help='Visualize the runs.') parser.add_argument('--lr', required=False, type=float, default=0.01, help='Learning rate of the AdaGrad optimizer') parser.add_argument('--eps', required=False, type=float, default=0.01, help='Eps of the AdaGrad optimizer') parser.add_argument('--clip-gradient', required=False, type=float, default=None, help='Clip threshold of the AdaGrad optimizer') parser.add_argument('--double-q', required=False, type=bool, default=False, help='Use Double DQN') parser.add_argument('--wd', required=False, type=float, default=0.0, help='Weight of the L2 Regularizer') parser.add_argument('-c', '--ctx', required=False, type=str, default='gpu', help='Running Context. E.g `-c gpu` or `-c gpu1` or `-c cpu`') parser.add_argument('-d', '--dir-path', required=False, type=str, default='', help='Saving directory of model files.') parser.add_argument('--start-eps', required=False, type=float, default=1.0, help='Eps of the epsilon-greedy policy at the beginning') parser.add_argument('--replay-start-size', required=False, type=int, default=50000, help='The step that the training starts') parser.add_argument('--kvstore-update-period', required=False, type=int, default=1, help='The period that the worker updates the parameters from the sever') parser.add_argument('--kv-type', required=False, type=str, default=None, help='type of kvstore, default will not use kvstore, could also be dist_async') args, unknown = parser.parse_known_args() if args.dir_path == '': rom_name = os.path.splitext(os.path.basename(args.rom))[0] args.dir_path = 'dqn-%s' % rom_name ctx = re.findall('([a-z]+)(\d*)', args.ctx) ctx = [(device, int(num)) if len(num) >0 else (device, 0) for device, num in ctx] replay_start_size = args.replay_start_size max_start_nullops = 30 replay_memory_size = 1000000 history_length = 4 rows = 84 cols = 84 q_ctx = mx.Context(*ctx[0]) game = AtariGame(rom_path=args.rom, resize_mode='scale', replay_start_size=replay_start_size, resized_rows=rows, resized_cols=cols, max_null_op=max_start_nullops, replay_memory_size=replay_memory_size, display_screen=args.visualization, history_length=history_length) ##RUN NATURE freeze_interval = 10000 epoch_num = 200 steps_per_epoch = 250000 update_interval = 4 discount = 0.99 eps_start = args.start_eps eps_min = 0.1 eps_decay = (eps_start - 0.1) / 1000000 eps_curr = eps_start freeze_interval /= update_interval minibatch_size = 32 action_num = len(game.action_set) data_shapes = {'data': (minibatch_size, history_length) + (rows, cols), 'dqn_action': (minibatch_size,), 'dqn_reward': (minibatch_size,)} #optimizer = mx.optimizer.create(name='sgd', learning_rate=args.lr,wd=args.wd) optimizer = mx.optimizer.Nop() dqn_output_op = DQNOutputNpyOp() dqn_sym = dqn_sym_nature(action_num, dqn_output_op) qnet = Base(data_shapes=data_shapes, sym=dqn_sym, name='QNet', initializer=DQNInitializer(factor_type="in"), ctx=q_ctx) target_qnet = qnet.copy(name="TargetQNet", ctx=q_ctx) # Create kvstore testShape = (1,1686180*100) testParam = nd.ones(testShape,ctx=q_ctx) testGrad = nd.zeros(testShape,ctx=q_ctx) # Create kvstore if args.kv_type != None: kvType = args.kv_type kvStore = kvstore.create(kvType) #Initialize kvstore for idx,v in enumerate(qnet.params.values()): kvStore.init(idx,v); # Set optimizer on kvstore kvStore.set_optimizer(optimizer) kvstore_update_period = args.kvstore_update_period else: updater = mx.optimizer.get_updater(optimizer) # if args.kv_type != None: # kvType = args.kv_type # kvStore = kvstore.create(kvType) # kvStore.init(0,testParam) # testOptimizer = mx.optimizer.Nop() # kvStore.set_optimizer(testOptimizer) # kvstore_update_period = args.kvstore_update_period qnet.print_stat() target_qnet.print_stat() # Begin Playing Game training_steps = 0 total_steps = 0 while(1): time_before_wait = time.time() # kvStore.push(0,testGrad,priority=0) # kvStore.pull(0,testParam,priority=0) # testParam.wait_to_read() for paramIndex in range(len(qnet.params)):#range(6):# k=qnet.params.keys()[paramIndex] kvStore.push(paramIndex,qnet.params_grad[k],priority=-paramIndex) kvStore.pull(paramIndex,qnet.params[k],priority=-paramIndex) for v in qnet.params.values(): v.wait_to_read() logging.info("wait time %f" %(time.time()-time_before_wait)) for epoch in xrange(epoch_num): # Run Epoch steps_left = steps_per_epoch episode = 0 epoch_reward = 0 start = time.time() game.start() while steps_left > 0: # Running New Episode episode += 1 episode_loss = 0.0 episode_q_value = 0.0 episode_update_step = 0 episode_action_step = 0 time_episode_start = time.time() game.begin_episode(steps_left) while not game.episode_terminate: # 1. We need to choose a new action based on the current game status if game.state_enabled and game.replay_memory.sample_enabled: do_exploration = (npy_rng.rand() < eps_curr) eps_curr = max(eps_curr - eps_decay, eps_min) if do_exploration: action = npy_rng.randint(action_num) else: # TODO Here we can in fact play multiple gaming instances simultaneously and make actions for each # We can simply stack the current_state() of gaming instances and give prediction for all of them # We need to wait after calling calc_score(.), which makes the program slow # TODO Profiling the speed of this part! current_state = game.current_state() state = nd.array(current_state.reshape((1,) + current_state.shape), ctx=q_ctx) / float(255.0) qval_npy = qnet.forward(batch_size=1, data=state)[0].asnumpy() action = numpy.argmax(qval_npy) episode_q_value += qval_npy[0, action] episode_action_step += 1 else: action = npy_rng.randint(action_num) # 2. Play the game for a single mega-step (Inside the game, the action may be repeated for several times) game.play(action) total_steps += 1 # 3. Update our Q network if we can start sampling from the replay memory # Also, we update every `update_interval` if total_steps % update_interval == 0 and game.replay_memory.sample_enabled: # 3.1 Draw sample from the replay_memory training_steps += 1 episode_update_step += 1 states, actions, rewards, next_states, terminate_flags \ = game.replay_memory.sample(batch_size=minibatch_size) states = nd.array(states, ctx=q_ctx) / float(255.0) next_states = nd.array(next_states, ctx=q_ctx) / float(255.0) actions = nd.array(actions, ctx=q_ctx) rewards = nd.array(rewards, ctx=q_ctx) terminate_flags = nd.array(terminate_flags, ctx=q_ctx) # 3.2 Use the target network to compute the scores and # get the corresponding target rewards if not args.double_q: target_qval = target_qnet.forward(batch_size=minibatch_size, data=next_states)[0] target_rewards = rewards + nd.choose_element_0index(target_qval, nd.argmax_channel(target_qval))\ * (1.0 - terminate_flags) * discount else: target_qval = target_qnet.forward(batch_size=minibatch_size, data=next_states)[0] qval = qnet.forward(batch_size=minibatch_size, data=next_states)[0] target_rewards = rewards + nd.choose_element_0index(target_qval, nd.argmax_channel(qval))\ * (1.0 - terminate_flags) * discount outputs = qnet.forward(batch_size=minibatch_size,is_train=True, data=states, dqn_action=actions, dqn_reward=target_rewards) qnet.backward(batch_size=minibatch_size) nd.waitall() time_before_update = time.time() if args.kv_type != None: if total_steps % kvstore_update_period == 0: update_to_kvstore(kvStore,qnet.params,qnet.params_grad) else: qnet.update(updater=updater) logging.info("update time %f" %(time.time()-time_before_update)) time_before_wait = time.time() nd.waitall() logging.info("wait time %f" %(time.time()-time_before_wait)) '''nd.waitall() time_before_wait = time.time() kvStore.push(0,testGrad,priority=0) kvStore.pull(0,testParam,priority=0) nd.waitall() logging.info("wait time %f" %(time.time()-time_before_wait))''' # 3.3 Calculate Loss diff = nd.abs(nd.choose_element_0index(outputs[0], actions) - target_rewards) quadratic_part = nd.clip(diff, -1, 1) loss = (0.5 * nd.sum(nd.square(quadratic_part)) + nd.sum(diff - quadratic_part)).asscalar() episode_loss += loss # 3.3 Update the target network every freeze_interval # (We can do annealing instead of hard copy) if training_steps % freeze_interval == 0: qnet.copy_params_to(target_qnet) steps_left -= game.episode_step time_episode_end = time.time() # Update the statistics epoch_reward += game.episode_reward info_str = "Epoch:%d, Episode:%d, Steps Left:%d/%d, Reward:%f, fps:%f, Exploration:%f" \ % (epoch, episode, steps_left, steps_per_epoch, game.episode_reward, game.episode_step / (time_episode_end - time_episode_start), eps_curr) if episode_update_step > 0: info_str += ", Avg Loss:%f/%d" % (episode_loss / episode_update_step, episode_update_step) if episode_action_step > 0: info_str += ", Avg Q Value:%f/%d" % (episode_q_value / episode_action_step, episode_action_step) logging.info(info_str) end = time.time() fps = steps_per_epoch / (end - start) qnet.save_params(dir_path=args.dir_path, epoch=epoch) logging.info("Epoch:%d, FPS:%f, Avg Reward: %f/%d" % (epoch, fps, epoch_reward / float(episode), episode))
def main(): parser = argparse.ArgumentParser( description='Script to test the trained network on a game.') parser.add_argument('-r', '--rom', required=False, type=str, default=os.path.join('arena', 'games', 'roms', 'breakout.bin'), help='Path of the ROM File.') parser.add_argument('-v', '--visualization', required=False, type=int, default=0, help='Visualize the runs.') parser.add_argument('--lr', required=False, type=float, default=0.01, help='Learning rate of the AdaGrad optimizer') parser.add_argument('--eps', required=False, type=float, default=0.01, help='Eps of the AdaGrad optimizer') parser.add_argument('--rms-decay', required=False, type=float, default=0.95, help='Decay rate of the RMSProp') parser.add_argument('--clip-gradient', required=False, type=float, default=None, help='Clip threshold of the AdaGrad optimizer') parser.add_argument('--double-q', required=False, type=bool, default=False, help='Use Double DQN') parser.add_argument('--wd', required=False, type=float, default=0.0, help='Weight of the L2 Regularizer') parser.add_argument( '-c', '--ctx', required=False, type=str, default=None, help='Running Context. E.g `-c gpu` or `-c gpu1` or `-c cpu`') parser.add_argument('-d', '--dir-path', required=False, type=str, default='', help='Saving directory of model files.') parser.add_argument( '--start-eps', required=False, type=float, default=1.0, help='Eps of the epsilon-greedy policy at the beginning') parser.add_argument('--replay-start-size', required=False, type=int, default=50000, help='The step that the training starts') parser.add_argument( '--kvstore-update-period', required=False, type=int, default=16, help='The period that the worker updates the parameters from the sever' ) parser.add_argument( '--kv-type', required=False, type=str, default=None, help= 'type of kvstore, default will not use kvstore, could also be dist_async' ) parser.add_argument('--optimizer', required=False, type=str, default="adagrad", help='type of optimizer') parser.add_argument('--nactor', required=False, type=int, default=16, help='number of actor') parser.add_argument('--exploration-period', required=False, type=int, default=4000000, help='length of annealing of epsilon greedy policy') parser.add_argument('--replay-memory-size', required=False, type=int, default=100, help='size of replay memory') parser.add_argument('--single-batch-size', required=False, type=int, default=5, help='batch size for every actor') parser.add_argument('--symbol', required=False, type=str, default="nature", help='type of network, nature or nips') parser.add_argument('--sample-policy', required=False, type=str, default="recent", help='minibatch sampling policy, recent or random') parser.add_argument('--epoch-num', required=False, type=int, default=50, help='number of epochs') parser.add_argument('--param-update-period', required=False, type=int, default=5, help='Parameter update period') parser.add_argument('--resize-mode', required=False, type=str, default="scale", help='Resize mode, scale or crop') parser.add_argument('--eps-update-period', required=False, type=int, default=8000, help='eps greedy policy update period') parser.add_argument('--server-optimizer', required=False, type=str, default="easgd", help='type of server optimizer') parser.add_argument('--nworker', required=False, type=int, default=1, help='number of kv worker') parser.add_argument('--easgd-alpha', required=False, type=float, default=0.01, help='easgd alpha') args, unknown = parser.parse_known_args() logging.info(str(args)) if args.dir_path == '': rom_name = os.path.splitext(os.path.basename(args.rom))[0] time_str = time.strftime("%m%d_%H%M_%S", time.localtime()) args.dir_path = ('dqn-%s-%d_' % (rom_name,int(args.lr*10**5)))+time_str \ + "_" + os.environ.get('DMLC_TASK_ID') logging.info("saving to dir: " + args.dir_path) if args.ctx == None: args.ctx = os.environ.get('CTX') logging.info("Context: %s" % args.ctx) ctx = re.findall('([a-z]+)(\d*)', args.ctx) ctx = [(device, int(num)) if len(num) > 0 else (device, 0) for device, num in ctx] # Async verision nactor = args.nactor param_update_period = args.param_update_period replay_start_size = args.replay_start_size max_start_nullops = 30 replay_memory_size = args.replay_memory_size history_length = 4 rows = 84 cols = 84 q_ctx = mx.Context(*ctx[0]) games = [] for g in range(nactor): games.append( AtariGame(rom_path=args.rom, resize_mode=args.resize_mode, replay_start_size=replay_start_size, resized_rows=rows, resized_cols=cols, max_null_op=max_start_nullops, replay_memory_size=replay_memory_size, display_screen=args.visualization, history_length=history_length)) ##RUN NATURE freeze_interval = 40000 / nactor freeze_interval /= param_update_period epoch_num = args.epoch_num steps_per_epoch = 4000000 / nactor discount = 0.99 save_screens = False eps_start = numpy.ones((3, )) * args.start_eps eps_min = numpy.array([0.1, 0.01, 0.5]) eps_decay = (eps_start - eps_min) / (args.exploration_period / nactor) eps_curr = eps_start eps_id = numpy.zeros((nactor, )) eps_update_period = args.eps_update_period eps_update_count = numpy.zeros((nactor, )) single_batch_size = args.single_batch_size minibatch_size = nactor * single_batch_size action_num = len(games[0].action_set) data_shapes = { 'data': (minibatch_size, history_length) + (rows, cols), 'dqn_action': (minibatch_size, ), 'dqn_reward': (minibatch_size, ) } if args.symbol == "nature": dqn_sym = dqn_sym_nature(action_num) elif args.symbol == "nips": dqn_sym = dqn_sym_nips(action_num) else: raise NotImplementedError qnet = Base(data_shapes=data_shapes, sym=dqn_sym, name='QNet', initializer=DQNInitializer(factor_type="in"), ctx=q_ctx) target_qnet = qnet.copy(name="TargetQNet", ctx=q_ctx) if args.optimizer == "adagrad": optimizer = mx.optimizer.create(name=args.optimizer, learning_rate=args.lr, eps=args.eps, clip_gradient=args.clip_gradient, rescale_grad=1.0, wd=args.wd) elif args.optimizer == "rmsprop" or args.optimizer == "rmspropnoncentered": optimizer = mx.optimizer.create(name=args.optimizer, learning_rate=args.lr, eps=args.eps, clip_gradient=args.clip_gradient, gamma1=args.rms_decay, gamma2=0, rescale_grad=1.0, wd=args.wd) lr_decay = (args.lr - 0) / (steps_per_epoch * epoch_num / param_update_period) # Create kvstore use_easgd = False if args.kv_type != None: kvType = args.kv_type kv = kvstore.create(kvType) #Initialize kvstore for idx, v in enumerate(qnet.params.values()): kv.init(idx, v) if args.server_optimizer == "easgd": use_easgd = True easgd_beta = 0.9 easgd_alpha = args.easgd_alpha server_optimizer = mx.optimizer.create(name="ServerEasgd", learning_rate=easgd_alpha) easgd_eta = 0.00025 central_weight = OrderedDict([(n, v.copyto(q_ctx)) for n, v in qnet.params.items()]) kv.set_optimizer(server_optimizer) updater = mx.optimizer.get_updater(optimizer) else: kv.set_optimizer(optimizer) kvstore_update_period = args.kvstore_update_period npy_rng = numpy.random.RandomState(123456 + kv.rank) else: updater = mx.optimizer.get_updater(optimizer) qnet.print_stat() target_qnet.print_stat() states_buffer_for_act = numpy.zeros( (nactor, history_length) + (rows, cols), dtype='uint8') states_buffer_for_train = numpy.zeros( (minibatch_size, history_length + 1) + (rows, cols), dtype='uint8') next_states_buffer_for_train = numpy.zeros( (minibatch_size, history_length) + (rows, cols), dtype='uint8') actions_buffer_for_train = numpy.zeros((minibatch_size, ), dtype='uint8') rewards_buffer_for_train = numpy.zeros((minibatch_size, ), dtype='float32') terminate_flags_buffer_for_train = numpy.zeros((minibatch_size, ), dtype='bool') # Begin Playing Game training_steps = 0 total_steps = 0 ave_fps = 0 ave_loss = 0 time_for_info = time.time() parallel_executor = concurrent.futures.ThreadPoolExecutor(nactor) for epoch in xrange(epoch_num): # Run Epoch steps_left = steps_per_epoch episode = 0 epoch_reward = 0 start = time.time() # for g, game in enumerate(games): game.start() game.begin_episode() eps_rand = npy_rng.rand() if eps_rand < 0.4: eps_id[g] = 0 elif eps_rand < 0.7: eps_id[g] = 1 else: eps_id[g] = 2 episode_stats = [EpisodeStat() for i in range(len(games))] while steps_left > 0: for g, game in enumerate(games): if game.episode_terminate: episode += 1 epoch_reward += game.episode_reward if args.kv_type != None: info_str = "Node[%d]: " % kv.rank else: info_str = "" info_str += "Epoch:%d, Episode:%d, Steps Left:%d/%d, Reward:%f, fps:%f, Exploration:%f" \ % (epoch, episode, steps_left, steps_per_epoch, game.episode_reward, ave_fps, (eps_curr[eps_id[g]])) info_str += ", Avg Loss:%f" % ave_loss if episode_stats[g].episode_action_step > 0: info_str += ", Avg Q Value:%f/%d" % ( episode_stats[g].episode_q_value / episode_stats[g].episode_action_step, episode_stats[g].episode_action_step) if g == 0: logging.info(info_str) if eps_update_count[g] * eps_update_period < total_steps: eps_rand = npy_rng.rand() if eps_rand < 0.4: eps_id[g] = 0 elif eps_rand < 0.7: eps_id[g] = 1 else: eps_id[g] = 2 eps_update_count[g] += 1 game.begin_episode(steps_left) episode_stats[g] = EpisodeStat() if total_steps > history_length: for g, game in enumerate(games): current_state = game.current_state() states_buffer_for_act[g] = current_state states = nd.array(states_buffer_for_act, ctx=q_ctx) / float(255.0) qval_npy = qnet.forward(batch_size=nactor, data=states)[0].asnumpy() actions_that_max_q = numpy.argmax(qval_npy, axis=1) actions = [0] * nactor for g, game in enumerate(games): # 1. We need to choose a new action based on the current game status if games[g].state_enabled and games[ g].replay_memory.sample_enabled: do_exploration = (npy_rng.rand() < eps_curr[eps_id[g]]) if do_exploration: action = npy_rng.randint(action_num) else: # TODO Here we can in fact play multiple gaming instances simultaneously and make actions for each # We can simply stack the current_state() of gaming instances and give prediction for all of them # We need to wait after calling calc_score(.), which makes the program slow # TODO Profiling the speed of this part! action = actions_that_max_q[g] episode_stats[g].episode_q_value += qval_npy[g, action] episode_stats[g].episode_action_step += 1 else: action = npy_rng.randint(action_num) actions[g] = action # t0=time.time() for ret in parallel_executor.map(play_game, zip(games, actions)): pass # t1=time.time() # logging.info("play time: %f" % (t1-t0)) eps_curr = numpy.maximum(eps_curr - eps_decay, eps_min) total_steps += 1 steps_left -= 1 if total_steps % 100 == 0: this_time = time.time() ave_fps = (100 / (this_time - time_for_info)) time_for_info = this_time # 3. Update our Q network if we can start sampling from the replay memory # Also, we update every `update_interval` if total_steps > minibatch_size and \ total_steps % (param_update_period) == 0 and \ games[-1].replay_memory.sample_enabled: if use_easgd and training_steps % kvstore_update_period == 0: for paramIndex in range(len(qnet.params)): k = qnet.params.keys()[paramIndex] kv.pull(paramIndex, central_weight[k], priority=-paramIndex) qnet.params[k][:] -= easgd_alpha * (qnet.params[k] - central_weight[k]) kv.push(paramIndex, qnet.params[k], priority=-paramIndex) # 3.1 Draw sample from the replay_memory for g, game in enumerate(games): episode_stats[g].episode_update_step += 1 nsample = single_batch_size i0 = (g * nsample) i1 = (g + 1) * nsample if args.sample_policy == "recent": action, reward, terminate_flag=game.replay_memory.sample_last(batch_size=nsample,\ states=states_buffer_for_train,offset=i0) elif args.sample_policy == "random": action, reward, terminate_flag=game.replay_memory.sample_inplace(batch_size=nsample,\ states=states_buffer_for_train,offset=i0) actions_buffer_for_train[i0:i1] = action rewards_buffer_for_train[i0:i1] = reward terminate_flags_buffer_for_train[i0:i1] = terminate_flag states = nd.array(states_buffer_for_train[:, :-1], ctx=q_ctx) / float(255.0) next_states = nd.array(states_buffer_for_train[:, 1:], ctx=q_ctx) / float(255.0) actions = nd.array(actions_buffer_for_train, ctx=q_ctx) rewards = nd.array(rewards_buffer_for_train, ctx=q_ctx) terminate_flags = nd.array(terminate_flags_buffer_for_train, ctx=q_ctx) # 3.2 Use the target network to compute the scores and # get the corresponding target rewards if not args.double_q: target_qval = target_qnet.forward( batch_size=minibatch_size, data=next_states)[0] target_rewards = rewards + nd.choose_element_0index(target_qval, nd.argmax_channel(target_qval))\ * (1.0 - terminate_flags) * discount else: target_qval = target_qnet.forward( batch_size=minibatch_size, data=next_states)[0] qval = qnet.forward(batch_size=minibatch_size, data=next_states)[0] target_rewards = rewards + nd.choose_element_0index(target_qval, nd.argmax_channel(qval))\ * (1.0 - terminate_flags) * discount outputs = qnet.forward(batch_size=minibatch_size, is_train=True, data=states, dqn_action=actions, dqn_reward=target_rewards) qnet.backward(batch_size=minibatch_size) if args.kv_type is None or use_easgd: qnet.update(updater=updater) else: update_on_kvstore(kv, qnet.params, qnet.params_grad) # 3.3 Calculate Loss diff = nd.abs( nd.choose_element_0index(outputs[0], actions) - target_rewards) quadratic_part = nd.clip(diff, -1, 1) loss = (0.5 * nd.sum(nd.square(quadratic_part)) + nd.sum(diff - quadratic_part)).asscalar() if ave_loss == 0: ave_loss = loss else: ave_loss = 0.95 * ave_loss + 0.05 * loss # 3.3 Update the target network every freeze_interval # (We can do annealing instead of hard copy) if training_steps % freeze_interval == 0: qnet.copy_params_to(target_qnet) if args.optimizer == "rmsprop" or args.optimizer == "rmspropnoncentered": optimizer.lr -= lr_decay if save_screens and training_steps % ( 60 * 60 * 2 / param_update_period) == 0: logging.info("saving screenshots") for g in range(nactor): screen = states_buffer_for_train[( g * single_batch_size), -2, :, :].reshape( states_buffer_for_train.shape[2:]) cv2.imwrite("screen_" + str(g) + ".png", screen) training_steps += 1 end = time.time() fps = steps_per_epoch / (end - start) qnet.save_params(dir_path=args.dir_path, epoch=epoch) if args.kv_type != None: logging.info( "Node[%d]: Epoch:%d, FPS:%f, Avg Reward: %f/%d" % (kv.rank, epoch, fps, epoch_reward / float(episode), episode)) else: logging.info("Epoch:%d, FPS:%f, Avg Reward: %f/%d" % (epoch, fps, epoch_reward / float(episode), episode))
def test_mxnet_binary(test_operation, typ): if 'div' == test_operation: numpy_outf = complex_div numpy_gradf = complex_div_grad if typ == 'rc': test_sym = mx.symbol.complex_div_rc elif typ == 'cc': test_sym = mx.symbol.complex_div_cc elif typ == 'cr': test_sym = mx.symbol.complex_div_cr else: numpy_outf = complex_mul numpy_gradf = complex_mul_grad if typ == 'rc': test_sym = mx.symbol.complex_mul_rc elif typ == 'cc': test_sym = mx.symbol.complex_mul_cc elif typ == 'cr': test_sym = mx.symbol.complex_mul_cr a = mx.symbol.Variable('a') b = mx.symbol.Variable('b') c = test_sym(a, b) base_complex_shape = (10, 10, 6) base_real_shape = (10, 10, 3) if 'cc' == typ: data_shapes = {'a': base_complex_shape, 'b': base_complex_shape} a_complex_npy = numpy.random.rand(*base_real_shape) + \ numpy.random.rand(*base_real_shape) * 1j b_complex_npy = numpy.random.rand(*base_real_shape) + \ numpy.random.rand(*base_real_shape) * 1j a_npy = numpy.empty(data_shapes['a']) b_npy = numpy.empty(data_shapes['b']) a_npy[:, :, ::2] = a_complex_npy.real a_npy[:, :, 1::2] = a_complex_npy.imag b_npy[:, :, ::2] = b_complex_npy.real b_npy[:, :, 1::2] = b_complex_npy.imag net = Base(data_shapes=data_shapes, sym=c) outputs = net.forward(a=a_npy, b=b_npy) out_grad = numpy.random.rand(*data_shapes['a']) print numpy.square( outputs[0].asnumpy()[:, :, ::2] - numpy_outf(a_complex_npy, b_complex_npy).real).sum() print numpy.square( outputs[0].asnumpy()[:, :, 1::2] - numpy_outf(a_complex_npy, b_complex_npy).imag).sum() net.backward(out_grads=[nd.array(out_grad, ctx=mx.gpu())]) a_grad_npy, b_grad_npy = numpy_gradf( out_grad[:, :, ::2] + out_grad[:, :, 1::2] * 1j, a_complex_npy, b_complex_npy) print numpy.square(net.exe.grad_dict['a'].asnumpy()[:, :, ::2] - a_grad_npy.real).sum() print numpy.square(net.exe.grad_dict['a'].asnumpy()[:, :, 1::2] - a_grad_npy.imag).sum() print numpy.square(net.exe.grad_dict['b'].asnumpy()[:, :, ::2] - b_grad_npy.real).sum() print numpy.square(net.exe.grad_dict['b'].asnumpy()[:, :, 1::2] - b_grad_npy.imag).sum() elif 'rc' == typ: data_shapes = {'a': base_real_shape, 'b': base_complex_shape} a_complex_npy = numpy.random.rand(*base_real_shape) b_complex_npy = numpy.random.rand(*base_real_shape) + \ numpy.random.rand(*base_real_shape) * 1j a_npy = numpy.empty(data_shapes['a']) b_npy = numpy.empty(data_shapes['b']) a_npy = a_complex_npy b_npy[:, :, ::2] = b_complex_npy.real b_npy[:, :, 1::2] = b_complex_npy.imag net = Base(data_shapes=data_shapes, sym=c) outputs = net.forward(a=a_npy, b=b_npy) out_grad = numpy.random.rand(*data_shapes['b']) print numpy.square( outputs[0].asnumpy()[:, :, ::2] - numpy_outf(a_complex_npy, b_complex_npy).real).sum() print numpy.square( outputs[0].asnumpy()[:, :, 1::2] - numpy_outf(a_complex_npy, b_complex_npy).imag).sum() net.backward(out_grads=[nd.array(out_grad, ctx=mx.gpu())]) a_grad_npy, b_grad_npy = numpy_gradf( out_grad[:, :, ::2] + out_grad[:, :, 1::2] * 1j, a_complex_npy, b_complex_npy) print numpy.square(net.exe.grad_dict['a'].asnumpy() - a_grad_npy.real).sum() print numpy.square(net.exe.grad_dict['b'].asnumpy()[:, :, ::2] - b_grad_npy.real).sum() print numpy.square(net.exe.grad_dict['b'].asnumpy()[:, :, 1::2] - b_grad_npy.imag).sum() else: data_shapes = {'a': base_complex_shape, 'b': base_real_shape} a_complex_npy = numpy.random.rand(*base_real_shape) + \ numpy.random.rand(*base_real_shape) * 1j b_complex_npy = numpy.random.rand(*data_shapes['b']) a_npy = numpy.empty(data_shapes['a']) b_npy = numpy.empty(data_shapes['b']) a_npy[:, :, ::2] = a_complex_npy.real a_npy[:, :, 1::2] = a_complex_npy.imag b_npy = b_complex_npy.real net = Base(data_shapes=data_shapes, sym=c) outputs = net.forward(a=a_npy, b=b_npy) out_grad = numpy.random.rand(*data_shapes['a']) print numpy.square( outputs[0].asnumpy()[:, :, ::2] - numpy_outf(a_complex_npy, b_complex_npy).real).sum() print numpy.square( outputs[0].asnumpy()[:, :, 1::2] - numpy_outf(a_complex_npy, b_complex_npy).imag).sum() net.backward(out_grads=[nd.array(out_grad, ctx=mx.gpu())]) a_grad_npy, b_grad_npy = numpy_gradf( out_grad[:, :, ::2] + out_grad[:, :, 1::2] * 1j, a_complex_npy, b_complex_npy) print numpy.square(net.exe.grad_dict['a'].asnumpy()[:, :, ::2] - a_grad_npy.real).sum() print numpy.square(net.exe.grad_dict['a'].asnumpy()[:, :, 1::2] - a_grad_npy.imag).sum() print numpy.square(net.exe.grad_dict['b'].asnumpy() - b_grad_npy.real).sum()
clip_gradient=None, rescale_grad=1.0 / minibatch_size, wd=0.00001) updater = mx.optimizer.get_updater(optimizer) qnet.print_stat() baseline = numpy.zeros((time_step, )) decay_factor = 0.5 for epoch in range(10000): data = [("data", numpy.random.rand(minibatch_size, 4))] data_ndarray = {k: nd.array(v, ctx=mx.gpu()) for k, v in data} outputs = qnet.forward(batch_size=minibatch_size, **data_ndarray) actions = get_npy_list(outputs[:time_step]) means = get_npy_list(outputs[time_step:(time_step * 2)]) vars = get_npy_list(outputs[(time_step * 2):(time_step * 3)]) scores = simple_sequence_generation_game(dict(data), actions) scores = [ score * pow(decay_factor, time_step - 1 - i) for i, score in enumerate(scores) ] q_estimation = numpy.cumsum(scores[::-1], axis=0)[::-1] baseline = baseline - 0.01 * (baseline - q_estimation.mean(axis=1)) qnet.backward( batch_size=minibatch_size, **dict(data_ndarray.items() + [("policy_t%d_score" % (i), score) for i, score in enumerate(q_estimation - baseline.reshape(time_step, 1))])) qnet.update(updater) print 'scores=', numpy.mean(scores), 'mean_score=', numpy.mean(simple_sequence_generation_game(dict(data), means)), \ 'baseline=', baseline