Пример #1
0
			dt = 1
		else:
			dt = 0
		totalR += rt
		
		# store transition
		if NEW_EXPERIENCE:
			R.StoreTransition(st, np.array([a_index]), np.array([rt]), st_next, dt)
		st = st_next
		
		E_local=[0]
		if episode_i > OBSERVATION_PHASE:
			E_local=[]
			for mini_batch in xrange(BATCHES):
				# sample mini batch
				s_batch, a_batch, r_batch, stag_batch, terminal_batch, num = R.SampleMiniBatch(MINI_BATCH)
				Y = Q.evaluate(sess, s_batch)
			
				# Double DQN update	
				#Q_next_arg = Q.evaluate(sess, stag_batch)
				#Q_next_argmax = np.argmax(Q_next_arg,1)
				#Q_next_target = Q_target.evaluate(sess, stag_batch)

				#a_batch = a_batch.astype(int)
				#for i in range(num):
				#	Y[i,a_batch[i,0]] = r_batch[i,0] + GAMMA*Q_next_target[i,Q_next_argmax[i]] * (1-terminal_batch[i])

				#if ONLY_OUTPUT:
				#	error = Q.train_output(sess, s_batch, Y)
				#else:
				#	error = Q.train(sess, s_batch, Y)
Пример #2
0
                simulator.SimulateNeuralEpisode(Q, sess, env_middle, False))
    L.AddRecord('network_right',
                simulator.SimulateNeuralEpisode(Q, sess, env_right, False))
    L.AddRecord(
        'policy_left',
        simulator.SimulatePolicyEpisode(policy, discretizer, env_left, False))
    L.AddRecord(
        'policy_middle',
        simulator.SimulatePolicyEpisode(policy, discretizer, env_middle,
                                        False))
    L.AddRecord(
        'policy_right',
        simulator.SimulatePolicyEpisode(policy, discretizer, env_right, False))
    L.AddRecord('total_reward', totalR)
    L.AddRecord('error', totalE)
    s_est, _, _, _, _, num = R_val.SampleMiniBatch(V_EST)
    Q_est_arg = Q.evaluate(sess, s_est)
    Q_est_argmax = np.argmax(Q_est_arg, 1) * 1.0
    V_est = Q_est_argmax.sum() / num * 1.0
    L.AddRecord('estimated_value', V_est)

    # update target network
    if steps >= C_STEPS:
        Ws, bs = Q.get_weights()
        Q_target.assign(sess, Ws, bs)
        print('updating traget network')
        steps = 0
    steps += 1

    # update reward log
    if onPolicy == False: