def test_step_with_kinematic():
    env_config = configparser.RawConfigParser()
    env_config.read('configs/test_env.config')
    env_config.set('agent', 'kinematic', 'true')
    test_env = ENV(env_config, phase='test')
    test_env.reset()

    # test state computation
    states, rewards, done_signals = test_env.step((Action(1, 0), Action(1, 0)))
    assert np.allclose(
        states[0], JointState(-1, 0, 1, 0, 0.3, 2, 0, 1.0, 0, 1, 0, -1, 0,
                              0.3))
    assert np.allclose(
        states[1],
        JointState(1, 0, -1, 0, 0.3, -2, 0, 1.0, np.pi, -1, 0, 1, 0, 0.3))
    assert rewards == [0, 0]
    assert done_signals == [False, False]

    # test one-step lookahead
    reward, end_time = test_env.compute_reward(0, [Action(1.5, 0), None])
    assert reward == -0.25
    assert end_time == 1

    reward, end_time = test_env.compute_reward(
        0, [Action(1.5, 0), Action(1.5, 0)])
    assert reward == -0.25
    assert end_time == 0.5

    # test collision detection
    states, rewards, done_signals = test_env.step((Action(1, 0), Action(1, 0)))
    assert np.allclose(
        states[0], JointState(0, 0, 1, 0, 0.3, 2, 0, 1.0, 0, 0, 0, -1, 0, 0.3))
    assert np.allclose(
        states[1],
        JointState(0, 0, -1, 0, 0.3, -2, 0, 1.0, np.pi, 0, 0, 1, 0, 0.3))
    assert rewards == [-0.25, -0.25]
    assert done_signals == [2, 2]

    # test reaching goal
    test_env = ENV(env_config, phase='test')
    test_env.reset()
    test_env.step((Action(1, np.pi / 2), Action(2, np.pi / 2)))
    test_env.step((Action(4, -np.pi / 2), Action(4, -np.pi / 2)))
    states, rewards, done_signals = test_env.step(
        (Action(1, -np.pi / 2), Action(2, -np.pi / 2)))
    assert rewards == [1, 1]
    assert done_signals == [1, 1]
예제 #2
0
def main():
    args = parse_args()
    for key in vars(args).keys():
        print('[*] {} = {}'.format(key, vars(args)[key]))

    save_dir = make_saving_dir(args)
    print(save_dir)
    result = np.zeros((args.n_sample, args.n_trial, args.max_ep, 2))

    for sample in range(args.n_sample):
        env = ENV(mapFile=args.map_name, random=args.random)
        model = {'MBIE': mbie.MBIE(env, args.beta), 'MBIE_NS': mbie.MBIE_NS(env, args.beta),\
            'DH': hindsight.DH(env, bool(args.ent_known),args.beta, args.lambd),\
            'DO': outcome.DO(env, bool(args.ent_known), args.beta, args.lambd)}
        print('sample {} out of {}'.format(sample, args.n_sample))
        env._render()

        np.save(save_dir + "map_sample_{}.npy".format(sample), env.map)
        for trial in range(args.n_trial):
            print('trail = {}'.format(trial))
            mrl = model[args.method]
            mrl.reset()
            for episode in range(args.max_ep):
                terminal = False
                step = 0
                R = []
                s = env.reset()
                while not terminal and step < args.max_step:
                    action = np.random.choice(np.flatnonzero(mrl.Q[s, :] == mrl.Q[s,:].max()))
                    ns, r, terminal = env.step(action)
                    R.append(r)
                    mrl.observe(s,action,ns,r, terminal)
                    step += 1
                    s = ns

                result[sample, trial, episode, 0] = step
                result[sample, trial, episode, 1] = disc_return(R, mrl.gamma)
                mrl.Qupdate()
                print(episode, step, disc_return(R, mrl.gamma), np.max(mrl.Q))
                #print(np.max(mrl.Q, axis=1).reshape(13,13))
            try:
                np.save(save_dir + "entopy_trail_{}_sample_{}.npy".format(trial, sample), mrl.entropy)
            except:
                print("No entropy is saving")
            np.save(save_dir + "count_trail_{}_sample_{}.npy".format(trial, sample), mrl.count)
    np.save(save_dir + 'results.npy', result)
예제 #3
0
def main():
	original_size = (782, 600)
	env = ENV(actions, (original_size[0]/6, original_size[1]/6))
	gamma = 0.9
	epsilon = .95
	model_ph = 'models'
	if not os.path.exists(model_ph):
		os.mkdir(model_ph)
	trials = 500
	trial_len = 1000
	rewards = []
	q_values = []

	dqn_agent = DQN(env=env)
	success_num = 0
	rewards = []
	q_values = []
	Q = []
	for trial in range(1, trials):
		t_reward = []
		t_qvalue = []
		cur_state = env.reset()
		for step in range(trial_len):
			action = dqn_agent.act(cur_state)
			new_state, reward, done, success = env.step(action)
			t_reward.append(reward)
			
			# reward = reward if not done else -20
			dqn_agent.remember(cur_state, action, reward, new_state, done)
	
			q_value = dqn_agent.replay()  # internally iterates default (prediction) model
			if q_value:
				t_qvalue.append(q_value)
				Q.append(q_value)
			else:
				t_qvalue.append(0.0)
				Q.append(0.0)
			dqn_agent.target_train()  # iterates target model
			cur_state = new_state

			dqn_agent.log_result()

			save_q(Q)

			if success:
				success_num += 1
				dqn_agent.step = 100
				print("Completed in {} trials".format(trial))
				dqn_agent.save_model(os.path.join(model_ph, "success-model.h5"))
				break
			if done:
				print("Failed to complete in trial {}, step {}".format(trial, step))
				dqn_agent.save_model(os.path.join(model_ph, "trial-{}-model.h5").format(trial))
				break
		rewards.append(np.sum(t_reward) if t_reward else 0.0)
		q_values.append(np.mean(t_qvalue) if t_qvalue else 0.0)
		
		with open('reward_and_Q/reward.txt', 'wb') as f:
			pickle.dump(rewards, f)
		with open('reward_and_Q/qvalue.txt', 'wb') as f:
			pickle.dump(q_values, f)
		print('trial: {}, success acc: {}'.format(trial, success_num / float(trial)))