예제 #1
0
def run_mpc(num_envs):
    value_predictor = greedy_reward_predictor(mbhp=(reward_type == "mbhp"))
    env = SuperMario_Vec_Env(num_envs, world, stage, wrap_atari=True)
    obs, info = env.reset()
    count = 0
    last_count = 0
    if world == 2:
        for i in range(5):
            count += 1
            obs, rewards, dones, info = env.step([1])
    best_action = value_predictor.predict(info)

    stat = []
    with open(log_path, "a") as logs:
        logs.write("=" * 5 + "\n")
    while count < 50000:
        count += 1
        obs, rewards, dones, info = env.step([best_action])
        if dones[0] or info[0]['x_pos'] > 3150:
            stat.append([count - last_count, info[0]['x_pos']])
            last_count = count
            with open(log_path, "a") as logs:
                logs.write(str(info[0]['x_pos']) + '\n')
            print('restart')
            print(experiment_name)
            obs, info = env.reset()
            if world == 2:
                for i in range(5):
                    count += 1
                    obs, rewards, dones, info = env.step([1])
        best_action = value_predictor.predict(info)
        print('[{}] {}'.format(count, info[0]['x_pos']))

    np.save(stat_path, np.array(stat))
    value_predictor.close()
예제 #2
0
    br = reward_map[18:, 18:, 7, :]
    return tl + tm + tr + ml + mr + bl + bm + br


#[black, d blue, l blue, grey, white]
color_map = np.array(
    [[0x00, 0x00, 0x00], [0xad, 0x95, 0x19], [0xe2, 0xd6, 0xa1],
     [0xbe, 0xba, 0xbc], [0xf2, 0xf1, 0xf1]],
    dtype=np.uint8)[None, None]
arrow_map = np.array([[0, 0], [3.5, 0], [2.1, 2.8], [4.0, 0], [2.4, 3.2]],
                     dtype=np.uint8)[None, None]
if __name__ == "__main__":
    import cv2
    from environments.mario_vec_env import SuperMario_Vec_Env
    visualizer = Visualizer()
    env = SuperMario_Vec_Env(1, 1, 2, wrap_atari=True)
    obs, info = env.reset()
    obs = obs[0]
    action = 0
    cv2.namedWindow('vis')

    # create trackbars for color change
    cv2.createTrackbar('pos', 'vis', 0, 7, lambda x: None)
    cv2.createTrackbar('action', 'vis', 0, 4, lambda x: None)

    sticky_left = 0
    while True:
        # time.sleep(0.05)
        obs, rewards, dones, info = env.step(np.array([action]))
        rgb = info[0]['rgb']
        obs = obs[0]
예제 #3
0
파일: vis_mpc.py 프로젝트: buoyancy99/sap
def run_mpc(num_envs):
    value_predictor = reward_predictor(baseline=baseline)
    dynamics = dynamics_model(model_path='dynamics/mario/ckpts',
                              mode='eval',
                              num_envs=num_envs)
    env = SuperMario_Vec_Env(num_envs, world, stage, wrap_atari=True)
    last_obs, info = env.reset()
    last_frame = info[0]['rgb']
    env.backup()
    value_predictor.update(info)
    dynamics.reset(last_obs, info)
    count = 0
    last_actions = np.random.randint(env.action_space.n, size=num_envs)
    while count < 50000:
        count += 1
        dynamics.start_planning()
        for _ in range(plan_step):
            sticky_mask = np.random.random(num_envs) < sticky_prob
            new_actions = np.random.randint(
                env.action_space.n,
                size=num_envs) * (1 - sticky_mask) + sticky_mask * last_actions
            last_actions = new_actions

            _, _, dones, info_hat = dynamics.step(new_actions)
            obs, _, _, info = env.step(new_actions)
            value_predictor.update(info_hat, dones)
            y, x = int(info_hat[0]['y'] / scale_y) + 6, int(
                info_hat[0]['x'] / scale_x) + 7
            last_frame[max(0, min(239, y - 2)):max(0, min(239, y + 2)),
                       max(0, min(255, x - 2)):max(0, min(255, x + 2))] = 0
            obs_to_show = info[0]['rgb']
            y, x = int(274 - info[0]['y_pos']) + 6, int(
                info[0]['screen_x_pos']) + 7
            obs_to_show[max(0, min(239, y - 2)):max(0, min(239, y + 2)),
                        max(0, min(255, x - 2)):max(0, min(255, x + 2))] = 0

            frame_last = cv2.cvtColor(
                cv2.resize(np.uint8(last_frame * 255.0), (1024, 1024),
                           interpolation=cv2.INTER_NEAREST), cv2.COLOR_RGB2BGR)
            frame = cv2.cvtColor(
                cv2.resize(np.uint8(obs_to_show * 255.0), (1024, 1024),
                           interpolation=cv2.INTER_NEAREST), cv2.COLOR_RGB2BGR)
            videowriter.write(np.concatenate([frame_last, frame], 1))

            # cv2.imshow('last_obs', frame_last)
            # cv2.imshow('obs', frame)
            # k = cv2.waitKey(5)

        dynamics.end_planning()
        best_action = value_predictor.predict()
        env.restore()
        for action in best_action:
            obs, rewards, dones, info = env.step(np.array([action] * num_envs))
            dynamics.update(obs, info)
            if dones[0] or info[0]['x_pos'] > 3150:
                print('restart')
                obs, info = env.reset()
                dynamics.reset(obs, info)
                break
        last_frame = info[0]['rgb']
        # assert np.equal(last_obs[0], last_obs[1]).all()
        # cv2.imshow('last1', cv2.resize(np.uint8(last_obs[0]), (512, 512), interpolation=cv2.INTER_NEAREST))
        # cv2.imshow('last2', cv2.resize(np.uint8(last_obs[1]), (512, 512), interpolation=cv2.INTER_NEAREST))
        print('[{}] {}'.format(info[0]['x_pos'], count))
        env.backup()
        value_predictor.update(info)
        # env.render()
    value_predictor.close()
예제 #4
0
def run_mpc(num_envs):
    value_predictor = reward_predictor(mbhp=(reward_type=="mbhp"))
    dynamics = dynamics_model(model_path = 'dynamics/mario/ckpts', mode = 'eval', num_envs = num_envs)
    env = SuperMario_Vec_Env(num_envs, world, stage, wrap_atari=True)
    obs, info = env.reset()
    env.backup()
    value_predictor.update(info)
    dynamics.reset(obs, info)
    last_count = 0
    stat = []
    last_actions = np.random.randint(env.action_space.n, size=num_envs)
    # with open(log_path, "a") as logs:
    #     logs.write("="*5 + "\n")
    for count in trange(1, 50001):
        dynamics.start_planning()
        for _ in range(plan_step):
            sticky_mask = np.random.random(num_envs) < sticky_prob
            new_actions = np.random.randint(env.action_space.n, size=num_envs) * (1 - sticky_mask) + sticky_mask * last_actions
            last_actions = new_actions

            _, _, dones, info = dynamics.step(new_actions)
            # _, _, _, _ = env.step(new_actions)
            if nodeath:
                dones = np.zeros_like(dones)
            value_predictor.update(info, dones)
            # predictor.update(info, np.zeros(num_envs))
            # videowriter.write(cv2.resize(np.uint8(obs[0]), (512, 512), interpolation=cv2.INTER_NEAREST))
            # for i in range(2):
            #     cv2.imshow(str(i), cv2.resize(np.uint8(obs[i]), (512, 512), interpolation=cv2.INTER_NEAREST))
            # k = cv2.waitKey(20)
            # env.render()
        dynamics.end_planning()
        best_action = value_predictor.predict()
        env.restore()
        for action in best_action:
            obs, rewards, dones, info = env.step(np.array([action] * num_envs))
            dynamics.update(obs, info)
            if save_video:
                videowriter.write(cv2.cvtColor(cv2.resize(np.uint8(info[0]['rgb'] * 255.0), (512, 512), interpolation=cv2.INTER_NEAREST), cv2.COLOR_RGB2BGR))
            if dones[0] or info[0]['x_pos'] > 3150:
                stat.append(info[0]['x_pos'])
                last_count = count
                # with open(log_path, "a") as logs:
                #     logs.write(str(info[0]['x_pos']) + '\n')
                # print('restart')
                # print(experiment_name)
                obs, info = env.reset()
                dynamics.reset(obs, info)
                break
        # last_obs = obs
        # assert np.equal(last_obs[0], last_obs[1]).all()
        # cv2.imshow('last1', cv2.resize(np.uint8(last_obs[0]), (512, 512), interpolation=cv2.INTER_NEAREST))
        # cv2.imshow('last2', cv2.resize(np.uint8(last_obs[1]), (512, 512), interpolation=cv2.INTER_NEAREST))
        # print('[{}] {}'.format(count, info[0]['x_pos']))
        env.backup()
        value_predictor.update(info)
        # env.render()
    stat = np.array(stat)
    np.save(stat_path, stat)
    print('mean: {:.3f}, stderr: {:.3f}'.format(stat.mean(), stat.std() / np.sqrt(len(stat))))
    value_predictor.close()