Exemplo n.º 1
0
 def __init__(self, env_name, batch_size, num_workers=mp.cpu_count() - 1):
     self.env_name = env_name
     self.batch_size = batch_size
     self.num_workers = num_workers
     
     self.queue = mp.Queue()
     self.envs = SubprocVecEnv([make_env(env_name) for _ in range(num_workers)],
                               queue=self.queue)
     self._env = gym.make(env_name)
def train_td3():
    env = SubprocVecEnv([lambda: EnvHandler(make_env()) for _ in range(1)])
    learn(env,
          total_timesteps=1e6,
          nb_epochs=None,
          nb_rollout_steps=100,
          max_ep_len=250,
          reward_scale=1.0,
          render=False,
          render_eval=False,
          noise_type='adaptive-param_0.2',
          normalize_returns=False,
          normalize_observations=True,
          actor_lr=1e-4,
          critic_lr=1e-3,
          popart=False,
          gamma=0.99,
          clip_norm=None,
          start_steps=10000,
          nb_train_steps=50,
          nb_eval_steps=100,
          nb_log_steps=100,
          nb_save_steps=None,
          batch_size=64,
          polyak=0.01,
          action_range=(-250.0, 250.0),
          observation_range=(-5.0, 5.0),
          target_noise=0.2,
          noise_clip=0.5,
          policy_delay=2,
          load_path=None,
          save_dir=None)
def train_ddpg():
    env = SubprocVecEnv([lambda: EnvHandler(make_env()) for _ in range(1)])
    #env = SubprocVecEnv([lambda: EnvHandler(make_env(env_no=0)), lambda: EnvHandler(make_env(env_no=1))])
    learn(env=env,
          seed=None,
          total_timesteps=1e5,
          nb_epochs=None,
          nb_epoch_cycles=10,
          nb_rollout_steps=100,
          reward_scale=1.0,
          render=False,
          render_eval=False,
          noise_type='ou-param_0.2',
          normalize_returns=False,
          normalize_observations=False,
          critic_l2_reg=1e-2,
          actor_lr=1e-4,
          critic_lr=1e-3,
          popart=False,
          gamma=0.99,
          clip_norm=None,
          nb_train_steps=50,
          nb_eval_steps=100,
          batch_size=64,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          nb_save_epochs=1,
          save_dir=".",
          load_path=None)
Exemplo n.º 4
0
    def reset_task(self, tasks, batch_id, reset_type='learning'):
        # regenerate new envs to avoid the engine stuck bug!
        dic_traffic_env_conf_list = []
        dic_path_list = []
        for task in tasks:
            dic_agent_conf = copy.deepcopy(self.dic_agent_conf)
            dic_agent_conf['TRAFFIC_FILE'] = task

            dic_traffic_env_conf = copy.deepcopy(
                self.task_traffic_env_map[task])
            dic_traffic_env_conf['TRAFFIC_FILE'] = task
            dic_traffic_env_conf_list.append(dic_traffic_env_conf)

            dic_path = copy.deepcopy(self.task_path_map[task])
            if reset_type == 'test':
                dic_path["PATH_TO_LOG"] = os.path.join(
                    dic_path['PATH_TO_WORK_DIRECTORY'], reset_type + '_round',
                    task, 'tasks_round_' + str(batch_id))
            else:
                dic_path["PATH_TO_LOG"] = os.path.join(
                    dic_path['PATH_TO_WORK_DIRECTORY'], reset_type + '_round',
                    'tasks_round_' + str(batch_id), task)
            dic_path_list.append(dic_path)

            if not os.path.exists(dic_path['PATH_TO_LOG']):
                os.makedirs(dic_path['PATH_TO_LOG'])

        self.envs = SubprocVecEnv(dic_path_list,
                                  dic_traffic_env_conf_list,
                                  len(tasks),
                                  queue=self.queue)
Exemplo n.º 5
0
def get_env(env_name, results_save_dir, seed, num_envs):
    """
    Initialize the OpenAI Gym environment.

    :param env_name: The name of the gym environment to use, (e.g. 'Pong-v0')
    :param results_save_dir: Output directory for results.
    :param seed: The random seed.

    :return: The initialized gym environment.
    """

    # Create the 32 environments to parallelize
    def make_sub_env_creator(env_num):
        """ Returns a function that creates an event. """
        def sub_env_creator():
            sub_env = make_atari(env_name)
            sub_env.seed(seed + env_num)

            if env_num == 0 and num_envs > 1:
                # Wrap first env in default monitor for video output
                # Results will be transformed into baselines monitor style at the end of the run
                sub_env = gym.wrappers.Monitor(sub_env, results_save_dir)
            else:
                # Wrap every other env in the baselines monitor for equivalent plotting.
                sub_env = Monitor(sub_env, join(results_save_dir, str(env_num)))

            sub_env = wrap_deepmind(sub_env, frame_stack=True, scale=True)

            return sub_env

        return sub_env_creator

    envs = [make_sub_env_creator(i) for i in range(num_envs)]

    return SubprocVecEnv(envs)
Exemplo n.º 6
0
def make_vec_envs(env_name, seed, num_processes, gamma, log_dir, add_timestep,
                  device, allow_early_resets, num_frame_stack=None, 
                  args=None):
    envs = [make_env(env_name, seed, i, log_dir, add_timestep, 
        allow_early_resets, map_width=args.map_width, render_gui=args.render, 
        print_map=args.print_map, noreward=args.no_reward, max_step=args.max_step, simple_reward=args.simple_reward, args=args)
            for i in range(num_processes)]

    if len(envs) > 1:
        envs = SubprocVecEnv(envs)
    else:
        if sys.version[0] =='2':
            envs = DummyVecEnv('DummyVecEnv', (), {1:envs})
        else:
            envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        if gamma is None:
            envs = VecNormalize(envs, ret=False)
        else:
            envs = VecNormalize(envs, gamma=gamma)

    envs = VecPyTorch(envs, device)

    if num_frame_stack is not None:
        print('stacking {} frames'.format(num_frame_stack))
        envs = VecPyTorchFrameStack(envs, num_frame_stack, device)
    elif len(envs.observation_space.shape) == 3:
        envs = VecPyTorchFrameStack(envs, 1, device)

    return envs
Exemplo n.º 7
0
def make_eval_envs(config,
                   how_train,
                   seed,
                   agents,
                   training_agent_ids,
                   acting_agent_ids,
                   num_stack,
                   num_processes,
                   state_directory=None,
                   state_directory_distribution=None):
    envs = [
        _make_eval_env(
            config=config,
            how_train=how_train,
            seed=seed,
            rank=rank,
            agents=agents,
            training_agent_ids=training_agent_ids,
            acting_agent_ids=acting_agent_ids,
            num_stack=num_stack,
            state_directory=state_directory,
            state_directory_distribution=state_directory_distribution)
        for rank in range(num_processes)
    ]
    return SubprocVecEnv(envs)
def train(env_id, num_timesteps, num_cpu):
    def make_env(rank):
        def _thunk():
            env = make_atari(env_id)
            env.seed(SEED + rank)
            gym.logger.setLevel(logging.WARN)
            env = wrap_deepmind(env)

            # wrap the env one more time for getting total reward
            env = Monitor(env, rank)
            return env
        return _thunk

    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    learn(CNN, env, SEED, total_timesteps=int(num_timesteps * 1.1))
    env.close()
    pass
def train(env_id, num_timesteps, num_cpu):
    def make_env(rank):
        def _thunk():
            env = make_atari(env_id)
            env.seed(SEED + rank)
            gym.logger.setLevel(logging.WARN)
            env = wrap_deepmind(env)

            # wrap the env one more time for getting total reward
            env = Monitor(env, rank)
            return env

        return _thunk

    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    learn(CNN, env, SEED, total_timesteps=int(num_timesteps * 1.1))
    env.close()
    pass
Exemplo n.º 10
0
def main(env_id, num_timesteps, seed, policy, nstack, nsteps, lrschedule,
         optimizer, num_cpu, model_file, use_static_wrapper,
         use_encoded_imagination, use_decoded_imagination):
    num_timesteps //= 4
    assert not (use_encoded_imagination and use_decoded_imagination)

    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            env.seed(seed + rank)
            if use_static_wrapper:
                env = StaticWrapper(env)
            if policy == 'cnn' or use_encoded_imagination:
                env = RenderWrapper(env, 400, 600)
                env = DownsampleWrapper(env, 4)
            if use_encoded_imagination or use_decoded_imagination:
                env = FrameStack(env, 3)
            if use_encoded_imagination:
                env = EncodedImaginationWrapper(env, model_file, num_cpu)
            if use_decoded_imagination:
                env = DecodedImaginationWrapper(env, model_file, num_cpu)
            gym.logger.setLevel(logging.WARN)
            return env

        return _thunk

    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])

    if policy == 'fc':
        policy_fn = FcPolicy
    if policy == 'cnn':
        policy_fn = CnnPolicy
    learn(policy_fn,
          env,
          seed,
          nsteps=nsteps,
          nstack=nstack,
          total_timesteps=num_timesteps,
          lrschedule=lrschedule,
          optimizer=optimizer,
          max_episode_length=195)
    env.close()
def dynamics_data_gen(env_name='Reacher-v2',
                      start_seed=0,
                      timesteps=10,
                      n_parallel_envs=1,
                      width=300,
                      height=240):
    import gym  # import locally so that caller can patch gym

    def make_env(seed):
        def _():
            env = gym.make(env_name)
            env.seed(seed)
            return env

        return _

    # Uncomment this to show the bug
    # from requests_futures.sessions import FuturesSession
    # session = FuturesSession()
    # session.get('http://www.google.com', )

    from subproc_vec_env import SubprocVecEnv
    # from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv

    env = SubprocVecEnv(
        [make_env(s) for s in range(start_seed, start_seed + n_parallel_envs)])

    policy = RandPolicy(env.observation_space, env.action_space, env.num_envs)

    rollouts = []
    obs = env.reset()
    for i in range(timesteps):
        # fs = env.render("rgb", width=width, height=height)
        fs = env.render("rgb_array", width=width, height=height)
        acs = policy.act(obs)
        rollouts.append(dict(obs=obs, acs=acs, views=fs))
        obs, rewards, dones, infos = env.step(acs)

    import pandas as pd
    return {k: np.stack(v) for k, v in pd.DataFrame(rollouts).items()}
Exemplo n.º 12
0
class BatchSampler(object):
    def __init__(self, env_name, batch_size, num_workers=mp.cpu_count() - 1):
        self.env_name = env_name
        self.batch_size = batch_size
        self.num_workers = num_workers
        
        self.queue = mp.Queue()
        self.envs = SubprocVecEnv([make_env(env_name) for _ in range(num_workers)],
                                  queue=self.queue)
        self._env = gym.make(env_name)

    def sample(self, policy, params=None, gamma=0.95):
        episodes = BatchEpisodes(batch_size=self.batch_size, gamma=gamma)
        for i in range(self.batch_size):
            self.queue.put(i)
        for _ in range(self.num_workers):
            self.queue.put(None)
        observations, batch_ids = self.envs.reset()
        dones = [False]
        while (not all(dones)) or (not self.queue.empty()):
            observations_tensor = observations
            actions_tensor = policy(observations_tensor, params=params).sample()
            with tf.device('/CPU:0'):
                actions = actions_tensor.numpy()
            new_observations, rewards, dones, new_batch_ids, _ = self.envs.step(actions)
            episodes.append(observations, actions, rewards, batch_ids)
            observations, batch_ids = new_observations, new_batch_ids

        return episodes

    def reset_task(self, task):
        tasks = [task for _ in range(self.num_workers)]
        reset = self.envs.reset_task(tasks)
        return all(reset)

    def sample_tasks(self, num_tasks):
        tasks = self._env.unwrapped.sample_tasks(num_tasks)
        return tasks
Exemplo n.º 13
0
def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari.
    """
    if wrapper_kwargs is None: wrapper_kwargs = {}
    def make_env(rank): # pylint: disable=C0111
        def _thunk():
            env = make_atari(env_id)
            env.seed(seed + rank)
            env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            return wrap_deepmind(env, **wrapper_kwargs)
        return _thunk
    set_global_seeds(seed)
    return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
Exemplo n.º 14
0
def train(env_id,
          num_timesteps,
          seed,
          num_env,
          gamma=0.99,
          ent_coef=0.01,
          nepochs=4,
          lr=2.5e-4,
          next_n=10,
          seq_len=10,
          nslupdates=10,
          K=1):

    ncpu = multiprocessing.cpu_count()
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True
    tf.Session(config=config).__enter__()

    def make_env(rank):
        def _thunk():
            import maze
            env = maze.MazeEnv(config=open('config/' + env_id + '.xml'))
            return env

        return _thunk

    env = SubprocVecEnv([make_env(i) for i in range(num_env)])

    from ppo_diverse import learn
    policy = LocPolicy
    learn(policy=policy,
          env=env,
          nsteps=128,
          nminibatches=4,
          lam=0.95,
          gamma=gamma,
          noptepochs=nepochs,
          ent_coef=ent_coef,
          lr=lr,
          cliprange=0.1,
          total_timesteps=int(num_timesteps * 1.1),
          next_n=next_n,
          seq_len=seq_len,
          nslupdates=nslupdates,
          K=K,
          seed=seed)
def train_ppo():
    env = SubprocVecEnv([lambda: EnvHandler(make_env())])
    learn(env=env,
          eval_env=None,
          total_timesteps=3e7,
          nsteps=128,
          nminibatches=1,
          cliprange=0.2,
          ent_coef=0.01,
          vf_coef=0.5,
          lam=0.95,
          gamma=0.99,
          noptepochs=4,
          lr=2.5e-4,
          save_interval=100,
          save_dir=".",
          load_path=None,
          normalize_observations=False,
          normalize_returns=False)
Exemplo n.º 16
0
def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari.
    """
    if wrapper_kwargs is None: wrapper_kwargs = {}

    def make_env(rank):  # pylint: disable=C0111
        def _thunk():
            env = retro.make(
                env_id, use_restricted_actions=retro.ACTIONS_MULTI_DISCRETE)
            env.seed(seed + rank)
            return Monitor(
                env,
                logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

        return _thunk

    set_global_seeds(seed)
    return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
Exemplo n.º 17
0
def make_train_envs(
    config,
    how_train,
    seed,
    game_state_file,
    training_agents,
    num_stack,
    num_processes,
    do_filter_team=True,
    state_directory=None,
    state_directory_distribution=None,
    step_loss=None,
    bomb_reward=None,
    item_reward=None,
    use_second_place=False,
    use_both_places=False,
    frozen_agent=None,
    mix_frozen_complex=False,
    florensa_starts_dir=None,
):
    envs = [
        _make_train_env(
            config=config,
            how_train=how_train,
            seed=seed,
            rank=rank,
            game_state_file=game_state_file,
            training_agents=training_agents,
            num_stack=num_stack,
            do_filter_team=do_filter_team,
            state_directory=state_directory,
            state_directory_distribution=state_directory_distribution,
            step_loss=step_loss,
            bomb_reward=bomb_reward,
            item_reward=item_reward,
            use_second_place=use_second_place,
            use_both_places=use_both_places,
            frozen_agent=frozen_agent,
            mix_frozen_complex=mix_frozen_complex)
        for rank in range(num_processes)
    ]
    return SubprocVecEnv(envs)
def make_vec_envs(env_name, seed, num_processes, gamma, log_dir, add_timestep,
                  device, allow_early_resets):
    envs = [
        make_env(env_name, seed, i, log_dir, add_timestep, allow_early_resets)
        for i in range(num_processes)
    ]

    if len(envs) > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    envs = VecPyTorch(envs, device)
    '''
    if len(envs.observation_space.shape) == 3:
        print('Creating frame stacking wrapper')
        envs = VecPyTorchFrameStack(envs, 4, device)
        #print(envs.observation_space)    '''

    return envs
Exemplo n.º 19
0
def build_env(args):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    nenv = args.num_env or ncpu
    print('num of env: ' + str(nenv))

    seed = args.seed
    env_id = args.env

    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=1,
                            inter_op_parallelism_threads=1)
    config.gpu_options.allow_growth = True
    get_session(config=config)

    env = SubprocVecEnv([
        lambda: make_env_from_id(env_id, seed + i
                                 if seed is not None else None, "")
        for i in range(nenv)
    ])
    return env
Exemplo n.º 20
0
    def reset_task(self, tasks, batch_id, reset_type='learning'):
        # regenerate new envs to avoid the engine stuck bug!

        #for i in range(self.num_workers):
        dic_agent_conf_list = []
        dic_traffic_env_conf_list = []
        dic_path_list = []
        for task in tasks:
            task_id = self.dic_traffic_env_conf['TRAFFIC_IN_TASKS'].index(task)
            dic_agent_conf = copy.deepcopy(self.dic_agent_conf)
            dic_agent_conf['TRAFFIC_FILE'] = task
            dic_agent_conf_list.append(dic_agent_conf)

            dic_traffic_env_conf = copy.deepcopy(self.dic_traffic_env_conf)
            dic_traffic_env_conf['TRAFFIC_FILE'] = task
            dic_traffic_env_conf_list.append(dic_traffic_env_conf)

            dic_path = copy.deepcopy(self.dic_path)
            if reset_type == 'test':
                dic_path["PATH_TO_LOG"] = os.path.join(
                    dic_path['PATH_TO_WORK_DIRECTORY'], reset_type + '_round',
                    'task_%d_%s' % (task_id, task),
                    'tasks_round_' + str(batch_id))
            else:
                dic_path["PATH_TO_LOG"] = os.path.join(
                    dic_path['PATH_TO_WORK_DIRECTORY'], reset_type + '_round',
                    'tasks_round_' + str(batch_id),
                    'task_%d_%s' % (task_id, task))
            dic_path['PATH_TO_SUMO_CONF'] = os.path.join(
                dic_path['PATH_TO_WORK_DIRECTORY'], "sumo_conf", task)
            dic_path_list.append(dic_path)

            if not os.path.exists(dic_path['PATH_TO_LOG']):
                os.makedirs(dic_path['PATH_TO_LOG'])

        self.envs = SubprocVecEnv(dic_path_list,
                                  dic_traffic_env_conf_list,
                                  len(tasks),
                                  queue=self.queue)
Exemplo n.º 21
0
def train(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one

        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state



    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype']=dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype']=dtype


    # Create environments
    print (num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print ('Made dir', monitor_rewards_dir) 
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print ('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print ('env for ls')
        envs_ls = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape
    model_dict['shape_dim0']=shape_dim0



    # # Create agent
    # if algo == 'a2c':
    #     agent = a2c(envs, model_dict)
    #     print ('init a2c agent')
    # elif algo == 'ppo':
    #     agent = ppo(envs, model_dict)
    #     print ('init ppo agent')
    # elif algo == 'a2c_minibatch':
    #     agent = a2c_minibatch(envs, model_dict)
    #     print ('init a2c_minibatch agent')
    # elif algo == 'a2c_list_rollout':
    #     agent = a2c_list_rollout(envs, model_dict)
    #     print ('init a2c_list_rollout agent')
    # elif algo == 'a2c_with_var':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')

    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if model_dict['load_params']:
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     # agent.actor_critic = torch.load(args.load_path).cuda()
        
    #     # print ('loaded ', args.load_path)

    #     if model_dict['load_number'] == 3:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict)

    #     elif model_dict['load_number'] == 6:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict)
    #     elif model_dict['load_number'] == 9:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict)

    #     # else:
    #     #     load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict)
    #     else:
    #         PROBLEM


    print ('Init expert agent')
    expert_agent = a2c(envs, model_dict)
    param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params9999360.pt'    
    param_dict = torch.load(param_file)
    expert_agent.actor_critic.load_state_dict(param_dict)
    print ('loaded params', param_file)
    expert_agent.actor_critic.cuda()



    print ('Init imitator agent')
    imitator_agent = a2c(envs, model_dict)
    # param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params.ckpt'  
    # param_dict = torch.load(param_file)
    # imitator_agent.actor_critic.load_state_dict(param_dict)
    # print ('loaded params', param_file)
    imitator_agent.actor_critic.cuda()







    agent = expert_agent
    expert_policy = expert_agent.actor_critic

    imitator_policy = imitator_agent.actor_critic
    optimizer = optim.Adam(imitator_policy.parameters(), lr=.0005, weight_decay=.00001)

    total_steps = 0

    display_step = 50






    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest
    agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval /num_processes/num_steps)




    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            state__ = Variable(agent.rollouts.states[step]) / 255.
            value, action, action_log_probs, dist_entropy = agent.act(state__) #, requires_grad=False)#, volatile=True))
            # print (action_log_probs.size())
            # print (dist_entropy.size())





            batch = state__

            optimizer.zero_grad()

            log_dist_expert = expert_policy.action_logdist(batch)
            log_dist_imitator = imitator_policy.action_logdist(batch)

            action_dist_kl = torch.sum((log_dist_expert - log_dist_imitator)*torch.exp(log_dist_expert), dim=1) #[B]

            # elbo, logpx, logpz, logqz, action_dist_kl = self.forward(batch, policy, k=k)
            loss = torch.mean(action_dist_kl)

            loss.backward()
            # nn.utils.clip_grad_norm(self.parameters(), .5)
            optimizer.step()

            # if total_steps%display_step==0: # and batch_idx == 0:
            #     # print ('Train Epoch: {}/{}'.format(epoch+1, epochs),
            #         # 'total_epochs {}'.format(total_epochs),
            #         print('LL:{:.4f}'.format(loss.data[0])
            #         # 'logpx:{:.4f}'.format(logpx.data[0]),
            #         # 'logpz:{:.5f}'.format(logpz.data[0]),
            #         # 'logqz:{:.5f}'.format(logqz.data[0]),
            #         # 'action_kl:{:.4f}'.format(action_dist_kl.data[0])
            #         )

            # total_steps+=1






            cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions) 

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state, shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)
            agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done)





        #Optimize agent


        agent.no_update()  #agent.update(j,num_updates)
        # agent.update()  #agent.update(j,num_updates)


        agent.insert_first_state(agent.rollouts.states[-1])


        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps
        
        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            save_to = home+'/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params_env.ckpt'
            torch.save(imitator_policy.state_dict(), save_to)
            print ('saved imitator_policy', save_to)

            # #Save model
            # if save_params:
            #     do_params(save_dir, agent, total_num_steps, model_dict)
            #     # save_params_v2(save_dir, agent, total_num_steps, model_dict)

                
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps)


        #Print updates
        if j % log_interval == 0:# and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}, {:.4f}".format(j, total_num_steps,
                                       final_rewards.min(),
                                       final_rewards.median(),
                                       final_rewards.mean(),
                                       final_rewards.max(),
                                       int(total_num_steps / (end - start)),
                                       end - start,
                                       end - start2, 
                                       loss.data[0])


            # to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps,
            #                            final_rewards.min(),
            #                            final_rewards.median(),
            #                            final_rewards.mean(),
            #                            final_rewards.max(),
            #                            int(total_num_steps / (end - start)),
            #                            end - start,
            #                            end - start2)


            print(to_print_info_string) 
            start2 = time.time()



            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time"
            if j % (log_interval*30) == 0:
            
                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards)
                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)
                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise #pass
                    print(to_print_legend_string)



    try:
        make_plots(model_dict)
    except:
        print ()
Exemplo n.º 22
0
def train(model_dict):
    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(
            state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:]
        current_state[:, -channels:] = state  #last frame is now the new one

        return current_state

    def update_rewards(reward, done, final_rewards, episode_rewards,
                       current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                 1)).float()  #[P,1]
        episode_rewards += reward  #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])  #[P,1]
        final_rewards *= masks  #erase the ones that are done
        final_rewards += (
            1 -
            masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks  #erase the done ones
        masks = masks.type(dtype)  #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks  #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype'] = dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype'] = dtype

    # Create environments
    print(num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print('Made dir', monitor_rewards_dir)
    envs = SubprocVecEnv([
        make_env(env_name, seed, i, monitor_rewards_dir)
        for i in range(num_processes)
    ])

    if vid_:
        print('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print('env for ls')
        envs_ls = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]
                 )  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape'] = obs_shape
    model_dict['shape_dim0'] = shape_dim0

    # # Create agent
    # if algo == 'a2c':
    #     agent = a2c(envs, model_dict)
    #     print ('init a2c agent')
    # elif algo == 'ppo':
    #     agent = ppo(envs, model_dict)
    #     print ('init ppo agent')
    # elif algo == 'a2c_minibatch':
    #     agent = a2c_minibatch(envs, model_dict)
    #     print ('init a2c_minibatch agent')
    # elif algo == 'a2c_list_rollout':
    #     agent = a2c_list_rollout(envs, model_dict)
    #     print ('init a2c_list_rollout agent')
    # elif algo == 'a2c_with_var':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')

    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if model_dict['load_params']:
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     # agent.actor_critic = torch.load(args.load_path).cuda()

    #     # print ('loaded ', args.load_path)

    #     if model_dict['load_number'] == 3:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict)

    #     elif model_dict['load_number'] == 6:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict)
    #     elif model_dict['load_number'] == 9:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict)

    #     # else:
    #     #     load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict)
    #     else:
    #         PROBLEM

    print('Init expert agent')
    expert_agent = a2c(envs, model_dict)
    param_file = home + '/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params9999360.pt'
    param_dict = torch.load(param_file)
    expert_agent.actor_critic.load_state_dict(param_dict)
    print('loaded params', param_file)
    expert_agent.actor_critic.cuda()

    print('Init imitator agent')
    imitator_agent = a2c(envs, model_dict)
    # param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params.ckpt'
    # param_dict = torch.load(param_file)
    # imitator_agent.actor_critic.load_state_dict(param_dict)
    # print ('loaded params', param_file)
    imitator_agent.actor_critic.cuda()

    agent = expert_agent
    expert_policy = expert_agent.actor_critic

    imitator_policy = imitator_agent.actor_critic
    optimizer = optim.Adam(imitator_policy.parameters(),
                           lr=.0005,
                           weight_decay=.00001)

    total_steps = 0

    display_step = 50

    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(
        num_processes,
        *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(
        current_state, state,
        shape_dim0).type(dtype)  #add the new frame, remove oldest
    agent.insert_first_state(
        current_state
    )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_processes, 1])  #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval / num_processes / num_steps)

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            state__ = Variable(agent.rollouts.states[step]) / 255.
            value, action, action_log_probs, dist_entropy = agent.act(
                state__)  #, requires_grad=False)#, volatile=True))
            # print (action_log_probs.size())
            # print (dist_entropy.size())

            batch = state__

            optimizer.zero_grad()

            log_dist_expert = expert_policy.action_logdist(batch)
            log_dist_imitator = imitator_policy.action_logdist(batch)

            action_dist_kl = torch.sum((log_dist_expert - log_dist_imitator) *
                                       torch.exp(log_dist_expert),
                                       dim=1)  #[B]

            # elbo, logpx, logpz, logqz, action_dist_kl = self.forward(batch, policy, k=k)
            loss = torch.mean(action_dist_kl)

            loss.backward()
            # nn.utils.clip_grad_norm(self.parameters(), .5)
            optimizer.step()

            # if total_steps%display_step==0: # and batch_idx == 0:
            #     # print ('Train Epoch: {}/{}'.format(epoch+1, epochs),
            #         # 'total_epochs {}'.format(total_epochs),
            #         print('LL:{:.4f}'.format(loss.data[0])
            #         # 'logpx:{:.4f}'.format(logpx.data[0]),
            #         # 'logpz:{:.5f}'.format(logpz.data[0]),
            #         # 'logqz:{:.5f}'.format(logqz.data[0]),
            #         # 'action_kl:{:.4f}'.format(action_dist_kl.data[0])
            #         )

            # total_steps+=1

            cpu_actions = action.data.squeeze(1).cpu().numpy()  #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions)

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(
                reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state,
                                                 shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)
            agent.insert_data(step, current_state, action.data, value, reward,
                              masks, action_log_probs, dist_entropy)  #, done)

        #Optimize agent

        agent.no_update()  #agent.update(j,num_updates)
        # agent.update()  #agent.update(j,num_updates)

        agent.insert_first_state(agent.rollouts.states[-1])

        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps

        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            save_to = home + '/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params_env.ckpt'
            torch.save(imitator_policy.state_dict(), save_to)
            print('saved imitator_policy', save_to)

            # #Save model
            # if save_params:
            #     do_params(save_dir, agent, total_num_steps, model_dict)
            #     # save_params_v2(save_dir, agent, total_num_steps, model_dict)

            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype,
                       agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state,
                        update_rewards, total_num_steps)

        #Print updates
        if j % log_interval == 0:  # and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}, {:.4f}".format(
                j, total_num_steps,
                final_rewards.min(), final_rewards.median(),
                final_rewards.mean(), final_rewards.max(),
                int(total_num_steps / (end - start)), end - start,
                end - start2, loss.data[0])

            # to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps,
            #                            final_rewards.min(),
            #                            final_rewards.median(),
            #                            final_rewards.mean(),
            #                            final_rewards.max(),
            #                            int(total_num_steps / (end - start)),
            #                            end - start,
            #                            end - start2)

            print(to_print_info_string)
            start2 = time.time()

            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time"
            if j % (log_interval * 30) == 0:

                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps,
                          update_current_state, update_rewards)
                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)
                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise  #pass
                    print(to_print_legend_string)

    try:
        make_plots(model_dict)
    except:
        print()
Exemplo n.º 23
0
def main():

    cumulative_avg_rewards = []
    for seed_ in [10, 50, 100, 200, 500]:
        seed(seed_)
        set_random_seed(seed_)
        print("Seed: ", seed_)
        episode = 0

        # initialize environment
        env_id = get_args().env
        #env = make_atari(env_id)
        #env = wrap_deepmind(env, frame_stack=True, clip_rewards=False, episode_life=False)
        #env = Monitor(env)

        env = SubprocVecEnv([make_env(seed_, i) for i in range(6)])  #24
        print("CHECK_ENV", env.reset().__array__().shape)
        state_size = env.observation_space.shape[0]
        action_size = env.action_space.n
        agent = get_agent(env)
        save_path = os.path.join('models_entropy_coeff1',
                                 "Space_inv_A2C_LSTM_nstep8_MAX_rew_546")
        agent.load(save_path)
        lstm_state = np.zeros((6, 256), dtype=np.float32)  #24

        # run for 100 episodes
        #for i in range(100):
        counter = 0

        episodic_reward_lis = []
        for i in range(wandb.config.episodes):
            # Set reward received in this episode = 0 at the start of the episode
            episodic_reward = np.zeros((6))  #24
            episodic_reward_m = np.zeros((6))  #24

            reset = False

            #env = gym.wrappers.Monitor(env, 'test/'+str(i), force=True)

            obs = env.reset()
            renders = []
            count = 0
            action_count = 0
            done = False
            done1 = np.zeros(6)  #24
            done2 = np.zeros(6)  #24
            while not done:
                a, v, lstm_state = agent.step(obs, S_=lstm_state, M_=done1)
                obs, reward, done1, info = env.step(a, done1, cond="eval")
                done = done2.all()
                if (done):
                    episodic_reward_m1 = episodic_reward_m.max()
                    break
                if (done1.any()):
                    episodic_reward_m[np.logical_and(
                        done2 <= 0, done1)] = episodic_reward[np.logical_and(
                            done2 <= 0, done1)]
                    for j in np.nonzero(done1)[0]:
                        episodic_reward[j] = 0
                episodic_reward += reward
                done2 = np.logical_or(done1, done2)

            if (i == 0):
                reset = True

            cumulative_avg_reward = evaluate(episodic_reward_m1, reset)

        tf.reset_default_graph()
        env.close()

        # your models will be evaluated on 100-episode average reward
        # therefore, we stop logging after 100 episodes
        print("*************************************************************")
        print("CUMULATIVE_AVG_REWARD", cumulative_avg_reward)
        print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
        cumulative_avg_rewards.append(cumulative_avg_reward)

    print("Final score: ", np.mean(cumulative_avg_rewards))
Exemplo n.º 24
0
def train(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one

        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda

        # print (current_state)
        # fdsf

        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state



    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype']=dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype']=dtype


    # Create environments
    print (num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print ('Made dir', monitor_rewards_dir) 
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print ('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print ('env for ls')
        envs_ls = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape
    model_dict['shape_dim0']=shape_dim0

    action_size = envs.action_space.n
    model_dict['action_size']=action_size



    # Create agent
    if algo == 'a2c':
        agent = a2c(model_dict)
        print ('init a2c agent')
    # elif algo == 'ppo':
    #     agent = ppo(envs, model_dict)
    #     print ('init ppo agent')
    # elif algo == 'a2c_minibatch':
    #     agent = a2c_minibatch(envs, model_dict)
    #     print ('init a2c_minibatch agent')
    # elif algo == 'a2c_list_rollout':
    #     agent = a2c_list_rollout(envs, model_dict)
    #     print ('init a2c_list_rollout agent')
    # elif algo == 'a2c_with_var':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    #Load model
    if model_dict['load_params']:
        # agent.actor_critic = torch.load(os.path.join(args.load_path))
        # agent.actor_critic = torch.load(args.load_path).cuda()
        
        # print ('loaded ', args.load_path)

        if model_dict['load_number'] == 3:
            load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict)

        elif model_dict['load_number'] == 6:
            load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict)
        elif model_dict['load_number'] == 9:
            load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict)

        # else:
        #     load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict)
        else:
            PROBLEM















    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest
    agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval /num_processes/num_steps)

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            value, action, action_log_probs, dist_entropy = agent.act(Variable(agent.rollouts.states[step]/255.))#, volatile=True))
            # print (action_log_probs.size())
            # print (dist_entropy.size())

            cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions) 


            # if np.sum(reward) > 0.:
            #     print (reward)
            #     afdas

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state, shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)
            agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done)





        #Optimize agent
        agent.update()  #agent.update(j,num_updates)
        agent.insert_first_state(agent.rollouts.states[-1])


        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps
        
        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                # do_params(save_dir, agent, total_num_steps, model_dict)
                # save_params_v2(save_dir, agent, total_num_steps, model_dict)
                save_params_v3(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps)


        #Print updates
        if j % log_interval == 0:# and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps,
                                       final_rewards.min(),
                                       final_rewards.median(),
                                       final_rewards.mean(),
                                       final_rewards.max(),
                                       int(total_num_steps / (end - start)),
                                       end - start,
                                       end - start2)
            print(to_print_info_string) 
            start2 = time.time()



            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time"
            if j % (log_interval*30) == 0:
            
                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards)
                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)
                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise #pass
                    print(to_print_legend_string)



    try:
        make_plots(model_dict)
    except:
        print ()
Exemplo n.º 25
0
    parser.add_argument('--plot-points', type=int, default=20, help='number of plot points (groups with mean, std)')
    parser.add_argument('--plot-path', type=str, default='ep_reward.png', help='path to save reward plot to')
    parser.add_argument('--seed', type=int, default=0, help='random seed')
    args = parser.parse_args()

    set_seed(args.seed)

    cuda = torch.cuda.is_available() and not args.no_cuda

    env_fns = []
    for rank in range(args.num_workers):
        env_fns.append(lambda: make_env(args.env_id, rank, args.seed + rank))
    if args.render:
        venv = RenderSubprocVecEnv(env_fns, args.render_interval)
    else:
        venv = SubprocVecEnv(env_fns)
    venv = VecFrameStack(venv, 4)

    test_env = make_env(args.env_id, 0, args.seed)
    test_env = FrameStack(test_env, 4)

    policy = {'cnn': AtariCNN}[args.arch](venv.action_space.n)
    policy = cuda_if(policy, cuda)

    optimizer = optim.Adam(policy.parameters())

    if args.lr_func == 'linear':
        lr_func = lambda a: args.lr * (1. - a)
    elif args.lr_func == 'constant':
        lr_func = lambda a: args.lr
def viz(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one



        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state




    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    
    num_processes = 1
    model_dict['num_processes'] = 1
    model_dict['num_steps'] = max_frames
    num_steps = max_frames
    
    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor


    # Create environments
    print (num_processes, 'processes')

    monitor_rewards_dir = ''
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    vid_ = 0
    see_frames = 1

    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape


    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print ('init a2c agent')
    elif algo == 'ppo':
        agent = ppo(envs, model_dict)
        print ('init ppo agent')
    elif algo == 'a2c_minibatch':
        agent = a2c_minibatch(envs, model_dict)
        print ('init a2c_minibatch agent')
    # agent = model_dict['agent'](envs, model_dict)




    #Load model
    model_params_file = save_dir+ '/model_params/model_params'+str(int(epoch_level))+'.pt'
    agent.actor_critic = torch.load(model_params_file).cuda()
    print ('loaded ', model_params_file)
    # fafdas


    # frame_path = save_dir+'/frames/'
    if not os.path.exists(frame_path):
        os.makedirs(frame_path)
        print ('Made dir', frame_path) 




    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest
    agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes

    #Begin training
    count =0
    start = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # if see_frames:
            #Grayscale
            # save_frame(state, count)




            # #RGB
            # state = envs.render()
            # print(state.shape)
            # fdsafa


        #         def get_action_meanings(self):
        # return [ACTION_MEANING[i] for i in self._action_set]

            # print (envs.get_action_meanings())

            # print (agent.rollouts.states[step].size())


            

            # print ('values', values)
            # print ('actions', actions)





            # rows = 1
            # cols = 3

            # fig = plt.figure(figsize=(8,4), facecolor='white')

            # # plot frame
            # ax = plt.subplot2grid((rows,cols), (0,0), frameon=False)

            # state1 = np.squeeze(state[0])
            # ax.imshow(state1, cmap='gray')
            # ax.set_xticks([])
            # ax.set_yticks([])
            # # ax.savefig(frame_path+'frame' +str(count)+'.png')
            # # print ('saved',frame_path+'frame' +str(count)+'.png')
            # # plt.close(fig)
            # ax.set_title('State',family='serif')





            # #plot values histogram
            # ax = plt.subplot2grid((rows,cols), (0,2), frameon=False)

            # values = []
            # actions = []
            # for ii in range(100):
            #     # Act, [P,1], [P,1]
            #     action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            #     val = value.data.cpu().numpy()[0][0]
            #     act_ = action.data.cpu().numpy()[0][0]
            #     # print ('value', val)
            #     # print ('action', act_)
            #     values.append(val)
            #     actions.append(act_)


            # weights = np.ones_like(values)/float(len(values))
            # ax.hist(values, 50, range=[0.0, 4.], weights=weights)
            # # ax.set_ylim(top=1.)
            # ax.set_ylim([0.,1.])

            # ax.set_title('Value',family='serif')







            # #plot actions
            # ax = plt.subplot2grid((rows,cols), (0,1), frameon=False)

            # action_prob = agent.actor_critic.action_dist(Variable(agent.rollouts.states[step], volatile=True))
            # action_prob = np.squeeze(action_prob.data.cpu().numpy())
            # action_size = envs.action_space.n

            # # print (action_prob.shape)

            # ax.bar(range(action_size), action_prob)

            # ax.set_title('Action',family='serif')
            # # ax.set_xticklabels(['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE'])
            # plt.xticks(range(action_size),['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'R_FIRE', 'L_FIRE'], fontsize=6)
            # ax.set_ylim([0.,1.])



            # # print (action_prob)
            # # ['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']
            # # fdsfas

            # plt.tight_layout(pad=3., w_pad=2.5, h_pad=1.0)

            # plt_path = frame_path+'plt' 
            # plt.savefig(plt_path+str(count)+'.png')
            # print ('saved',plt_path+str(count)+'.png')
            # plt.close(fig)
            # # fsadf




            count+=1
            if count % 10 ==0:
                print (count)

            if count > 2:
                if reward.cpu().numpy() > 0:
                    # print (, reward.cpu().numpy(), count)
                    print (done[0],masks.cpu().numpy(), reward.cpu().numpy(),'reward!!', step)
                    print (np.squeeze(agent.rollouts.rewards.cpu().numpy()))
                else:
                    print (done[0],masks.cpu().numpy(), reward.cpu().numpy())


                # if done[0] or count > max_frames:
                if count > max_frames:

                    next_value = agent.actor_critic(Variable(agent.rollouts.states[-1], volatile=True))[0].data
                    agent.rollouts.compute_returns(next_value, agent.use_gae, agent.gamma, agent.tau)

                    rollouts_ =  np.squeeze(agent.rollouts.returns.cpu().numpy())
                    rewards_ =  np.squeeze(agent.rollouts.rewards.cpu().numpy())
                    # rollouts_ =  np.squeeze(agent.rollouts.returns.cpu().numpy())
                    # rollouts_ =  np.squeeze(agent.rollouts.returns.cpu().numpy())


                    for jj in range(len(rollouts_)):

                        print (jj, rollouts_[jj], rewards_[jj])
                    ffsdfa






                # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
                # print ('value', value)
                # print ('action', action)

                # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
                # print ('value', value)
                # print ('action', action)


            action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))

            cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions) 



            # Record rewards
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            
            # Update state
            current_state = update_current_state(current_state, state, shape_dim0)

            # Agent record step
            agent.insert_data(step, current_state, action.data, value.data, reward, masks)


            # print (reward)






        total_num_steps = (j + 1) * num_processes * num_steps
Exemplo n.º 27
0
def train(model_dict):
    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(
            state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:]
        current_state[:, -channels:] = state  #last frame is now the new one

        return current_state

    def update_rewards(reward, done, final_rewards, episode_rewards,
                       current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                 1)).float()  #[P,1]
        episode_rewards += reward  #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])  #[P,1]
        final_rewards *= masks  #erase the ones that are done
        final_rewards += (
            1 -
            masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks  #erase the done ones
        masks = masks.type(dtype)  #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks  #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']
    vae_ = model_dict['vae_']
    grad_var_ = model_dict['grad_var_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype'] = dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype'] = dtype

    # Create environments
    print(num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print('Made dir', monitor_rewards_dir)
    envs = SubprocVecEnv([
        make_env(env_name, seed, i, monitor_rewards_dir)
        for i in range(num_processes)
    ])

    if vid_:
        print('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print('env for ls')
        envs_ls = make_env_basic(env_name)

    if vae_:
        print('env for vae')
        envs_vae = make_env_basic(env_name)

    if grad_var_:
        print('env for grad_var_')
        envs_grad_var = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]
                 )  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape'] = obs_shape
    model_dict['shape_dim0'] = shape_dim0
    model_dict['action_size'] = envs.action_space.n
    print(envs.action_space.n, 'actions')

    # next_state_pred_ = 0
    # model_dict['next_state_pred_'] = next_state_pred_

    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print('init a2c agent')

    discriminator = CNN_Discriminator(model_dict).cuda()
    print('init discriminator')

    # elif algo == 'a2c_over':
    #     agent = a2c_over(envs, model_dict)
    #     print ('init a2c_over agent')
    # elif algo == 'a2c_under':
    #     agent = a2c_under(envs, model_dict)
    #     print ('init a2c_under agent')
    # elif algo == 'ppo':
    #     agent = ppo(envs, model_dict)
    #     print ('init ppo agent')
    # elif algo == 'a2c_minibatch':
    #     agent = a2c_minibatch(envs, model_dict)
    #     print ('init a2c_minibatch agent')
    # elif algo == 'a2c_list_rollout':
    #     agent = a2c_list_rollout(envs, model_dict)
    #     print ('init a2c_list_rollout agent')
    # elif algo == 'a2c_with_var':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if args.load_path != '':
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     agent.actor_critic = torch.load(args.load_path).cuda()
    #     print ('loaded ', args.load_path)

    # see_reward_episode = 0
    # if 'Montez' in env_name and see_reward_episode:
    #     states_list = [[] for i in range(num_processes)]

    # view_reward_episode(model_dict=model_dict, frames=[])
    # dfasddsf

    # if vae_:
    #     vae = VAE()
    #     vae.cuda()

    buffer_ = 1

    if buffer_:
        buffer_states = deque(maxlen=200)
        buffer_actions = deque(maxlen=200)

    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(
        num_processes,
        *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(
        current_state, state, shape_dim0).type(
            dtype)  #add the new frame, remove oldest, since its a stack
    agent.insert_first_state(
        current_state
    )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_processes, 1])  #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval / num_processes / num_steps)

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):
        # discrim_errors = []
        # discrim_errors_reverse = []
        # discrim_errors_2step = []
        # frames = []
        for step in range(num_steps):

            # Act, [P,1], [P,1], [P,1], [P]
            state_pytorch = Variable(agent.rollouts.states[step])
            value, action, action_log_probs, dist_entropy = agent.act(
                state_pytorch)  #, volatile=True))

            # print (action)

            # fsdaf

            # Apply to Environment, S:[P,C,H,W], R:[P], D:[P]
            cpu_actions = action.data.squeeze(1).cpu().numpy()  #[P]
            frame, reward, done, info = envs.step(cpu_actions)

            # frames.append(torch.FloatTensor(frame)) #[P,1,84,84]

            # # current_frame = torch.from_numpy(frame)  #[P,1,84,84]
            # current_frame = torch.FloatTensor(frame)  #[P,1,84,84]
            # if step ==0:
            #     prev_frame = torch.FloatTensor(state)  #[P,1,84,84]

            # #Pred action and get error
            # discrim_error = discriminator.forward(prev_frame, current_frame, action)
            # discrim_errors.append(discrim_error)

            # discrim_error_reverse = discriminator.forward(current_frame, prev_frame, action)
            # discrim_errors_reverse.append(discrim_error_reverse)

            # # THIS IS TO SEE PREDICTIONS

            # if step==0:
            #     f =  np.reshape(prev_frame[0].numpy(), [84,84])
            # f =np.concatenate([f,np.reshape(current_frame[0].numpy(),[84,84])], axis=0)

            # # f1 = prev_frame[0].numpy()
            # # f2 = current_frame[0].numpy()
            # # f = np.reshape(np.concatenate([f1,f2], axis=1), [168,84])
            # # print (f.shape)
            # print (cpu_actions[0])
            # # ['NOOP', 'FIRE', 'RIGHT', 'LEFT'] for breakout
            # #for montezuma
            # #['NOOP', 'FIRE', 'UP', 'RIGHT', 'LEFT', 'DOWN', 'UPRIGHT', 'UPLEFT', 'DOWNRIGHT', 'DOWNLEFT',
            #     #'UPFIRE', 'RIGHTFIRE', 'LEFTFIRE', 'DOWNFIRE', 'UPRIGHTFIRE', 'UPLEFTFIRE', 'DOWNRIGHTFIRE', 'DOWNLEFTFIRE']
            # I think FIRE = JUMP

            # if step ==2:
            #     print (torch.mean(current_frame-prev_frame))
            #     fdafds

            # prev_frame_2step = prev_frame

            # prev_frame = current_frame

            # # print (torch.sum(prev_frame_2step), torch.sum(prev_frame))

            # fadsa

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(
                reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, frame,
                                                 shape_dim0)
            agent.insert_data(step, current_state, action.data, value, reward,
                              masks, action_log_probs, dist_entropy,
                              0)  #, done)

        # print (f.shape)
        # rows = 1
        # cols = 1
        # fig = plt.figure(figsize=(1+cols,5+rows), facecolor='white')
        # ax = plt.subplot2grid((rows,cols), (0,0), frameon=False) #, rowspan=7)
        # ax.imshow(f, cmap=plt.get_cmap('gray'))
        # ax.set_yticks([])
        # ax.set_xticks([])
        # plt.tight_layout()
        # plt.savefig(model_dict['exp_path']+'plot.png')
        # print ('plotted')
        # fadsfad

        # if buffer_:
        #     if len(buffer_actions) <100:
        #         buffer_steps = 10
        #     else:
        #         buffer_steps = 1

        buffer_steps = 500

        if buffer_:
            #Insert into buffer
            buffer_states.append(agent.rollouts.states)
            buffer_actions.append(agent.rollouts.actions)

            # print (agent.rollouts.states)
            # print (agent.rollouts.actions)
            # fda
            # print (len(buffer_actions))

            #If buffer full enough,sample , predict, optimize
            # if len(buffer_actions) > 10:

            if len(buffer_actions) == 100:

                # if 1:
                #Num of optimization steps
                for i in range(buffer_steps):
                    # #Sample batch
                    # states_batch = []
                    # actions_batch = []
                    # for bb in range(num_processes):
                    #     ind = np.random.randint(len(buffer_actions))
                    #     print (buffer_states[ind].size())
                    #     fadas
                    #     states_batch.append(buffer_states[ind])
                    #     actions_batch.append(buffer_actions[ind])
                    # states_batch = torch.stack(states_batch, dim=1)
                    # actions_batch = torch.stack(actions_batch, dim=1)

                    ind = np.random.randint(len(buffer_actions))
                    states_batch = buffer_states[ind]
                    actions_batch = buffer_actions[ind]

                    #Optimize action-predictor
                    discrim_errors = discrim_predictions(
                        model_dict, states_batch, actions_batch, discriminator)
                    discriminator.optimize(discrim_errors)

                    if i % 20 == 0:
                        print(i)

                # print (len(buffer_actions), torch.mean(discrim_errors).data.cpu().numpy()[0])

            #Optimize agent
            discrim_errors = discrim_predictions(model_dict,
                                                 agent.rollouts.states,
                                                 agent.rollouts.actions,
                                                 discriminator)
            discrim_errors_reverse = discrim_predictions(
                model_dict,
                agent.rollouts.states,
                agent.rollouts.actions,
                discriminator,
                reverse=True)

            if len(buffer_actions) > 100:
                discriminator.optimize(discrim_errors)

            agent.update2(discrim_errors,
                          discrim_errors_reverse)  #agent.update(j,num_updates)
            # agent.update2(discrim_errors)  #agent.update(j,num_updates)

        else:

            discrim_errors = discrim_predictions(model_dict,
                                                 agent.rollouts.states,
                                                 agent.rollouts.actions,
                                                 discriminator)
            discrim_errors_reverse = discrim_predictions(
                model_dict,
                agent.rollouts.states,
                agent.rollouts.actions,
                discriminator,
                reverse=True)

            #Optimize discriminator
            discriminator.optimize(discrim_errors)

            #Optimize agent
            agent.update2(discrim_errors,
                          discrim_errors_reverse)  #agent.update(j,num_updates)
            # agent.update2(discrim_errors)  #agent.update(j,num_updates)

        agent.insert_first_state(agent.rollouts.states[-1])

        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps

        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype,
                       agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state,
                        update_rewards, total_num_steps)
            #make vae prob gif
            if vae_:
                do_prob_state(envs_vae, agent, model_dict, vae,
                              update_current_state, total_num_steps)
            # #make vae prob gif
            # if grad_var_:
            #     do_grad_var(envs_grad_var, agent, model_dict, update_current_state, total_num_steps)

        #Print updates
        if j % log_interval == 0:  # and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.2f}, {:.3f}".format(
                j, total_num_steps,
                final_rewards.min(), final_rewards.median(),
                final_rewards.mean(), final_rewards.max(),
                int(total_num_steps / (end - start)), end - start,
                end - start2,
                torch.mean(discrim_errors).data.cpu().numpy()[0])

            print(to_print_info_string)

            # if vae_:
            #     elbo =  "{:.2f}".format(elbo.data.cpu().numpy()[0])

            # if next_state_pred_:
            #     state_pred_error_print =  "{:.2f}".format(agent.state_pred_error.data.cpu().numpy()[0])
            #     print(to_print_info_string+' '+state_pred_error_print+' '+elbo)
            #     to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, pred_error, elbo"

            # else:
            # if vae_:
            #     print(to_print_info_string+' '+elbo)
            # else:
            # print(to_print_info_string)

            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, discrim_E"  #, elbo"
            start2 = time.time()

            if j % (log_interval * 30) == 0:

                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps,
                          update_current_state, update_rewards)

                # if grad_var_  and j % (log_interval*300) == 0:
                if grad_var_ and j % (log_interval * 30) == 0:
                    #writes to file
                    do_grad_var(envs_grad_var, agent, model_dict,
                                total_num_steps, update_current_state,
                                update_rewards)

                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)

                    # if grad_var_ and j % (log_interval*300) == 0:
                    if grad_var_ and j % (log_interval * 30) == 0:
                        update_grad_plot(model_dict)
                        to_print_legend_string += ' grad_var_plot updated '

                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise  #pass
                    print(to_print_legend_string + " problem with plot")

    try:
        make_plots(model_dict)
    except:
        print()
Exemplo n.º 28
0
def train(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one

        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state



    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype']=dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype']=dtype


    # Create environments
    print (num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print ('Made dir', monitor_rewards_dir) 
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print ('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print ('env for ls')
        envs_ls = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape
    model_dict['shape_dim0']=shape_dim0



    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print ('init a2c agent')
    # elif algo == 'ppo':
    #     agent = ppo(envs, model_dict)
    #     print ('init ppo agent')
    # elif algo == 'a2c_minibatch':
    #     agent = a2c_minibatch(envs, model_dict)
    #     print ('init a2c_minibatch agent')
    # elif algo == 'a2c_list_rollout':
    #     agent = a2c_list_rollout(envs, model_dict)
    #     print ('init a2c_list_rollout agent')
    # elif algo == 'a2c_with_var':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if model_dict['load_params']:
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     # agent.actor_critic = torch.load(args.load_path).cuda()
        
    #     # print ('loaded ', args.load_path)

    #     if model_dict['load_number'] == 3:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict)

    #     elif model_dict['load_number'] == 6:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict)
    #     elif model_dict['load_number'] == 9:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict)

    #     # else:
    #     #     load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict)
    #     else:
    #         PROBLEM






    #load model
    # if model_dict['load_params']:

    # load_params(thigns)
    # param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt'
    param_file = home+'/Documents/tmp/RoadRunner/RoadRunnerNoFrameskip-v4/A2C/seed1/model_params3/model_params9999360.pt'


    # pretrained_dict = torch.load(param_file)  # object
    # print (pretrained_dict)
    # agent_dict = agent.actor_critic.state_dict()  #dict
    # print (agent_dict.keys())
    # agent_dict.update(pretrained_dict)
    # # agent_dict.update(agent.actor_critic)
    # agent.actor_critic.load_state_dict(agent_dict)


    param_dict = torch.load(param_file)
    agent.actor_critic.load_state_dict(param_dict)


    # agent.actor_critic = torch.load(param_file)
    agent.actor_critic.cuda()
    print ('loaded', param_file)

    # afdsa







    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest
    agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval /num_processes/num_steps)

    # list of lists, where lists are trajectories. trajectories have actinos and states 
    dataset = []
    tmp_trajs = [[] for x in range(num_processes)]


    dataset_count = 0


    done = [0]*num_processes

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            value, action, action_log_probs, dist_entropy = agent.act(Variable(agent.rollouts.states[step]))#, volatile=True))
            # print (action_log_probs.size())
            # print (dist_entropy.size())

            cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())






            # y = torch.LongTensor(batch_size,1).random_() % nb_digits
            # # One hot encoding buffer that you create out of the loop and just keep reusing
            # y_onehot = torch.FloatTensor(batch_size, nb_digits)
            # # In your for loop
            # y_onehot.zero_()
            # y_onehot.scatter_(1, y, 1)



            states_ = agent.rollouts.states[step].cpu().numpy()  #[P,S,84,84]
            # print (state_t.shape)
            actions_ = action.data.cpu().numpy() #[P,1]
            # print (action)
            # fdsaf


            #store step
            for proc in range(num_processes):

                #add states
                state_t = states_[proc]
                action_t = actions_[proc]
                tmp_trajs[proc].append([action_t, state_t])

                if done[proc]:

                    dataset.append(tmp_trajs[proc])
                    dataset_count += len(tmp_trajs[proc])
                    tmp_trajs[proc] = []

                    for ii in range(len(dataset)):
                        print (len(dataset[ii]))


            if dataset_count > 10000:

                # pickle.dump( dataset, open(home+'/Documents/tmp/breakout_2frames/breakout_trajectories_10000.pkl', "wb" ) )
                pickle.dump( dataset, open(home+'/Documents/tmp/RoadRunner/trajectories_10000.pkl', "wb" ) )

                print('saved')
                # pickle.save(dataset)
                STOP





            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions) 







            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state, shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)
            agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done)






        # print (len(dataset))
        # print ()





        #Optimize agent
        # agent.update()  #agent.update(j,num_updates)
        agent.insert_first_state(agent.rollouts.states[-1])


        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps
        
        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
                # save_params_v2(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps)


        #Print updates
        if j % log_interval == 0:# and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps,
                                       final_rewards.min(),
                                       final_rewards.median(),
                                       final_rewards.mean(),
                                       final_rewards.max(),
                                       int(total_num_steps / (end - start)),
                                       end - start,
                                       end - start2)
            print(to_print_info_string) 
            start2 = time.time()



            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time"
            if j % (log_interval*30) == 0:
            
                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards)
                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)
                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise #pass
                    print(to_print_legend_string)



    try:
        make_plots(model_dict)
    except:
        print ()
Exemplo n.º 29
0
def getEnvs():
    envs = [make_env for i in range(0, len(getListOfGames("train")))]
    print(envs, "******************* ENVS BEFORE SubprocVecEnv **************************")
    envs = SubprocVecEnv(envs)
    print(envs, "******************* ENVS AFTER SubprocVecEnv **************************")
    return envs
Exemplo n.º 30
0
def train(model_dict):
    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(
            state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:]
        current_state[:, -channels:] = state  #last frame is now the new one

        return current_state

    def update_rewards(reward, done, final_rewards, episode_rewards,
                       current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                 1)).float()  #[P,1]
        episode_rewards += reward  #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])  #[P,1]
        final_rewards *= masks  #erase the ones that are done
        final_rewards += (
            1 -
            masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks  #erase the done ones
        masks = masks.type(dtype)  #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks  #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype'] = dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype'] = dtype

    # Create environments
    print(num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print('Made dir', monitor_rewards_dir)
    envs = SubprocVecEnv([
        make_env(env_name, seed, i, monitor_rewards_dir)
        for i in range(num_processes)
    ])

    if vid_:
        print('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print('env for ls')
        envs_ls = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]
                 )  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape'] = obs_shape
    model_dict['shape_dim0'] = shape_dim0

    action_size = envs.action_space.n

    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print('init a2c agent')
    elif algo == 'ppo':
        agent = ppo(envs, model_dict)
        print('init ppo agent')
    elif algo == 'a2c_minibatch':
        agent = a2c_minibatch(envs, model_dict)
        print('init a2c_minibatch agent')
    elif algo == 'a2c_list_rollout':
        agent = a2c_list_rollout(envs, model_dict)
        print('init a2c_list_rollout agent')
    elif algo == 'a2c_with_var':
        agent = a2c_with_var(envs, model_dict)
        print('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    #Load model
    if model_dict['load_params']:
        # agent.actor_critic = torch.load(os.path.join(args.load_path))
        # agent.actor_critic = torch.load(args.load_path).cuda()

        # print ('loaded ', args.load_path)

        if model_dict['load_number'] == 3:
            load_params_v2(
                home +
                '/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/',
                agent, 3000160, model_dict)

        elif model_dict['load_number'] == 6:
            load_params_v2(
                home +
                '/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/',
                agent, 6000160, model_dict)
        elif model_dict['load_number'] == 9:
            load_params_v2(
                home +
                '/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/',
                agent, 9000160, model_dict)

        # else:
        #     load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict)
        else:
            PROBLEM

    ls_path = save_dir + '/V_and_Q_errors/'
    ls_file = ls_path + 'error_monitor.csv'

    if not os.path.exists(ls_path):
        os.makedirs(ls_path)
        # if print_:
        print('Made dir', ls_path)

    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(
        num_processes,
        *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(
        current_state, state,
        shape_dim0).type(dtype)  #add the new frame, remove oldest
    agent.insert_first_state(
        current_state
    )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_processes, 1])  #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval / num_processes / num_steps)

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):

        Vs = []
        Qs = []

        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            value, action, action_log_probs, dist_entropy = agent.act(
                Variable(agent.rollouts.states[step]))  #, volatile=True))
            # print (action_log_probs.size())
            # print (dist_entropy.size())

            one_hot_action = torch.FloatTensor(num_processes, action_size)
            one_hot_action.zero_()
            one_hot_action.scatter_(1, action.data.cpu(), 1)

            # print (action)
            # print (one_hot_action)
            # fdsfa

            V, Q = agent.actor_critic.get_V_and_Q(
                Variable(agent.rollouts.states[step]), one_hot_action)
            Vs.append(V)
            Qs.append(Q)

            cpu_actions = action.data.squeeze(1).cpu().numpy()  #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions)

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(
                reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state,
                                                 shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)
            agent.insert_data(step, current_state, action.data, value, reward,
                              masks, action_log_probs, dist_entropy)  #, done)

        #Optimize agent
        # agent.update()  #agent.update(j,num_updates)

        V_loss, Q_loss = agent.update2(Vs, Qs)  #agent.update(j,num_updates)

        V_loss = V_loss.data.cpu().numpy()[0]
        Q_loss = Q_loss.data.cpu().numpy()[0]
        # print (V_loss)
        # fasd

        agent.insert_first_state(agent.rollouts.states[-1])

        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps

        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
                save_params_v2(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype,
                       agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state,
                        update_rewards, total_num_steps)

        #Print updates
        if j % log_interval == 0:  # and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(
                j, total_num_steps,
                final_rewards.min(), final_rewards.median(),
                final_rewards.mean(), final_rewards.max(),
                int(total_num_steps / (end - start)), end - start,
                end - start2)
            print(to_print_info_string)
            start2 = time.time()

            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time"
            if j % (log_interval * 30) == 0:

                if total_num_steps > 5000:
                    with open(ls_file, 'a') as f:
                        writer = csv.writer(f)
                        writer.writerow([total_num_steps, V_loss, Q_loss])

                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps,
                          update_current_state, update_rewards)
                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)
                    make_plots(model_dict)

                    if total_num_steps > 5000:
                        update_error_plot(model_dict)

                    print(to_print_legend_string + " Plot updated")
                except:
                    raise  #pass
                    print(to_print_legend_string)

    try:
        make_plots(model_dict)
    except:
        print()
Exemplo n.º 31
0
def train(model_dict):
    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(
            state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:]
        current_state[:, -channels:] = state  #last frame is now the new one

        return current_state

    def update_rewards(reward, done, final_rewards, episode_rewards,
                       current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                 1)).float()  #[P,1]
        episode_rewards += reward  #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])  #[P,1]
        final_rewards *= masks  #erase the ones that are done
        final_rewards += (
            1 -
            masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks  #erase the done ones
        masks = masks.type(dtype)  #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks  #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']
    vae_ = model_dict['vae_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype'] = dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype'] = dtype

    # Create environments
    print(num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print('Made dir', monitor_rewards_dir)
    envs = SubprocVecEnv([
        make_env(env_name, seed, i, monitor_rewards_dir)
        for i in range(num_processes)
    ])

    if vid_:
        print('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print('env for ls')
        envs_ls = make_env_basic(env_name)

    if vae_:
        print('env for vae')
        envs_vae = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]
                 )  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape'] = obs_shape
    model_dict['shape_dim0'] = shape_dim0

    next_state_pred_ = 0
    model_dict['next_state_pred_'] = next_state_pred_

    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print('init a2c agent')
    elif algo == 'ppo':
        agent = ppo(envs, model_dict)
        print('init ppo agent')
    elif algo == 'a2c_minibatch':
        agent = a2c_minibatch(envs, model_dict)
        print('init a2c_minibatch agent')
    elif algo == 'a2c_list_rollout':
        agent = a2c_list_rollout(envs, model_dict)
        print('init a2c_list_rollout agent')
    elif algo == 'a2c_with_var':
        agent = a2c_with_var(envs, model_dict)
        print('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if args.load_path != '':
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     agent.actor_critic = torch.load(args.load_path).cuda()
    #     print ('loaded ', args.load_path)

    # see_reward_episode = 0
    # if 'Montez' in env_name and see_reward_episode:
    #     states_list = [[] for i in range(num_processes)]

    # view_reward_episode(model_dict=model_dict, frames=[])
    # dfasddsf

    if vae_:
        vae = VAE()
        vae.cuda()

    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(
        num_processes,
        *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(
        current_state, state,
        shape_dim0).type(dtype)  #add the new frame, remove oldest
    agent.insert_first_state(
        current_state
    )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_processes, 1])  #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval / num_processes / num_steps)

    # prev_action = Variable(torch.zeros([num_processes, 1]).type(torch.LongTensor)).cuda()

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            state_pytorch = Variable(agent.rollouts.states[step])

            value, action, action_log_probs, dist_entropy = agent.act(
                state_pytorch)  #, volatile=True))

            # if next_state_pred_:
            #     next_state_prediction = agent.actor_critic.predict_next_state2(state_pytorch, prev_action)
            # next_state_prediction = 0

            # print (action_log_probs.size())
            # print (dist_entropy.size())

            # prev_action = action

            # print (next_state_prediction.size()) # [P,1,84,84]
            # fasd

            cpu_actions = action.data.squeeze(1).cpu().numpy()  #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions)

            reward_numpy = reward

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(
                reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state,
                                                 shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)

            if next_state_pred_:

                agent.insert_data(step, current_state, action.data, value,
                                  reward, masks, action_log_probs,
                                  dist_entropy,
                                  next_state_prediction)  #, done)
                agent.rollouts.insert_state_pred(next_state_prediction)

            else:
                agent.insert_data(step, current_state, action.data, value,
                                  reward, masks, action_log_probs,
                                  dist_entropy, 0)  #, done)

            # if 'Montez' in env_name and see_reward_episode:

            #     for state_i in range(len(state)):
            #         if done[state_i]:
            #             states_list[state_i] = []
            #         else:
            #             states_list[state_i].append(np.squeeze(state[state_i]))

            #             # print (state[state_i].shape)
            #             # fasdf

            #         # print (reward)

            #         if reward_numpy[state_i] >0:
            #             #plot the states of state_i
            #             print (len(states_list[state_i]))
            #             # view_reward_episode(model_dict=model_dict, frames=states_list[state_i][len(states_list[state_i])-100:])
            #             # view_reward_episode(model_dict=model_dict, frames=states_list[state_i][len(states_list[state_i])-100:])
            #             view_reward_episode(model_dict=model_dict, frames=states_list[state_i])

            #             fadsa

            #      # and np.sum(agent.rollouts.rewards.cpu().numpy()) > 0

            #     # print (np.sum(agent.rollouts.rewards.cpu().numpy()))
            #     # print (j)

        #Optimize agent
        agent.update()  #agent.update(j,num_updates)

        batch = agent.rollouts.states

        # print (batch.size())   # [Steps+1,Processes,Stack,84,84]
        # remove first state since its repeated, its the last state of last episode
        # take the first state of the stack for each step
        #reshape to [P*S,84,84]
        batch = batch[1:]  # [Steps,Processes,Stack,84,84]
        batch = batch[:, :, 0]  # [Steps,Processes,84,84]
        batch = batch.contiguous().view(-1, 84, 84)  # [Steps*Processes,84,84]

        # print (batch.size())

        # fadsa
        # print (vae)
        elbo = vae.update(batch)

        agent.insert_first_state(agent.rollouts.states[-1])

        # print (agent.state_pred_error.data.cpu().numpy())

        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps

        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype,
                       agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state,
                        update_rewards, total_num_steps)
            #make vae prob gif
            if vae_:
                do_prob_state(envs_vae, agent, model_dict, vae,
                              update_current_state, total_num_steps)

        #Print updates
        if j % log_interval == 0:  # and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.2f}".format(
                j, total_num_steps,
                final_rewards.min(), final_rewards.median(),
                final_rewards.mean(), final_rewards.max(),
                int(total_num_steps / (end - start)), end - start,
                end - start2)

            elbo = "{:.2f}".format(elbo.data.cpu().numpy()[0])

            if next_state_pred_:
                state_pred_error_print = "{:.2f}".format(
                    agent.state_pred_error.data.cpu().numpy()[0])
                print(to_print_info_string + ' ' + state_pred_error_print +
                      ' ' + elbo)
                to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, pred_error, elbo"

            else:
                print(to_print_info_string + ' ' + elbo)
                to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, elbo"

            start2 = time.time()

            if j % (log_interval * 30) == 0:

                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps,
                          update_current_state, update_rewards)
                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)
                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise  #pass
                    print(to_print_legend_string)

    try:
        make_plots(model_dict)
    except:
        print()
Exemplo n.º 32
0
def train(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one

        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state



    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype']=dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype']=dtype


    # Create environments
    print (num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print ('Made dir', monitor_rewards_dir) 
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print ('env for gif')
        envs_gif = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape
    model_dict['shape_dim0']=shape_dim0



    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print ('init a2c agent')
    elif algo == 'ppo':
        agent = ppo(envs, model_dict)
        print ('init ppo agent')
    elif algo == 'a2c_minibatch':
        agent = a2c_minibatch(envs, model_dict)
        print ('init a2c_minibatch agent')
    elif algo == 'a2c_list_rollout':
        agent = a2c_list_rollout(envs, model_dict)
        print ('init a2c_list_rollout agent')
    elif algo == 'a2c_with_var':
        agent = a2c_with_var(envs, model_dict)
        print ('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if args.load_path != '':
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     agent.actor_critic = torch.load(args.load_path).cuda()
    #     print ('loaded ', args.load_path)












    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest
    agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes

    #Begin training
    # count =0
    start = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            value, action, action_log_probs, dist_entropy = agent.act(Variable(agent.rollouts.states[step]))#, volatile=True))
            # print (action_log_probs.size())
            # print (dist_entropy.size())

            cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions) 

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state, shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)
            agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy, done)





        #Optimize agent
        agent.update()  #agent.update(j,num_updates)
        agent.insert_first_state(agent.rollouts.states[-1])





        total_num_steps = (j + 1) * num_processes * num_steps
        
        if total_num_steps % save_interval == 0 and save_dir != "":

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps)


        #Print updates
        if j % log_interval == 0:
            end = time.time()

            if j % (log_interval*30) == 0:

                #update plots
                try:
                    make_plots(model_dict)
                    print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated")
                except:
                    # raise
                    print("Upts, n_timesteps, min/med/mean/max, FPS, Time")

            print("{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}".
                    format(j, total_num_steps,
                           final_rewards.min(),
                           final_rewards.median(),
                           final_rewards.mean(),
                           final_rewards.max(),
                           int(total_num_steps / (end - start)),
                           end - start))#, agent.current_lr)
    
    try:
        make_plots(model_dict)
    except:
        print ()
Exemplo n.º 33
0
def train(model_dict):
    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(
            state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:]
        current_state[:, -channels:] = state  #last frame is now the new one

        # if see_frames:
        #     #Grayscale
        #     save_frame(state, count)
        #     count+=1
        #     if done[0]:
        #         ffsdfa
        #     #RGB
        #     state = envs.render()
        #     print(state.shape)
        #     fdsafa

        return current_state

    def update_rewards(reward, done, final_rewards, episode_rewards,
                       current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                 1)).float()  #[P,1]
        episode_rewards += reward  #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])  #[P,1]
        final_rewards *= masks  #erase the ones that are done
        final_rewards += (
            1 -
            masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks  #erase the done ones
        masks = masks.type(dtype)  #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks  #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    def do_vid():
        n_vids = 3
        for i in range(n_vids):
            done = False
            state = envs_video.reset()
            # state = torch.from_numpy(state).float().type(dtype)
            current_state = torch.zeros(1, *obs_shape)
            current_state = update_current_state(current_state, state,
                                                 shape_dim0).type(dtype)
            # print ('Recording')
            # count=0
            while not done:
                # print (count)
                # count +=1
                # Act
                state_var = Variable(current_state, volatile=True)
                # print (state_var.size())
                action, value = agent.act(state_var)
                cpu_actions = action.data.squeeze(1).cpu().numpy()

                # Observe reward and next state
                state, reward, done, info = envs_video.step(
                    cpu_actions)  # state:[nProcesss, ndims, height, width]
                # state = torch.from_numpy(state).float().type(dtype)
                # current_state = torch.zeros(1, *obs_shape)
                current_state = update_current_state(current_state, state,
                                                     shape_dim0).type(dtype)
        state = envs_video.reset()

        vid_path = save_dir + '/videos/'
        count = 0
        for aaa in os.listdir(vid_path):

            if 'openaigym' in aaa and '.mp4' in aaa:
                #os.rename(vid_path+aaa, vid_path+'vid_t'+str(total_num_steps)+'.mp4')
                subprocess.call("(cd " + vid_path + " && mv " + vid_path +
                                aaa + " " + vid_path + env_name + '_' + algo +
                                '_vid_t' + str(total_num_steps) + '_' +
                                str(count) + ".mp4)",
                                shell=True)
                count += 1
            if '.json' in aaa:
                os.remove(vid_path + aaa)

    def save_frame(state, count):

        frame_path = save_dir + '/frames/'
        if not os.path.exists(frame_path):
            os.makedirs(frame_path)
            print('Made dir', frame_path)

        state1 = np.squeeze(state[0])
        # print (state1.shape)
        fig = plt.figure(figsize=(4, 4), facecolor='white')
        plt.imshow(state1, cmap='gray')
        plt.savefig(frame_path + 'frame' + str(count) + '.png')
        print('saved', frame_path + 'frame' + str(count) + '.png')
        plt.close(fig)

    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor

    # Create environments
    print(num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print('Made dir', monitor_rewards_dir)
    envs = SubprocVecEnv([
        make_env(env_name, seed, i, monitor_rewards_dir)
        for i in range(num_processes)
    ])

    vid_ = 1
    see_frames = 0

    if vid_:
        print('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]
                 )  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape'] = obs_shape

    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print('init a2c agent')
    elif algo == 'ppo':
        agent = ppo(envs, model_dict)
        print('init ppo agent')
    elif algo == 'a2c_minibatch':
        agent = a2c_minibatch(envs, model_dict)
        print('init a2c_minibatch agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if args.load_path != '':
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     agent.actor_critic = torch.load(args.load_path).cuda()
    #     print ('loaded ', args.load_path)

    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(
        num_processes,
        *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(
        current_state, state,
        shape_dim0).type(dtype)  #add the new frame, remove oldest
    agent.insert_first_state(
        current_state
    )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_processes, 1])  #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes

    #Begin training
    # count =0
    start = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P,1]
            action, value = agent.act(
                Variable(agent.rollouts.states[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()  #[P]

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions)

            # Record rewards
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(
                reward, done, final_rewards, episode_rewards, current_state)

            # Update state
            current_state = update_current_state(current_state, state,
                                                 shape_dim0)

            # Agent record step
            agent.insert_data(step, current_state, action.data, value.data,
                              reward, masks)

        #Optimize agent
        agent.update()  #agent.update(j,num_updates)
        agent.insert_first_state(agent.rollouts.states[-1])

        total_num_steps = (j + 1) * num_processes * num_steps

        #Save model
        if total_num_steps % save_interval == 0 and save_dir != "":
            save_path = os.path.join(save_dir, 'model_params')
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            # A really ugly way to save a model to CPU
            save_model = agent.actor_critic
            if cuda:
                save_model = copy.deepcopy(agent.actor_critic).cpu()
            # torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))
            # steps_sci_nota = '{e}'.format(total_num_steps)
            save_to = os.path.join(
                save_path, "model_params" + str(total_num_steps) + ".pt")
            # save_to=os.path.join(save_path, "model_params" + steps_sci_nota+".pt")
            torch.save(save_model, save_to)
            print('saved', save_to)

            #make video
            if vid_:
                do_vid()

        #Print updates
        if j % log_interval == 0:
            end = time.time()

            if j % (log_interval * 30) == 0:

                #update plots
                try:
                    make_plots(model_dict)
                    print(
                        "Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated"
                    )
                except:
                    raise
                    print("Upts, n_timesteps, min/med/mean/max, FPS, Time")

            print("{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}".format(
                j, total_num_steps, final_rewards.min(),
                final_rewards.median(), final_rewards.mean(),
                final_rewards.max(), int(total_num_steps / (end - start)),
                end - start))  #, agent.current_lr)

    try:
        make_plots(model_dict)
    except:
        print()
Exemplo n.º 34
0
def train(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one

        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state



    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']


    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']
    vae_ = model_dict['vae_']
    grad_var_ = model_dict['grad_var_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype']=dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype']=dtype


    # Create environments
    print (num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print ('Made dir', monitor_rewards_dir) 
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print ('env for gif')
        envs_gif = make_env_basic(env_name)

    # if ls_:
    #     print ('env for ls')
    #     envs_ls = make_env_basic(env_name)

    # if vae_:
    #     print ('env for vae')
    #     envs_vae = make_env_basic(env_name)

    # if grad_var_:
    #     print ('env for grad_var_')
    #     envs_grad_var = make_env_basic(env_name)



    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape
    model_dict['shape_dim0']=shape_dim0
    model_dict['action_size'] = envs.action_space.n
    print (envs.action_space.n, 'actions')



    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print ('init a2c agent')

    elif algo == 'dqn':
        agent = DQN(envs, model_dict)
        print ('init DQN agent')  
        print (agent.q_net)   



    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest, since its a stack
    # agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval /num_processes/num_steps)


    # dqn_epsilon = .1 #lower means less likely to do random .9 # .1

    epsilon_start = 1.0
    epsilon_final = 0.01
    epsilon_decay = 50000
    epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):

        dqn_epsilon = epsilon_by_frame(j)

        #Num steps till agent update
        # for step in range(num_steps):

        # Act, [P,1], [P,1], [P,1], [P]
        # state_pytorch = Variable(agent.rollouts.states[step])
        state_pytorch = Variable(current_state)
        # value, action, action_log_probs, dist_entropy = agent.act(state_pytorch, epsilon=dqn_epsilon)#, volatile=True))
        action = agent.act(state_pytorch, epsilon=dqn_epsilon)#, volatile=True))
        
        # Apply to Environment, S:[P,C,H,W], R:[P], D:[P]
        # cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]
        frame, reward, done, info = envs.step(action) 

        # Record rewards and update state
        reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
        new_current_state = update_current_state(current_state, frame, shape_dim0)


        agent.replay_buffer.push(current_state, action, reward, new_current_state, done.astype(int))

        current_state = new_current_state


        if len(agent.replay_buffer) > 100:
            agent.update()
            # agent.update()
            # agent.update()
            # agent.update()







        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps
        
        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps)
            #make vae prob gif
            if vae_:
                do_prob_state(envs_vae, agent, model_dict, vae, update_current_state, total_num_steps)
            # #make vae prob gif
            # if grad_var_:
            #     do_grad_var(envs_grad_var, agent, model_dict, update_current_state, total_num_steps)

        #Print updates
        if j % log_interval == 0:# and j!=0:
            end = time.time()


            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.2f}, {:.2f}, {:.5f}".format(j, total_num_steps,
                                       final_rewards.min(),
                                       final_rewards.median(),
                                       final_rewards.mean(),
                                       final_rewards.max(),
                                       int(total_num_steps / (end - start)),
                                       end - start,
                                       end - start2,
                                       dqn_epsilon,
                                       agent.loss.data.cpu().numpy()[0])
                                       # torch.mean(discrim_errors).data.cpu().numpy()[0])

            print(to_print_info_string)


            # if vae_:
            #     elbo =  "{:.2f}".format(elbo.data.cpu().numpy()[0])


            # if next_state_pred_:
            #     state_pred_error_print =  "{:.2f}".format(agent.state_pred_error.data.cpu().numpy()[0])
            #     print(to_print_info_string+' '+state_pred_error_print+' '+elbo)
            #     to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, pred_error, elbo"

            # else:
            # if vae_:
            #     print(to_print_info_string+' '+elbo)
            # else:
            # print(to_print_info_string)


            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, discrim_E"#, elbo"
            start2 = time.time()

            if j % (log_interval*30) == 0:
            
                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards)

                # if grad_var_  and j % (log_interval*300) == 0:
                if grad_var_  and j % (log_interval*30) == 0:
                    #writes to file
                    do_grad_var(envs_grad_var, agent, model_dict, total_num_steps, update_current_state, update_rewards)






                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)

                    # if grad_var_ and j % (log_interval*300) == 0:
                    if grad_var_ and j % (log_interval*30) == 0:
                        update_grad_plot(model_dict)
                        to_print_legend_string += ' grad_var_plot updated '

                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")

                    # print (len(agent.replay_buffer))
                except:
                    raise #pass
                    print(to_print_legend_string + " problem with plot")



    try:
        make_plots(model_dict)
    except:
        print ()
Exemplo n.º 35
0
def viz(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one



        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    def do_vid():
        n_vids=3
        for i in range(n_vids):
            done=False
            state = envs_video.reset()
            # state = torch.from_numpy(state).float().type(dtype)
            current_state = torch.zeros(1, *obs_shape)
            current_state = update_current_state(current_state, state, shape_dim0).type(dtype)
            # print ('Recording')
            # count=0
            while not done:
                # print (count)
                # count +=1
                # Act
                state_var = Variable(current_state, volatile=True) 
                # print (state_var.size())
                action, value = agent.act(state_var)
                cpu_actions = action.data.squeeze(1).cpu().numpy()

                # Observe reward and next state
                state, reward, done, info = envs_video.step(cpu_actions) # state:[nProcesss, ndims, height, width]
                # state = torch.from_numpy(state).float().type(dtype)
                # current_state = torch.zeros(1, *obs_shape)
                current_state = update_current_state(current_state, state, shape_dim0).type(dtype)
        state = envs_video.reset()
        
        vid_path = save_dir+'/videos/'
        count =0
        for aaa in os.listdir(vid_path):

            if 'openaigym' in aaa and '.mp4' in aaa:
                #os.rename(vid_path+aaa, vid_path+'vid_t'+str(total_num_steps)+'.mp4')
                subprocess.call("(cd "+vid_path+" && mv "+ vid_path+aaa +" "+ vid_path+env_name+'_'+algo+'_vid_t'+str(total_num_steps)+'_'+str(count) +".mp4)", shell=True) 
                count+=1
            if '.json' in aaa:
                os.remove(vid_path+aaa)




    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    
    num_processes = 1
    model_dict['num_processes'] = 1
    
    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor


    # Create environments
    print (num_processes, 'processes')
    # monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    # if not os.path.exists(monitor_rewards_dir):
    #     os.makedirs(monitor_rewards_dir)
    #     print ('Made dir', monitor_rewards_dir) 

    monitor_rewards_dir = ''
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    vid_ = 0
    see_frames = 1

    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape


    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print ('init a2c agent')
    elif algo == 'ppo':
        agent = ppo(envs, model_dict)
        print ('init ppo agent')
    elif algo == 'a2c_minibatch':
        agent = a2c_minibatch(envs, model_dict)
        print ('init a2c_minibatch agent')
    # agent = model_dict['agent'](envs, model_dict)




    #Load model
    # if args.load_path != '':
        # agent.actor_critic = torch.load(os.path.join(args.load_path))

    # epoch_level = 1e6
    model_params_file = save_dir+ '/model_params/model_params'+str(int(epoch_level))+'.pt'
    agent.actor_critic = torch.load(model_params_file).cuda()
    print ('loaded ', model_params_file)
    # fafdas


    # frame_path = save_dir+'/frames/'
    if not os.path.exists(frame_path):
        os.makedirs(frame_path)
        print ('Made dir', frame_path) 




    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest
    agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes

    #Begin training
    count =0
    start = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # if see_frames:
            #Grayscale
            # save_frame(state, count)




            # #RGB
            # state = envs.render()
            # print(state.shape)
            # fdsafa


            values = []
            actions = []
            for ii in range(100):
                # Act, [P,1], [P,1]
                action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
                val = value.data.cpu().numpy()[0][0]
                act_ = action.data.cpu().numpy()[0][0]
                # print ('value', val)
                # print ('action', act_)
                values.append(val)
                actions.append(act_)

            # print ('values', values)
            # print ('actions', actions)

            rows = 1
            cols = 2

            fig = plt.figure(figsize=(8,4), facecolor='white')

            # plot frame
            ax = plt.subplot2grid((rows,cols), (0,0), frameon=False)

            state1 = np.squeeze(state[0])
            ax.imshow(state1, cmap='gray')
            ax.set_xticks([])
            ax.set_yticks([])
            # ax.savefig(frame_path+'frame' +str(count)+'.png')
            # print ('saved',frame_path+'frame' +str(count)+'.png')
            # plt.close(fig)


            #plot values histogram
            ax = plt.subplot2grid((rows,cols), (0,1), frameon=False)

            weights = np.ones_like(values)/float(len(values))
            ax.hist(values, 50, range=[0.0, 4.], weights=weights)
            # ax.set_ylim(top=1.)
            ax.set_ylim([0.,1.])

            plt_path = frame_path+'plt' 
            plt.savefig(plt_path+str(count)+'.png')
            print ('saved',plt_path+str(count)+'.png')
            plt.close(fig)
            # fsadf



            count+=1
            if count > 2:
                if done[0] or count > max_frames:
                    ffsdfa





                # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
                # print ('value', value)
                # print ('action', action)

                # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
                # print ('value', value)
                # print ('action', action)


            
            cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions) 



            # Record rewards
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            
            # Update state
            current_state = update_current_state(current_state, state, shape_dim0)

            # Agent record step
            agent.insert_data(step, current_state, action.data, value.data, reward, masks)



        # #Optimize agent
        # agent.update()  #agent.update(j,num_updates)
        # agent.insert_first_state(agent.rollouts.states[-1])




        total_num_steps = (j + 1) * num_processes * num_steps