def dynamics_data_gen(env_name='Reacher-v2',
                      start_seed=0,
                      timesteps=10,
                      n_parallel_envs=1,
                      width=300,
                      height=240):
    import gym  # import locally so that caller can patch gym

    def make_env(seed):
        def _():
            env = gym.make(env_name)
            env.seed(seed)
            return env

        return _

    # Uncomment this to show the bug
    # from requests_futures.sessions import FuturesSession
    # session = FuturesSession()
    # session.get('http://www.google.com', )

    from subproc_vec_env import SubprocVecEnv
    # from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv

    env = SubprocVecEnv(
        [make_env(s) for s in range(start_seed, start_seed + n_parallel_envs)])

    policy = RandPolicy(env.observation_space, env.action_space, env.num_envs)

    rollouts = []
    obs = env.reset()
    for i in range(timesteps):
        # fs = env.render("rgb", width=width, height=height)
        fs = env.render("rgb_array", width=width, height=height)
        acs = policy.act(obs)
        rollouts.append(dict(obs=obs, acs=acs, views=fs))
        obs, rewards, dones, infos = env.step(acs)

    import pandas as pd
    return {k: np.stack(v) for k, v in pd.DataFrame(rollouts).items()}
Exemplo n.º 2
0
class BatchSampler(object):
    def __init__(self, env_name, batch_size, num_workers=mp.cpu_count() - 1):
        self.env_name = env_name
        self.batch_size = batch_size
        self.num_workers = num_workers
        
        self.queue = mp.Queue()
        self.envs = SubprocVecEnv([make_env(env_name) for _ in range(num_workers)],
                                  queue=self.queue)
        self._env = gym.make(env_name)

    def sample(self, policy, params=None, gamma=0.95):
        episodes = BatchEpisodes(batch_size=self.batch_size, gamma=gamma)
        for i in range(self.batch_size):
            self.queue.put(i)
        for _ in range(self.num_workers):
            self.queue.put(None)
        observations, batch_ids = self.envs.reset()
        dones = [False]
        while (not all(dones)) or (not self.queue.empty()):
            observations_tensor = observations
            actions_tensor = policy(observations_tensor, params=params).sample()
            with tf.device('/CPU:0'):
                actions = actions_tensor.numpy()
            new_observations, rewards, dones, new_batch_ids, _ = self.envs.step(actions)
            episodes.append(observations, actions, rewards, batch_ids)
            observations, batch_ids = new_observations, new_batch_ids

        return episodes

    def reset_task(self, task):
        tasks = [task for _ in range(self.num_workers)]
        reset = self.envs.reset_task(tasks)
        return all(reset)

    def sample_tasks(self, num_tasks):
        tasks = self._env.unwrapped.sample_tasks(num_tasks)
        return tasks
Exemplo n.º 3
0
def train(model_dict):
    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(
            state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:]
        current_state[:, -channels:] = state  #last frame is now the new one

        # if see_frames:
        #     #Grayscale
        #     save_frame(state, count)
        #     count+=1
        #     if done[0]:
        #         ffsdfa
        #     #RGB
        #     state = envs.render()
        #     print(state.shape)
        #     fdsafa

        return current_state

    def update_rewards(reward, done, final_rewards, episode_rewards,
                       current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                 1)).float()  #[P,1]
        episode_rewards += reward  #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])  #[P,1]
        final_rewards *= masks  #erase the ones that are done
        final_rewards += (
            1 -
            masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks  #erase the done ones
        masks = masks.type(dtype)  #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks  #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    def do_vid():
        n_vids = 3
        for i in range(n_vids):
            done = False
            state = envs_video.reset()
            # state = torch.from_numpy(state).float().type(dtype)
            current_state = torch.zeros(1, *obs_shape)
            current_state = update_current_state(current_state, state,
                                                 shape_dim0).type(dtype)
            # print ('Recording')
            # count=0
            while not done:
                # print (count)
                # count +=1
                # Act
                state_var = Variable(current_state, volatile=True)
                # print (state_var.size())
                action, value = agent.act(state_var)
                cpu_actions = action.data.squeeze(1).cpu().numpy()

                # Observe reward and next state
                state, reward, done, info = envs_video.step(
                    cpu_actions)  # state:[nProcesss, ndims, height, width]
                # state = torch.from_numpy(state).float().type(dtype)
                # current_state = torch.zeros(1, *obs_shape)
                current_state = update_current_state(current_state, state,
                                                     shape_dim0).type(dtype)
        state = envs_video.reset()

        vid_path = save_dir + '/videos/'
        count = 0
        for aaa in os.listdir(vid_path):

            if 'openaigym' in aaa and '.mp4' in aaa:
                #os.rename(vid_path+aaa, vid_path+'vid_t'+str(total_num_steps)+'.mp4')
                subprocess.call("(cd " + vid_path + " && mv " + vid_path +
                                aaa + " " + vid_path + env_name + '_' + algo +
                                '_vid_t' + str(total_num_steps) + '_' +
                                str(count) + ".mp4)",
                                shell=True)
                count += 1
            if '.json' in aaa:
                os.remove(vid_path + aaa)

    def save_frame(state, count):

        frame_path = save_dir + '/frames/'
        if not os.path.exists(frame_path):
            os.makedirs(frame_path)
            print('Made dir', frame_path)

        state1 = np.squeeze(state[0])
        # print (state1.shape)
        fig = plt.figure(figsize=(4, 4), facecolor='white')
        plt.imshow(state1, cmap='gray')
        plt.savefig(frame_path + 'frame' + str(count) + '.png')
        print('saved', frame_path + 'frame' + str(count) + '.png')
        plt.close(fig)

    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor

    # Create environments
    print(num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print('Made dir', monitor_rewards_dir)
    envs = SubprocVecEnv([
        make_env(env_name, seed, i, monitor_rewards_dir)
        for i in range(num_processes)
    ])

    vid_ = 1
    see_frames = 0

    if vid_:
        print('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]
                 )  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape'] = obs_shape

    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print('init a2c agent')
    elif algo == 'ppo':
        agent = ppo(envs, model_dict)
        print('init ppo agent')
    elif algo == 'a2c_minibatch':
        agent = a2c_minibatch(envs, model_dict)
        print('init a2c_minibatch agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if args.load_path != '':
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     agent.actor_critic = torch.load(args.load_path).cuda()
    #     print ('loaded ', args.load_path)

    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(
        num_processes,
        *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(
        current_state, state,
        shape_dim0).type(dtype)  #add the new frame, remove oldest
    agent.insert_first_state(
        current_state
    )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_processes, 1])  #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes

    #Begin training
    # count =0
    start = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P,1]
            action, value = agent.act(
                Variable(agent.rollouts.states[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()  #[P]

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions)

            # Record rewards
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(
                reward, done, final_rewards, episode_rewards, current_state)

            # Update state
            current_state = update_current_state(current_state, state,
                                                 shape_dim0)

            # Agent record step
            agent.insert_data(step, current_state, action.data, value.data,
                              reward, masks)

        #Optimize agent
        agent.update()  #agent.update(j,num_updates)
        agent.insert_first_state(agent.rollouts.states[-1])

        total_num_steps = (j + 1) * num_processes * num_steps

        #Save model
        if total_num_steps % save_interval == 0 and save_dir != "":
            save_path = os.path.join(save_dir, 'model_params')
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            # A really ugly way to save a model to CPU
            save_model = agent.actor_critic
            if cuda:
                save_model = copy.deepcopy(agent.actor_critic).cpu()
            # torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))
            # steps_sci_nota = '{e}'.format(total_num_steps)
            save_to = os.path.join(
                save_path, "model_params" + str(total_num_steps) + ".pt")
            # save_to=os.path.join(save_path, "model_params" + steps_sci_nota+".pt")
            torch.save(save_model, save_to)
            print('saved', save_to)

            #make video
            if vid_:
                do_vid()

        #Print updates
        if j % log_interval == 0:
            end = time.time()

            if j % (log_interval * 30) == 0:

                #update plots
                try:
                    make_plots(model_dict)
                    print(
                        "Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated"
                    )
                except:
                    raise
                    print("Upts, n_timesteps, min/med/mean/max, FPS, Time")

            print("{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}".format(
                j, total_num_steps, final_rewards.min(),
                final_rewards.median(), final_rewards.mean(),
                final_rewards.max(), int(total_num_steps / (end - start)),
                end - start))  #, agent.current_lr)

    try:
        make_plots(model_dict)
    except:
        print()
Exemplo n.º 4
0
def runTrain(gymId='BreakoutNoFrameskip-v4',
             numEnvs=16,
             seed=0,
             filePathBrain='training/breakout-v1.pth',
             numSteps=5,
             numBatches=20000,
             outputBatchInterval=1000,
             joinEnvs=1,
             epsilon=0.00001):
    def make_env(rank):
        def _thunk():
            env = make_atari(gymId)
            env.seed(seed + rank)
            gym.logger.setLevel(logging.WARN)
            env = wrap_deepmind(env)

            # wrap the env one more time for getting total reward
            env = Monitor(env, rank)
            return env

        return _thunk

    print('training starting', numBatches, outputBatchInterval, 'epsilon',
          epsilon)
    env = SubprocVecEnv([make_env(i) for i in range(numEnvs)])

    numActions = env.action_space.n

    torchDevice = 'cpu'
    if torch.cuda.is_available():
        torchDevice = 'cuda'
    agent = ai_a2c.A2C(numActions, device=torchDevice)
    if filePathBrain:
        agent.load(filePath=filePathBrain)

    timingStart = date_time.now()
    batchCount = 0

    states, actions, rewards, dones, values = [], [], [], [], []
    for ii in range(numEnvs):
        states.append([])
        actions.append([])
        rewards.append([])
        dones.append([])
        values.append([])

    # Set first state.
    # Environment returns 1 frame, but we want multiple, so we stack the new
    # state on top of the past ones.
    nh, nw, nc = env.observation_space.shape
    nstack = 4
    batchStateShape = (numEnvs * numSteps, nh, nw, nc * nstack)
    emptyState = np.zeros((numEnvs, nh, nw, nc * nstack), dtype=np.uint8)
    obs = env.reset()
    # states = updateState(obs, emptyState, nc)
    lastStates = updateState(obs, emptyState, nc)
    lastDones = [False for _ in range(numEnvs)]

    totalRewards = []
    realTotalRewards = []
    # All actions are always valid.
    validActions = [0, 1, 2, 3]

    while batchCount < numBatches:
        states, actions, rewards, dones, values = [], [], [], [], []
        stepCount = 0
        while stepCount < numSteps:
            actionsStep, valuesStep = agent.selectActions(
                lastStates, validActions=validActions, randomRatio=epsilon)
            # print ('actionsStep', actionsStep)
            states.append(np.copy(lastStates))
            actions.append(actionsStep)
            values.append(valuesStep)
            if stepCount > 0:
                dones.append(lastDones)

            # Input the action (run a step) for all environments.
            statesStep, rewardsStep, donesStep, infosStep = env.step(
                actionsStep)

            # Update state for any dones.
            for n, done in enumerate(donesStep):
                if done:
                    lastStates[n] = lastStates[n] * 0
            lastStates = updateState(obs, lastStates, nc)

            # Update rewards for logging / tracking.
            for done, info in zip(donesStep, infosStep):
                if done:
                    totalRewards.append(info['reward'])
                    if info['total_reward'] != -1:
                        realTotalRewards.append(info['total_reward'])

            lastDones = donesStep
            rewards.append(rewardsStep)

            stepCount += 1

        # Dones is one off, so add the last one.
        dones.append(lastDones)

        # discount/bootstrap off value fn
        # lastValues = self.agent.value(lastStates).tolist()
        # Can skip this as it is done in the learn function with calcActualStateValues?

        # Join all (combine batches and steps).
        states = np.asarray(states, dtype='float32').swapaxes(
            1, 0).reshape(batchStateShape)
        actions = np.asarray(actions).swapaxes(1, 0).flatten()
        rewards = np.asarray(rewards).swapaxes(1, 0).flatten()
        dones = np.asarray(dones).swapaxes(1, 0).flatten()
        values = np.asarray(values).swapaxes(1, 0).flatten()
        agent.learn(states, actions, rewards, dones, values)

        batchCount += 1

        if batchCount % outputBatchInterval == 0:
            runTime = date_time.diff(date_time.now(), timingStart, 'minutes')
            totalSteps = batchCount * numSteps
            runTimePerStep = runTime / totalSteps
            runTimePerStepUnit = 'minutes'
            if runTimePerStep < 0.02:
                runTimePerStep *= 60
                runTimePerStepUnit = 'seconds'
            print(batchCount, numBatches, '(batch done)',
                  number.toFixed(runTime), 'run time minutes,', totalSteps,
                  'steps,', number.toFixed(runTimePerStep), runTimePerStepUnit,
                  'per step')

            r = totalRewards[-100:]  # get last 100
            tr = realTotalRewards[-100:]
            if len(r) == 100:
                print("avg reward (last 100):", np.mean(r))
            if len(tr) == 100:
                print("avg total reward (last 100):", np.mean(tr))
                print("max (last 100):", np.max(tr))

            # Only save periodically as well.
            if filePathBrain:
                agent.save(filePathBrain)

    env.close()

    if filePathBrain:
        agent.save(filePathBrain)

    runTime = date_time.diff(date_time.now(), timingStart, 'minutes')
    totalSteps = numBatches * numSteps
    runTimePerStep = runTime / totalSteps
    runTimePerStepUnit = 'minutes'
    if runTimePerStep < 0.02:
        runTimePerStep *= 60
        runTimePerStepUnit = 'seconds'
    print('training done:', number.toFixed(runTime),
          'run time minutes,', totalSteps, 'steps,',
          number.toFixed(runTimePerStep), runTimePerStepUnit, 'per step')

    return None
Exemplo n.º 5
0
def train(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one

        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state



    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype']=dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype']=dtype


    # Create environments
    print (num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print ('Made dir', monitor_rewards_dir) 
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print ('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print ('env for ls')
        envs_ls = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape
    model_dict['shape_dim0']=shape_dim0



    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print ('init a2c agent')
    # elif algo == 'ppo':
    #     agent = ppo(envs, model_dict)
    #     print ('init ppo agent')
    # elif algo == 'a2c_minibatch':
    #     agent = a2c_minibatch(envs, model_dict)
    #     print ('init a2c_minibatch agent')
    # elif algo == 'a2c_list_rollout':
    #     agent = a2c_list_rollout(envs, model_dict)
    #     print ('init a2c_list_rollout agent')
    # elif algo == 'a2c_with_var':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if model_dict['load_params']:
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     # agent.actor_critic = torch.load(args.load_path).cuda()
        
    #     # print ('loaded ', args.load_path)

    #     if model_dict['load_number'] == 3:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict)

    #     elif model_dict['load_number'] == 6:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict)
    #     elif model_dict['load_number'] == 9:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict)

    #     # else:
    #     #     load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict)
    #     else:
    #         PROBLEM






    #load model
    # if model_dict['load_params']:

    # load_params(thigns)
    # param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt'
    param_file = home+'/Documents/tmp/RoadRunner/RoadRunnerNoFrameskip-v4/A2C/seed1/model_params3/model_params9999360.pt'


    # pretrained_dict = torch.load(param_file)  # object
    # print (pretrained_dict)
    # agent_dict = agent.actor_critic.state_dict()  #dict
    # print (agent_dict.keys())
    # agent_dict.update(pretrained_dict)
    # # agent_dict.update(agent.actor_critic)
    # agent.actor_critic.load_state_dict(agent_dict)


    param_dict = torch.load(param_file)
    agent.actor_critic.load_state_dict(param_dict)


    # agent.actor_critic = torch.load(param_file)
    agent.actor_critic.cuda()
    print ('loaded', param_file)

    # afdsa







    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest
    agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval /num_processes/num_steps)

    # list of lists, where lists are trajectories. trajectories have actinos and states 
    dataset = []
    tmp_trajs = [[] for x in range(num_processes)]


    dataset_count = 0


    done = [0]*num_processes

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            value, action, action_log_probs, dist_entropy = agent.act(Variable(agent.rollouts.states[step]))#, volatile=True))
            # print (action_log_probs.size())
            # print (dist_entropy.size())

            cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())






            # y = torch.LongTensor(batch_size,1).random_() % nb_digits
            # # One hot encoding buffer that you create out of the loop and just keep reusing
            # y_onehot = torch.FloatTensor(batch_size, nb_digits)
            # # In your for loop
            # y_onehot.zero_()
            # y_onehot.scatter_(1, y, 1)



            states_ = agent.rollouts.states[step].cpu().numpy()  #[P,S,84,84]
            # print (state_t.shape)
            actions_ = action.data.cpu().numpy() #[P,1]
            # print (action)
            # fdsaf


            #store step
            for proc in range(num_processes):

                #add states
                state_t = states_[proc]
                action_t = actions_[proc]
                tmp_trajs[proc].append([action_t, state_t])

                if done[proc]:

                    dataset.append(tmp_trajs[proc])
                    dataset_count += len(tmp_trajs[proc])
                    tmp_trajs[proc] = []

                    for ii in range(len(dataset)):
                        print (len(dataset[ii]))


            if dataset_count > 10000:

                # pickle.dump( dataset, open(home+'/Documents/tmp/breakout_2frames/breakout_trajectories_10000.pkl', "wb" ) )
                pickle.dump( dataset, open(home+'/Documents/tmp/RoadRunner/trajectories_10000.pkl', "wb" ) )

                print('saved')
                # pickle.save(dataset)
                STOP





            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions) 







            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state, shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)
            agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done)






        # print (len(dataset))
        # print ()





        #Optimize agent
        # agent.update()  #agent.update(j,num_updates)
        agent.insert_first_state(agent.rollouts.states[-1])


        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps
        
        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
                # save_params_v2(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps)


        #Print updates
        if j % log_interval == 0:# and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps,
                                       final_rewards.min(),
                                       final_rewards.median(),
                                       final_rewards.mean(),
                                       final_rewards.max(),
                                       int(total_num_steps / (end - start)),
                                       end - start,
                                       end - start2)
            print(to_print_info_string) 
            start2 = time.time()



            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time"
            if j % (log_interval*30) == 0:
            
                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards)
                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)
                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise #pass
                    print(to_print_legend_string)



    try:
        make_plots(model_dict)
    except:
        print ()
Exemplo n.º 6
0
def train(model_dict):
    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(
            state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:]
        current_state[:, -channels:] = state  #last frame is now the new one

        return current_state

    def update_rewards(reward, done, final_rewards, episode_rewards,
                       current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                 1)).float()  #[P,1]
        episode_rewards += reward  #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])  #[P,1]
        final_rewards *= masks  #erase the ones that are done
        final_rewards += (
            1 -
            masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks  #erase the done ones
        masks = masks.type(dtype)  #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks  #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']
    vae_ = model_dict['vae_']
    grad_var_ = model_dict['grad_var_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype'] = dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype'] = dtype

    # Create environments
    print(num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print('Made dir', monitor_rewards_dir)
    envs = SubprocVecEnv([
        make_env(env_name, seed, i, monitor_rewards_dir)
        for i in range(num_processes)
    ])

    if vid_:
        print('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print('env for ls')
        envs_ls = make_env_basic(env_name)

    if vae_:
        print('env for vae')
        envs_vae = make_env_basic(env_name)

    if grad_var_:
        print('env for grad_var_')
        envs_grad_var = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]
                 )  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape'] = obs_shape
    model_dict['shape_dim0'] = shape_dim0
    model_dict['action_size'] = envs.action_space.n
    print(envs.action_space.n, 'actions')

    # next_state_pred_ = 0
    # model_dict['next_state_pred_'] = next_state_pred_

    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print('init a2c agent')

    discriminator = CNN_Discriminator(model_dict).cuda()
    print('init discriminator')

    # elif algo == 'a2c_over':
    #     agent = a2c_over(envs, model_dict)
    #     print ('init a2c_over agent')
    # elif algo == 'a2c_under':
    #     agent = a2c_under(envs, model_dict)
    #     print ('init a2c_under agent')
    # elif algo == 'ppo':
    #     agent = ppo(envs, model_dict)
    #     print ('init ppo agent')
    # elif algo == 'a2c_minibatch':
    #     agent = a2c_minibatch(envs, model_dict)
    #     print ('init a2c_minibatch agent')
    # elif algo == 'a2c_list_rollout':
    #     agent = a2c_list_rollout(envs, model_dict)
    #     print ('init a2c_list_rollout agent')
    # elif algo == 'a2c_with_var':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if args.load_path != '':
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     agent.actor_critic = torch.load(args.load_path).cuda()
    #     print ('loaded ', args.load_path)

    # see_reward_episode = 0
    # if 'Montez' in env_name and see_reward_episode:
    #     states_list = [[] for i in range(num_processes)]

    # view_reward_episode(model_dict=model_dict, frames=[])
    # dfasddsf

    # if vae_:
    #     vae = VAE()
    #     vae.cuda()

    buffer_ = 1

    if buffer_:
        buffer_states = deque(maxlen=200)
        buffer_actions = deque(maxlen=200)

    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(
        num_processes,
        *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(
        current_state, state, shape_dim0).type(
            dtype)  #add the new frame, remove oldest, since its a stack
    agent.insert_first_state(
        current_state
    )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_processes, 1])  #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval / num_processes / num_steps)

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):
        # discrim_errors = []
        # discrim_errors_reverse = []
        # discrim_errors_2step = []
        # frames = []
        for step in range(num_steps):

            # Act, [P,1], [P,1], [P,1], [P]
            state_pytorch = Variable(agent.rollouts.states[step])
            value, action, action_log_probs, dist_entropy = agent.act(
                state_pytorch)  #, volatile=True))

            # print (action)

            # fsdaf

            # Apply to Environment, S:[P,C,H,W], R:[P], D:[P]
            cpu_actions = action.data.squeeze(1).cpu().numpy()  #[P]
            frame, reward, done, info = envs.step(cpu_actions)

            # frames.append(torch.FloatTensor(frame)) #[P,1,84,84]

            # # current_frame = torch.from_numpy(frame)  #[P,1,84,84]
            # current_frame = torch.FloatTensor(frame)  #[P,1,84,84]
            # if step ==0:
            #     prev_frame = torch.FloatTensor(state)  #[P,1,84,84]

            # #Pred action and get error
            # discrim_error = discriminator.forward(prev_frame, current_frame, action)
            # discrim_errors.append(discrim_error)

            # discrim_error_reverse = discriminator.forward(current_frame, prev_frame, action)
            # discrim_errors_reverse.append(discrim_error_reverse)

            # # THIS IS TO SEE PREDICTIONS

            # if step==0:
            #     f =  np.reshape(prev_frame[0].numpy(), [84,84])
            # f =np.concatenate([f,np.reshape(current_frame[0].numpy(),[84,84])], axis=0)

            # # f1 = prev_frame[0].numpy()
            # # f2 = current_frame[0].numpy()
            # # f = np.reshape(np.concatenate([f1,f2], axis=1), [168,84])
            # # print (f.shape)
            # print (cpu_actions[0])
            # # ['NOOP', 'FIRE', 'RIGHT', 'LEFT'] for breakout
            # #for montezuma
            # #['NOOP', 'FIRE', 'UP', 'RIGHT', 'LEFT', 'DOWN', 'UPRIGHT', 'UPLEFT', 'DOWNRIGHT', 'DOWNLEFT',
            #     #'UPFIRE', 'RIGHTFIRE', 'LEFTFIRE', 'DOWNFIRE', 'UPRIGHTFIRE', 'UPLEFTFIRE', 'DOWNRIGHTFIRE', 'DOWNLEFTFIRE']
            # I think FIRE = JUMP

            # if step ==2:
            #     print (torch.mean(current_frame-prev_frame))
            #     fdafds

            # prev_frame_2step = prev_frame

            # prev_frame = current_frame

            # # print (torch.sum(prev_frame_2step), torch.sum(prev_frame))

            # fadsa

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(
                reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, frame,
                                                 shape_dim0)
            agent.insert_data(step, current_state, action.data, value, reward,
                              masks, action_log_probs, dist_entropy,
                              0)  #, done)

        # print (f.shape)
        # rows = 1
        # cols = 1
        # fig = plt.figure(figsize=(1+cols,5+rows), facecolor='white')
        # ax = plt.subplot2grid((rows,cols), (0,0), frameon=False) #, rowspan=7)
        # ax.imshow(f, cmap=plt.get_cmap('gray'))
        # ax.set_yticks([])
        # ax.set_xticks([])
        # plt.tight_layout()
        # plt.savefig(model_dict['exp_path']+'plot.png')
        # print ('plotted')
        # fadsfad

        # if buffer_:
        #     if len(buffer_actions) <100:
        #         buffer_steps = 10
        #     else:
        #         buffer_steps = 1

        buffer_steps = 500

        if buffer_:
            #Insert into buffer
            buffer_states.append(agent.rollouts.states)
            buffer_actions.append(agent.rollouts.actions)

            # print (agent.rollouts.states)
            # print (agent.rollouts.actions)
            # fda
            # print (len(buffer_actions))

            #If buffer full enough,sample , predict, optimize
            # if len(buffer_actions) > 10:

            if len(buffer_actions) == 100:

                # if 1:
                #Num of optimization steps
                for i in range(buffer_steps):
                    # #Sample batch
                    # states_batch = []
                    # actions_batch = []
                    # for bb in range(num_processes):
                    #     ind = np.random.randint(len(buffer_actions))
                    #     print (buffer_states[ind].size())
                    #     fadas
                    #     states_batch.append(buffer_states[ind])
                    #     actions_batch.append(buffer_actions[ind])
                    # states_batch = torch.stack(states_batch, dim=1)
                    # actions_batch = torch.stack(actions_batch, dim=1)

                    ind = np.random.randint(len(buffer_actions))
                    states_batch = buffer_states[ind]
                    actions_batch = buffer_actions[ind]

                    #Optimize action-predictor
                    discrim_errors = discrim_predictions(
                        model_dict, states_batch, actions_batch, discriminator)
                    discriminator.optimize(discrim_errors)

                    if i % 20 == 0:
                        print(i)

                # print (len(buffer_actions), torch.mean(discrim_errors).data.cpu().numpy()[0])

            #Optimize agent
            discrim_errors = discrim_predictions(model_dict,
                                                 agent.rollouts.states,
                                                 agent.rollouts.actions,
                                                 discriminator)
            discrim_errors_reverse = discrim_predictions(
                model_dict,
                agent.rollouts.states,
                agent.rollouts.actions,
                discriminator,
                reverse=True)

            if len(buffer_actions) > 100:
                discriminator.optimize(discrim_errors)

            agent.update2(discrim_errors,
                          discrim_errors_reverse)  #agent.update(j,num_updates)
            # agent.update2(discrim_errors)  #agent.update(j,num_updates)

        else:

            discrim_errors = discrim_predictions(model_dict,
                                                 agent.rollouts.states,
                                                 agent.rollouts.actions,
                                                 discriminator)
            discrim_errors_reverse = discrim_predictions(
                model_dict,
                agent.rollouts.states,
                agent.rollouts.actions,
                discriminator,
                reverse=True)

            #Optimize discriminator
            discriminator.optimize(discrim_errors)

            #Optimize agent
            agent.update2(discrim_errors,
                          discrim_errors_reverse)  #agent.update(j,num_updates)
            # agent.update2(discrim_errors)  #agent.update(j,num_updates)

        agent.insert_first_state(agent.rollouts.states[-1])

        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps

        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype,
                       agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state,
                        update_rewards, total_num_steps)
            #make vae prob gif
            if vae_:
                do_prob_state(envs_vae, agent, model_dict, vae,
                              update_current_state, total_num_steps)
            # #make vae prob gif
            # if grad_var_:
            #     do_grad_var(envs_grad_var, agent, model_dict, update_current_state, total_num_steps)

        #Print updates
        if j % log_interval == 0:  # and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.2f}, {:.3f}".format(
                j, total_num_steps,
                final_rewards.min(), final_rewards.median(),
                final_rewards.mean(), final_rewards.max(),
                int(total_num_steps / (end - start)), end - start,
                end - start2,
                torch.mean(discrim_errors).data.cpu().numpy()[0])

            print(to_print_info_string)

            # if vae_:
            #     elbo =  "{:.2f}".format(elbo.data.cpu().numpy()[0])

            # if next_state_pred_:
            #     state_pred_error_print =  "{:.2f}".format(agent.state_pred_error.data.cpu().numpy()[0])
            #     print(to_print_info_string+' '+state_pred_error_print+' '+elbo)
            #     to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, pred_error, elbo"

            # else:
            # if vae_:
            #     print(to_print_info_string+' '+elbo)
            # else:
            # print(to_print_info_string)

            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, discrim_E"  #, elbo"
            start2 = time.time()

            if j % (log_interval * 30) == 0:

                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps,
                          update_current_state, update_rewards)

                # if grad_var_  and j % (log_interval*300) == 0:
                if grad_var_ and j % (log_interval * 30) == 0:
                    #writes to file
                    do_grad_var(envs_grad_var, agent, model_dict,
                                total_num_steps, update_current_state,
                                update_rewards)

                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)

                    # if grad_var_ and j % (log_interval*300) == 0:
                    if grad_var_ and j % (log_interval * 30) == 0:
                        update_grad_plot(model_dict)
                        to_print_legend_string += ' grad_var_plot updated '

                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise  #pass
                    print(to_print_legend_string + " problem with plot")

    try:
        make_plots(model_dict)
    except:
        print()
Exemplo n.º 7
0
def train(model_dict):
    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(
            state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:]
        current_state[:, -channels:] = state  #last frame is now the new one

        return current_state

    def update_rewards(reward, done, final_rewards, episode_rewards,
                       current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                 1)).float()  #[P,1]
        episode_rewards += reward  #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])  #[P,1]
        final_rewards *= masks  #erase the ones that are done
        final_rewards += (
            1 -
            masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks  #erase the done ones
        masks = masks.type(dtype)  #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks  #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']
    vae_ = model_dict['vae_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype'] = dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype'] = dtype

    # Create environments
    print(num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print('Made dir', monitor_rewards_dir)
    envs = SubprocVecEnv([
        make_env(env_name, seed, i, monitor_rewards_dir)
        for i in range(num_processes)
    ])

    if vid_:
        print('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print('env for ls')
        envs_ls = make_env_basic(env_name)

    if vae_:
        print('env for vae')
        envs_vae = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]
                 )  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape'] = obs_shape
    model_dict['shape_dim0'] = shape_dim0

    next_state_pred_ = 0
    model_dict['next_state_pred_'] = next_state_pred_

    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print('init a2c agent')
    elif algo == 'ppo':
        agent = ppo(envs, model_dict)
        print('init ppo agent')
    elif algo == 'a2c_minibatch':
        agent = a2c_minibatch(envs, model_dict)
        print('init a2c_minibatch agent')
    elif algo == 'a2c_list_rollout':
        agent = a2c_list_rollout(envs, model_dict)
        print('init a2c_list_rollout agent')
    elif algo == 'a2c_with_var':
        agent = a2c_with_var(envs, model_dict)
        print('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if args.load_path != '':
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     agent.actor_critic = torch.load(args.load_path).cuda()
    #     print ('loaded ', args.load_path)

    # see_reward_episode = 0
    # if 'Montez' in env_name and see_reward_episode:
    #     states_list = [[] for i in range(num_processes)]

    # view_reward_episode(model_dict=model_dict, frames=[])
    # dfasddsf

    if vae_:
        vae = VAE()
        vae.cuda()

    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(
        num_processes,
        *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(
        current_state, state,
        shape_dim0).type(dtype)  #add the new frame, remove oldest
    agent.insert_first_state(
        current_state
    )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_processes, 1])  #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval / num_processes / num_steps)

    # prev_action = Variable(torch.zeros([num_processes, 1]).type(torch.LongTensor)).cuda()

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            state_pytorch = Variable(agent.rollouts.states[step])

            value, action, action_log_probs, dist_entropy = agent.act(
                state_pytorch)  #, volatile=True))

            # if next_state_pred_:
            #     next_state_prediction = agent.actor_critic.predict_next_state2(state_pytorch, prev_action)
            # next_state_prediction = 0

            # print (action_log_probs.size())
            # print (dist_entropy.size())

            # prev_action = action

            # print (next_state_prediction.size()) # [P,1,84,84]
            # fasd

            cpu_actions = action.data.squeeze(1).cpu().numpy()  #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions)

            reward_numpy = reward

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(
                reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state,
                                                 shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)

            if next_state_pred_:

                agent.insert_data(step, current_state, action.data, value,
                                  reward, masks, action_log_probs,
                                  dist_entropy,
                                  next_state_prediction)  #, done)
                agent.rollouts.insert_state_pred(next_state_prediction)

            else:
                agent.insert_data(step, current_state, action.data, value,
                                  reward, masks, action_log_probs,
                                  dist_entropy, 0)  #, done)

            # if 'Montez' in env_name and see_reward_episode:

            #     for state_i in range(len(state)):
            #         if done[state_i]:
            #             states_list[state_i] = []
            #         else:
            #             states_list[state_i].append(np.squeeze(state[state_i]))

            #             # print (state[state_i].shape)
            #             # fasdf

            #         # print (reward)

            #         if reward_numpy[state_i] >0:
            #             #plot the states of state_i
            #             print (len(states_list[state_i]))
            #             # view_reward_episode(model_dict=model_dict, frames=states_list[state_i][len(states_list[state_i])-100:])
            #             # view_reward_episode(model_dict=model_dict, frames=states_list[state_i][len(states_list[state_i])-100:])
            #             view_reward_episode(model_dict=model_dict, frames=states_list[state_i])

            #             fadsa

            #      # and np.sum(agent.rollouts.rewards.cpu().numpy()) > 0

            #     # print (np.sum(agent.rollouts.rewards.cpu().numpy()))
            #     # print (j)

        #Optimize agent
        agent.update()  #agent.update(j,num_updates)

        batch = agent.rollouts.states

        # print (batch.size())   # [Steps+1,Processes,Stack,84,84]
        # remove first state since its repeated, its the last state of last episode
        # take the first state of the stack for each step
        #reshape to [P*S,84,84]
        batch = batch[1:]  # [Steps,Processes,Stack,84,84]
        batch = batch[:, :, 0]  # [Steps,Processes,84,84]
        batch = batch.contiguous().view(-1, 84, 84)  # [Steps*Processes,84,84]

        # print (batch.size())

        # fadsa
        # print (vae)
        elbo = vae.update(batch)

        agent.insert_first_state(agent.rollouts.states[-1])

        # print (agent.state_pred_error.data.cpu().numpy())

        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps

        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype,
                       agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state,
                        update_rewards, total_num_steps)
            #make vae prob gif
            if vae_:
                do_prob_state(envs_vae, agent, model_dict, vae,
                              update_current_state, total_num_steps)

        #Print updates
        if j % log_interval == 0:  # and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.2f}".format(
                j, total_num_steps,
                final_rewards.min(), final_rewards.median(),
                final_rewards.mean(), final_rewards.max(),
                int(total_num_steps / (end - start)), end - start,
                end - start2)

            elbo = "{:.2f}".format(elbo.data.cpu().numpy()[0])

            if next_state_pred_:
                state_pred_error_print = "{:.2f}".format(
                    agent.state_pred_error.data.cpu().numpy()[0])
                print(to_print_info_string + ' ' + state_pred_error_print +
                      ' ' + elbo)
                to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, pred_error, elbo"

            else:
                print(to_print_info_string + ' ' + elbo)
                to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, elbo"

            start2 = time.time()

            if j % (log_interval * 30) == 0:

                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps,
                          update_current_state, update_rewards)
                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)
                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise  #pass
                    print(to_print_legend_string)

    try:
        make_plots(model_dict)
    except:
        print()
def viz(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one



        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state




    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    
    num_processes = 1
    model_dict['num_processes'] = 1
    model_dict['num_steps'] = max_frames
    num_steps = max_frames
    
    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor


    # Create environments
    print (num_processes, 'processes')

    monitor_rewards_dir = ''
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    vid_ = 0
    see_frames = 1

    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape


    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print ('init a2c agent')
    elif algo == 'ppo':
        agent = ppo(envs, model_dict)
        print ('init ppo agent')
    elif algo == 'a2c_minibatch':
        agent = a2c_minibatch(envs, model_dict)
        print ('init a2c_minibatch agent')
    # agent = model_dict['agent'](envs, model_dict)




    #Load model
    model_params_file = save_dir+ '/model_params/model_params'+str(int(epoch_level))+'.pt'
    agent.actor_critic = torch.load(model_params_file).cuda()
    print ('loaded ', model_params_file)
    # fafdas


    # frame_path = save_dir+'/frames/'
    if not os.path.exists(frame_path):
        os.makedirs(frame_path)
        print ('Made dir', frame_path) 




    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest
    agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes

    #Begin training
    count =0
    start = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # if see_frames:
            #Grayscale
            # save_frame(state, count)




            # #RGB
            # state = envs.render()
            # print(state.shape)
            # fdsafa


        #         def get_action_meanings(self):
        # return [ACTION_MEANING[i] for i in self._action_set]

            # print (envs.get_action_meanings())

            # print (agent.rollouts.states[step].size())


            

            # print ('values', values)
            # print ('actions', actions)





            # rows = 1
            # cols = 3

            # fig = plt.figure(figsize=(8,4), facecolor='white')

            # # plot frame
            # ax = plt.subplot2grid((rows,cols), (0,0), frameon=False)

            # state1 = np.squeeze(state[0])
            # ax.imshow(state1, cmap='gray')
            # ax.set_xticks([])
            # ax.set_yticks([])
            # # ax.savefig(frame_path+'frame' +str(count)+'.png')
            # # print ('saved',frame_path+'frame' +str(count)+'.png')
            # # plt.close(fig)
            # ax.set_title('State',family='serif')





            # #plot values histogram
            # ax = plt.subplot2grid((rows,cols), (0,2), frameon=False)

            # values = []
            # actions = []
            # for ii in range(100):
            #     # Act, [P,1], [P,1]
            #     action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            #     val = value.data.cpu().numpy()[0][0]
            #     act_ = action.data.cpu().numpy()[0][0]
            #     # print ('value', val)
            #     # print ('action', act_)
            #     values.append(val)
            #     actions.append(act_)


            # weights = np.ones_like(values)/float(len(values))
            # ax.hist(values, 50, range=[0.0, 4.], weights=weights)
            # # ax.set_ylim(top=1.)
            # ax.set_ylim([0.,1.])

            # ax.set_title('Value',family='serif')







            # #plot actions
            # ax = plt.subplot2grid((rows,cols), (0,1), frameon=False)

            # action_prob = agent.actor_critic.action_dist(Variable(agent.rollouts.states[step], volatile=True))
            # action_prob = np.squeeze(action_prob.data.cpu().numpy())
            # action_size = envs.action_space.n

            # # print (action_prob.shape)

            # ax.bar(range(action_size), action_prob)

            # ax.set_title('Action',family='serif')
            # # ax.set_xticklabels(['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE'])
            # plt.xticks(range(action_size),['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'R_FIRE', 'L_FIRE'], fontsize=6)
            # ax.set_ylim([0.,1.])



            # # print (action_prob)
            # # ['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']
            # # fdsfas

            # plt.tight_layout(pad=3., w_pad=2.5, h_pad=1.0)

            # plt_path = frame_path+'plt' 
            # plt.savefig(plt_path+str(count)+'.png')
            # print ('saved',plt_path+str(count)+'.png')
            # plt.close(fig)
            # # fsadf




            count+=1
            if count % 10 ==0:
                print (count)

            if count > 2:
                if reward.cpu().numpy() > 0:
                    # print (, reward.cpu().numpy(), count)
                    print (done[0],masks.cpu().numpy(), reward.cpu().numpy(),'reward!!', step)
                    print (np.squeeze(agent.rollouts.rewards.cpu().numpy()))
                else:
                    print (done[0],masks.cpu().numpy(), reward.cpu().numpy())


                # if done[0] or count > max_frames:
                if count > max_frames:

                    next_value = agent.actor_critic(Variable(agent.rollouts.states[-1], volatile=True))[0].data
                    agent.rollouts.compute_returns(next_value, agent.use_gae, agent.gamma, agent.tau)

                    rollouts_ =  np.squeeze(agent.rollouts.returns.cpu().numpy())
                    rewards_ =  np.squeeze(agent.rollouts.rewards.cpu().numpy())
                    # rollouts_ =  np.squeeze(agent.rollouts.returns.cpu().numpy())
                    # rollouts_ =  np.squeeze(agent.rollouts.returns.cpu().numpy())


                    for jj in range(len(rollouts_)):

                        print (jj, rollouts_[jj], rewards_[jj])
                    ffsdfa






                # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
                # print ('value', value)
                # print ('action', action)

                # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
                # print ('value', value)
                # print ('action', action)


            action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))

            cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions) 



            # Record rewards
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            
            # Update state
            current_state = update_current_state(current_state, state, shape_dim0)

            # Agent record step
            agent.insert_data(step, current_state, action.data, value.data, reward, masks)


            # print (reward)






        total_num_steps = (j + 1) * num_processes * num_steps
Exemplo n.º 9
0
def train(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one

        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state



    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype']=dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype']=dtype


    # Create environments
    print (num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print ('Made dir', monitor_rewards_dir) 
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print ('env for gif')
        envs_gif = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape
    model_dict['shape_dim0']=shape_dim0



    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print ('init a2c agent')
    elif algo == 'ppo':
        agent = ppo(envs, model_dict)
        print ('init ppo agent')
    elif algo == 'a2c_minibatch':
        agent = a2c_minibatch(envs, model_dict)
        print ('init a2c_minibatch agent')
    elif algo == 'a2c_list_rollout':
        agent = a2c_list_rollout(envs, model_dict)
        print ('init a2c_list_rollout agent')
    elif algo == 'a2c_with_var':
        agent = a2c_with_var(envs, model_dict)
        print ('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if args.load_path != '':
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     agent.actor_critic = torch.load(args.load_path).cuda()
    #     print ('loaded ', args.load_path)












    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest
    agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes

    #Begin training
    # count =0
    start = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            value, action, action_log_probs, dist_entropy = agent.act(Variable(agent.rollouts.states[step]))#, volatile=True))
            # print (action_log_probs.size())
            # print (dist_entropy.size())

            cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions) 

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state, shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)
            agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy, done)





        #Optimize agent
        agent.update()  #agent.update(j,num_updates)
        agent.insert_first_state(agent.rollouts.states[-1])





        total_num_steps = (j + 1) * num_processes * num_steps
        
        if total_num_steps % save_interval == 0 and save_dir != "":

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps)


        #Print updates
        if j % log_interval == 0:
            end = time.time()

            if j % (log_interval*30) == 0:

                #update plots
                try:
                    make_plots(model_dict)
                    print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated")
                except:
                    # raise
                    print("Upts, n_timesteps, min/med/mean/max, FPS, Time")

            print("{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}".
                    format(j, total_num_steps,
                           final_rewards.min(),
                           final_rewards.median(),
                           final_rewards.mean(),
                           final_rewards.max(),
                           int(total_num_steps / (end - start)),
                           end - start))#, agent.current_lr)
    
    try:
        make_plots(model_dict)
    except:
        print ()
Exemplo n.º 10
0
def train(model_dict):
    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(
            state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:]
        current_state[:, -channels:] = state  #last frame is now the new one

        return current_state

    def update_rewards(reward, done, final_rewards, episode_rewards,
                       current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                 1)).float()  #[P,1]
        episode_rewards += reward  #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])  #[P,1]
        final_rewards *= masks  #erase the ones that are done
        final_rewards += (
            1 -
            masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks  #erase the done ones
        masks = masks.type(dtype)  #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks  #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']
    vae_ = model_dict['vae_']
    grad_var_ = model_dict['grad_var_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype'] = dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype'] = dtype

    # Create environments
    print(num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print('Made dir', monitor_rewards_dir)
    envs = SubprocVecEnv([
        make_env(env_name, seed, i, monitor_rewards_dir)
        for i in range(num_processes)
    ])

    if vid_:
        print('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print('env for ls')
        envs_ls = make_env_basic(env_name)

    if vae_:
        print('env for vae')
        envs_vae = make_env_basic(env_name)

    if grad_var_:
        print('env for grad_var_')
        envs_grad_var = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]
                 )  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels
    action_space = envs.action_space

    model_dict['action_space'] = action_space
    model_dict['obs_shape'] = obs_shape
    model_dict['shape_dim0'] = shape_dim0

    next_state_pred_ = 0
    model_dict['next_state_pred_'] = next_state_pred_

    n_contexts = 2
    model_dict['n_contexts'] = n_contexts

    # Create agent
    # if algo == 'a2c':
    agent = a2c(model_dict)
    print('init a2c agent')

    discriminator = CNN_Discriminator(num_steps, n_contexts, model_dict).cuda()
    print('init discriminator')

    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(
        num_processes,
        *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(
        current_state, state, shape_dim0).type(
            dtype)  #add the new frame, remove oldest, since its a stack
    agent.insert_first_state(
        current_state
    )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_processes, 1])  #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes,
                                 1])  #when episode complete, sotres it here

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval / num_processes / num_steps)

    context_probs = torch.ones(n_contexts) / n_contexts

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):

        context_np = np.random.choice(n_contexts, num_processes)
        context = torch.from_numpy(context_np).view(num_processes, 1)
        context_onehot = torch.FloatTensor(num_processes, n_contexts).zero_()
        context_onehot.scatter_(1, context, 1)  # [P,C]

        list_frames = []
        for step in range(num_steps):

            #Sample context
            # context = torch.unsqueeze(context_probs.multinomial(num_processes), dim=1) # [P,1]

            # print (torch.multinomial.sample(context_probs, num_processes))

            # print (np.random.multinomial(num_processes, [1./n_contexts]*n_contexts))
            # print (np.random.choice(n_contexts, num_processes)) #[1./n_contexts]*n_contexts))

            # Act, [P,1], [P], [P,1], [P]
            state_pytorch = Variable(agent.rollouts.states[step])
            value, action, action_log_probs, dist_entropy = agent.act(
                state_pytorch, context_onehot)  #, volatile=True))

            # print (context_np)
            # print (action)

            #ACTIONS
            #['NOOP', 'FIRE', 'UP', 'RIGHT', 'LEFT', 'DOWN', 'UPRIGHT', 'UPLEFT', 'DOWNRIGHT', 'DOWNLEFT',
            #'UPFIRE', 'RIGHTFIRE', 'LEFTFIRE', 'DOWNFIRE', 'UPRIGHTFIRE', 'UPLEFTFIRE', 'DOWNRIGHTFIRE', 'DOWNLEFTFIRE']

            # TO FIX THE ACTIONS
            # action = action.data
            # # print (action)
            # # fdasf
            # for i in range(len(context_np)):
            #     if context_np[i] == 0:
            #         action[i] = 4
            #     else:
            #         action[i] = 3
            # action = Variable(action)
            # # print (action)
            # # print (action)
            # # fadsf

            # # TO FIX THE ACTIONS 2
            # action = action.data
            # # print (action)
            # # fdasf
            # for i in range(len(action)):
            #     if action[i].cpu().numpy() >= 8:
            #         action[i] = 0
            #     # else:
            #     #     action[i] = 3
            # action = Variable(action)
            # # print (action)
            # # print (action)
            # # fadsf

            # Apply to Environment, S:[P,C,H,W], R:[P], D:[P]
            cpu_actions = action.data.squeeze(1).cpu().numpy()  #[P]
            # print (cpu_actions)
            state, reward, done, info = envs.step(cpu_actions)

            # print (state.shape) #[P,1,84,84]
            list_frames.append(torch.FloatTensor(state))

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(
                reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state,
                                                 shape_dim0)
            agent.insert_data(step, current_state, action.data, value, reward,
                              masks, action_log_probs, dist_entropy,
                              0)  #, done)

        #Optimize discriminator
        if j % 2 == 0:
            discriminator_error = discriminator.update(list_frames,
                                                       context)  #[P]
            if j == 0:
                print('multiple updates')
                for jj in range(20):
                    discriminator_error = discriminator.update(
                        list_frames, context)
                    # print (torch.mean(discriminator_error).data.cpu().numpy()[0])
                # fasds

        grad_sum = agent.actor_critic.graddd(state_pytorch, context_onehot)

        #Optimize agent
        # agent.update(context_onehot, discriminator_error)  #agent.update(j,num_updates)
        agent.update2(context_onehot, discriminator_error,
                      grad_sum)  #agent.update(j,num_updates)

        agent.insert_first_state(agent.rollouts.states[-1])

        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps

        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype,
                       agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state,
                        update_rewards, total_num_steps)
            #make vae prob gif
            if vae_:
                do_prob_state(envs_vae, agent, model_dict, vae,
                              update_current_state, total_num_steps)
            # #make vae prob gif
            # if grad_var_:
            #     do_grad_var(envs_grad_var, agent, model_dict, update_current_state, total_num_steps)

        #Print updates
        if j % log_interval == 0:  # and j!=0:
            end = time.time()

            # print (torch.mean(discriminator_error).data.cpu().numpy()[0])
            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.2f}, {:.3f}".format(
                j, total_num_steps,
                final_rewards.min(), final_rewards.median(),
                final_rewards.mean(), final_rewards.max(),
                int(total_num_steps / (end - start)), end - start,
                end - start2,
                torch.mean(discriminator_error).data.cpu().numpy()[0])

            print(to_print_info_string)

            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, D_error"  #, elbo"
            start2 = time.time()

            if j % (log_interval * 30) == 0:

                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps,
                          update_current_state, update_rewards)

                # if grad_var_  and j % (log_interval*300) == 0:
                if grad_var_ and j % (log_interval * 30) == 0:
                    #writes to file
                    do_grad_var(envs_grad_var, agent, model_dict,
                                total_num_steps, update_current_state,
                                update_rewards)

                # # THIS IS TO SEE PREDICTIONS

                # nstep_frames = torch.stack(list_frames)  #[N, P, 1,84,84]
                # nstep_frames = torch.transpose(nstep_frames, 0,1)
                # nstep_frames = torch.squeeze(nstep_frames) #its [P,N,84,84] so its like a batch of N dimensional images
                # nstep_frames = Variable(nstep_frames).cuda()
                # pred = F.softmax(discriminator.predict(nstep_frames), dim=1)
                # print (pred, context_np)

                # rows = 1
                # cols = 2
                # fig = plt.figure(figsize=(1+cols,15+rows), facecolor='white')

                # zero_comp = 0
                # one_comp = 0
                # for ii in range(len(context_np)):

                #     if context_np[ii] == 0 and not zero_comp:
                #         print (ii)
                #         imgg = nstep_frames[ii].view(num_steps*84,84).data.cpu().numpy()

                #         # imgg = nstep_frames[ii].view(num_steps*84//2,84*2)
                #         # # imgg = nstep_frames[ii].view(num_steps*84,84)
                #         # imgg = imgg.data.cpu().numpy()

                #         ax = plt.subplot2grid((rows,cols), (0,0), frameon=False) #, rowspan=7)

                #         # print (imgg.shape)
                #         # plt.imshow(imgg, cmap=plt.get_cmap('gray'))
                #         ax.imshow(imgg, cmap=plt.get_cmap('gray'))
                #         ax.set_yticks([])
                #         # plt.savefig(model_dict['exp_path']+'img0.pdf')
                #         # print (model_dict['exp_path']+'img.png')
                #         zero_comp =1
                #     if context_np[ii] == 1 and not one_comp:
                #         print (ii)
                #         imgg = nstep_frames[ii].view(num_steps*84,84).data.cpu().numpy()

                #         # imgg = nstep_frames[ii].view(num_steps*84//2,84*2)
                #         # # imgg = nstep_frames[ii].view(num_steps*84,84)
                #         # imgg = imgg.data.cpu().numpy()

                #         ax = plt.subplot2grid((rows,cols), (0,1), frameon=False) #, rowspan=7)
                #         # print (imgg.shape)
                #         # plt.imshow(imgg, cmap=plt.get_cmap('gray'))
                #         ax.imshow(imgg, cmap=plt.get_cmap('gray'))
                #         ax.set_yticks([])

                #         # plt.savefig(model_dict['exp_path']+'img1.pdf')
                #         # print (model_dict['exp_path']+'img.png')
                #         one_comp =1
                #     if zero_comp and one_comp:
                #         print ('plotted both')

                #         # imgg = nstep_frames[20].view(num_steps*84,84).data.cpu().numpy()
                #         # plt.imshow(imgg, cmap=plt.get_cmap('gray'))
                #         # plt.savefig(model_dict['exp_path']+'img_20.pdf')

                #         # fdfaa  fig = plt.figure(figsize=(4+cols,1+rows), facecolor='white')

                #         plt.savefig(model_dict['exp_path']+'img_both.pdf')

                #         ffasd

                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)

                    # if grad_var_ and j % (log_interval*300) == 0:
                    if grad_var_ and j % (log_interval * 30) == 0:
                        update_grad_plot(model_dict)
                        to_print_legend_string += ' grad_var_plot updated '

                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise  #pass
                    print(to_print_legend_string + " problem with plot")

    try:
        make_plots(model_dict)
    except:
        print()
Exemplo n.º 11
0
def train(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one

        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda

        # print (current_state)
        # fdsf

        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state



    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype']=dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype']=dtype


    # Create environments
    print (num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print ('Made dir', monitor_rewards_dir) 
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print ('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print ('env for ls')
        envs_ls = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape
    model_dict['shape_dim0']=shape_dim0

    action_size = envs.action_space.n
    model_dict['action_size']=action_size



    # Create agent
    if algo == 'a2c':
        agent = a2c(model_dict)
        print ('init a2c agent')
    # elif algo == 'ppo':
    #     agent = ppo(envs, model_dict)
    #     print ('init ppo agent')
    # elif algo == 'a2c_minibatch':
    #     agent = a2c_minibatch(envs, model_dict)
    #     print ('init a2c_minibatch agent')
    # elif algo == 'a2c_list_rollout':
    #     agent = a2c_list_rollout(envs, model_dict)
    #     print ('init a2c_list_rollout agent')
    # elif algo == 'a2c_with_var':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    #Load model
    if model_dict['load_params']:
        # agent.actor_critic = torch.load(os.path.join(args.load_path))
        # agent.actor_critic = torch.load(args.load_path).cuda()
        
        # print ('loaded ', args.load_path)

        if model_dict['load_number'] == 3:
            load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict)

        elif model_dict['load_number'] == 6:
            load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict)
        elif model_dict['load_number'] == 9:
            load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict)

        # else:
        #     load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict)
        else:
            PROBLEM















    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest
    agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval /num_processes/num_steps)

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            value, action, action_log_probs, dist_entropy = agent.act(Variable(agent.rollouts.states[step]/255.))#, volatile=True))
            # print (action_log_probs.size())
            # print (dist_entropy.size())

            cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions) 


            # if np.sum(reward) > 0.:
            #     print (reward)
            #     afdas

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state, shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)
            agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done)





        #Optimize agent
        agent.update()  #agent.update(j,num_updates)
        agent.insert_first_state(agent.rollouts.states[-1])


        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps
        
        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                # do_params(save_dir, agent, total_num_steps, model_dict)
                # save_params_v2(save_dir, agent, total_num_steps, model_dict)
                save_params_v3(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps)


        #Print updates
        if j % log_interval == 0:# and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps,
                                       final_rewards.min(),
                                       final_rewards.median(),
                                       final_rewards.mean(),
                                       final_rewards.max(),
                                       int(total_num_steps / (end - start)),
                                       end - start,
                                       end - start2)
            print(to_print_info_string) 
            start2 = time.time()



            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time"
            if j % (log_interval*30) == 0:
            
                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards)
                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)
                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise #pass
                    print(to_print_legend_string)



    try:
        make_plots(model_dict)
    except:
        print ()
Exemplo n.º 12
0
def main():

    cumulative_avg_rewards = []
    for seed_ in [10, 50, 100, 200, 500]:
        seed(seed_)
        set_random_seed(seed_)
        print("Seed: ", seed_)
        episode = 0

        # initialize environment
        env_id = get_args().env
        #env = make_atari(env_id)
        #env = wrap_deepmind(env, frame_stack=True, clip_rewards=False, episode_life=False)
        #env = Monitor(env)

        env = SubprocVecEnv([make_env(seed_, i) for i in range(6)])  #24
        print("CHECK_ENV", env.reset().__array__().shape)
        state_size = env.observation_space.shape[0]
        action_size = env.action_space.n
        agent = get_agent(env)
        save_path = os.path.join('models_entropy_coeff1',
                                 "Space_inv_A2C_LSTM_nstep8_MAX_rew_546")
        agent.load(save_path)
        lstm_state = np.zeros((6, 256), dtype=np.float32)  #24

        # run for 100 episodes
        #for i in range(100):
        counter = 0

        episodic_reward_lis = []
        for i in range(wandb.config.episodes):
            # Set reward received in this episode = 0 at the start of the episode
            episodic_reward = np.zeros((6))  #24
            episodic_reward_m = np.zeros((6))  #24

            reset = False

            #env = gym.wrappers.Monitor(env, 'test/'+str(i), force=True)

            obs = env.reset()
            renders = []
            count = 0
            action_count = 0
            done = False
            done1 = np.zeros(6)  #24
            done2 = np.zeros(6)  #24
            while not done:
                a, v, lstm_state = agent.step(obs, S_=lstm_state, M_=done1)
                obs, reward, done1, info = env.step(a, done1, cond="eval")
                done = done2.all()
                if (done):
                    episodic_reward_m1 = episodic_reward_m.max()
                    break
                if (done1.any()):
                    episodic_reward_m[np.logical_and(
                        done2 <= 0, done1)] = episodic_reward[np.logical_and(
                            done2 <= 0, done1)]
                    for j in np.nonzero(done1)[0]:
                        episodic_reward[j] = 0
                episodic_reward += reward
                done2 = np.logical_or(done1, done2)

            if (i == 0):
                reset = True

            cumulative_avg_reward = evaluate(episodic_reward_m1, reset)

        tf.reset_default_graph()
        env.close()

        # your models will be evaluated on 100-episode average reward
        # therefore, we stop logging after 100 episodes
        print("*************************************************************")
        print("CUMULATIVE_AVG_REWARD", cumulative_avg_reward)
        print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
        cumulative_avg_rewards.append(cumulative_avg_reward)

    print("Final score: ", np.mean(cumulative_avg_rewards))
Exemplo n.º 13
0
def train(model_dict):
    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(
            state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:]
        current_state[:, -channels:] = state  #last frame is now the new one

        return current_state

    def update_rewards(reward, done, final_rewards, episode_rewards,
                       current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                 1)).float()  #[P,1]
        episode_rewards += reward  #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])  #[P,1]
        final_rewards *= masks  #erase the ones that are done
        final_rewards += (
            1 -
            masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks  #erase the done ones
        masks = masks.type(dtype)  #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks  #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype'] = dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype'] = dtype

    # Create environments
    print(num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print('Made dir', monitor_rewards_dir)
    envs = SubprocVecEnv([
        make_env(env_name, seed, i, monitor_rewards_dir)
        for i in range(num_processes)
    ])

    if vid_:
        print('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print('env for ls')
        envs_ls = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]
                 )  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape'] = obs_shape
    model_dict['shape_dim0'] = shape_dim0

    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print('init a2c agent')
    # elif algo == 'ppo':
    #     agent = ppo(envs, model_dict)
    #     print ('init ppo agent')
    # elif algo == 'a2c_minibatch':
    #     agent = a2c_minibatch(envs, model_dict)
    #     print ('init a2c_minibatch agent')
    # elif algo == 'a2c_list_rollout':
    #     agent = a2c_list_rollout(envs, model_dict)
    #     print ('init a2c_list_rollout agent')
    # elif algo == 'a2c_with_var':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if model_dict['load_params']:
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     # agent.actor_critic = torch.load(args.load_path).cuda()

    #     # print ('loaded ', args.load_path)

    #     if model_dict['load_number'] == 3:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict)

    #     elif model_dict['load_number'] == 6:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict)
    #     elif model_dict['load_number'] == 9:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict)

    #     # else:
    #     #     load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict)
    #     else:
    #         PROBLEM

    #load model
    # if model_dict['load_params']:

    # load_params(thigns)
    # param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt'
    param_file = home + '/Documents/tmp/RoadRunner/RoadRunnerNoFrameskip-v4/A2C/seed1/model_params3/model_params9999360.pt'

    # pretrained_dict = torch.load(param_file)  # object
    # print (pretrained_dict)
    # agent_dict = agent.actor_critic.state_dict()  #dict
    # print (agent_dict.keys())
    # agent_dict.update(pretrained_dict)
    # # agent_dict.update(agent.actor_critic)
    # agent.actor_critic.load_state_dict(agent_dict)

    param_dict = torch.load(param_file)
    agent.actor_critic.load_state_dict(param_dict)

    # agent.actor_critic = torch.load(param_file)
    agent.actor_critic.cuda()
    print('loaded', param_file)

    # afdsa

    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(
        num_processes,
        *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(
        current_state, state,
        shape_dim0).type(dtype)  #add the new frame, remove oldest
    agent.insert_first_state(
        current_state
    )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_processes, 1])  #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval / num_processes / num_steps)

    # list of lists, where lists are trajectories. trajectories have actinos and states
    dataset = []
    tmp_trajs = [[] for x in range(num_processes)]

    dataset_count = 0

    done = [0] * num_processes

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            value, action, action_log_probs, dist_entropy = agent.act(
                Variable(agent.rollouts.states[step]))  #, volatile=True))
            # print (action_log_probs.size())
            # print (dist_entropy.size())

            cpu_actions = action.data.squeeze(1).cpu().numpy()  #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())

            # y = torch.LongTensor(batch_size,1).random_() % nb_digits
            # # One hot encoding buffer that you create out of the loop and just keep reusing
            # y_onehot = torch.FloatTensor(batch_size, nb_digits)
            # # In your for loop
            # y_onehot.zero_()
            # y_onehot.scatter_(1, y, 1)

            states_ = agent.rollouts.states[step].cpu().numpy()  #[P,S,84,84]
            # print (state_t.shape)
            actions_ = action.data.cpu().numpy()  #[P,1]
            # print (action)
            # fdsaf

            #store step
            for proc in range(num_processes):

                #add states
                state_t = states_[proc]
                action_t = actions_[proc]
                tmp_trajs[proc].append([action_t, state_t])

                if done[proc]:

                    dataset.append(tmp_trajs[proc])
                    dataset_count += len(tmp_trajs[proc])
                    tmp_trajs[proc] = []

                    for ii in range(len(dataset)):
                        print(len(dataset[ii]))

            if dataset_count > 10000:

                # pickle.dump( dataset, open(home+'/Documents/tmp/breakout_2frames/breakout_trajectories_10000.pkl', "wb" ) )
                pickle.dump(
                    dataset,
                    open(
                        home +
                        '/Documents/tmp/RoadRunner/trajectories_10000.pkl',
                        "wb"))

                print('saved')
                # pickle.save(dataset)
                STOP

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions)

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(
                reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state,
                                                 shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)
            agent.insert_data(step, current_state, action.data, value, reward,
                              masks, action_log_probs, dist_entropy)  #, done)

        # print (len(dataset))
        # print ()

        #Optimize agent
        # agent.update()  #agent.update(j,num_updates)
        agent.insert_first_state(agent.rollouts.states[-1])

        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps

        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
                # save_params_v2(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype,
                       agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state,
                        update_rewards, total_num_steps)

        #Print updates
        if j % log_interval == 0:  # and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(
                j, total_num_steps,
                final_rewards.min(), final_rewards.median(),
                final_rewards.mean(), final_rewards.max(),
                int(total_num_steps / (end - start)), end - start,
                end - start2)
            print(to_print_info_string)
            start2 = time.time()

            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time"
            if j % (log_interval * 30) == 0:

                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps,
                          update_current_state, update_rewards)
                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)
                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise  #pass
                    print(to_print_legend_string)

    try:
        make_plots(model_dict)
    except:
        print()
Exemplo n.º 14
0
def train(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one

        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state



    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype']=dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype']=dtype


    # Create environments
    print (num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print ('Made dir', monitor_rewards_dir) 
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print ('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print ('env for ls')
        envs_ls = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape
    model_dict['shape_dim0']=shape_dim0



    # # Create agent
    # if algo == 'a2c':
    #     agent = a2c(envs, model_dict)
    #     print ('init a2c agent')
    # elif algo == 'ppo':
    #     agent = ppo(envs, model_dict)
    #     print ('init ppo agent')
    # elif algo == 'a2c_minibatch':
    #     agent = a2c_minibatch(envs, model_dict)
    #     print ('init a2c_minibatch agent')
    # elif algo == 'a2c_list_rollout':
    #     agent = a2c_list_rollout(envs, model_dict)
    #     print ('init a2c_list_rollout agent')
    # elif algo == 'a2c_with_var':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')

    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if model_dict['load_params']:
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     # agent.actor_critic = torch.load(args.load_path).cuda()
        
    #     # print ('loaded ', args.load_path)

    #     if model_dict['load_number'] == 3:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict)

    #     elif model_dict['load_number'] == 6:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict)
    #     elif model_dict['load_number'] == 9:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict)

    #     # else:
    #     #     load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict)
    #     else:
    #         PROBLEM


    print ('Init expert agent')
    expert_agent = a2c(envs, model_dict)
    param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params9999360.pt'    
    param_dict = torch.load(param_file)
    expert_agent.actor_critic.load_state_dict(param_dict)
    print ('loaded params', param_file)
    expert_agent.actor_critic.cuda()



    print ('Init imitator agent')
    imitator_agent = a2c(envs, model_dict)
    # param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params.ckpt'  
    # param_dict = torch.load(param_file)
    # imitator_agent.actor_critic.load_state_dict(param_dict)
    # print ('loaded params', param_file)
    imitator_agent.actor_critic.cuda()







    agent = expert_agent
    expert_policy = expert_agent.actor_critic

    imitator_policy = imitator_agent.actor_critic
    optimizer = optim.Adam(imitator_policy.parameters(), lr=.0005, weight_decay=.00001)

    total_steps = 0

    display_step = 50






    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest
    agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval /num_processes/num_steps)




    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            state__ = Variable(agent.rollouts.states[step]) / 255.
            value, action, action_log_probs, dist_entropy = agent.act(state__) #, requires_grad=False)#, volatile=True))
            # print (action_log_probs.size())
            # print (dist_entropy.size())





            batch = state__

            optimizer.zero_grad()

            log_dist_expert = expert_policy.action_logdist(batch)
            log_dist_imitator = imitator_policy.action_logdist(batch)

            action_dist_kl = torch.sum((log_dist_expert - log_dist_imitator)*torch.exp(log_dist_expert), dim=1) #[B]

            # elbo, logpx, logpz, logqz, action_dist_kl = self.forward(batch, policy, k=k)
            loss = torch.mean(action_dist_kl)

            loss.backward()
            # nn.utils.clip_grad_norm(self.parameters(), .5)
            optimizer.step()

            # if total_steps%display_step==0: # and batch_idx == 0:
            #     # print ('Train Epoch: {}/{}'.format(epoch+1, epochs),
            #         # 'total_epochs {}'.format(total_epochs),
            #         print('LL:{:.4f}'.format(loss.data[0])
            #         # 'logpx:{:.4f}'.format(logpx.data[0]),
            #         # 'logpz:{:.5f}'.format(logpz.data[0]),
            #         # 'logqz:{:.5f}'.format(logqz.data[0]),
            #         # 'action_kl:{:.4f}'.format(action_dist_kl.data[0])
            #         )

            # total_steps+=1






            cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions) 

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state, shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)
            agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done)





        #Optimize agent


        agent.no_update()  #agent.update(j,num_updates)
        # agent.update()  #agent.update(j,num_updates)


        agent.insert_first_state(agent.rollouts.states[-1])


        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps
        
        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            save_to = home+'/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params_env.ckpt'
            torch.save(imitator_policy.state_dict(), save_to)
            print ('saved imitator_policy', save_to)

            # #Save model
            # if save_params:
            #     do_params(save_dir, agent, total_num_steps, model_dict)
            #     # save_params_v2(save_dir, agent, total_num_steps, model_dict)

                
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps)


        #Print updates
        if j % log_interval == 0:# and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}, {:.4f}".format(j, total_num_steps,
                                       final_rewards.min(),
                                       final_rewards.median(),
                                       final_rewards.mean(),
                                       final_rewards.max(),
                                       int(total_num_steps / (end - start)),
                                       end - start,
                                       end - start2, 
                                       loss.data[0])


            # to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps,
            #                            final_rewards.min(),
            #                            final_rewards.median(),
            #                            final_rewards.mean(),
            #                            final_rewards.max(),
            #                            int(total_num_steps / (end - start)),
            #                            end - start,
            #                            end - start2)


            print(to_print_info_string) 
            start2 = time.time()



            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time"
            if j % (log_interval*30) == 0:
            
                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards)
                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)
                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise #pass
                    print(to_print_legend_string)



    try:
        make_plots(model_dict)
    except:
        print ()
Exemplo n.º 15
0
def main():
    envs = [make_env(env_name, seed, rank, log_dir) for rank in range(num_processes)]
    envs = SubprocVecEnv(envs)
    obs_shape = envs.observation_space.shape
    obs_shape = [obs_shape[0]*num_stack, *obs_shape[1:]]
    actor_critic = CNNPolicy(obs_shape[0], envs.action_space, False)
    if cuda:
        actor_critic.cuda()
    optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps=eps, alpha=alpha)

    rollouts = RolloutStorage(num_steps, num_processes, obs_shape, envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(num_processes, *obs_shape)
    
    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
            current_obs[:, -shape_dim0:] = obs
            
            obs = envs.reset()
            
    update_current_obs(obs)
    rollouts.observations[0].copy_(current_obs)
    episode_rewards = torch.zeros([num_processes,1])
    final_rewards = torch.zeros([num_processes,1])
    if cuda:
        rollouts.cuda()
        current_obs = current_obs.cuda()
    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]
        
        # test
    start = time.time()
    for j in range(num_updates):
        for step in range(num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(Variable(rollouts.observations[step], volatile=True),
                                                                  Variable(rollouts.states[step], volatile=True),
                                                                  Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze().cpu().numpy()
            #print(cpu_action)

            # obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            # stack: make sure that reward is a numpy array(convert list to ndarray)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks
            if cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            # update obs nad rollouts
            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks)

        # compute current update's return
        next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True),
                                  Variable(rollouts.states[-1], volatile=True),
                                  Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, False, gamma, tau)

        # in a2c the values  were calculated twice
        # the data in rollouts must be viewed, because the shape in rollouts is [num_steps, num_processes, x] which is [num,x] in actor_critic
        values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)),                                                                                       Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                                                                                 Variable(rollouts.masks[:-1].view(-1, 1)),
                                                                                       Variable(rollouts.actions.view(-1, action_shape)))

        # compute the loss
        values = values.view(num_steps, num_processes, 1)
        action_log_probs = action_log_probs.view(num_steps, num_processes, 1)

        advantages = Variable(rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        # update model
        optimizer.zero_grad()
        loss = value_loss * value_loss_coef + action_loss - dist_entropy * entropy_coef 
        loss.backward()
        nn.utils.clip_grad_norm(actor_critic.parameters(), max_grad_norm)
        optimizer.step()

        rollouts.after_update()
        if j % log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * num_processes * num_steps
            print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
            format(j, total_num_steps,
                    int(total_num_steps / (end - start)),
                    final_rewards.mean(),
                    final_rewards.median(),
                    final_rewards.min(),
                    final_rewards.max(), dist_entropy.data[0],
                    value_loss.data[0], action_loss.data[0]))
# todo: test save_url                
    torch.save(actor_critic,save_url)      
Exemplo n.º 16
0
def viz(model_dict):
    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(
            state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:]
        current_state[:, -channels:] = state  #last frame is now the new one

        return current_state

    def update_rewards(reward, done, final_rewards, episode_rewards,
                       current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                 1)).float()  #[P,1]
        episode_rewards += reward  #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])  #[P,1]
        final_rewards *= masks  #erase the ones that are done
        final_rewards += (
            1 -
            masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks  #erase the done ones
        masks = masks.type(dtype)  #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks  #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    def do_vid():
        n_vids = 3
        for i in range(n_vids):
            done = False
            state = envs_video.reset()
            # state = torch.from_numpy(state).float().type(dtype)
            current_state = torch.zeros(1, *obs_shape)
            current_state = update_current_state(current_state, state,
                                                 shape_dim0).type(dtype)
            # print ('Recording')
            # count=0
            while not done:
                # print (count)
                # count +=1
                # Act
                state_var = Variable(current_state, volatile=True)
                # print (state_var.size())
                action, value = agent.act(state_var)
                cpu_actions = action.data.squeeze(1).cpu().numpy()

                # Observe reward and next state
                state, reward, done, info = envs_video.step(
                    cpu_actions)  # state:[nProcesss, ndims, height, width]
                # state = torch.from_numpy(state).float().type(dtype)
                # current_state = torch.zeros(1, *obs_shape)
                current_state = update_current_state(current_state, state,
                                                     shape_dim0).type(dtype)
        state = envs_video.reset()

        vid_path = save_dir + '/videos/'
        count = 0
        for aaa in os.listdir(vid_path):

            if 'openaigym' in aaa and '.mp4' in aaa:
                #os.rename(vid_path+aaa, vid_path+'vid_t'+str(total_num_steps)+'.mp4')
                subprocess.call("(cd " + vid_path + " && mv " + vid_path +
                                aaa + " " + vid_path + env_name + '_' + algo +
                                '_vid_t' + str(total_num_steps) + '_' +
                                str(count) + ".mp4)",
                                shell=True)
                count += 1
            if '.json' in aaa:
                os.remove(vid_path + aaa)

    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    num_processes = 1
    model_dict['num_processes'] = 1

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor

    # Create environments
    print(num_processes, 'processes')
    # monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    # if not os.path.exists(monitor_rewards_dir):
    #     os.makedirs(monitor_rewards_dir)
    #     print ('Made dir', monitor_rewards_dir)

    monitor_rewards_dir = ''
    envs = SubprocVecEnv([
        make_env(env_name, seed, i, monitor_rewards_dir)
        for i in range(num_processes)
    ])

    vid_ = 0
    see_frames = 1

    if vid_:
        print('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]
                 )  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape'] = obs_shape

    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print('init a2c agent')
    elif algo == 'ppo':
        agent = ppo(envs, model_dict)
        print('init ppo agent')
    elif algo == 'a2c_minibatch':
        agent = a2c_minibatch(envs, model_dict)
        print('init a2c_minibatch agent')
    # agent = model_dict['agent'](envs, model_dict)

    #Load model
    # if args.load_path != '':
    # agent.actor_critic = torch.load(os.path.join(args.load_path))

    # epoch_level = 1e6
    model_params_file = save_dir + '/model_params/model_params' + str(
        int(epoch_level)) + '.pt'
    agent.actor_critic = torch.load(model_params_file).cuda()
    print('loaded ', model_params_file)
    # fafdas

    # frame_path = save_dir+'/frames/'
    if not os.path.exists(frame_path):
        os.makedirs(frame_path)
        print('Made dir', frame_path)

    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(
        num_processes,
        *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(
        current_state, state,
        shape_dim0).type(dtype)  #add the new frame, remove oldest
    agent.insert_first_state(
        current_state
    )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_processes, 1])  #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes

    #Begin training
    count = 0
    start = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # if see_frames:
            #Grayscale
            # save_frame(state, count)

            # #RGB
            # state = envs.render()
            # print(state.shape)
            # fdsafa

            values = []
            actions = []
            for ii in range(100):
                # Act, [P,1], [P,1]
                action, value = agent.act(
                    Variable(agent.rollouts.states[step], volatile=True))
                val = value.data.cpu().numpy()[0][0]
                act_ = action.data.cpu().numpy()[0][0]
                # print ('value', val)
                # print ('action', act_)
                values.append(val)
                actions.append(act_)

            # print ('values', values)
            # print ('actions', actions)

            rows = 1
            cols = 2

            fig = plt.figure(figsize=(8, 4), facecolor='white')

            # plot frame
            ax = plt.subplot2grid((rows, cols), (0, 0), frameon=False)

            state1 = np.squeeze(state[0])
            ax.imshow(state1, cmap='gray')
            ax.set_xticks([])
            ax.set_yticks([])
            # ax.savefig(frame_path+'frame' +str(count)+'.png')
            # print ('saved',frame_path+'frame' +str(count)+'.png')
            # plt.close(fig)

            #plot values histogram
            ax = plt.subplot2grid((rows, cols), (0, 1), frameon=False)

            weights = np.ones_like(values) / float(len(values))
            ax.hist(values, 50, range=[0.0, 4.], weights=weights)
            # ax.set_ylim(top=1.)
            ax.set_ylim([0., 1.])

            plt_path = frame_path + 'plt'
            plt.savefig(plt_path + str(count) + '.png')
            print('saved', plt_path + str(count) + '.png')
            plt.close(fig)
            # fsadf

            count += 1
            if count > 2:
                if done[0] or count > max_frames:
                    ffsdfa

                # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
                # print ('value', value)
                # print ('action', action)

                # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
                # print ('value', value)
                # print ('action', action)

            cpu_actions = action.data.squeeze(1).cpu().numpy()  #[P]

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions)

            # Record rewards
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(
                reward, done, final_rewards, episode_rewards, current_state)

            # Update state
            current_state = update_current_state(current_state, state,
                                                 shape_dim0)

            # Agent record step
            agent.insert_data(step, current_state, action.data, value.data,
                              reward, masks)

        # #Optimize agent
        # agent.update()  #agent.update(j,num_updates)
        # agent.insert_first_state(agent.rollouts.states[-1])

        total_num_steps = (j + 1) * num_processes * num_steps
Exemplo n.º 17
0
def viz(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one



        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    def do_vid():
        n_vids=3
        for i in range(n_vids):
            done=False
            state = envs_video.reset()
            # state = torch.from_numpy(state).float().type(dtype)
            current_state = torch.zeros(1, *obs_shape)
            current_state = update_current_state(current_state, state, shape_dim0).type(dtype)
            # print ('Recording')
            # count=0
            while not done:
                # print (count)
                # count +=1
                # Act
                state_var = Variable(current_state, volatile=True) 
                # print (state_var.size())
                action, value = agent.act(state_var)
                cpu_actions = action.data.squeeze(1).cpu().numpy()

                # Observe reward and next state
                state, reward, done, info = envs_video.step(cpu_actions) # state:[nProcesss, ndims, height, width]
                # state = torch.from_numpy(state).float().type(dtype)
                # current_state = torch.zeros(1, *obs_shape)
                current_state = update_current_state(current_state, state, shape_dim0).type(dtype)
        state = envs_video.reset()
        
        vid_path = save_dir+'/videos/'
        count =0
        for aaa in os.listdir(vid_path):

            if 'openaigym' in aaa and '.mp4' in aaa:
                #os.rename(vid_path+aaa, vid_path+'vid_t'+str(total_num_steps)+'.mp4')
                subprocess.call("(cd "+vid_path+" && mv "+ vid_path+aaa +" "+ vid_path+env_name+'_'+algo+'_vid_t'+str(total_num_steps)+'_'+str(count) +".mp4)", shell=True) 
                count+=1
            if '.json' in aaa:
                os.remove(vid_path+aaa)




    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    
    num_processes = 1
    model_dict['num_processes'] = 1
    
    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor


    # Create environments
    print (num_processes, 'processes')
    # monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    # if not os.path.exists(monitor_rewards_dir):
    #     os.makedirs(monitor_rewards_dir)
    #     print ('Made dir', monitor_rewards_dir) 

    monitor_rewards_dir = ''
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    vid_ = 0
    see_frames = 1

    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape


    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print ('init a2c agent')
    elif algo == 'ppo':
        agent = ppo(envs, model_dict)
        print ('init ppo agent')
    elif algo == 'a2c_minibatch':
        agent = a2c_minibatch(envs, model_dict)
        print ('init a2c_minibatch agent')
    # agent = model_dict['agent'](envs, model_dict)




    #Load model
    # if args.load_path != '':
        # agent.actor_critic = torch.load(os.path.join(args.load_path))

    # epoch_level = 1e6
    model_params_file = save_dir+ '/model_params/model_params'+str(int(epoch_level))+'.pt'
    agent.actor_critic = torch.load(model_params_file).cuda()
    print ('loaded ', model_params_file)
    # fafdas


    # frame_path = save_dir+'/frames/'
    if not os.path.exists(frame_path):
        os.makedirs(frame_path)
        print ('Made dir', frame_path) 




    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest
    agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes

    #Begin training
    count =0
    start = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # if see_frames:
            #Grayscale
            # save_frame(state, count)




            # #RGB
            # state = envs.render()
            # print(state.shape)
            # fdsafa


            values = []
            actions = []
            for ii in range(100):
                # Act, [P,1], [P,1]
                action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
                val = value.data.cpu().numpy()[0][0]
                act_ = action.data.cpu().numpy()[0][0]
                # print ('value', val)
                # print ('action', act_)
                values.append(val)
                actions.append(act_)

            # print ('values', values)
            # print ('actions', actions)

            rows = 1
            cols = 2

            fig = plt.figure(figsize=(8,4), facecolor='white')

            # plot frame
            ax = plt.subplot2grid((rows,cols), (0,0), frameon=False)

            state1 = np.squeeze(state[0])
            ax.imshow(state1, cmap='gray')
            ax.set_xticks([])
            ax.set_yticks([])
            # ax.savefig(frame_path+'frame' +str(count)+'.png')
            # print ('saved',frame_path+'frame' +str(count)+'.png')
            # plt.close(fig)


            #plot values histogram
            ax = plt.subplot2grid((rows,cols), (0,1), frameon=False)

            weights = np.ones_like(values)/float(len(values))
            ax.hist(values, 50, range=[0.0, 4.], weights=weights)
            # ax.set_ylim(top=1.)
            ax.set_ylim([0.,1.])

            plt_path = frame_path+'plt' 
            plt.savefig(plt_path+str(count)+'.png')
            print ('saved',plt_path+str(count)+'.png')
            plt.close(fig)
            # fsadf



            count+=1
            if count > 2:
                if done[0] or count > max_frames:
                    ffsdfa





                # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
                # print ('value', value)
                # print ('action', action)

                # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
                # print ('value', value)
                # print ('action', action)


            
            cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions) 



            # Record rewards
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            
            # Update state
            current_state = update_current_state(current_state, state, shape_dim0)

            # Agent record step
            agent.insert_data(step, current_state, action.data, value.data, reward, masks)



        # #Optimize agent
        # agent.update()  #agent.update(j,num_updates)
        # agent.insert_first_state(agent.rollouts.states[-1])




        total_num_steps = (j + 1) * num_processes * num_steps
Exemplo n.º 18
0
def train(model_dict):
    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(
            state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:]
        current_state[:, -channels:] = state  #last frame is now the new one

        return current_state

    def update_rewards(reward, done, final_rewards, episode_rewards,
                       current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                 1)).float()  #[P,1]
        episode_rewards += reward  #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])  #[P,1]
        final_rewards *= masks  #erase the ones that are done
        final_rewards += (
            1 -
            masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks  #erase the done ones
        masks = masks.type(dtype)  #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks  #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype'] = dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype'] = dtype

    # Create environments
    print(num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print('Made dir', monitor_rewards_dir)
    envs = SubprocVecEnv([
        make_env(env_name, seed, i, monitor_rewards_dir)
        for i in range(num_processes)
    ])

    if vid_:
        print('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print('env for ls')
        envs_ls = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]
                 )  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape'] = obs_shape
    model_dict['shape_dim0'] = shape_dim0

    # # Create agent
    # if algo == 'a2c':
    #     agent = a2c(envs, model_dict)
    #     print ('init a2c agent')
    # elif algo == 'ppo':
    #     agent = ppo(envs, model_dict)
    #     print ('init ppo agent')
    # elif algo == 'a2c_minibatch':
    #     agent = a2c_minibatch(envs, model_dict)
    #     print ('init a2c_minibatch agent')
    # elif algo == 'a2c_list_rollout':
    #     agent = a2c_list_rollout(envs, model_dict)
    #     print ('init a2c_list_rollout agent')
    # elif algo == 'a2c_with_var':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')

    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if model_dict['load_params']:
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     # agent.actor_critic = torch.load(args.load_path).cuda()

    #     # print ('loaded ', args.load_path)

    #     if model_dict['load_number'] == 3:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict)

    #     elif model_dict['load_number'] == 6:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict)
    #     elif model_dict['load_number'] == 9:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict)

    #     # else:
    #     #     load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict)
    #     else:
    #         PROBLEM

    print('Init expert agent')
    expert_agent = a2c(envs, model_dict)
    param_file = home + '/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params9999360.pt'
    param_dict = torch.load(param_file)
    expert_agent.actor_critic.load_state_dict(param_dict)
    print('loaded params', param_file)
    expert_agent.actor_critic.cuda()

    print('Init imitator agent')
    imitator_agent = a2c(envs, model_dict)
    # param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params.ckpt'
    # param_dict = torch.load(param_file)
    # imitator_agent.actor_critic.load_state_dict(param_dict)
    # print ('loaded params', param_file)
    imitator_agent.actor_critic.cuda()

    agent = expert_agent
    expert_policy = expert_agent.actor_critic

    imitator_policy = imitator_agent.actor_critic
    optimizer = optim.Adam(imitator_policy.parameters(),
                           lr=.0005,
                           weight_decay=.00001)

    total_steps = 0

    display_step = 50

    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(
        num_processes,
        *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(
        current_state, state,
        shape_dim0).type(dtype)  #add the new frame, remove oldest
    agent.insert_first_state(
        current_state
    )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_processes, 1])  #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval / num_processes / num_steps)

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            state__ = Variable(agent.rollouts.states[step]) / 255.
            value, action, action_log_probs, dist_entropy = agent.act(
                state__)  #, requires_grad=False)#, volatile=True))
            # print (action_log_probs.size())
            # print (dist_entropy.size())

            batch = state__

            optimizer.zero_grad()

            log_dist_expert = expert_policy.action_logdist(batch)
            log_dist_imitator = imitator_policy.action_logdist(batch)

            action_dist_kl = torch.sum((log_dist_expert - log_dist_imitator) *
                                       torch.exp(log_dist_expert),
                                       dim=1)  #[B]

            # elbo, logpx, logpz, logqz, action_dist_kl = self.forward(batch, policy, k=k)
            loss = torch.mean(action_dist_kl)

            loss.backward()
            # nn.utils.clip_grad_norm(self.parameters(), .5)
            optimizer.step()

            # if total_steps%display_step==0: # and batch_idx == 0:
            #     # print ('Train Epoch: {}/{}'.format(epoch+1, epochs),
            #         # 'total_epochs {}'.format(total_epochs),
            #         print('LL:{:.4f}'.format(loss.data[0])
            #         # 'logpx:{:.4f}'.format(logpx.data[0]),
            #         # 'logpz:{:.5f}'.format(logpz.data[0]),
            #         # 'logqz:{:.5f}'.format(logqz.data[0]),
            #         # 'action_kl:{:.4f}'.format(action_dist_kl.data[0])
            #         )

            # total_steps+=1

            cpu_actions = action.data.squeeze(1).cpu().numpy()  #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions)

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(
                reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state,
                                                 shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)
            agent.insert_data(step, current_state, action.data, value, reward,
                              masks, action_log_probs, dist_entropy)  #, done)

        #Optimize agent

        agent.no_update()  #agent.update(j,num_updates)
        # agent.update()  #agent.update(j,num_updates)

        agent.insert_first_state(agent.rollouts.states[-1])

        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps

        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            save_to = home + '/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params_env.ckpt'
            torch.save(imitator_policy.state_dict(), save_to)
            print('saved imitator_policy', save_to)

            # #Save model
            # if save_params:
            #     do_params(save_dir, agent, total_num_steps, model_dict)
            #     # save_params_v2(save_dir, agent, total_num_steps, model_dict)

            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype,
                       agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state,
                        update_rewards, total_num_steps)

        #Print updates
        if j % log_interval == 0:  # and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}, {:.4f}".format(
                j, total_num_steps,
                final_rewards.min(), final_rewards.median(),
                final_rewards.mean(), final_rewards.max(),
                int(total_num_steps / (end - start)), end - start,
                end - start2, loss.data[0])

            # to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps,
            #                            final_rewards.min(),
            #                            final_rewards.median(),
            #                            final_rewards.mean(),
            #                            final_rewards.max(),
            #                            int(total_num_steps / (end - start)),
            #                            end - start,
            #                            end - start2)

            print(to_print_info_string)
            start2 = time.time()

            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time"
            if j % (log_interval * 30) == 0:

                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps,
                          update_current_state, update_rewards)
                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)
                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise  #pass
                    print(to_print_legend_string)

    try:
        make_plots(model_dict)
    except:
        print()
Exemplo n.º 19
0
def train(model_dict):
    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(
            state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:]
        current_state[:, -channels:] = state  #last frame is now the new one

        return current_state

    def update_rewards(reward, done, final_rewards, episode_rewards,
                       current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                 1)).float()  #[P,1]
        episode_rewards += reward  #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])  #[P,1]
        final_rewards *= masks  #erase the ones that are done
        final_rewards += (
            1 -
            masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks  #erase the done ones
        masks = masks.type(dtype)  #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks  #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype'] = dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype'] = dtype

    # Create environments
    print(num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print('Made dir', monitor_rewards_dir)
    envs = SubprocVecEnv([
        make_env(env_name, seed, i, monitor_rewards_dir)
        for i in range(num_processes)
    ])

    if vid_:
        print('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print('env for ls')
        envs_ls = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]
                 )  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape'] = obs_shape
    model_dict['shape_dim0'] = shape_dim0

    action_size = envs.action_space.n

    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print('init a2c agent')
    elif algo == 'ppo':
        agent = ppo(envs, model_dict)
        print('init ppo agent')
    elif algo == 'a2c_minibatch':
        agent = a2c_minibatch(envs, model_dict)
        print('init a2c_minibatch agent')
    elif algo == 'a2c_list_rollout':
        agent = a2c_list_rollout(envs, model_dict)
        print('init a2c_list_rollout agent')
    elif algo == 'a2c_with_var':
        agent = a2c_with_var(envs, model_dict)
        print('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    #Load model
    if model_dict['load_params']:
        # agent.actor_critic = torch.load(os.path.join(args.load_path))
        # agent.actor_critic = torch.load(args.load_path).cuda()

        # print ('loaded ', args.load_path)

        if model_dict['load_number'] == 3:
            load_params_v2(
                home +
                '/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/',
                agent, 3000160, model_dict)

        elif model_dict['load_number'] == 6:
            load_params_v2(
                home +
                '/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/',
                agent, 6000160, model_dict)
        elif model_dict['load_number'] == 9:
            load_params_v2(
                home +
                '/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/',
                agent, 9000160, model_dict)

        # else:
        #     load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict)
        else:
            PROBLEM

    ls_path = save_dir + '/V_and_Q_errors/'
    ls_file = ls_path + 'error_monitor.csv'

    if not os.path.exists(ls_path):
        os.makedirs(ls_path)
        # if print_:
        print('Made dir', ls_path)

    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(
        num_processes,
        *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(
        current_state, state,
        shape_dim0).type(dtype)  #add the new frame, remove oldest
    agent.insert_first_state(
        current_state
    )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_processes, 1])  #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval / num_processes / num_steps)

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):

        Vs = []
        Qs = []

        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            value, action, action_log_probs, dist_entropy = agent.act(
                Variable(agent.rollouts.states[step]))  #, volatile=True))
            # print (action_log_probs.size())
            # print (dist_entropy.size())

            one_hot_action = torch.FloatTensor(num_processes, action_size)
            one_hot_action.zero_()
            one_hot_action.scatter_(1, action.data.cpu(), 1)

            # print (action)
            # print (one_hot_action)
            # fdsfa

            V, Q = agent.actor_critic.get_V_and_Q(
                Variable(agent.rollouts.states[step]), one_hot_action)
            Vs.append(V)
            Qs.append(Q)

            cpu_actions = action.data.squeeze(1).cpu().numpy()  #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions)

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(
                reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state,
                                                 shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)
            agent.insert_data(step, current_state, action.data, value, reward,
                              masks, action_log_probs, dist_entropy)  #, done)

        #Optimize agent
        # agent.update()  #agent.update(j,num_updates)

        V_loss, Q_loss = agent.update2(Vs, Qs)  #agent.update(j,num_updates)

        V_loss = V_loss.data.cpu().numpy()[0]
        Q_loss = Q_loss.data.cpu().numpy()[0]
        # print (V_loss)
        # fasd

        agent.insert_first_state(agent.rollouts.states[-1])

        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps

        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
                save_params_v2(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype,
                       agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state,
                        update_rewards, total_num_steps)

        #Print updates
        if j % log_interval == 0:  # and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(
                j, total_num_steps,
                final_rewards.min(), final_rewards.median(),
                final_rewards.mean(), final_rewards.max(),
                int(total_num_steps / (end - start)), end - start,
                end - start2)
            print(to_print_info_string)
            start2 = time.time()

            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time"
            if j % (log_interval * 30) == 0:

                if total_num_steps > 5000:
                    with open(ls_file, 'a') as f:
                        writer = csv.writer(f)
                        writer.writerow([total_num_steps, V_loss, Q_loss])

                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps,
                          update_current_state, update_rewards)
                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)
                    make_plots(model_dict)

                    if total_num_steps > 5000:
                        update_error_plot(model_dict)

                    print(to_print_legend_string + " Plot updated")
                except:
                    raise  #pass
                    print(to_print_legend_string)

    try:
        make_plots(model_dict)
    except:
        print()
Exemplo n.º 20
0
def train(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one

        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state



    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']


    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']
    vae_ = model_dict['vae_']
    grad_var_ = model_dict['grad_var_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype']=dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype']=dtype


    # Create environments
    print (num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print ('Made dir', monitor_rewards_dir) 
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print ('env for gif')
        envs_gif = make_env_basic(env_name)

    # if ls_:
    #     print ('env for ls')
    #     envs_ls = make_env_basic(env_name)

    # if vae_:
    #     print ('env for vae')
    #     envs_vae = make_env_basic(env_name)

    # if grad_var_:
    #     print ('env for grad_var_')
    #     envs_grad_var = make_env_basic(env_name)



    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape
    model_dict['shape_dim0']=shape_dim0
    model_dict['action_size'] = envs.action_space.n
    print (envs.action_space.n, 'actions')



    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print ('init a2c agent')

    elif algo == 'dqn':
        agent = DQN(envs, model_dict)
        print ('init DQN agent')  
        print (agent.q_net)   



    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest, since its a stack
    # agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval /num_processes/num_steps)


    # dqn_epsilon = .1 #lower means less likely to do random .9 # .1

    epsilon_start = 1.0
    epsilon_final = 0.01
    epsilon_decay = 50000
    epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):

        dqn_epsilon = epsilon_by_frame(j)

        #Num steps till agent update
        # for step in range(num_steps):

        # Act, [P,1], [P,1], [P,1], [P]
        # state_pytorch = Variable(agent.rollouts.states[step])
        state_pytorch = Variable(current_state)
        # value, action, action_log_probs, dist_entropy = agent.act(state_pytorch, epsilon=dqn_epsilon)#, volatile=True))
        action = agent.act(state_pytorch, epsilon=dqn_epsilon)#, volatile=True))
        
        # Apply to Environment, S:[P,C,H,W], R:[P], D:[P]
        # cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]
        frame, reward, done, info = envs.step(action) 

        # Record rewards and update state
        reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
        new_current_state = update_current_state(current_state, frame, shape_dim0)


        agent.replay_buffer.push(current_state, action, reward, new_current_state, done.astype(int))

        current_state = new_current_state


        if len(agent.replay_buffer) > 100:
            agent.update()
            # agent.update()
            # agent.update()
            # agent.update()







        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps
        
        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps)
            #make vae prob gif
            if vae_:
                do_prob_state(envs_vae, agent, model_dict, vae, update_current_state, total_num_steps)
            # #make vae prob gif
            # if grad_var_:
            #     do_grad_var(envs_grad_var, agent, model_dict, update_current_state, total_num_steps)

        #Print updates
        if j % log_interval == 0:# and j!=0:
            end = time.time()


            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.2f}, {:.2f}, {:.5f}".format(j, total_num_steps,
                                       final_rewards.min(),
                                       final_rewards.median(),
                                       final_rewards.mean(),
                                       final_rewards.max(),
                                       int(total_num_steps / (end - start)),
                                       end - start,
                                       end - start2,
                                       dqn_epsilon,
                                       agent.loss.data.cpu().numpy()[0])
                                       # torch.mean(discrim_errors).data.cpu().numpy()[0])

            print(to_print_info_string)


            # if vae_:
            #     elbo =  "{:.2f}".format(elbo.data.cpu().numpy()[0])


            # if next_state_pred_:
            #     state_pred_error_print =  "{:.2f}".format(agent.state_pred_error.data.cpu().numpy()[0])
            #     print(to_print_info_string+' '+state_pred_error_print+' '+elbo)
            #     to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, pred_error, elbo"

            # else:
            # if vae_:
            #     print(to_print_info_string+' '+elbo)
            # else:
            # print(to_print_info_string)


            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, discrim_E"#, elbo"
            start2 = time.time()

            if j % (log_interval*30) == 0:
            
                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards)

                # if grad_var_  and j % (log_interval*300) == 0:
                if grad_var_  and j % (log_interval*30) == 0:
                    #writes to file
                    do_grad_var(envs_grad_var, agent, model_dict, total_num_steps, update_current_state, update_rewards)






                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)

                    # if grad_var_ and j % (log_interval*300) == 0:
                    if grad_var_ and j % (log_interval*30) == 0:
                        update_grad_plot(model_dict)
                        to_print_legend_string += ' grad_var_plot updated '

                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")

                    # print (len(agent.replay_buffer))
                except:
                    raise #pass
                    print(to_print_legend_string + " problem with plot")



    try:
        make_plots(model_dict)
    except:
        print ()
Exemplo n.º 21
0
def train(model_dict):
    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(
            state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:]
        current_state[:, -channels:] = state  #last frame is now the new one

        return current_state

    def update_rewards(reward, done, final_rewards, episode_rewards,
                       current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                 1)).float()  #[P,1]
        episode_rewards += reward  #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])  #[P,1]
        final_rewards *= masks  #erase the ones that are done
        final_rewards += (
            1 -
            masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks  #erase the done ones
        masks = masks.type(dtype)  #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks  #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']
    vae_ = model_dict['vae_']
    explore_ = model_dict['explore_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype'] = dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype'] = dtype

    # Create environments
    print(num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print('Made dir', monitor_rewards_dir)
    envs = SubprocVecEnv([
        make_env(env_name, seed, i, monitor_rewards_dir)
        for i in range(num_processes)
    ])

    if vid_:
        print('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print('env for ls')
        envs_ls = make_env_basic(env_name)

    if vae_:
        print('env for vae')
        envs_vae = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]
                 )  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape'] = obs_shape
    model_dict['shape_dim0'] = shape_dim0

    next_state_pred_ = 0
    model_dict['next_state_pred_'] = next_state_pred_

    # Create agent
    # if algo == 'a2c':

    # agent = a2c(envs, model_dict)

    # elif algo == 'ppo':
    #     agent = ppo(envs, model_dict)
    #     print ('init ppo agent')
    # elif algo == 'a2c_minibatch':
    #     agent = a2c_minibatch(envs, model_dict)
    #     print ('init a2c_minibatch agent')
    # elif algo == 'a2c_list_rollout':
    #     agent = a2c_list_rollout(envs, model_dict)
    #     print ('init a2c_list_rollout agent')
    # elif algo == 'a2c_with_var':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if args.load_path != '':
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     agent.actor_critic = torch.load(args.load_path).cuda()
    #     print ('loaded ', args.load_path)

    # see_reward_episode = 0
    # if 'Montez' in env_name and see_reward_episode:
    #     states_list = [[] for i in range(num_processes)]

    # view_reward_episode(model_dict=model_dict, frames=[])
    # dfasddsf

    # if vae_:
    #     vae = VAE()
    #     vae.cuda()

    print('init exploit a2c agent')
    agent_exploit = a2c(envs, model_dict)

    if explore_:
        print('init explore a2c agent')
        agent_explore = a2c(envs, model_dict)
        print('init vae')
        vae = VAE()
        vae.cuda()

    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(
        num_processes,
        *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(
        current_state, state,
        shape_dim0).type(dtype)  #add the new frame, remove oldest

    agent_exploit.insert_first_state(
        current_state
    )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step
    if explore_:
        agent_explore.insert_first_state(
            current_state
        )  #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_processes, 1])  #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval / num_processes / num_steps)

    # prev_action = Variable(torch.zeros([num_processes, 1]).type(torch.LongTensor)).cuda()

    # For normalizing the logprobs
    B = .99
    m = torch.FloatTensor([-100.]).cuda()
    v = torch.FloatTensor([10000.]).cuda()

    # prev_reward = torch.ones(num_processes,1).cuda()
    if model_dict['init_exploit_processes'] == -1:
        init_exploit_processes = num_processes
    else:
        init_exploit_processes = model_dict['init_exploit_processes']
    exploit_processes = init_exploit_processes
    # explore_processes = 16

    all_frames = []

    start = time.time()
    start2 = time.time()
    for j in range(num_updates):

        start3 = time.time()
        for step in range(num_steps):

            # start3 = time.time()
            state_pytorch = Variable(agent_exploit.rollouts.states[step]
                                     )  #, volatile=True) # [P,S,84,84]

            # exploit_state = state_pytorch[:exploit_processes]
            # explore_state = state_pytorch[exploit_processes:]

            u_value, u_action, u_action_log_probs, u_dist_entropy = agent_exploit.act(
                state_pytorch)
            if explore_:
                r_value, r_action, r_action_log_probs, r_dist_entropy = agent_explore.act(
                    state_pytorch)

            u_cpu_actions = u_action.data.squeeze(1).cpu().numpy()  #[P]
            if explore_:
                r_cpu_actions = r_action.data.squeeze(1).cpu().numpy()  #[P]

            #Choose how many you want from each
            cpu_actions = np.concatenate((u_cpu_actions[:exploit_processes],
                                          r_cpu_actions[exploit_processes:]),
                                         0)  #[P]
            # cpu_actions = u_cpu_actions

            # before_step_time = time.time() - start3

            # Step, S:[P,C,H,W], R:[P], D:[P]
            # start3 = time.time()
            state, reward, done, info = envs.step(cpu_actions)
            # step_time = time.time() - start3
            # reward_numpy = reward
            # print (reward)

            # # for trainign vae.
            # for p in range(len(state)):
            #     # print (state[p].shape) #[1,84,84]
            #     # fasad
            #     all_frames.append(state[p])
            #     print (len(all_frames))
            #     if len(all_frames) == 10000:
            #         pickle.dump( all_frames, open(home + '/Documents/tmp/montezum_frames.pkl' , "wb" ) )
            #         print ('saved pkl')
            #         fafaadsfs

            # start3 = time.time()
            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(
                reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state,
                                                 shape_dim0)
            # current_state_u = current_state[:exploit_processes]
            # current_state_r = current_state[exploit_processes:]

            #Insert data for exploit agent
            agent_exploit.insert_data(step, current_state, u_action.data,
                                      u_value, reward, masks,
                                      u_action_log_probs, u_dist_entropy,
                                      0)  #, done)

            if explore_:
                # Insert log prob for explore agent
                batch = state_pytorch[:, -1]  #last of stack
                batch = batch.contiguous()  # [P,84,84]
                elbo = vae.forward2(batch, k=10)  #[P]
                elbo = elbo.view(-1, 1).data  #[P,1]
                elbo = (elbo - m) / torch.sqrt(v)
                elbo = torch.clamp(elbo, max=.01)
                agent_explore.insert_data(step, current_state, r_action.data,
                                          r_value, -elbo, masks,
                                          r_action_log_probs, r_dist_entropy,
                                          0)  #, done)

                #update m and v
                m = B * m + (1. - B) * elbo.mean()
                v = B * v + (1. - B) * elbo.pow(2).mean()

                if elbo.mean() < -9000.:
                    print(elbo)
                    print(reward)
                    print(elbo.mean())
                    print(elbo.pow(2).mean())
                    fadsads

            # after_step_time = time.time() - start3

            # if 'Montez' in env_name and see_reward_episode:

            #     for state_i in range(len(state)):
            #         if done[state_i]:
            #             states_list[state_i] = []
            #         else:
            #             states_list[state_i].append(np.squeeze(state[state_i]))

            #             # print (state[state_i].shape)
            #             # fasdf

            #         # print (reward)

            #         if reward_numpy[state_i] >0:
            #             #plot the states of state_i
            #             print (len(states_list[state_i]))
            #             # view_reward_episode(model_dict=model_dict, frames=states_list[state_i][len(states_list[state_i])-100:])
            #             # view_reward_episode(model_dict=model_dict, frames=states_list[state_i][len(states_list[state_i])-100:])
            #             view_reward_episode(model_dict=model_dict, frames=states_list[state_i])

            #             fadsa

            #      # and np.sum(agent.rollouts.rewards.cpu().numpy()) > 0

            #     # print (np.sum(agent.rollouts.rewards.cpu().numpy()))
            #     # print (j)

        steps_time = time.time() - start3
        start3 = time.time()

        #Optimize agents
        agent_exploit.update()  #agent.update(j,num_updates)
        if explore_:
            agent_explore.update()  #agent.update(j,num_updates)

            #Optimize vae
            batch = agent_exploit.rollouts.states
            batch = batch[1:]  # [Steps,Processes,Stack,84,84]
            batch = batch[:, :, 0]  # [Steps,Processes,84,84]
            batch = batch.contiguous().view(-1, 84,
                                            84)  # [Steps*Processes,84,84]
            elbo = vae.update(batch)

        #Insert state
        agent_exploit.insert_first_state(agent_exploit.rollouts.states[-1])
        if explore_:
            agent_explore.insert_first_state(agent_explore.rollouts.states[-1])

        total_num_steps = (j + 1) * num_processes * num_steps

        #Change number of explore vs exploit
        if model_dict['init_exploit_processes'] != -1 and model_dict[
                'inc_exploiters_over'] != -1:
            frac_step = np.minimum((total_num_steps + 1.) /
                                   float(model_dict['inc_exploiters_over']),
                                   1.)  #fraction of steps
            aaa = int((num_processes - init_exploit_processes) * frac_step)
            exploit_processes = np.minimum(init_exploit_processes + aaa + 1,
                                           num_processes)

        update_time = time.time() - start3

        # agent_exploit.rollouts.reset_lists()
        # agent_explore.rollouts.reset_lists()

        # print ('init ', init_exploit_processes)
        # print ('cur ', exploit_processes)
        # print ('frac_step', frac_step)
        # print ('aaa', aaa)

        # print (agent.state_pred_error.data.cpu().numpy())

        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)

        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype,
                       agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state,
                        update_rewards, total_num_steps)
            # #make vae prob gif
            if vae_:
                # do_prob_state(envs_vae, agent, model_dict, vae, update_current_state, total_num_steps)
                # do_gifs2(envs_vae, agent_exploit, vae, model_dict, update_current_state, update_rewards, total_num_steps)
                do_gifs3(envs_vae, agent_exploit, vae, model_dict,
                         update_current_state, update_rewards, total_num_steps)

        #Print updates
        if j % log_interval == 0:  # and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.2f}".format(
                j, total_num_steps,
                final_rewards.min(), final_rewards.median(),
                final_rewards.mean(), final_rewards.max(),
                int(total_num_steps / (end - start)), end - start,
                end - start2)

            elbo = "{:.2f}".format(elbo.data.cpu().numpy()[0])
            # elbo =  "1"

            steps_time = "{:.3f}".format(steps_time)
            update_time = "{:.3f}".format(update_time)

            # if next_state_pred_:
            #     state_pred_error_print =  "{:.2f}".format(agent.state_pred_error.data.cpu().numpy()[0])
            #     print(to_print_info_string+' '+state_pred_error_print+' '+elbo)
            #     to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, pred_error, elbo"

            # else:
            # print(to_print_info_string+' '+elbo)
            # print(to_print_info_string+' '+elbo+' '+str(exploit_processes)+' '+str(before_step_time)+' '+str(step_time)+' '+str(after_step_time))#, value[0].data.cpu().numpy(), m.cpu().numpy(), v.cpu().numpy())
            print(
                to_print_info_string + ' ' + elbo + ' ' +
                str(exploit_processes)
            )  #+' '+steps_time+' '+update_time)#, value[0].data.cpu().numpy(), m.cpu().numpy(), v.cpu().numpy())

            # print (value[0].data.cpu().numpy(), m.cpu().numpy(), v.cpu().numpy())
            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, elbo, Exploit_Procs"

            start2 = time.time()

            if j % (log_interval * 30) == 0:

                if ls_:
                    # do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards)
                    do_ls_2(envs_ls, agent_explore, model_dict,
                            total_num_steps, update_current_state,
                            update_rewards, vae)

                    # update_ls_plot(model_dict)
                    update_ls_plot_2(model_dict)
                    print('updated ls')

                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots

            # if ls_:

                try:

                    start3 = time.time()

                    make_plots(model_dict)
                    print(to_print_legend_string +
                          " Plot updated ")  #+str(time.time() - start3))
                except:
                    raise  #pass
                    print(to_print_legend_string)

    try:
        make_plots(model_dict)
    except:
        print()