示例#1
0
def test(rank, params, shared_model):
    torch.manual_seed(params.seed + rank)
    env = create_atari_env(params.env_name, video=True)
    env.seed(params.seed + rank)
    model = ActorCritic(env.observation_space.shape[0], env.action_space)
    model.eval()
    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True
    start_time = time.time()
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)
        value, action_value, (hx, cx) = model(
            (Variable(state.unsqueeze(0), volatile=True), (hx, cx)))
        prob = F.softmax(action_value)
        action = prob.max(1)[1].data.numpy()
        state, reward, done, _ = env.step(action[0])
        reward_sum += reward
        if done:
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(60)
        state = torch.from_numpy(state)
def test(rank, args, shared_model):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space.n,
                        args.lstm_size)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    start_time = time.time()

    #actions=deque(maxlen=100)
    episode_length = 0

    currentPath = os.getcwd()
    File = open(currentPath + '/record.txt', 'a+')
    print("\n\n\n\n------------------------------\n\n\n\n\n")
    File.write("\n\n\n\n------------------------------\n\n\n\n\n")
    File.close()

    cnt = 0
    episode_number = 0

    while True:
        env.render()
        cnt = cnt + 1
        episode_length += 1
        if done:
            model.load_state_dict(shared_model.state_dict())
            hx = Variable(torch.zeros(1, args.lstm_size), volatile=True)
            cx = Variable(torch.zeros(1, args.lstm_size), volatile=True)
        else:
            hx = Variable(hx.data, volatile=True)
            cx = Variable(cx.data, volatile=True)

        #print(state)
        value, logit, (hx, cx) = model((Variable(state.unsqueeze(0),
                                                 volatile=True), (hx, cx)))
        prob = F.softmax(logit)
        #action=prob.max(1)[1].data.numpy()
        action = prob.multinomial().data

        #if(args.env_name=='Breakout-v3'):
        #    state,reward,done,_=env.step(1)
        #     reward_sum+=reward
        #state,reward,done,_ =env.step(action[0,0])
        state, reward, done, _ = env.step(action.numpy())
        done = done  #or episode_length >= args.max_episode_length
        if episode_length >= args.max_episode_length:
            done = True
            reward_sum -= 30
        reward_sum += reward

        #actions.append(action[0,0])
        #if actions.count(actions[0])==actions.maxlen:
        #    done=True
        #if reward!=0:
        #  print("ep %d : game finished,reward: %d " %(episode_number,reward))+('' if reward == #-1 else ' !!!!!!!!')

        if done:
            hour = int(
                time.strftime("%H", time.gmtime(time.time() - start_time)))
            _min = int(
                time.strftime("%M", time.gmtime(time.time() - start_time)))

            print("Time {},episode reward {}, episode length {} ".format(
                hour * 60 + _min + args.starttime, reward_sum, episode_length))

            File = open(currentPath + '/record.txt', 'a+')
            File.write(
                "Time {},episode reward {}, episode length {} \n".format(
                    hour * 60 + _min + args.starttime, reward_sum,
                    episode_length))
            File.close()

            reward_sum = 0
            episode_length = 0
            #actions.clear()
            state = env.reset()

            torch.save(model.state_dict(), currentPath + '/A3C.t7')
            episode_number += 1
            time.sleep(60)

        state = torch.from_numpy(state)
示例#3
0
文件: test.py 项目: 404akhan/research
def test(rank, args, shared_model):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    if not os.path.exists('models-a3c'):
        os.makedirs('models-a3c')
    path = 'models-a3c/model-{}.pth'.format(args.model_name)
    print('saving directory is', path)

    model = ActorCritic(env.action_space.n, args.num_atoms, args.gamma)
    model.eval()

    state = env.reset()
    state = np.concatenate([state] * 4, axis=0)
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True
    action_stat = [0] * model.num_outputs

    start_time = time.time()
    episode_length = 0

    for ep_counter in itertools.count(1):
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())

            torch.save(shared_model.state_dict(), path)
            print('saved model')

        atoms_logit, logit = model(Variable(state.unsqueeze(0), volatile=True))
        prob = F.softmax(logit)
        action = prob.max(1)[1].data.numpy()

        action_np = action[0, 0]
        action_stat[action_np] += 1

        state_new, reward, done, info = env.step(action_np)
        dead = is_dead(info)

        if args.testing:
            atoms_prob = F.softmax(atoms_logit)
            value = model.get_v(atoms_prob, batch=False)
            atoms_prob = atoms_prob.squeeze().data.numpy()

            print('episode', episode_length, 'normal action', action_np,
                  'lives', info['ale.lives'], 'value', value)
            env.render()

            if ep_counter % 100 == 0:
                plt.plot(model.z, atoms_prob)
                plt.title('average v is {}'.format(value))
                plt.show()
        state = np.append(state.numpy()[1:, :, :], state_new, axis=0)
        done = done or episode_length >= args.max_episode_length

        reward_sum += reward
        episode_length += 1

        if done:
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            print("actions stats real {}".format(
                action_stat[:model.num_outputs]))

            reward_sum = 0
            episode_length = 0
            state = env.reset()
            env.seed(args.seed + rank + (args.num_processes + 1) * ep_counter)
            state = np.concatenate([state] * 4, axis=0)
            action_stat = [0] * model.num_outputs
            if not args.testing: time.sleep(60)

        state = torch.from_numpy(state)
示例#4
0
                    help='how many training processes to use (default: 4)')
parser.add_argument('--num-steps', type=int, default=20, metavar='NS',
                    help='number of forward steps in A3C (default: 20)')
parser.add_argument('--max-episode-length', type=int, default=10000, metavar='M',
                    help='maximum length of an episode (default: 10000)')
parser.add_argument('--env-name', default='PongDeterministic-v3', metavar='ENV',
                    help='environment to train on (default: PongDeterministic-v3)')


if __name__ == '__main__':
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    env = create_atari_env(args.env_name)
    shared_model = ActorCritic(
        env.observation_space.shape[0], env.action_space)
    shared_model.share_memory()

    processes = []

    p = mp.Process(target=test, args=(args.num_processes, args, shared_model))
    p.start()
    processes.append(p)

    for rank in range(0, args.num_processes):
        p = mp.Process(target=train, args=(rank, args, shared_model))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()
示例#5
0
class Agent(mp.Process):

    def __init__(self, global_actor_critic, optimizer, input_dims, nb_actions, gamma, lr, name, global_ep_index,
                 env_id):
        super(Agent, self).__init__()
        self.local_actor_critic = ActorCritic(input_dims, nb_actions, gamma)
        self.global_actor_critic = global_actor_critic
        self.name = "w%02i" % name
        self.episode_index = global_ep_index
        self.env = gym.make(env_id)
        self.optimizer = optimizer

    def run(self):
        t_step = 1
        while self.episode_index.value < EPISODES:
            done = False
            observation = self.env.reset()
            score = 0
            self.local_actor_critic.clear_memory()
            while not done:
                action = self.local_actor_critic.choose_action(observation)
                observation_, reward, done, info = self.env.step(action)
                score += reward
                self.local_actor_critic.remember(observation, action, reward)
                if (t_step % T_MAX) == 0 or done:
                    loss = self.local_actor_critic.calc_loss(done)
                    self.optimizer.zero_grad()
                    loss.backward()
                    for local_param, global_param in zip(
                            self.local_actor_critic.parameters(),
                            self.global_actor_critic.parameters()):
                        global_param._grad = local_param.grad
                    self.optimizer.step()
                    self.local_actor_critic.load_state_dict(self.global_actor_critic.state_dict())
                    self.local_actor_critic.clear_memory()
                t_step += 1
                observation = observation_
            with self.episode_index.get_lock():
                self.episode_index.value += 1
            print(self.name, 'episode ', self.episode_index.value, 'reward %.1f' % score)
示例#6
0
def train(rank, args, shared_model, counter, lock, optimizer=None):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()
    avg_rew_win_size = 25
    avg_rew = 0
    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True
    start_time = time.time()
    avg_rew_cnt = 0
    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            episode_length += 1
            value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx)))
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).detach()
            log_prob = log_prob.gather(1, action)

            state, reward, done, _ = env.step(action.numpy())
            done = done or episode_length >= args.max_episode_length
            reward_sum += reward
            reward = max(min(reward, 1), -1)
            # a quick hack to prevent the agent from stucking
            actions.append(action[0, 0])
            if actions.count(actions[0]) == actions.maxlen:
                done = True
                with lock:
                    counter.value += 1

            if done:
                avg_rew = avg_rew + reward_sum
                if avg_rew_cnt % avg_rew_win_size == 0:
                    print(" avg. episode reward {}".format(avg_rew /
                                                           avg_rew_win_size))
                    avg_rew = 0
                print("Time {},  episode reward {}, episode length {}".format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, episode_length))
                episode_length = 0
                reward_sum = 0
                actions.clear()
                state = env.reset()
                avg_rew_cnt = avg_rew_cnt + 1

            state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((state.unsqueeze(0), (hx, cx)))
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1] - values[i]
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * gae.detach() - args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args.value_loss_coef * value_loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
示例#7
0
class Params():
    def __init__(self):
        self.lr = 0.0001
        self.gamma = 0.99
        self.tau = 1.
        self.seed = 1
        self.num_processes = 16
        self.num_steps = 20
        self.max_episode_length = 10000
        self.env_name = 'Breakout-v0'


# Main run
os.environ['OMP_NUM_THREADS'] = '1'
params = Params()
torch.manual_seed(params.seed)
env = create_atari_env(params.env_name)
shared_model = ActorCritic(env.observation_space.shape[0], env.action_space)
shared_model.share_memory()
optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=params.lr)
optimizer.share_memory()
processes = []
p = mp.Process(target=test, args=(params.num_processes, params, shared_model))
p.start()
processes.append(p)
for rank in range(0, params.num_processes):
    p = mp.Process(target=train, args=(rank, params, shared_model, optimizer))
    p.start()
    processes.append(p)
for p in processes:
    p.join()
示例#8
0
文件: play.py 项目: cg31/cule

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='CuLE')
    parser.add_argument('game', type=str, help='Atari ROM filename')
    parser.add_argument('--num-stack',
                        type=int,
                        default=4,
                        help='number of images in a stack (default: 4)')
    args = parser.parse_args()
    num_stack = args.num_stack

    env = AtariEnv(args.game, num_envs=1)
    env.eval()

    model = ActorCritic(num_stack, env.action_space)
    shape = (args.num_stack, 84, 84)
    states = torch.ByteTensor(*shape).zero_()

    observation = env.reset()[0]
    states[-1] = downsample(observation).squeeze(-1)
    actions = env.minimal_actions()
    N = actions.size(0)

    options = {'noop': 0, 'right': 1, 'left': 2, 'down': 4, 'up': 8, ' ': 16}
    action_keys = [
        0, 1, 2, 4, 8, 16, 9, 10, 5, 6, 24, 17, 18, 20, 25, 26, 21, 22
    ]
    action_names = ['NOOP', 'RIGHT', 'LEFT', 'DOWN', 'UP', 'FIRE', 'UPRIGHT', \
                    'UPLEFT', 'DOWNRIGHT', 'DOWNLEFT', 'UPFIRE', 'RIGHTFIRE', \
                    'LEFTFIRE', 'DOWNFIRE', 'UPRIGHTFIRE', 'UPLEFTFIRE',      \
示例#9
0
                        default=1)
    parser.add_argument(
        "--rnd",
        type=bool,
        help="Play against random agent (else against negamax)",
        default=False)
    opts = parser.parse_args()

    # Autodetect CUDA
    use_cuda = T.cuda.is_available()
    device = T.device("cuda" if use_cuda else "cpu")
    print('Device:', device)

    HIDDEN_SIZE = 256
    env = ConnectX(switch_prob=0.5, random_agent=opts.rnd, test_mode=True)
    model = ActorCritic(env.observation_space.n, env.action_space.n,
                        HIDDEN_SIZE)
    model.load_state_dict(T.load(opts.weights))

    total_reward = 0
    for _ in range(opts.num):
        state = env.reset()
        done = False

        while not done:
            state = T.FloatTensor(state.board).unsqueeze(0).to(device)
            dist, _ = model(state)
            dist_space = dist.sample()

            action = T.argmax(dist_space, dim=1, keepdim=True).cpu().numpy()[0]

            next_state, reward, done, _ = env.step(action)
示例#10
0
                    metavar='O',
                    help='use an optimizer without shared momentum.')
parser.add_argument('--model-name', default='def', help='for saving the model')
parser.add_argument('--load-dir', help='load model from path')
parser.add_argument('--testing', default=False, help='to run model')

if __name__ == '__main__':
    os.environ['OMP_NUM_THREADS'] = '1'

    args = parser.parse_args()
    print(args)

    torch.manual_seed(args.seed)

    env = create_atari_env(args.env_name)
    shared_model = ActorCritic(env.observation_space.shape[0],
                               env.action_space, args.num_skips)
    shared_model.share_memory()

    if args.no_shared:
        optimizer = None
    else:
        optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr)
        optimizer.share_memory()

    if args.load_dir:
        filename = args.load_dir
        print('==> loading checkpoint {}'.format(filename))
        checkpoint = torch.load(filename)
        shared_model.load_state_dict(checkpoint)
        print('==> loaded checkpoint {}'.format(filename))
示例#11
0
def test(name, backend, env_name, rank, args, shared_model, counter, docker, train_mode=True):
    torch.manual_seed(args.seed + rank)

    if backend == 'unity3d':
        if docker:
            os.chdir('/mnt/code/')
        env = create_unity3d_env(train_mode=train_mode,\
         file_name=env_name, \
         worker_id=rank, seed=args.seed, \
         docker_training=docker)
    elif backend == 'gym':
        env = create_atari_env(env_name)
        env.seed(args.seed + rank)
    else:
        print(f' [!]: {backend} is not a valid backend')
        raise ValueError

    print(env.action_space)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state).float()
    reward_sum = 0
    done = True

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    history = {'num-steps': [], 'times': [], 'rewards': [], 'episode-length': []}
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)

        value, logit, (hx, cx) = model((Variable(
            state.unsqueeze(0), volatile=True), (hx, cx)))
        prob = F.softmax(logit)
        action = prob.max(1, keepdim=True)[1].data.numpy()

        state, reward, done, _ = env.step(action[0, 0])
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            done = True

        if done:
            end = time.time() - start_time
            history['num-steps'].append(counter.value)
            history['times'].append(end)
            history['rewards'].append(reward_sum)
            history['episode-length'].append(episode_length)
            print("Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss", time.gmtime(end)), counter.value, counter.value / (end),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()

            if train_mode:
                history['weights'] = shared_model.state_dict()
                torch.save(history, f'{name}-history.t7')
                time.sleep(60)

        state = torch.from_numpy(state).float()

    env.close()
                    help='Enviorment ')
parser.add_argument('--lstm-size',type=int,default=128,metavar='LSTM',
                    help='lstm size')
parser.add_argument('--loadmodel',type=int,default=0,
                    help='whether to loadmodel')
parser.add_argument('--starttime',type=int,default=0,
                    help='start time')

if __name__=='__main__':
    args=parser.parse_args()

    torch.manual_seed(args.seed)
    torch.set_num_threads(1)

    env=create_atari_env(args.env_name)
    shared_model=ActorCritic(
        env.observation_space.shape[0],env.action_space.n,args.lstm_size)
    if args.loadmodel>0:
        shared_model.load_state_dict(torch.load(os.getcwd()+'/A3C.t7'))
    shared_model.share_memory()

    processes= []
    p=mp.Process(target=test, args=(args.num_processes,args,shared_model))
    p.start()
    processes.append(p)

    for rank in range(args.num_processes):
	print(rank)
        p=mp.Process(target=train,args=(rank,args,shared_model))
        p.start()
        processes.append(p)
示例#13
0
def test(rank, args, shared_model, counter):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    model.eval()

    state = env.reset()

    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    # actions = deque(maxlen=100)
    episode_length = 0
    while True:
        env.render()
        print('here')
        # env.render()
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)
        print('there')
        value, logit, (hx, cx) = model((Variable(
            state.unsqueeze(0), volatile=True), (hx, cx)))
        print('hi')
        prob = F.softmax(logit)
        # print(prob)
        action = prob.max(1, keepdim=True)[1].data.numpy()
        print(action)

        state, reward, done, _ = env.step(action[0, 0])
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        # actions.append(action[0, 0])
        # if actions.count(actions[0]) == actions.maxlen:
        #     done = True

        if done:
            print("Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                counter.value, counter.value / (time.time() - start_time),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(60)

        state = torch.from_numpy(state)
class Agent(object):
    """Interacts and learns from the environment"""
    def __init__(self, num_agents, state_size, action_size):
        """ Initialize an Agent object

        Params
        ======
            num_agent (int): number of agents
            state_size (int): dimension of each state
            action_size (int): dimension of each action
        """
        self.num_agents = num_agents

        self.state_size = state_size
        self.action_size = action_size

        #        self.model = ActorCriticPolicy(state_size, action_size, 256)
        self.model = ActorCritic(state_size, action_size, 256)
        self.optimizer = optim.Adam(self.model.parameters(), LR, eps=EPSILON)

    def compute_gaes(self,
                     next_value,
                     rewards,
                     masks,
                     values,
                     gamma=0.99,
                     tau=0.95):
        values = values + [next_value]
        gae = 0
        returns = []
        for step in reversed(range(len(rewards))):
            delta = rewards[step] + gamma * values[
                step + 1] * masks[step] - values[step]
            gae = delta + gamma * tau * masks[step] * gae
            returns.insert(0, gae + values[step])
        return returns

    def compute_advantage(self,
                          next_value,
                          rewards,
                          masks,
                          values,
                          gamma=0.99,
                          tau=0.95):
        values = values + [next_value]
        advantage = 0
        returns = []
        for step in reversed(range(len(rewards))):
            # G(t) = r + G(t+
            delta = rewards[step] + gamma * values[
                step + 1] * masks[step] - values[step]
            gae = delta + gamma * tau * masks[step] * gae

            g_return = reward + GAMMA * next_return * done
            next_return = g_return
            # g_return = reward + GAMMA * g_return*done

            # Compute TD error
            td_error = reward + GAMMA * next_value - value
            # Compute advantages
            advantage = advantage * TAU * GAMMA * done + td_error

    def step(self, states, actions, values, rewards, log_probs, masks,
             next_value):

        #      def compute_gaes(next_value, rewards, masks, values, gamma=0.99, tau=0.95):

        returns = self.compute_gaes(next_value, rewards, masks, values)
        returns = torch.cat(returns).detach()
        log_probs = torch.cat(log_probs).detach()
        values = torch.cat(values).detach()
        states = torch.cat(states)
        actions = torch.cat(actions)
        advantages = returns - values
        advantages = (advantages - advantages.mean()) / advantages.std()
        self.learn(ppo_epochs=10,
                   mini_batch_size=32,
                   states=states,
                   actions=actions,
                   log_probs=log_probs,
                   returns=returns,
                   advantages=advantages,
                   clip_param=0.2)

    def step_(self, rollout):
        """ Compute advantage estimates at each time steps given a trajectory"""

        storage = [None] * (len(rollout) - 1)

        shape = (self.num_agents, 1)
        advantage = torch.Tensor(np.zeros(shape))

        for i in reversed(range(len(rollout) - 1)):
            # rollout --> tuple ( s, a, p(a|s), r, dones, V(s) ) FOR ALL AGENT
            # rollout --> last row (s, none, none, none, pending_value) FOR ALL AGENT
            state, action, log_prob, reward, done, value = rollout[i]

            # last step - next_return = pending_value
            if i == len(rollout) - 2:
                next_return = rollout[i + 1][-1]

            state = torch.Tensor(state)
            action = torch.Tensor(action)
            reward = torch.Tensor(reward).unsqueeze(1)
            done = torch.Tensor(done).unsqueeze(1)
            next_value = rollout[i + 1][-1]

            # G(t) = r + G(t+1)
            g_return = reward + GAMMA * next_return * done
            next_return = g_return
            # g_return = reward + GAMMA * g_return*done

            # Compute TD error
            td_error = reward + GAMMA * next_value - value
            # Compute advantages
            advantage = advantage * TAU * GAMMA * done + td_error

            # Add (s, a, p(a|s), g, advantage)
            storage[i] = [state, action, log_prob, g_return, advantage]

        state, action, log_prob, g_return, advantage = map(
            lambda x: torch.cat(x, dim=0), zip(*storage))
        advantage = (advantage - advantage.mean()) / advantage.std()

        # Check dimensions
        # print ("States :", states.size(0), " * ", states.size(1) )
        # print ("Actions :", actions.size(0), " * ", actions.size(1) )
        # print ("Log Prob :", log_prob.size(0), " * ", log_prob.size(1) )
        # print ("Return :", g_return.size(0), " * ", g_return.size(1) )
        # print ("Advantage :", advantage.size(0), " * ", advantage.size(1) )

        self.learn(state, action, log_prob, g_return, advantage,
                   self.num_agents)

    def act(self, states):
        """Given state as per current policy model, returns action, log probabilities and estimated state values"""
        dist, values = self.model(states)
        actions = dist.sample()
        log_probs = dist.log_prob(actions)
        log_probs = torch.sum(log_probs, dim=1, keepdim=True)

        return actions, log_probs, values, dist

    def sample(self, states, actions, log_probs, returns, advantages):
        """Randomly sample learning batches from trajectory"""
        rand_idx = np.random.randint(0, states.size(0), BATCH_SIZE)
        return states[rand_idx, :], actions[rand_idx, :], log_probs[
            rand_idx, :], returns[rand_idx, :], advantages[rand_idx, :]

    def ppo_iter(self, mini_batch_size, states, actions, log_probs, returns,
                 advantage):
        batch_size = states.size(0)
        for _ in range(batch_size // mini_batch_size):
            rand_ids = np.random.randint(0, batch_size, mini_batch_size)
            yield states[rand_ids, :], actions[rand_ids, :], log_probs[
                rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]

#    def learn(self, states, actions, log_probs_old, returns, advantages, num_agents):

    def learn(self,
              ppo_epochs,
              mini_batch_size,
              states,
              actions,
              log_probs,
              returns,
              advantages,
              clip_param=0.2):
        for _ in range(ppo_epochs):
            #            for state, action, old_log_probs, return_, advantage in self.ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantages):
            batch_size = states.size(0)
            for _ in range(batch_size // mini_batch_size):
                state, action, old_log_probs, return_, advantage = self.sample(
                    states, actions, log_probs, returns, advantages)
                _, new_log_probs, values, dist = self.act(state)
                entropy = dist.entropy().mean()

                ratio = (new_log_probs - old_log_probs).exp()
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1.0 - clip_param,
                                    1.0 + clip_param) * advantage

                actor_loss = -torch.min(surr1, surr2).mean()
                critic_loss = (return_ - values).pow(2).mean()

                loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy

                self.optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(self.model.parameters(),
                                         GRADIENT_CLIP)
                self.optimizer.step()

    def learn_(self, states, actions, log_probs_old, returns, advantages,
               num_agents):
        """ Optimize surrogate loss with policy and value parameters using given learning batches."""

        for _ in range(NUM_EPOCHS):
            for _ in range(states.size(0) // BATCH_SIZE):
                state_samples, action_samples, log_prob_samples, return_samples, advantage_samples = self.sample(
                    states, actions, log_probs_old, returns, advantages)

                dist, values = self.model(state_samples)

                log_probs = dist.log_prob(action_samples)
                log_probs = torch.sum(log_probs, dim=1, keepdim=True)
                entropy = dist.entropy().mean()

                ratio = (log_probs - log_prob_samples).exp()

                # Surrogate Objctive
                obj = ratio * advantage_samples

                # Clipped Surrogate Objective
                obj_clipped = ratio.clamp(1.0 - CLIP,
                                          1.0 + CLIP) * advantage_samples

                # Compute policy loss: L = min[ r(θ), clip ( r(θ), 1-Ɛ, 1+Ɛ )*A ] - β * entropy
                policy_loss = -torch.min(obj,
                                         obj_clipped).mean(0) - BETA * entropy

                # Compute value loss: L = ( V(s) - V_t )^2
                value_loss = (return_samples - values).pow(2).mean()

                # Optimize
                self.optimizer.zero_grad()
                (policy_loss + 0.5 * value_loss).backward()
                nn.utils.clip_grad_norm_(self.model.parameters(),
                                         GRADIENT_CLIP)
                self.optimizer.step()
示例#15
0
def test(rank, args, shared_model, counter, logger):
    console_f = logger.init_console_log_file()

    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    model.eval()

    max_score = 0

    start_time = time.time()

    while True:
        if args.max_counter_num != 0 and counter.value > args.max_counter_num:
            if args.save_policy_models:
                logger.save_policy_model(shared_model, counter.value + 1)
            exit(0)
        # monitor counter value
        if counter.value % args.testing_every_counter > 1:
            continue
        counter_value = counter.value
        model.load_state_dict(shared_model.state_dict())

        if args.save_policy_models:
            if counter_value % args.save_policy_models_every <= 5:
                logger.save_policy_model(shared_model, counter_value)

        state = env.reset()
        state = torch.from_numpy(state)
        reward_sum = 0
        done = True

        # a quick hack to prevent the agent from stucking
        # actions = deque(maxlen=100)
        # actions = deque(maxlen=500)
        actions = deque(maxlen=1000)
        episode_length = 0
        episode_count = 0
        episode_rewards_sum = 0
        episode_length_sum = 0
        while True:
            episode_length += 1
            # Sync with the shared model
            with torch.no_grad():
                if done:
                    cx = Variable(torch.zeros(1, 256))
                    hx = Variable(torch.zeros(1, 256))
                else:
                    cx = Variable(cx.data)
                    hx = Variable(hx.data)

                value, logit, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx)))
                prob = F.softmax(logit, dim=1)
                action = prob.max(1, keepdim=True)[1].data.numpy()

            state, reward, done, _ = env.step(action[0, 0])
            done = done or episode_length >= args.max_episode_length
            reward_sum += reward

            # a quick hack to prevent the agent from stucking
            actions.append(action[0, 0])
            if actions.count(actions[0]) == actions.maxlen:
                done = True

            if done:
                episode_count += 1
                episode_rewards_sum += reward_sum
                episode_length_sum += episode_length
                if episode_count == args.testing_episodes_num:
                    print("Time {}, num steps {}, FPS {:.0f}, avg episode reward {}, avg episode length {}".format(
                        time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)),
                        counter_value, counter_value / (time.time() - start_time),
                        episode_rewards_sum/args.testing_episodes_num, episode_length_sum/args.testing_episodes_num))
                    logger.write_results_log(console_f,
                                             time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)),
                                             counter_value,
                                             counter_value / (time.time() - start_time),
                                             episode_rewards_sum / args.testing_episodes_num,
                                             episode_length_sum / args.testing_episodes_num)
                    if args.save_max and (episode_rewards_sum / args.testing_episodes_num) >= max_score:
                        max_score = episode_rewards_sum / args.testing_episodes_num
                        logger.save_policy_model(shared_model, count="max_reward")
                    break

                reward_sum = 0
                episode_length = 0
                actions.clear()
                state = env.reset()

            state = torch.from_numpy(state)
示例#16
0
        if done:
            print("Time {}, episode reward {}, episode length {}".
                  format(get_elapsed_time_str(), reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(60)
        state = torch.from_numpy(state)


if __name__ == '__main__':
    env = create_atari_env(args.rom)
    # torch.manual_seed(SEED)
    shared_model = ActorCritic(env.observation_space.shape[0], env.action_space)
    shared_model.share_memory()
    # print (shared_model.conv1._parameters['weight'].data.is_cuda)
    optimizer = SharedAdam(shared_model.parameters(), lr=0.0001)
    optimizer.share_memory()

    if args.play:
        if os.path.isfile(args.play):
            print("=> loading checkpoint '{}'".format(args.play))
            checkpoint = torch.load(args.play)
            #            args.start_epoch = checkpoint['epoch']
            #            best_prec1 = checkpoint['best_prec1']
            shared_model.load_state_dict(checkpoint['state_dict'])
            #optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
示例#17
0
def train(rank, shared_model, optimizer):
    """
    :param rank: worker-ID
    :param shared_model: model to sync between workers
    :param optimizer:
    :return:
    """
    # torch.manual_seed(SEED + rank)
    ac_steps = 20
    max_episode_length = 10000
    gamma = 0.99
    tau = 1.0
    max_grad_norm = 50.0
    checkpoint_n = 20

    env = create_atari_env(romname)
    env.seed(SEED + rank)
    state = env.reset()
    state = Variable(torch.from_numpy(state).unsqueeze(0).type(FloatTensor), requires_grad=False)
    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    t = 0
    done = True
    episodes = 0
    reward_sum = 0
    reward_sum1 = 0
    start_time = time.time()
    best_reward = -999
    isbest = 0
    cx = hx = None
    while True:
        model.load_state_dict(shared_model.state_dict())
        if done:  # need to reset LSTM cell's input
            cx = Variable(torch.zeros(1, 256)).type(FloatTensor)
            hx = Variable(torch.zeros(1, 256)).type(FloatTensor)
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)  # basically this is to detach from previous comp graph

        states = []
        values = []
        log_probs = []
        rewards = []
        entropies = []

        for i in range(ac_steps):
            t += 1
            v, logit, (hx, cx) = model((state, (hx, cx)))
            states.append(state)
            prob = F.softmax(logit)
            log_prob = F.log_softmax(logit)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial().detach()  # detach -- so the backprob will NOT go through multinomial()
            log_prob = log_prob.gather(1, action)
            action = action.data[0, 0]
            state, reward, done, _ = env.step(action)
            reward_sum += reward
            reward_sum1 += reward
            done = done or t >= max_episode_length
            if done:
                t_ = t
                t = 0
                state = env.reset()
                episodes += 1
                if episodes % 10 == 0:
                    time_str = time.strftime(
                        "%Hh %Mm %Ss", time.gmtime(time.time() - start_time))
                    print("Time {}, worker-{} episode {} "
                          "mean episode reward {}, "
                          "episode length {}".
                          format(time_str, rank, episodes, reward_sum / 10.0, t_))
                    reward_sum = 0.0

                if episodes % checkpoint_n == 0:
                    ave_reward = reward_sum1 / checkpoint_n
                    if best_reward < ave_reward:
                        isbest = 1
                        best_reward = ave_reward

                    print("Saving checkpoint Time {}, worker-{} episode {} "
                          "mean episode reward {}, "
                          "episode length {} best_reward {}".
                          format(get_elapsed_time_str(), rank, episodes, ave_reward, t_, best_reward))
                    checkpoint_fname = os.path.join(
                        args.savedir,
                        args.rom + '_worker' + str(rank) + '_' + str(episodes))
                    save_checkpoint({'epoch': episodes,
                                     'average_reward': ave_reward,
                                     'time': time.time(),
                                     'state_dict': model.state_dict(),
                                     'optimizer': optimizer.state_dict(),
                                     }, isbest, checkpoint_fname)
                    reward_sum1 = 0.0

            state = Variable(torch.from_numpy(state).unsqueeze(0).type(FloatTensor), requires_grad=False)
            reward = max(min(reward, 1), -1)
            values.append(v)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        # We reach here because either
        # i) an episode ends, such as game over
        # ii) we have explored certain steps into the future and now it is
        #     time to look-back and summerise the
        if done:
            R = torch.zeros(1, 1).type(FloatTensor)
        else:
            value, _, _ = model((state, (hx, cx)))
            R = value.data

        values.append(Variable(R))
        critic_loss = 0
        actor_loss = 0
        R = Variable(R)
        gae = 0
        for i in reversed(range(len(rewards))):
            R = gamma * R + rewards[i]
            advantage = R - values[i]  # type: Variable
            critic_loss += 0.5 * advantage.pow(2)
            td_error = rewards[i] + gamma * values[i + 1].data - values[i].data
            gae = gae * gamma * tau + td_error
            actor_loss -= (Variable(gae) * log_probs[i] + 0.01 * entropies[i])

        optimizer.zero_grad()
        total_loss = actor_loss + critic_loss * 0.5  # type: Variable
        total_loss.backward()  # error occur
        torch.nn.utils.clip_grad_norm(model.parameters(), max_grad_norm)
        ensure_shared_grads(model, shared_model)
        optimizer.step()
示例#18
0
def train(rank, params, shared_model, optimizer):
    torch.manual_seed(params.seed + rank) # shifting the seed with rank to asynchronize each training agent
    env = create_atari_env(params.env_name) # creating an optimized environment thanks to the create_atari_env function
    env.seed(params.seed + rank) # aligning the seed of the environment on the seed of the agent
    model = ActorCritic(env.observation_space.shape[0], env.action_space) # creating the model from the ActorCritic class
    state = env.reset() # state is a numpy array of size 1*42*42, in black & white
    state = torch.from_numpy(state) # converting the numpy array into a torch tensor
    done = True # when the game is done
    episode_length = 0 # initializing the length of an episode to 0
    while True: # repeat
        episode_length += 1 # incrementing the episode length by one
        model.load_state_dict(shared_model.state_dict()) # synchronizing with the shared model - the agent gets the shared model to do an exploration on num_steps
        if done: # if it is the first iteration of the while loop or if the game was just done, then:
            cx = Variable(torch.zeros(1, 256)) # the cell states of the LSTM are reinitialized to zero
            hx = Variable(torch.zeros(1, 256)) # the hidden states of the LSTM are reinitialized to zero
        else: # else:
            cx = Variable(cx.data) # we keep the old cell states, making sure they are in a torch variable
            hx = Variable(hx.data) # we keep the old hidden states, making sure they are in a torch variable
        values = [] # initializing the list of values (V(S))
        log_probs = [] # initializing the list of log probabilities
        rewards = [] # initializing the list of rewards
        entropies = [] # initializing the list of entropies
        for step in range(params.num_steps): # going through the num_steps exploration steps
            value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx))) # getting from the model the output V(S) of the critic, the output Q(S,A) of the actor, and the new hidden & cell states
            prob = F.softmax(action_values) # generating a distribution of probabilities of the Q-values according to the softmax: prob(a) = exp(prob(a))/sum_b(exp(prob(b)))
            log_prob = F.log_softmax(action_values) # generating a distribution of log probabilities of the Q-values according to the log softmax: log_prob(a) = log(prob(a))
            entropy = -(log_prob * prob).sum(1) # H(p) = - sum_x p(x).log(p(x))
            entropies.append(entropy) # storing the computed entropy
            action = prob.multinomial().data # selecting an action by taking a random draw from the prob distribution
            log_prob = log_prob.gather(1, Variable(action)) # getting the log prob associated to this selected action
            values.append(value) # storing the value V(S) of the state
            log_probs.append(log_prob) # storing the log prob of the action
            state, reward, done, _ = env.step(action.numpy()) # playing the selected action, reaching the new state, and getting the new reward
            done = (done or episode_length >= params.max_episode_length) # if the episode lasts too long (the agent is stucked), then it is done
            reward = max(min(reward, 1), -1) # clamping the reward between -1 and +1
            if done: # if the episode is done:
                episode_length = 0 # we restart the environment
                state = env.reset() # we restart the environment
            state = torch.from_numpy(state) # tensorizing the new state
            rewards.append(reward) # storing the new observed reward
            if done: # if we are done
                break # we stop the exploration and we directly move on to the next step: the update of the shared model
        R = torch.zeros(1, 1) # intializing the cumulative reward
        if not done: # if we are not done:
            value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) # we initialize the cumulative reward with the value of the last shared state
            R = value.data # we initialize the cumulative reward with the value of the last shared state
        values.append(Variable(R)) # storing the value V(S) of the last reached state S
        policy_loss = 0 # initializing the policy loss
        value_loss = 0 # initializing the value loss
        R = Variable(R) # making sure the cumulative reward R is a torch Variable
        gae = torch.zeros(1, 1) # initializing the Generalized Advantage Estimation to 0
        for i in reversed(range(len(rewards))): # starting from the last exploration step and going back in time
            R = params.gamma * R + rewards[i] # R = gamma*R + r_t = r_0 + gamma r_1 + gamma^2 * r_2 ... + gamma^(n-1)*r_(n-1) + gamma^nb_step * V(last_state)
            advantage = R - values[i] # R is an estimator of Q at time t = i so advantage_i = Q_i - V(state_i) = R - value[i]
            value_loss = value_loss + 0.5 * advantage.pow(2) # computing the value loss
            TD = rewards[i] + params.gamma * values[i + 1].data - values[i].data # computing the temporal difference
            gae = gae * params.gamma * params.tau + TD # gae = sum_i (gamma*tau)^i * TD(i) with gae_i = gae_(i+1)*gamma*tau + (r_i + gamma*V(state_i+1) - V(state_i))
            policy_loss = policy_loss - log_probs[i] * Variable(gae) - 0.01 * entropies[i] # computing the policy loss
        optimizer.zero_grad() # initializing the optimizer
        (policy_loss + 0.5 * value_loss).backward() # we give 2x more importance to the policy loss than the value loss because the policy loss is smaller
        torch.nn.utils.clip_grad_norm(model.parameters(), 40) # clamping the values of gradient between 0 and 40 to prevent the gradient from taking huge values and degenerating the algorithm
        ensure_shared_grads(model, shared_model) # making sure the model of the agent and the shared model share the same gradient
        optimizer.step() # running the optimization step
示例#19
0
    # Autodetect CUDA
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    print('Device:', device)

    # Prepare environments
    envs = [make_env() for i in range(args.envs)]
    envs = MultiEnv(envs)
    if args.mp:
        envs = SubprocVecEnv(envs)
    env = OhlcvEnv(WINDOW_SIZE, './data/test/')
    obs_ = env.reset()
    num_inputs = env.observation_space.shape
    num_outputs = env.action_space.n

    model = ActorCritic(num_inputs, num_outputs, HIDDEN_SIZE,
                        std=0.0).to(device)
    print(model)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    frame_idx = 0
    train_epoch = 0
    best_reward = None

    state = envs.reset()
    early_stop = False

    while not early_stop:

        log_probs = []
        values = []
        states = []
import gym
import numpy as np
from time import sleep

from model import ActorCritic
from helpers import load_model, worker

lr = 0.001
gamma = 0.99
gae = 0.9
clc = 0.1
step_update = 50
ppo_epsilon = 0.2

input_dim = 4
shared_hidden0 = 25
shared_hidden1 = 50
critic_hidden = 25
output_dim_actor = 2
output_dim_critic = 1

model = ActorCritic(input_dim, shared_hidden0, shared_hidden1, critic_hidden,
                    output_dim_actor, output_dim_critic)

filename = '*****@*****.**'
# filename = 'actor_critic.pt'
model = load_model(model, filename)

params = {'epochs': 1, 'n_workers': 0, 'lr': lr}

worker(model, params, None, 0, render=True, train=False, max_eps=1000)
示例#21
0
文件: train.py 项目: ddayzzz/ACER
def train(rank, args, T, shared_model, shared_average_model, optimiser):
    torch.manual_seed(args.seed + rank)
    # CUDA
    if args.use_cuda:
        torch.cuda.manual_seed(args.seed + rank)

    env = gym.make(args.env)
    env.seed(args.seed + rank)
    model = ActorCritic(env.observation_space, env.action_space,
                        args.hidden_size)

    gpu_id = 0 if args.use_cuda else -1  # todo 0 代表第一个显卡
    if gpu_id >= 0:
        model = model.cuda()
    model.train()

    if not args.on_policy:
        # Normalise memory capacity by number of training processes
        memory = EpisodicReplayMemory(
            args.memory_capacity // args.num_processes,
            args.max_episode_length)

    t = 1  # Thread step counter
    done = True  # Start new episode

    while T.value() <= args.T_max:
        # On-policy episode loop
        while True:
            # Sync with shared model at least every t_max steps
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    model.load_state_dict(shared_model.state_dict())
            else:
                model.load_state_dict(shared_model.state_dict())
            # Get starting timestep
            t_start = t

            # Reset or pass on hidden state
            if done:
                avg_hx = torch.zeros(1, args.hidden_size)
                avg_cx = torch.zeros(1, args.hidden_size)
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        hx = torch.zeros(1, args.hidden_size).cuda()
                        cx = torch.zeros(1, args.hidden_size).cuda()
                else:
                    hx = torch.zeros(1, args.hidden_size)
                    cx = torch.zeros(1, args.hidden_size)

                # Reset environment and done flag
                state = state_to_tensor(env.reset())
                if gpu_id >= 0:
                    state = state.cuda()
                done, episode_length = False, 0
            else:
                # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call)
                hx = hx.detach()
                cx = cx.detach()

            # Lists of outputs for training
            policies, Qs, Vs, actions, rewards, average_policies = [], [], [], [], [], []

            while not done and t - t_start < args.t_max:
                # Calculate policy and values
                policy, Q, V, (hx, cx) = model(state, (hx, cx))

                # shared 模型在 CPU上, 需要转换
                if gpu_id >= 0:
                    to_avg_state = state.cpu()
                else:
                    to_avg_state = state
                average_policy, _, _, (avg_hx, avg_cx) = shared_average_model(
                    to_avg_state, (avg_hx, avg_cx))
                # if gpu_id >= 0:
                #     average_policies = average_policies.cuda()
                # Sample action
                action = torch.multinomial(policy, 1)[0, 0]

                # Step
                next_state, reward, done, _ = env.step(action.item())
                next_state = state_to_tensor(next_state)
                if gpu_id >= 0:
                    next_state = next_state.cuda()

                reward = args.reward_clip and min(max(
                    reward, -1), 1) or reward  # Optionally clamp rewards
                done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
                episode_length += 1  # Increase episode counter

                if not args.on_policy:
                    # Save (beginning part of) transition for offline training
                    memory.append(state, action, reward,
                                  policy.detach())  # Save just tensors
                # Save outputs for online training
                [
                    arr.append(el) for arr, el in zip((
                        policies, Qs, Vs, actions, rewards,
                        average_policies), (policy, Q, V,
                                            torch.LongTensor([[action]]),
                                            torch.Tensor([[reward]]),
                                            average_policy))
                ]

                # Increment counters
                t += 1
                T.increment()

                # Update state
                state = next_state

            # Break graph for last values calculated (used for targets, not directly as model outputs)
            if done:
                # Qret = 0 for terminal s
                Qret = torch.zeros(1, 1)

                if not args.on_policy:
                    # Save terminal state for offline training
                    memory.append(state, None, None, None)
            else:
                # Qret = V(s_i; θ) for non-terminal s
                _, _, Qret, _ = model(state, (hx, cx))
                Qret = Qret.detach().cpu()

            # Train the network on-policy
            if gpu_id >= 0:
                Qs = list(map(lambda x: x.cpu(), Qs))
                Vs = list(map(lambda x: x.cpu(), Vs))
                policies = list(map(lambda x: x.cpu(), policies))
            _train(args, T, model, shared_model, shared_average_model,
                   optimiser, policies, Qs, Vs, actions, rewards, Qret,
                   average_policies)

            # Finish on-policy episode
            if done:
                break

        # Train the network off-policy when enough experience has been collected
        if not args.on_policy and len(memory) >= args.replay_start:
            # Sample a number of off-policy episodes based on the replay ratio
            for _ in range(_poisson(args.replay_ratio)):
                # Act and train off-policy for a batch of (truncated) episode
                trajectories = memory.sample_batch(args.batch_size,
                                                   maxlen=args.t_max)

                # Reset hidden state
                avg_hx = torch.zeros(args.batch_size, args.hidden_size)
                avg_cx = torch.zeros(args.batch_size, args.hidden_size)
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        hx = torch.zeros(args.batch_size,
                                         args.hidden_size).cuda()
                        cx = torch.zeros(args.batch_size,
                                         args.hidden_size).cuda()
                else:

                    hx = torch.zeros(args.batch_size, args.hidden_size)
                    cx = torch.zeros(args.batch_size, args.hidden_size)

                # Lists of outputs for training
                policies, Qs, Vs, actions, rewards, old_policies, average_policies = [], [], [], [], [], [], []

                # Loop over trajectories (bar last timestep)
                for i in range(len(trajectories) - 1):
                    # Unpack first half of transition
                    state = torch.cat(
                        tuple(trajectory.state
                              for trajectory in trajectories[i]), 0)
                    action = torch.LongTensor([
                        trajectory.action for trajectory in trajectories[i]
                    ]).unsqueeze(1)
                    reward = torch.Tensor([
                        trajectory.reward for trajectory in trajectories[i]
                    ]).unsqueeze(1)
                    old_policy = torch.cat(
                        tuple(trajectory.policy
                              for trajectory in trajectories[i]), 0)

                    # Calculate policy and values
                    policy, Q, V, (hx, cx) = model(state, (hx, cx))
                    average_policy, _, _, (avg_hx,
                                           avg_cx) = shared_average_model(
                                               state, (avg_hx, avg_cx))

                    # Save outputs for offline training
                    [
                        arr.append(el)
                        for arr, el in zip((policies, Qs, Vs, actions, rewards,
                                            average_policies, old_policies), (
                                                policy, Q, V, action, reward,
                                                average_policy, old_policy))
                    ]

                    # Unpack second half of transition
                    next_state = torch.cat(
                        tuple(trajectory.state
                              for trajectory in trajectories[i + 1]), 0)
                    done = torch.Tensor([
                        trajectory.action is None
                        for trajectory in trajectories[i + 1]
                    ]).unsqueeze(1)

                # Do forward pass for all transitions
                _, _, Qret, _ = model(next_state, (hx, cx))
                # Qret = 0 for terminal s, V(s_i; θ) otherwise
                Qret = ((1 - done) * Qret).detach().cpu()

                # Train the network off-policy
                if gpu_id >= 0:
                    Qs = list(map(lambda x: x.cpu(), Qs))
                    Vs = list(map(lambda x: x.cpu(), Vs))
                    policies = list(map(lambda x: x.cpu(), policies))
                _train(args,
                       T,
                       model,
                       shared_model,
                       shared_average_model,
                       optimiser,
                       policies,
                       Qs,
                       Vs,
                       actions,
                       rewards,
                       Qret,
                       average_policies,
                       old_policies=old_policies)
        done = True

    env.close()
示例#22
0
文件: worker.py 项目: zafarizb/Plato
def train(rank, args, share_model, counter, lock):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)
    optimizer = optim.Adam(share_model.parameters(), lr=args.lr)
    model.train()

    state = env.reset()
    state = torch.FloatTensor(state)
    done = True
    # reward_sum = 0
    episode_length = 0
    while True:
        model.load_state_dict(share_model.state_dict())
        if done:
            cx = Variable(torch.zeros(1, 256))
            hx = Variable(torch.zeros(1, 256))
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            episode_length += 1
            value, logit, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx)))
            prob = F.softmax(logit)
            log_prob = F.log_softmax(logit)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial().data
            log_prob = log_prob.gather(1, Variable(action))

            state, reward, done, _ = env.step(action.numpy())
            # print('reward', reward)
            done = done or episode_length >= args.max_episode_length
            reward = max(min(reward, 1), -1)
            # reward_sum += reward
            # print(reward)

            with lock:
                counter.value += 1

            if done:
                episode_length = 0
                state = env.reset()

            state = torch.FloatTensor(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                # print('rank: ', rank)
                # print('reward: ', reward_sum)
                # reward_sum = 0
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            delta_t = rewards[i] + args.gamma * values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - log_probs[i] * Variable(gae) - args.entropy_coef * entropies[i]

        optimizer.zero_grad()
        (policy_loss + args.value_loss_coef * value_loss).backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), args.max_grad_norm)
        ensure_shared_grads(model, share_model)
        optimizer.step()
示例#23
0
                    loss.backward()
                    for local_param, global_param in zip(
                            self.local_actor_critic.parameters(),
                            self.global_actor_critic.parameters()):
                        global_param._grad = local_param.grad
                    self.optimizer.step()
                    self.local_actor_critic.load_state_dict(self.global_actor_critic.state_dict())
                    self.local_actor_critic.clear_memory()
                t_step += 1
                observation = observation_
            with self.episode_index.get_lock():
                self.episode_index.value += 1
            print(self.name, 'episode ', self.episode_index.value, 'reward %.1f' % score)


if __name__ == '__main__':
    lr = 1e-4
    env_id = 'CartPole-v0'
    nb_actions = 2
    input_dims = [4]
    global_actor_critic = ActorCritic(input_dims, nb_actions)
    global_actor_critic.share_memory()
    optim = SharedAdam(global_actor_critic.parameters(), lr=lr, betas=(0.92, 0.999))
    global_ep = mp.Value('i', 0)

    workers = [
        Agent(global_actor_critic, optim, input_dims, nb_actions, gamma=0.99, lr=lr, name=i,
              global_ep_index=global_ep,
              env_id=env_id) for i in range(mp.cpu_count())]
    [w.start() for w in workers]
    [w.join() for w in workers]
示例#24
0
def test(rank, args, shared_model, counter, loggers, kill):
    counter, steps, max_episodes = counter

    torch.manual_seed(args.seed + rank)

    env = create_vizdoom_env(args.config_path, args.test_scenario_path)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.spaces[0].shape[0],
                        env.action_space, args.topology)

    model.eval()

    state = env.reset()
    reward_sum = 0
    done = True

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    hidden = ((torch.zeros(1, 64), torch.zeros(1, 64)), (torch.zeros(1, 256),
                                                         torch.zeros(1, 256)))
    actions = deque(maxlen=100)
    episode_length = 0
    episode_counter = 0

    obs_index = 0
    obs_history = []
    pose_history = []
    goal_loc = env.goal()

    model.load_state_dict(shared_model.state_dict())

    while not kill.is_set():
        if steps.value > args.max_episode_steps:
            break

        if episode_counter > max_episodes:
            break

        try:
            episode_start_time = time.time()
            episode_length += 1

            value, logit, _, _, hidden = model((state_to_torch(state), hidden))
            prob = F.softmax(logit)
            action = prob.max(1, keepdim=True)[1].data.numpy()

            for i in range(4):
                state, reward, done, _ = env.step(action[0, 0], steps=1)
                reward_sum += reward

                if done:
                    break
                else:
                    obs_frame = (np.moveaxis(state[0], 0, -1) * 255).astype(
                        np.uint8)

                    if isinstance(obs_history, list):
                        obs_history.append(obs_frame)
                    else:
                        obs_history[obs_index, :, :, :] = obs_frame
                        obs_index += 1

                    pose_history.append(env.pose())

            # a quick hack to prevent the agent from stucking
            # actions.append(action[0, 0])
            # if actions.count(actions[0]) == actions.maxlen:
            #     done = True

            if done:
                if isinstance(obs_history, list):
                    obs_history = np.array(obs_history)

                if loggers:
                    loggers['test_reward'](env.game.get_total_reward(),
                                           episode_counter)
                    loggers['video'](video(env.wad, env.current_map, goal_loc,
                                           obs_history, pose_history),
                                     episode_counter)
                    loggers['test_time'](time.time() - episode_start_time,
                                         episode_counter)

                print(
                    "Time {}, num episodes {}, FPS {:.0f}, episode reward {}, episode length {}".
                    format(
                        time.strftime("%Hh %Mm %Ss",
                                      time.gmtime(time.time() - start_time)),
                        counter.value,
                        counter.value / (time.time() - start_time), reward_sum,
                        episode_length))
                reward_sum = 0
                episode_length = 0
                actions.clear()
                state = env.reset()

                obs_index = 0
                pose_history = []
                goal_loc = env.goal()

                hidden = ((torch.zeros(1, 64), torch.zeros(1, 64)),
                          (torch.zeros(1, 256), torch.zeros(1, 256)))

                time.sleep(args.eval_interval)

                model.load_state_dict(shared_model.state_dict())

                episode_counter += 1
        except Exception as err:
            kill.set()
            raise err
示例#25
0
def test(args, shared_model):
    action_map = _set_action_map()

    env = FixedEnvWrap()

    # time.sleep(10)
    model = ActorCritic()
    model.load_state_dict(shared_model.state_dict())
    model.eval()

    state = env.reset()

    training_time = 0
    vis = visdom.Visdom(env='final')
    line_plot = vis.line(Y=np.array([0]),
                         opts=dict(xlabel='testing count',
                                   ylabel='average reward',
                                   title='ali-v1'))

    start = time.time()
    vis_count = 0
    while True:
        video_count = 1
        reward_all_sum = 0
        reward_all = 0
        reward_all_ave = 0
        reward_gop = 0
        action = 3
        last_action = 3
        # update model before testing all trace files
        # time.sleep(5)
        print('load updated model')
        model.load_state_dict(shared_model.state_dict())
        while True:
            # get the reward for one gop
            while True:
                _, done, decision_flag = env.step_gop(action)
                if decision_flag or done:
                    reward_gop = env.get_reward_gop()
                    state = env.get_state_gop()
                    break
                else:
                    continue
            # print('testing')
            # get action from model
            last_action = action
            with torch.no_grad():
                state = torch.FloatTensor(state)
                logit, _ = model(
                    state.view(-1, args.s_gop_info, args.s_gop_len))
                prob = F.softmax(logit, dim=1)
                _, action = torch.max(prob, 1)
                action = action.data.numpy()[0]

            bitrate, target_buffer = action_map[last_action]
            # print('bitrate: %d, target_buffer: %d, reward is %s' % (bitrate, target_buffer, reward_gop))
            if done:
                print("video count %d, reward is %.5f" %
                      (video_count, reward_all))
                # reward_all_sum += reward_all / 100
                reward_all_sum += reward_all
                video_count += 1
                if reward_all < 0:
                    print('bad model ! just break this loop')
                    reward_all_ave = 0
                    break
                if video_count > env.traces_len * 2:
                    reward_all_ave = reward_all_sum / video_count
                    break
                action = 3
                last_action = 3
                reward_all = 0

            reward_all += reward_gop

        # update the figure of average reward of all testing files
        vis_count += 1
        reward_all_ave = max(reward_all_ave, 0)
        vis.line(Y=np.array([reward_all_ave]),
                 X=np.array([vis_count]),
                 win=line_plot,
                 update='append')
        path = 'ali-v1/actor.pt-' + str(vis_count)
        torch.save(model.state_dict(), path)

        end = time.time()
        hours, rem = divmod(end - start, 3600)
        minutes, seconds = divmod(rem, 60)

        print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes),
                                              seconds))
        print("average reward of traces are: ", reward_all_ave)
        print('saved one model in epoch:', vis_count)
示例#26
0
文件: main.py 项目: AjayTalati/a3c
                    metavar='ENV',
                    help='environment to train on (default: Breakout-v0)')
parser.add_argument('--render',
                    default=False,
                    action='store_true',
                    help='render the environment')

if __name__ == '__main__':
    args = parser.parse_args()

    #torch.manual_seed(args.seed)
    torch.set_num_threads(1)

    env = gym.make(args.env_name)

    global_model = ActorCritic(env.action_space.n)
    global_model.share_memory()
    local_model = ActorCritic(env.action_space.n)

    optimizer = AsyncAdam(global_model.parameters(),
                          local_model.parameters(),
                          lr=args.lr)

    processes = []
    for rank in range(args.num_processes):
        p = mp.Process(target=train,
                       args=(rank, args, global_model, local_model, optimizer))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()
示例#27
0
def train(rank, params, shared_model, optimizer):
	torch.manual_seed(params.seed + rank)
	env = create_atari_env(params.env_name) #getting the environment
	env.seed(params.seed + rank)
	model = ActorCritic(env.observation_space.shape[0], env.action_space)
	state = env.reset()
	state = torch.from_numpy(state)
	done = True 
	episode_length = 0
	while True:
		episode_length+=1
		model.load_state_dict(shared_model.state_dict())
		if done:
			cx = Variable(torch.zeros(1,256))
			hx = Variable(torch.zeros(1,256))
		else:
			cx = Variable(cx.data)
			hx = Variable(hx.data)
		values = []
		log_probs = []
		rewards = []
		entropies = []
		for step in range(params.num_steps):
			value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx)))
			prob = F.softmax(action_values)
			log_prob = F.log_softmax(action_values)
			entropy = -(log_prob * prob).sum(1)
			entropies.append(entropy)
			action = prob.multinomial().data
			log_prob = log_prob.gather(1, Variable(action))
			values.append(value)
			log_probs.append(log_prob)
			state, reward, done = env.step(action.numpy())
			done = (done or episode_length >= params.max_episode_length)
			reward = max(min(reward,1), -1)
			if done:
				episode_length = 0
				state = env.reset()
			state = torch.from_numpy(state)
			rewards.append(reward)
			if done:
				break 
		R = torch.zeros(1,1)
		if not done:
			value, _, _ = model.((Variable(state.unsqueeze(0)), (hx, cx)))
			R = value.data
		values.append(Variable(R))
		policy_loss = 0
		value_loss = 0
		R = Variable(R)
		gae = torch.zeros(1,1)
		for i in reversed(range(len(rewards))):
			R = params.gamma*R + rewards[i]
			advantage = R - values[i]
			value_loss = value_loss + 0.5 * advantage.pow(2)
			TD = rewards[i] + params.gamma * values[i+1].data - values[i].data
			gae = gae * params.gamma * params.tau + TD 
			policy_loss = policy_loss - log_probs[i]*Variable(gae) - 0.01*entropies[i]
		optimizer.zero_grad()
		(policy_loss + 0.5 * value_loss).backward()
		torch.nn.utils.clip_grad_norm(model.parameters(), 40)
		ensure_shared_grads(model, shared_model)
		optimizer.step()
示例#28
0
文件: test.py 项目: 404akhan/a3c-dlab
def test(rank, args, shared_model):
    torch.manual_seed(args.seed + rank)

    env = WrapEnv(args.env_name)
    model = ActorCritic(4, env.num_actions, args.num_skips)

    model.eval()

    state = env.reset()
    state = np.concatenate([state] * 4, axis=0)
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True
    action_stat = [0] * (model.n_real_acts + model.n_aux_acts)

    start_time = time.time()
    episode_length = 0

    for ep_counter in itertools.count(1):
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())

            if not os.path.exists('model-a3c-aux'):
                os.makedirs('model-a3c-aux')
            torch.save(shared_model.state_dict(),
                       'model-a3c-aux/model-{}.pth'.format(args.model_name))
            print('saved model')

        value, logit = model(Variable(state.unsqueeze(0), volatile=True))
        prob = F.softmax(logit)
        action = prob.max(1)[1].data.numpy()

        action_np = action[0, 0]
        action_stat[action_np] += 1

        if action_np < model.n_real_acts:
            state_new, reward, done, info = env.step(action_np)

            if args.testing:
                print('episode', episode_length, 'normal action', action_np,
                      'lives', info['ale.lives'])
                env.render()
            state = np.append(state.numpy()[1:, :, :], state_new, axis=0)
            done = done or episode_length >= args.max_episode_length

            reward_sum += reward
            episode_length += 1
        else:
            state = state.numpy()

            for _ in range(action_np - model.n_real_acts + 2):
                state_new, rew, done, info = env.step(
                    0)  # instead of random perform NOOP=0

                if args.testing:
                    print('episode', episode_length, 'no_op action', action_np,
                          'lives', info['ale.lives'])
                    # env.render()
                state = np.append(state[1:, :, :], state_new, axis=0)
                done = done or episode_length >= args.max_episode_length

                reward_sum += rew
                episode_length += 1
                if done:
                    break

        if done:
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            print("actions stats real {}, aux {}".format(
                action_stat[:model.n_real_acts],
                action_stat[model.n_real_acts:]))

            reward_sum = 0
            episode_length = 0
            state = env.reset()
            state = np.concatenate([state] * 4, axis=0)
            action_stat = [0] * (model.n_real_acts + model.n_aux_acts)
            if not args.testing: time.sleep(60)

        state = torch.from_numpy(state)
示例#29
0
def train(rank, args, shared_model, optimizer=None):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()

    state = env.reset()
    state = torch.from_numpy(state)
    done = True

    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = Variable(torch.zeros(1, 256))
            hx = Variable(torch.zeros(1, 256))
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            value, logit, (hx, cx) = model(
                (Variable(state.unsqueeze(0)), (hx, cx)))
            prob = F.softmax(logit)
            log_prob = F.log_softmax(logit)
            entropy = -(log_prob * prob).sum(1)
            entropies.append(entropy)

            action = prob.multinomial().data
            log_prob = log_prob.gather(1, Variable(action))

            state, reward, done, _ = env.step(action.numpy())
            done = done or episode_length >= args.max_episode_length
            reward = max(min(reward, 1), -1)

            if done:
                episode_length = 0
                state = env.reset()

            state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae) - 0.01 * entropies[i]

        optimizer.zero_grad()

        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 40)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
示例#30
0
文件: test.py 项目: Luo1996/ACER
def test(rank, args, T, shared_model):
  torch.manual_seed(args.seed + rank)

  env = gym.make(args.env)
  env.seed(args.seed + rank)
  model = ActorCritic(env.observation_space, env.action_space, args.hidden_size)
  model.eval()

  can_test = True  # Test flag
  t_start = 1  # Test step counter to check against global counter
  rewards, steps = [], []  # Rewards and steps for plotting
  l = str(len(str(args.T_max)))  # Max num. of digits for logging steps
  done = True  # Start new episode

  while T.value() <= args.T_max:
    if can_test:
      t_start = T.value()  # Reset counter

      # Evaluate over several episodes and average results
      avg_rewards, avg_episode_lengths = [], []
      for _ in range(args.evaluation_episodes):
        while True:
          # Reset or pass on hidden state
          if done:
            # Sync with shared model every episode
            model.load_state_dict(shared_model.state_dict())
            hx = Variable(torch.zeros(1, args.hidden_size), volatile=True)
            cx = Variable(torch.zeros(1, args.hidden_size), volatile=True)
            # Reset environment and done flag
            state = state_to_tensor(env.reset())
            done, episode_length = False, 0
            reward_sum = 0

          # Optionally render validation states
          if args.render:
            env.render()

          # Calculate policy
          policy, _, _, (hx, cx) = model(Variable(state, volatile=True), (hx.detach(), cx.detach()))  # Break graph for memory efficiency

          # Choose action greedily
          action = policy.max(1)[1].data[0, 0]

          # Step
          state, reward, done, _ = env.step(action)
          state = state_to_tensor(state)
          reward_sum += reward
          done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
          episode_length += 1  # Increase episode counter

          # Log and reset statistics at the end of every episode
          if done:
            avg_rewards.append(reward_sum)
            avg_episode_lengths.append(episode_length)
            break

      print(('[{}] Step: {:<' + l + '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format(
            datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3],
            t_start,
            sum(avg_rewards) / args.evaluation_episodes,
            sum(avg_episode_lengths) / args.evaluation_episodes))

      if args.evaluate:
        return

      rewards.append(avg_rewards)  # Keep all evaluations
      steps.append(t_start)
      plot_line(steps, rewards)  # Plot rewards
      torch.save(model.state_dict(), 'model.pth')  # Save model params
      can_test = False  # Finish testing
    else:
      if T.value() - t_start >= args.evaluation_interval:
        can_test = True

    time.sleep(0.001)  # Check if available to test every millisecond

  env.close()