예제 #1
0
            else:
                state = next_state
        # Update monitorization variables & params for next Episode
        scores.append(score)
        print('Episode/Test {} throws an avg of {}'.format(e, score))
    return scores


if __name__ == "__main__":
    # set environment and get state & action size
    env, brain_name, state_size, action_size, num_agents = defineEnvironment(
        path, verbose=True)

    # define agent
    agent = Agent(state_size,action_size,num_agents, \
                  SEED,GAMMA,TAU,LR_ACTOR,LR_CRITIC, \
                      BUFFER_SIZE, BUFFER_TYPE, POLICY_UPDATE)
    if mode == 'train':
        # train
        scores, checkpoint = train_agent(agent,
                                         env,
                                         brain_name,
                                         n_episodes=EPISODES,
                                         batch_size=BATCH_SIZE)
        # export data
        with open(results_filename, 'wb') as f:
            pickle.dump([scores, checkpoint], f)
    elif mode == 'evaluation':
        weights_filename = 'weights/twenty_agents/actor_batch64_model_weights.pth'
        agent.actor.load_state_dict(torch.load(weights_filename))
        agent.actor.eval()
# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
# print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
# print('The state for the first agent looks like:', states[0])

seed = 0
agent = Agent(state_size, action_size, seed)


def ddpg(n_episodes=2000, max_t=1000):
    scores_deque = deque(maxlen=100)
    scores = []
    max_score = -np.Inf
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations

        agent.reset()
        episode_scores = np.zeros(num_agents)

        for t in range(max_t):
            actions = agent.act(states)
"""
###################################
STEP 5: Initialize DDPG Agents from the Agent Class in dqn_agent.py
A DDPG agent initialized with the following parameters.
    ======
    state_size (int): dimension of each state (required)
    action_size (int): dimension of each action (required)
    num_agents (int): number of agents in the unity environment
    seed (int): random seed for initializing training point (default = 0)

Here we initialize two agents
We set the states size to 48 (24*2), so we can feed each agent boths agent's state observations.
"""
#Initialize Agent
agent_1 = Agent(state_size=48, action_size=action_size, num_agents=1, random_seed=0)
agent_2 = Agent(state_size=48, action_size=action_size, num_agents=1, random_seed=0)

# Load trained model weights for agent 1
agent_1.actor_local.load_state_dict(torch.load('ddpgActor1_Model.pth'))
agent_1.critic_local.load_state_dict(torch.load('ddpgCritic1_Model.pth'))

# Load trained model weights for agent 2
agent_2.actor_local.load_state_dict(torch.load('ddpgActor2_Model.pth'))
agent_2.critic_local.load_state_dict(torch.load('ddpgCritic2_Model.pth'))

"""
###################################
STEP 6: Play Banana for specified number of Episodes
"""
# loop from num_episodes
예제 #4
0
 def __init__(self):
     super(MultiAgentDDPG, self).__init__()
     self.config = Config()
     self.agents = [Agent() for _ in range(self.config.num_agents)]
     self.buffer = ReplayBuffer()
예제 #5
0
        self.lr_critic = 1e-4
        self.discount_rate = 0.99  # discount factor
        self.tau = 1e-3
        self.weight_decay = 0
        self.theta = 0.15
        self.sigma = 0.2


config = Config()

from collections import deque
import matplotlib.pyplot as plt

from ddpg_agent import Agent
agent1 = Agent(state_size=config.state_size,
               action_size=config.action_size,
               random_seed=1,
               config=config)
agent2 = Agent(state_size=config.state_size,
               action_size=config.action_size,
               random_seed=0,
               config=config)

wandb.watch((agent1.actor_local, agent1.critic_local))

from tqdm import tqdm


def combine_state_action(states, actions=np.zeros((1, config.action_size))):
    states = np.expand_dims(states, 0)
    augmented_state = np.hstack([states, actions])
    return augmented_state
class MADDPG():
    def __init__(self, state_size, action_size, random_seed):
        """Initialize 2 Agent objects.
        
        Params
        ======
            state_size (int): dimension of one agent's observation
            action_size (int): dimension of each action
        """
        self.state_size = state_size
        self.action_size = action_size
        # Initialize the agents
        self.ddpg_agent0 = Agent(state_size, action_size, random_seed=0)
        self.ddpg_agent1 = Agent(state_size, action_size, random_seed=1)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def act(self, states, rand=False):
        """Agents act with actor_local"""
        if rand == False:
            action0 = self.ddpg_agent0.act(states[0])
            action1 = self.ddpg_agent1.act(states[1])
            actions = [action0, action1]
            return actions
        if rand == True:
            actions = np.random.randn(2, 2)
            actions = np.clip(actions, -1, 1)
            return actions

    def step(self, states, actions, rewards, next_states, dones, learn=True):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        state0 = states[0]
        state1 = states[1]

        action0 = actions[0]
        action1 = actions[1]

        reward0 = rewards[0]
        reward1 = rewards[1]

        next_state0 = next_states[0]
        next_state1 = next_states[1]

        done0 = dones[0]
        done1 = dones[1]

        self.memory.add(state0, state1, action0, action1, reward0, reward1,
                        next_state0, next_state1, done0, done1)

        if learn == True and len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def learn(self, experiences, GAMMA):
        s0, s1, a0, a1, r0, r1, next_s0, next_s1, d0, d1 = experiences

        # next actions (for CRITIC network)
        a_next0 = self.ddpg_agent0.actor_target(next_s0)
        a_next1 = self.ddpg_agent1.actor_target(next_s1)

        # action predictions (for ACTOR network)
        a_pred0 = self.ddpg_agent0.actor_local(s0)
        a_pred1 = self.ddpg_agent1.actor_local(s1)

        # ddpg agents learn separately, each agent learns from its perspective, that is why states, actions, etc are swapped
        self.ddpg_agent0.learn(s0, s1, a0, a1, r0, r1, next_s0, next_s1, d0,
                               d1, a_next0, a_next1, a_pred0, a_pred1)
        self.ddpg_agent1.learn(s1, s0, a1, a0, r1, r0, next_s1, next_s0, d1,
                               d0, a_next1, a_next0, a_pred1, a_pred0)
예제 #7
0
 def create_agent(memory):
     return Agent(state_size=states.shape[1],
                  action_size=brain.vector_action_space_size,
                  random_seed=random_seed,
                  memory=memory,
                  batch_size=128)
예제 #8
0
        self.batch_size = 128
        self.lr_actor = 1e-4
        self.lr_critic = 1e-4
        self.discount_rate = 0.99
        self.tau = 1e-3
        self.weight_decay = 0


config = Config()

from collections import deque
import matplotlib.pyplot as plt

from ddpg_agent import Agent
agent = Agent(state_size=state_size,
              action_size=action_size,
              random_seed=1,
              config=config)

from tqdm import tqdm


# Defining the main training loop.
def ddpg(n_episodes=300, max_t=1000):
    scores_deque = deque(maxlen=100)
    scores = []
    max_score = -np.Inf

    for i_episode in tqdm(range(1, n_episodes + 1)):
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        score = np.zeros(config.no_agents)
예제 #9
0
env_info = env.reset(train_mode=True)[brain_name]

num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(
    states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

agent = Agent(state_size, action_size, random_seed=0)

agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth'))
agent.critic_local.load_state_dict(torch.load('checkpoint_critic.pth'))
scores_total = 0
NUM_GAMES = 50
max_time = 1000

for _ in range(NUM_GAMES):
    env_info = env.reset(train_mode=False)[brain_name]
    state = env_info.vector_observations
    score = np.zeros(num_agents)
    for t in range(max_time):
        actions = []
        for j in range(num_agents):
            actions.append(agent.act(state[j], add_noise=False))
# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

random_seed = 1
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
scores = np.zeros(num_agents)

if torch.cuda.is_available():
    trained_model = torch.load('checkpoint_actor.pth')
else:
    trained_model = torch.load('checkpoint_actor.pth',map_location={'cuda:0': 'cpu'})

agent = Agent(state_size=state_size, action_size=action_size, random_seed=random_seed)
agent.actor_local = Actor(state_size, action_size, random_seed).to(device)
agent.actor_local.load_state_dict(trained_model)

env_info = env.reset(train_mode=False)[brain_name]     # reset the environment
states = env_info.vector_observations                  # get the current state (for each agent)

while True:
    action = agent.act(states, add_noise=False)
    env_info = env.step(action)[brain_name]
    states = env_info.vector_observations              # get next state (for each agent)
예제 #11
0
    # number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)

    # number of actions
    action_size = brain.vector_action_space_size
    print('Size of each actions:', action_size)

    # examine the state space 
    states = env_info.vector_observations
    # print('States look like:', state)
    state_size = states.shape[1]
    print('States have length:', state_size)

    agent_0 = Agent(state_size=state_size, 
                    action_size=action_size, 
                    num_agents=1, 
                    random_seed=0)

    agent_1 = Agent(state_size=state_size, 
                  action_size=action_size, 
                  num_agents=1, 
                  random_seed=0)                    

    agent_0.actor_local.load_state_dict(torch.load(result.actor_0_model, map_location='cpu'))
    agent_0.critic_local.load_state_dict(torch.load(result.critic_0_model, map_location='cpu'))
    agent_1.actor_local.load_state_dict(torch.load(result.actor_1_model, map_location='cpu'))
    agent_1.critic_local.load_state_dict(torch.load(result.critic_1_model, map_location='cpu'))

    # Set environment to evalulation mode
    env_info = env.reset(train_mode=False)[brain_name]        
    states = env_info.vector_observations                  
from ddpg_agent import Agent

from collections import deque

if __name__ == "__main__":
    # Get the default financial and AC Model parameters
    financial_params, ac_params = utils.get_env_param()
    print(financial_params)
    print(ac_params)

    # Create simulation environment
    env = sca.MarketEnvironment()

    # Initialize Feed-forward DNNs for Actor and Critic models.
    agent1 = Agent(state_size=env.observation_space_dimension(),
                   action_size=env.action_space_dimension(),
                   random_seed=1225)
    agent2 = Agent(state_size=env.observation_space_dimension(),
                   action_size=env.action_space_dimension(),
                   random_seed=108)
    # Set the liquidation time
    lqt = 60

    # Set the number of trades
    n_trades = 60

    # Set trader's risk aversion
    tr1 = 1e-6
    tr2 = 1e-6

    # Set the number of episodes to run the simulation
예제 #13
0
# number of agents
num_agents = len(env_info.agents)
print("Number of agents:", num_agents)

# size of each action
action_size = brain.vector_action_space_size
print("Size of each action:", action_size)

# examine the state space
states = env_info.vector_observations
cstate_size = states.shape[1]
print("There are {} agents. Each observes a state with length: {}".format(
    states.shape[0], cstate_size))
print("The state for the first agent looks like:", states[0])

agent = Agent(state_size=cstate_size, action_size=action_size, random_seed=42)


def ddpg(n_episodes=5000, max_t=2000, print_every=10):
    scores_deque = deque(maxlen=print_every)
    t_scores = []
    agent.reset()
    print(str(datetime.datetime.now()) + " Training started")
    for i_episode in range(0, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]  # reset the envi
        states = env_info.vector_observations  # get current state(each agent)
        scores = np.zeros(num_agents)  # initialize the score (for each agent)
        dones = env_info.local_done  # see if episode finished
        for _ in range(max_t):
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]  # send actions to the env
예제 #14
0
def train(env_pth, n_episodes=500, output='output'):

    BATCH_SIZE = 64      # minibatch size
    SEED = 2
    os.makedirs(output, exist_ok=True)

    # load environment
    env = UnityEnvironment(file_name=env_pth)

    # get the default brain name
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the evironment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    logger.info(f'Number of agents: {num_agents}')

    # size of each action
    action_size = brain.vector_action_space_size
    logger.info(f'Size of each action: {action_size}')

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    logger.info(f'There are {states.shape[0]} agent(s). Each observes a state with length: {state_size}')
    logger.info(f'The state for the first agent looks like: {states[0]}')

    # create agent
    agent = Agent(state_size, action_size, SEED, BATCH_SIZE)

    def ddpg(n_episodes, average_window=100, output='output'):
        scores_deque = deque(maxlen=average_window)
        scores_all = []

        for i_episode in range(1, n_episodes+1):
            env_info = env.reset(train_mode=True)[brain_name]        # reset the environment
            states = env_info.vector_observations                    # get the current state (for each agent)
            scores = np.zeros(num_agents)                            # initialize the score (for each agent)

            while True:
                actions = agent.act(states)                          # select an action (for each agent)
                env_info = env.step(actions)[brain_name]             # send all actions to tne environment
                next_states = env_info.vector_observations           # get next state (for each agent)
                rewards = env_info.rewards                           # get reward (for each agent)
                dones = env_info.local_done                          # see if episode finished

                for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
                    agent.step(state, action, reward, next_state, done)

                scores += rewards                                    # update the score (for each agent)
                states = next_states                                 # roll over states to next time step
                if np.any(dones):                                    # exit loop if episode finished
                    break

            average_score_episode = np.mean(scores)
            scores_deque.append(average_score_episode)
            scores_all.append(average_score_episode)
            average_score_queue = np.mean(scores_deque)

            logger.info(f'\rEpisode {i_episode}\tScores: {average_score_episode:.2f}\tAverage Score: {average_score_queue:.2f}')
            torch.save(agent.actor_local.state_dict(), f'{output}/checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), f'{output}/checkpoint_critic.pth')
            if i_episode > average_window and average_score_queue > 30:
                break
        
        return scores_all

    scores = ddpg(n_episodes=n_episodes, output=output)
    plot_rewards(scores, output)

    env.close()
    env = UnityEnvironment(file_name='./Reacher_Linux/Reacher',
                           no_graphics=not args.visualize
                           and not args.watch_one_episode)

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    agent = Agent(state_size=len(env_info.vector_observations[0]),
                  action_size=brain.vector_action_space_size,
                  actor_hidden_layers=args.actor_hidden_layers,
                  critic_hidden_layers=args.critic_hidden_layers,
                  ou_theta=args.ou_theta,
                  ou_sigma=args.ou_sigma,
                  ou_theta_decay=args.ou_theta_decay,
                  ou_sigma_decay=args.ou_sigma_decay,
                  energy_penalty=args.energy_penalty,
                  random_seed=0)

    if args.load:
        load_agent()
    if args.watch_one_episode:
        watch_one_episode(args.slow_by)
    if args.train:
        scores = ddpg(n_episodes=args.n_episodes,
                      slow_every=args.slow_every,
                      slow_by=args.slow_by)
        print(scores)
        outfile = open('scores.txt', 'w')
예제 #16
0
import gym
import torch
import numpy as np
from ddpg_agent import Agent
import matplotlib.pyplot as plt
from noise import OUNoise

env = gym.make('BipedalWalker-v3')

state_dim = int(env.observation_space.shape[0])
action_dim = int(env.action_space.shape[0])
agent = Agent(state_size=state_dim, action_size=action_dim)


def ddpg(episodes, step, pretrained=False, noise=True):

    if pretrained:
        agent.actor_local.load_state_dict(
            torch.load('./models/weights/checkpoint_actor.pth',
                       map_location="cpu"))
        agent.critic_local.load_state_dict(
            torch.load('.models/weights/checkpoint_critic.pth',
                       map_location="cpu"))
        agent.actor_target.load_state_dict(
            torch.load('.models/weights/checkpoint_actor.pth',
                       map_location="cpu"))
        agent.critic_target.load_state_dict(
            torch.load('.models/weights/checkpoint_critic.pth',
                       map_location="cpu"))

    reward_list = []
예제 #17
0
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=False)[brain_name]

# number of actions and state size
action_size = brain.vector_action_space_size
state = env_info.vector_observations[0]
state_size = len(state)

# Instantiate the agent
agent = Agent(state_size=state_size,
              action_size=action_size * 2,
              random_seed=64)

# Load the weights from file
agent.actor_local.load_state_dict(torch.load(actor_file))
agent.critic_local.load_state_dict(torch.load(critic_file))

env_info = env.reset(train_mode=False)[brain_name]  # reset the environment
state = env_info.vector_observations[0]  # get the current state
score = 0  # initialize the score
while True:
    action = agent.act(state, False)  # Don't add noise during test!
    env_info = env.step(action)[
        brain_name]  # send the action to the environment
    next_state = env_info.vector_observations[0]  # get the next state
    reward = env_info.rewards[0]  # get the reward
def ddpg(n_episodes=500, max_t=200, train_mode=True):
    env = UnityEnvironment(file_name='./1_agent/Reacher.app')
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    action_size = brain.vector_action_space_size
    env_info = env.reset(train_mode=train_mode)[brain_name]

    states = env_info.vector_observations

    agent = Agent(state_size=states.shape[1],
                  action_size=action_size,
                  random_seed=2)

    brain_name = env.brain_names[0]
    scores = []
    scores_deque = deque(maxlen=100)
    max_score = -np.Inf

    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=train_mode)[brain_name]
        num_agents = len(env_info.agents)
        #         agent.reset()
        score = 0
        states = env_info.vector_observations
        #         while True:
        for t in range(max_t):
            agent.reset()
            actions = agent.act(states)
            #             actions = np.clip(actions, -1,1)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            #             rewards = [1.0  if x > 0.0 else 0.0 for x in rewards]
            dones = env_info.local_done
            agent.step(states, actions, rewards, next_states, dones)
            states = next_states
            score += np.mean(env_info.rewards)
            if np.any(dones):
                break
        scores_deque.append(score)
        scores.append(score)
        print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(
            i_episode, np.mean(scores_deque), score),
              end="")

        if i_episode % 100 == 0:
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(),
                       'checkpoint_critic.pth')
            print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(
                i_episode, np.mean(scores_deque), score),
                  end="")
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_deque)))

        if np.mean(scores_deque) >= 30.0:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode, np.mean(scores_deque)))
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(),
                       'checkpoint_critic.pth')
            break

    env.close()
    return scores
예제 #19
0
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(
    states.shape[0], state_size))
# print('The state for the first agent looks like:', states[0])

agent = Agent(state_size=state_size,
              action_size=action_size,
              num_agents=num_agents,
              random_seed=2)


def save_model(filename):
    torch.save(agent.actor_local.state_dict(), '{}_actor.pth'.format(filename))
    torch.save(agent.critic_local.state_dict(),
               '{}_critic.pth'.format(filename))


def ddpg(n_episodes=2000, print_every=100, save_every=100):
    avg_solved = 0
    scores_deque = deque(maxlen=100)
    scores_global = []
    avg_global = []
예제 #20
0
financial_params
ac_params

import numpy as np

import syntheticChrissAlmgren as sca
from ddpg_agent import Agent

from collections import deque

# Create simulation environment
env = sca.MarketEnvironment()

# Initialize Feed-forward DNNs for Actor and Critic models.
agent = Agent(state_size=env.observation_space_dimension(),
              action_size=env.action_space_dimension(),
              random_seed=0)

# Set the liquidation time
lqt = 60

# Set the number of trades
n_trades = 60

# Set trader's risk aversion
tr = 1e-6

# Set the number of episodes to run the simulation
episodes = 10000

shortfall_hist = np.array([])
예제 #21
0
env1_path = Path("./Reacher_Windows_x86_64_v1/Reacher.app")
env2_path = Path("./Reacher_Windows_x86_64_v2/Reacher.app")
env_path = str(env1_path.resolve())
env = UnityEnvironment(file_name=env_path)

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

env_info = env.reset(train_mode=True)[brain_name]  # reset the environment
state_size = env_info.vector_observations.shape[1]
action_size = env_info.previous_vector_actions.shape[1]
NUM_AGENTS = 20
SEED = 72
agent = Agent(state_size=state_size, action_size=action_size, random_seed=SEED)
writer = tbx.SummaryWriter()


def ddpg(n_episodes=2000, max_t=1000):
    scores_deque = deque(maxlen=100)
    scores = []
    timesteps = 0
    tbx_counter = 0
    max_score = -np.Inf

    for i_episode in range(1, n_episodes + 1):
        state = env.reset(train_mode=True)[brain_name].vector_observations
        agent.reset()
        score = 0
        for t in range(max_t):
예제 #22
0
# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

parser = argparse.ArgumentParser()
parser.add_argument('--train', dest='train', action='store_true', help='train agent locally')
parser.add_argument('--test', dest='test', action='store_true', help='test agent locally')
args = parser.parse_args()

agent = Agent(state_size=state_size, action_size=action_size, random_seed=0)

def ddpg(n_episodes=300, max_t=1000, solved_score=30.0, print_every=100):
    scores_window = deque(maxlen=print_every)
    scores = []
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name]      # reset the environment
        states = env_info.vector_observations                  # get the current state (for each agent)
        score = np.zeros(num_agents)                           # initialize the score (for each agent)
        agent.reset()
        for t in range(max_t):
            actions = agent.act(states, add_noise=True)        # select an action (for each agent)
            actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
            env_info = env.step(actions)[brain_name]           # send all actions to tne environment
            next_states = env_info.vector_observations         # get next state (for each agent)
            rewards = env_info.rewards                         # get reward (for each agent)
def main():

    # load version 2 (with 20 agents) of the environment
    env_name = "Reacher_Windows_x86_64_version1\Reacher.exe"  # add a Unity-Environment name.
    env = UnityEnvironment(file_name=env_name)

    # Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the enviroment
    env_info = env.reset(train_mode=False)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    print("Number of agents : ", num_agents)

    # size of each action
    action_size = brain.vector_action_space_size
    print("Size of each action : ", action_size)

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    print("There are {} agents. Each observes a state with length: {}".format(
        states.shape[0], state_size))

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    random_seed = 12345  #10
    agent = Agent(state_size, action_size, random_seed, device=device)

    actor_state_dict = torch.load("checkpoint_actor.pth")
    agent.actor_local.load_state_dict(actor_state_dict)
    critic_state_dict = torch.load("checkpoint_critic.pth")
    agent.critic_local.load_state_dict(critic_state_dict)

    # Take Random Actions in the Environment
    env_info = env.reset(train_mode=False)[brain_name]  # reset the environment
    states = env_info.vector_observations  # get the current state (for each agent)

    scores = np.zeros(num_agents)  # initialize the score (for each agent)
    while True:
        actions = agent.act(
            states, add_noise=False)  # select an action (for each agent)
        env_info = env.step(actions)[
            brain_name]  # send all actions to tne environment
        next_states = env_info.vector_observations  # get next state (for each agent)
        rewards = env_info.rewards  # get reward (for each agent)
        dones = env_info.local_done  # see if episode finished

        scores += env_info.rewards  # update the score (for each agent)
        states = next_states  # roll over states to next time step
        if np.any(dones):  # exit loop if episode finished
            break
    print('Total score (averaged over agents) this episode: {}'.format(
        np.mean(scores)))

    # When finished, you can close the environment
    env.close()
예제 #24
0
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(
    states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

state_size = state_size * 2
agent_1 = Agent(state_size=state_size, action_size=action_size, random_seed=1)
agent_2 = Agent(state_size=state_size, action_size=action_size, random_seed=1)

actor_1_weights = "actor_1_model.pth"
actor_2_weights = "actor_2_model.pth"

critic_1_weights = "critic_1_model.pth"
critic_2_weights = "critic_2_model.pth"


def ddpg(n_episodes=100000, max_t=20000):

    scores_deque = deque(maxlen=100)
    total_scores = []
    average_scores = []
예제 #25
0
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of actions and state size
action_size = brain.vector_action_space_size
state = env_info.vector_observations[0]
state_size = len(state)

# Instantate the agent, using DQN or Double DQN according to the command line argument
agent = Agent(state_size=state_size,
              action_size=action_size * 2,
              random_seed=64)


def ddpg(n_episodes=10000, max_t=10000):
    """DDPG
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
    """
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
예제 #26
0
def train(
    n_episodes,
    max_t,
    env_fp,
    no_graphics,
    seed,
    save_every_nth,
    buffer_size,
    batch_size,
    gamma,
    tau,
    lr_actor,
    lr_critic,
    weight_decay,
    log,
):
    log.info("#### Initializing environment...")
    # init environment
    env = UnityEnvironment(file_name=env_fp, no_graphics=no_graphics)

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    log.info(f"Number of agents: {num_agents}")

    # size of each action
    action_size = brain.vector_action_space_size
    log.info(f"Size of each action: {action_size}")

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    log.info(
        f"There are {states.shape[0]} agents. Each observes a state with length: {state_size}"
    )
    log.info(f"The state for the first agent looks like: {states[0]}")

    agent = Agent(
        num_agents=len(env_info.agents),
        state_size=state_size,
        action_size=action_size,
        buffer_size=buffer_size,
        batch_size=batch_size,
        gamma=gamma,
        tau=tau,
        lr_actor=lr_actor,
        lr_critic=lr_critic,
        weight_decay=weight_decay,
        random_seed=seed,
    )

    log.info("#### Training...")

    scores_deque = deque(maxlen=100)
    scores = []
    for i_episode in range(1, n_episodes + 1):
        brain_name = env.brain_names[0]
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        agent.reset()
        score = np.zeros((len(env_info.agents), 1))
        for t in range(max_t):
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            rewards = np.array(rewards).reshape((next_states.shape[0], 1))
            dones = env_info.local_done
            dones = np.array(dones).reshape((next_states.shape[0], 1))
            agent.step(states, actions, rewards, next_states, dones)
            score += rewards
            states = next_states
            if np.any(dones):
                break
        scores_deque.append(np.max(score))
        scores.append(np.max(score))
        print(
            "Episode {}\tAverage Score: {:.2f}\tScore: {:.2f}".format(
                i_episode, np.mean(scores_deque), scores[-1]),
            end="\r",
        )

        if i_episode % 100 == 0:
            print("\rEpisode {}\tAverage Score: {:.2f}".format(
                i_episode, np.mean(scores_deque)))
        if i_episode % save_every_nth == 0:
            save_checkpoint(
                state={
                    "episode": i_episode,
                    "actor_state_dict": agent.actor_local.state_dict(),
                    "critic_state_dict": agent.critic_local.state_dict(),
                    "scores_deque": scores_deque,
                    "scores": scores,
                },
                filename="checkpoint.pth",
            )
        if np.mean(scores_deque) >= 0.5:
            save_checkpoint(
                state={
                    "episode": i_episode,
                    "actor_state_dict": agent.actor_local.state_dict(),
                    "critic_state_dict": agent.critic_local.state_dict(),
                    "scores_deque": scores_deque,
                    "scores": scores,
                },
                filename="checkpoint_solved.pth",
            )
            print(
                "\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}"
                .format(i_episode - 100, np.mean(scores_deque)))
            break
GRAPHICS_OFF = False

n_episodes = 3

env = UnityEnvironment(file_name=ENV_PATH, no_graphics=GRAPHICS_OFF)
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

env_info = env.reset(train_mode=GRAPHICS_OFF)[brain_name]
num_agents = len(env_info.agents)
action_size = brain.vector_action_space_size
states = env_info.vector_observations
state_size = states.shape[1]

agents = Agent(state_size=state_size,
               action_size=action_size,
               num_agents=num_agents,
               random_seed=0)
agents.actor_local.load_state_dict(
    torch.load("ckpt/{}".format(ACTOR_CHECKPOINT_NAME)))
agents.critic_local.load_state_dict(
    torch.load("ckpt/{}".format(CRITIC_CHECKPOINT_NAME)))

for i_episode in range(1, n_episodes + 1):
    print('Starting episode {}'.format(i_episode))
    env_info = env.reset(train_mode=GRAPHICS_OFF)[brain_name]
    state = env_info.vector_observations
    agents.reset()
    score = np.zeros(num_agents)
    while True:
        action = agents.act(state)
예제 #28
0
# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(
    states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])
agent = Agent(state_size=state_size, action_size=action_size, random_seed=2)
pretrained_dict_actor = torch.load(
    opt.model_pth_actor, map_location=lambda storage, location: storage)
pretrained_dict_critic = torch.load(
    opt.model_pth_critic, map_location=lambda storage, location: storage)
model_dict_actor = agent.actor_local.state_dict()
model_dict_critic = agent.critic_local.state_dict()

# 1. filter out unnecessary keys
pretrained_dict_actor = {
    k: v
    for k, v in pretrained_dict_actor.items() if k in model_dict_actor
}
pretrained_dict_critic = {
    k: v
    for k, v in pretrained_dict_critic.items() if k in model_dict_critic
예제 #29
0
print(state_size)
print('There are {} agents. Each observes a state with length: {}'.format(
    states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])
#import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
#%matplotlib inline

from ddpg_agent import Agent

print('state size, action size', state_size, action_size)
agent = Agent(state_size, action_size, random_seed=5)


def ddpg(n_episodes=200, max_t=1000, print_every=1, plot_every=25):
    scores_deque = deque(maxlen=print_every)
    scores = []
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations
        #print('state size', state.shape)
        agent.reset()
        score = 0

        for t in range(max_t):
            action = agent.act(state)
            env_info = env.step(action)[brain_name]
예제 #30
0
파일: training.py 프로젝트: hooniedh/tennis
            brain = env.brains[brain_name]
            action_size = brain.vector_action_space_size
            env_info = env.reset(train_mode=True)[brain_name]
            state = env_info.vector_observations[0]
            state_size = len(state)
            num_agents = len(env_info.vector_observations)

            print("num agents {} state size {} action size {}".format(
                num_agents, state_size, action_size))

            agent = Agent(state_size,
                          action_size,
                          random_seed=0,
                          buffer_size=int(1e5),
                          batch_size=512,
                          tau=1e-3,
                          lr_actor=3e-4,
                          lr_critic=3e-4,
                          critic_weight_decay=0.0,
                          update_every=4,
                          update_num_repeats=2,
                          noise_decay=0.999)

            if isForTrain is True:
                print("start training....")
                Train(env, agent, num_agents, num_episodes=3000)
            else:
                Test(env, agent, num_agents, num_episodes=200)

        except KeyboardInterrupt:
            print("Keyboard interrupted")
            env.close()