示例#1
0
    def __init__(self,
                 env,
                 batch_size,
                 replay_capacity,
                 episodes_before_train,
                 device='cpu'):

        self.env = env
        self.n_agents = env.n
        self.memory = memory.ReplayMemory(replay_capacity)

        self.actors = [
            ActorNet(env.observation_space[i].shape[0], env.action_space[i].n)
            for i in range(self.n_agents)
        ]
        self.critics = [
            CriticNet(env.observation_space[i].shape[0], env.n)
            for i in range(self.n_agents)
        ]

        self.critic_optimizers = [
            Adam(x.parameters(), lr=0.01) for x in self.critics
        ]
        self.actor_optimizers = [
            Adam(x.parameters(), lr=0.01) for x in self.actors
        ]

        self.actor_targets = deepcopy(self.actors)
        self.critic_targets = deepcopy(self.critics)

        self.device = device
        self.episodes_before_train = episodes_before_train
        self.batch_size = batch_size

        self.GAMMA = 0.95
        self.epsilon = 0.3

        for x in self.actors:
            x.to(device)
        for x in self.critics:
            x.to(device)
        for x in self.actor_targets:
            x.to(device)
        for x in self.critic_targets:
            x.to(device)
示例#2
0
    target_net = DQN(output=4).to(device)
    target_net.load_state_dict(policy_net.state_dict())



    optimizer = torch.optim.Adam(policy_net.parameters(), lr=lr)

    env = gym.make('PongNoFrameskip-v4')
    #env = gym.make('Pong-v0')
    env = envwrapper.make_env(env)

    # prepare memory
    OPTIMIZE_THRESHOLD =  1000
    capacity = OPTIMIZE_THRESHOLD * 10

    replaymemory = memory.ReplayMemory(capacity)

    episode_rewards = train(env, EPISODE_NUM)

    plot_rewards(episode_rewards)
    
    torch.save(policy_net, 'dqn_pong_model')
    policy_net = torch.load('dqn_pong_model')
    test(env, 1, policy_net)

'''
    print(env.action_space)

    # select action to interact with env
    for i in range(10):
        selected_action = select_action(get_state(env.reset()))
示例#3
0
    def __init__(
            self,
            # ddqn parameters
            connection_label="lonely_worker",
            q_network_type='simple',
            target_q_network_type='simple',
            gamma=0.99,
            target_update_freq=10000,
            train_freq=3,
            num_burn_in=300,
            batch_size=32,
            optimizer='adam',
            loss_func="mse",
            max_ep_length=1000,
            experiment_id="Exp_1",
            model_checkpoint=True,
            opt_metric=None,
            # environment parameters
            net_file="cross.net.xml",
            route_file="cross.rou.xml",
            network_dir="./network",
            demand="nominal",
            state_shape=(1, 11),
            num_actions=2,
            use_gui=False,
            delta_time=10,
            reward="balanced",
            # memory parameters
            max_size=100000,
            # additional parameters
            policy="linDecEpsGreedy",
            eps=0.1,
            num_episodes=2,
            monitoring=False,
            episode_recording=False,
            hparams=None):

        if hparams:
            args_description = locals()
            args_description = str(
                {key: args_description[key]
                 for key in hparams})
        else:
            args_description = "single_worker"

        self.connection_label = connection_label
        self.q_network_type = q_network_type
        self.target_q_network_type = target_q_network_type
        self.gamma = gamma
        self.target_update_freq = target_update_freq
        self.train_freq = train_freq
        self.num_burn_in = num_burn_in
        self.batch_size = batch_size
        self.optimizer = optimizer
        self.loss_func = loss_func
        self.max_ep_length = max_ep_length
        self.experiment_id = experiment_id
        self.model_checkpoint = model_checkpoint
        self.opt_metric = opt_metric

        # additional parameters
        self.policy = policy
        self.eps = eps
        self.num_episodes = num_episodes
        self.monitoring = monitoring
        self.episode_recording = episode_recording
        self.output_dir, self.summary_writer_folder = tools.get_output_folder(
            "./logs", self.experiment_id, args_description)
        self.summary_writer = tf.summary.FileWriter(
            logdir=self.summary_writer_folder)

        # environment parameters
        self.net_file = os.path.join(network_dir, net_file)
        self.route_file = os.path.join(self.output_dir, route_file)
        self.demand = demand
        self.state_shape = state_shape
        self.num_actions = num_actions
        self.use_gui = use_gui
        self.delta_time = delta_time
        self.reward = reward

        # memory parameters
        self.max_size = max_size
        self.state_shape = state_shape

        # Initialize Q-networks (value and target)
        self.q_network = agent.get_model(model_name=self.q_network_type,
                                         input_shape=(self.state_shape[1], ),
                                         num_actions=self.num_actions)

        self.target_q_network = agent.get_model(
            model_name=self.target_q_network_type,
            input_shape=(self.state_shape[1], ),
            num_actions=self.num_actions)

        # Initialize environment
        self.env = environment.Env(connection_label=self.connection_label,
                                   net_file=self.net_file,
                                   route_file=self.route_file,
                                   demand=self.demand,
                                   state_shape=self.state_shape,
                                   num_actions=self.num_actions,
                                   policy=self.policy,
                                   use_gui=self.use_gui,
                                   eps=self.eps,
                                   reward=self.reward)

        # Initialize replay memory
        self.memory = memory.ReplayMemory(max_size=self.max_size,
                                          state_shape=self.state_shape,
                                          num_actions=self.num_actions)

        # Initialize Double DQN algorithm
        self.ddqn = doubledqn.DoubleDQN(
            q_network=self.q_network,
            target_q_network=self.target_q_network,
            memory=self.memory,
            gamma=self.gamma,
            target_update_freq=self.target_update_freq,
            train_freq=self.train_freq,
            num_burn_in=self.num_burn_in,
            batch_size=self.batch_size,
            optimizer=self.optimizer,
            loss_func=self.loss_func,
            max_ep_length=self.max_ep_length,
            env_name=self.env,
            output_dir=self.output_dir,
            monitoring=self.monitoring,
            episode_recording=self.episode_recording,
            experiment_id=self.experiment_id,
            summary_writer=self.summary_writer)

        # Store initialization prameters
        self.store_init(locals())
示例#4
0
import policy
import time
import torch
import memory
import random
import math
import torch.nn.functional as F
import nstep

agent = policy.policy()

lagged_agent = policy.policy()
lagged_agent.copy_weights(agent)

replay_memory_size = 100000
replay_memory = memory.ReplayMemory(replay_memory_size)

# export OMP_NUM_THREADS=1
def live(iterations, batch_size, lagg, eps, improve_flag, num_steps):
    n_step = nstep.Nstep(num_steps)
    g = slider.Game()
    state = g.get_state()
    total_reward = 0
    start = time.time()
    for i in range(iterations):

        # eps-greedy
        if random.uniform(0,1) < eps:
            action = random.randint(0,3)
        else:
            action = agent.get_action(state)
示例#5
0
    empty_state = np.zeros_like(state, dtype=np.int)

    _, in_h, in_w = state.shape

    try:
        checkpoint = torch.load(os.path.join(save_dir, training_state_file))
    except FileNotFoundError:
        checkpoint = None

    policy_net = model.DQNetwork(state_size, action_size, in_h,
                                 in_w).to(device)
    optimizer = optim.RMSprop(policy_net.parameters())

    # Memory initialization
    mem = memory.ReplayMemory(memory_size)

    episodes_done = 0
    steps_done = 0

    if checkpoint is not None:
        policy_net.load_state_dict(checkpoint['policy_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        policy_net.eval()
        episodes_done = checkpoint['episode']
        steps_done = checkpoint['steps_done']

        print(f'Restoring from latest checkpoint, episode {episodes_done + 1}')

    target_net = model.DQNetwork(state_size, action_size, in_h,
                                 in_w).to(device)
示例#6
0
def main_training_loop():

    fixed_states = test.get_fixed_states()
    env = gym.make('BreakoutNoFrameskip-v0')

    n_actions = env.action_space.n

    policy_net = DeepQNetwork(constants.STATE_IMG_HEIGHT,
                              constants.STATE_IMG_WIDTH,
                              constants.N_IMAGES_PER_STATE, n_actions)

    target_net = DeepQNetwork(constants.STATE_IMG_HEIGHT,
                              constants.STATE_IMG_WIDTH,
                              constants.N_IMAGES_PER_STATE, n_actions)
    criterion = torch.nn.MSELoss()
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = torch.optim.RMSprop(policy_net.parameters(),
                                    lr=constants.LEARNING_RATE,
                                    momentum=0.95)
    replay_memory = memory.ReplayMemory(constants.REPLAY_MEMORY_SIZE)

    steps_done = 0
    epoch = 0
    information = [[
        "epoch", "n_steps", "avg_reward", "avg_score", "n_episodes",
        "avg_q_value"
    ]]
    try:
        for i_episode in range(constants.N_EPISODES):

            cumulative_screenshot = []

            # Prepare the cumulative screenshot
            padding_image = torch.zeros(
                (1, constants.STATE_IMG_HEIGHT, constants.STATE_IMG_WIDTH))
            for i in range(constants.N_IMAGES_PER_STATE - 1):
                cumulative_screenshot.append(padding_image)

            env.reset()
            episode_score = 0
            episode_reward = 0

            screen_grayscale_state = get_screen(env)
            cumulative_screenshot.append(screen_grayscale_state)

            state = utils.process_state(cumulative_screenshot)

            prev_state_lives = constants.INITIAL_LIVES

            for i in range(constants.N_TIMESTEP_PER_EP):
                if constants.SHOW_SCREEN:
                    env.render()

                action = select_action(state, policy_net, steps_done, env)
                _, reward, done, info = env.step(action)
                episode_score += reward

                reward_tensor = None
                if info["ale.lives"] < prev_state_lives:
                    reward_tensor = torch.tensor([-1])
                    episode_reward += -1
                elif reward > 0:
                    reward_tensor = torch.tensor([1])
                    episode_reward += 1
                elif reward < 0:
                    reward_tensor = torch.tensor([-1])
                    episode_reward += -1
                else:
                    reward_tensor = torch.tensor([0])

                prev_state_lives = info["ale.lives"]

                screen_grayscale = get_screen(env)
                cumulative_screenshot.append(screen_grayscale)
                cumulative_screenshot.pop(
                    0
                )  # Deletes the first element of the list to save memory space

                if done:
                    next_state = None
                else:
                    next_state = utils.process_state(cumulative_screenshot)

                replay_memory.push(state, action, next_state, reward_tensor)

                if next_state is not None:
                    state.copy_(next_state)

                optimize_model(target_net, policy_net, replay_memory,
                               optimizer, criterion)
                steps_done += 1

                if done:
                    print("Episode:", i_episode, "Steps done:", steps_done,
                          "- Episode reward:", episode_reward,
                          "- Episode score:", episode_score)
                    break

                # Update target policy
                if steps_done % constants.TARGET_UPDATE == 0:
                    target_net.load_state_dict(policy_net.state_dict())

                # Epoch test
                if steps_done % constants.STEPS_PER_EPOCH == 0:
                    epoch += 1
                    epoch_reward_average, epoch_score_average, n_episodes, q_values_average = test.test_agent(
                        target_net, fixed_states)
                    information.append([
                        epoch, steps_done, epoch_reward_average,
                        epoch_score_average, n_episodes, q_values_average
                    ])
                    print("INFO", [
                        epoch, steps_done, epoch_reward_average,
                        epoch_score_average, n_episodes, q_values_average
                    ])

        # Save test information in dataframe
        print("Saving information...")
        information_numpy = numpy.array(information)
        dataframe_information = pandas.DataFrame(columns=information_numpy[0,
                                                                           0:],
                                                 data=information_numpy[1:,
                                                                        0:])
        dataframe_information.to_csv("info/results.csv")
        print(dataframe_information)

        # Save target parameters in file
        torch.save(target_net.state_dict(), "info/nn_parameters.txt")

    except KeyboardInterrupt:
        # Save test information in dataframe
        print("Saving information...")
        information_numpy = numpy.array(information)
        dataframe_information = pandas.DataFrame(columns=information_numpy[0,
                                                                           0:],
                                                 data=information_numpy[1:,
                                                                        0:])
        dataframe_information.to_csv("info/results.csv")
        print(dataframe_information)

        # Save target parameters in file
        torch.save(target_net.state_dict(), "info/nn_parameters.txt")

    env.close()