示例#1
0
  def __init__(self, config):
    self.config = config

    # Create session
    self.session = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(
        allow_growth=True)))

    # Create networks
    self.prior_network = PolicyNetwork(
        scope=config.prior_network,
        temperature=config.prior_temperature,
        use_symmetry=config.use_symmetry)

    self.rollout_network = PolicyNetwork(
        scope=config.rollout_network,
        temperature=config.rollout_temperature,
        reuse=config.prior_network == config.rollout_network,
        use_symmetry=config.use_symmetry)

    self.value_network = ValueNetwork(
        scope=config.value_network, use_symmetry=config.use_symmetry)

    # Load networks from checkpoints
    run_dir = util.run_directory(config)
    util.restore_network_or_fail(self.session, run_dir, self.prior_network)
    util.restore_network_or_fail(self.session, run_dir, self.rollout_network)
    util.restore_network_or_fail(self.session, run_dir, self.value_network)

    # Create queues
    self.prior_queue = AllQueue()
    self.rollout_queue = AllQueue(maxsize=16)
    self.value_queue = AllQueue(maxsize=16)

    self.new_game()
示例#2
0
def selfplay_with_noise(sess, trained_model_dir, uniform_sample=False):
    print("-----------Load Trained Model------------")
    policy_estimator = PolicyNetwork(input_size=len(env.state), output_size=env.action_space.n)
    policy_estimator.restore(sess=sess,
                             checkpoint_file=os.path.join(trained_model_dir, 'episodes_99999_model.ckpt'))
    reward_estimator = RewardNetwork(input_size=len(env.state), output_size=1)
    reward_estimator.restore(sess=sess,
                             checkpoint_file=os.path.join(trained_model_dir, 'episodes_99999_model.ckpt'))

    print("-----------Simulation with Different Noises------------")
    noise_test_dir = os.path.join(trained_model_dir, "noise-test")
    if not os.path.exists(noise_test_dir):
        os.makedirs(noise_test_dir)
    tag_str = "uniform.winrate.txt" if uniform_sample else "weighted.winrate.txt"
    filename = os.path.join(noise_test_dir, tag_str)
    fp = open(filename, "w")
    fp.write("noise\twinrate\n")

    noises = []
    accuracys = []
    for i in range(15):
        noise = i * 0.01
        win_rate = self_play(env,
                             policy_estimator,
                             reward_estimator,
                             uniform_sample=uniform_sample,
                             num_episodes=1000,
                             noise=noise)
        # Keep statistics for noise-accuracy curve
        noises.append(noise)
        accuracys.append(win_rate)
        fp.write(str(noise) + "\t" + str(win_rate) + "\n")
        fp.flush()
    fp.close()
示例#3
0
文件: agent.py 项目: mfsuve/TORCS-RL
    def __init__(self, load_from=None, will_train=True):
        self.env = TorcsEnv(
            path='/usr/local/share/games/torcs/config/raceman/quickrace.xml')
        self.args = SAC_args()
        self.buffer = ReplayBuffer(self.args.buffer_size)

        action_dim = self.env.action_space.shape[0]
        state_dim = self.env.observation_space.shape[0]
        hidden_dim = 256

        self.action_size = action_dim
        self.state_size = state_dim

        self.value_net = ValueNetwork(state_dim,
                                      hidden_dim).to(self.args.device)
        self.target_value_net = ValueNetwork(state_dim,
                                             hidden_dim).to(self.args.device)

        self.soft_q_net1 = SoftQNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)
        self.soft_q_net2 = SoftQNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)

        self.policy_net = PolicyNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)

        self.target_value_net.load_state_dict(self.value_net.state_dict())

        self.value_criterion = nn.MSELoss()
        self.soft_q_loss1 = nn.MSELoss()
        self.soft_q_loss2 = nn.MSELoss()

        self.value_opt = optim.Adam(self.value_net.parameters(),
                                    lr=self.args.lr)
        self.soft_q_opt1 = optim.Adam(self.soft_q_net1.parameters(),
                                      lr=self.args.lr)
        self.soft_q_opt2 = optim.Adam(self.soft_q_net2.parameters(),
                                      lr=self.args.lr)
        self.policy_opt = optim.Adam(self.policy_net.parameters(),
                                     lr=self.args.lr)

        if will_train:
            current_time = time.strftime('%d-%b-%y-%H.%M.%S', time.localtime())
            self.plot_folder = f'plots/{current_time}'
            self.model_save_folder = f'model/{current_time}'
            make_sure_dir_exists(self.plot_folder)
            make_sure_dir_exists(self.model_save_folder)
            self.cp = Checkpoint(self.model_save_folder)

        if load_from is not None:
            try:
                self.load_checkpoint(load_from)
            except FileNotFoundError:
                print(f'{load_from} not found. Running default.')
        else:
            print('Starting from scratch.')
示例#4
0
def train():
    tf.reset_default_graph()
    # Create a global step variable
    global_step = tf.Variable(0, name="global_step", trainable=False)

    policy_estimator = PolicyNetwork(input_size=len(env.state), output_size=env.action_space.n,
                                     summaries_dir=experiment_dir)
    value_estimator = ValueNetwork(input_size=len(env.state), output_size=1)
    # Object-aware Reward Network
    reward_estimator = ObjectAwareRewardNetwork(input_size=len(env.state), output_size=1, action_num=env.action_space.n)
    # # Reward Network
    # reward_estimator = RewardNetwork(input_size=len(env.state), output_size=1, action_num=env.action_space.n)

    saver = tf.train.Saver()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        sess.run(tf.initialize_all_variables())
        reinforce(env,
                  policy_estimator,
                  value_estimator,
                  reward_estimator,
                  max_num_episodes,
                  sess,
                  discount_factor=0.99,
                  uniform_sample=False,
                  saver=saver,
                  model_dir=model_dir,
                  figure_dir=figure_dir)
示例#5
0
  def create_new_opponent(self, name):
    # Create clone of policy_player
    clone = PolicyNetwork(name)
    self.session.run(self.policy_network.assign(clone))
    util.save_network(self.session, self.run_dir, clone)
    new_opponent = PolicyPlayer(clone, self.session)

    self.opponents.decrease_win_rates()
    self.opponents.add_opponent(new_opponent)
示例#6
0
    def __init__(self, config):
        self.config = config

        session = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(
            allow_growth=True)))
        self.random_player = RandomPlayer()
        self.exploratory_network = PolicyNetwork(config.exploratory_network)
        self.exploratory_player = PolicyPlayer(self.exploratory_network,
                                               session)

        self.playout_network = PolicyNetwork(
            config.playout_network,
            reuse=config.exploratory_network == config.playout_network)
        self.playout_player = PolicyPlayer(self.playout_network, session)

        self.run_dir = util.run_directory(config)
        util.restore_network_or_fail(session, self.run_dir,
                                     self.exploratory_network)
        util.restore_network_or_fail(session, self.run_dir,
                                     self.playout_network)
示例#7
0
  def __init__(self, config):
    self.config = config
    self.run_dir = util.run_directory(config)

    self.session = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(
        allow_growth=True)))

    self.policy_network = PolicyNetwork('policy')
    self.policy_player = PolicyPlayer(self.policy_network, self.session)
    util.restore_or_initialize_network(self.session, self.run_dir,
                                       self.policy_network)

    # Train ops
    self.create_train_op(self.policy_network)
    self.writer = tf.summary.FileWriter(self.run_dir)
    util.restore_or_initialize_scope(self.session, self.run_dir,
                                     self.training_scope.name)

    self.opponents = Opponents(
        [RandomPlayer(),
         RandomThreatPlayer(),
         MaxThreatPlayer()])
    self.opponents.restore_networks(self.session, self.run_dir)
示例#8
0
 def restore_networks(self, session, run_dir):
   opponents_file = os.path.join(run_dir, 'opponents')
   if os.path.exists(opponents_file):
     with open(opponents_file) as f:
       for line in f.readlines():
         opponent_name, win_rate_string = line.strip().split()
         win_rate = float(win_rate_string)
         if opponent_name[:8] == 'network-':
           print('Restoring %s' % opponent_name)
           network = PolicyNetwork(opponent_name)
           util.restore_network_or_fail(session, run_dir, network)
           opponent = PolicyPlayer(network, session)
           self.win_rates[opponent] = win_rate
         else:
           for opponent in self.win_rates.keys():
             if opponent_name == opponent.name:
               self.win_rates[opponent] = win_rate
def trainD(file_name="Distral_1col",
           list_of_envs=[GridworldEnv(4), GridworldEnv(5)],
           batch_size=128,
           gamma=0.999,
           alpha=0.9,
           beta=5,
           eps_start=0.9,
           eps_end=0.05,
           eps_decay=5,
           is_plot=False,
           num_episodes=200,
           max_num_steps_per_episode=1000,
           learning_rate=0.001,
           memory_replay_size=10000,
           memory_policy_size=1000):
    """
    Soft Q-learning training routine. Retuns rewards and durations logs.
    Plot environment screen
    """
    num_actions = list_of_envs[0].action_space.n
    num_envs = len(list_of_envs)
    policy = PolicyNetwork(num_actions)
    models = [DQN(num_actions)
              for _ in range(0, num_envs)]  ### Add torch.nn.ModuleList (?)
    memories = [
        ReplayMemory(memory_replay_size, memory_policy_size)
        for _ in range(0, num_envs)
    ]

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        policy.cuda()
        for model in models:
            model.cuda()

    optimizers = [
        optim.Adam(model.parameters(), lr=learning_rate) for model in models
    ]
    policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate)
    # optimizer = optim.RMSprop(model.parameters(), )

    episode_durations = [[] for _ in range(num_envs)]
    episode_rewards = [[] for _ in range(num_envs)]

    steps_done = np.zeros(num_envs)
    episodes_done = np.zeros(num_envs)
    current_time = np.zeros(num_envs)

    # Initialize environments
    for env in list_of_envs:
        env.reset()

    while np.min(episodes_done) < num_episodes:
        # TODO: add max_num_steps_per_episode

        # Optimization is given by alterating minimization scheme:
        #   1. do the step for each env
        #   2. do one optimization step for each env using "soft-q-learning".
        #   3. do one optimization step for the policy

        for i_env, env in enumerate(list_of_envs):
            # print("Cur episode:", i_episode, "steps done:", steps_done,
            #         "exploration factor:", eps_end + (eps_start - eps_end) * \
            #         math.exp(-1. * steps_done / eps_decay))

            # last_screen = env.current_grid_map
            current_screen = get_screen(env)
            state = current_screen  # - last_screen
            # Select and perform an action
            action = select_action(state, policy, models[i_env], num_actions,
                                   eps_start, eps_end, eps_decay,
                                   episodes_done[i_env], alpha, beta)
            steps_done[i_env] += 1
            current_time[i_env] += 1
            _, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # Observe new state
            last_screen = current_screen
            current_screen = get_screen(env)
            if not done:
                next_state = current_screen  # - last_screen
            else:
                next_state = None

            # Store the transition in memory
            time = Tensor([current_time[i_env]])
            memories[i_env].push(state, action, next_state, reward, time)

            # Perform one step of the optimization (on the target network)
            optimize_model(policy, models[i_env], optimizers[i_env],
                           memories[i_env], batch_size, alpha, beta, gamma)
            if done:
                print(
                    "ENV:", i_env, "iter:", episodes_done[i_env], "\treward:",
                    env.episode_total_reward, "\tit:", current_time[i_env],
                    "\texp_factor:", eps_end + (eps_start - eps_end) *
                    math.exp(-1. * episodes_done[i_env] / eps_decay))
                env.reset()
                episodes_done[i_env] += 1
                episode_durations[i_env].append(current_time[i_env])
                current_time[i_env] = 0
                episode_rewards[i_env].append(env.episode_total_reward)
                if is_plot:
                    plot_rewards(episode_rewards, i_env)

        optimize_policy(policy, policy_optimizer, memories, batch_size,
                        num_envs, gamma)

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results

    np.save(file_name + '-distral-2col-rewards', episode_rewards)
    np.save(file_name + '-distral-2col-durations', episode_durations)

    return models, policy, episode_rewards, episode_durations
示例#10
0
def main():


    
    # ENVIROMENT
    env_name = "CartPole-v1"
    # env_name = "LunarLander-v2"
    env = gym.make(env_name)
    n_actions = env.action_space.n
    feature_dim = env.observation_space.shape[0]

    # PARAMETERS
    learning_rate = 1e-3
    state_scale = 1.0
    reward_scale = 1.0
    clip = 0.2
    n_epoch = 4
    max_episodes = 10
    max_timesteps = 200
    batch_size = 32
    max_iterations = 200
    gamma = 0.99
    gae_lambda = 0.95
    entropy_coefficient = 0.01

    # NETWORK
    value_model = ValueNetwork(in_dim=feature_dim).to(device)
    value_optimizer = optim.Adam(value_model.parameters(), lr=learning_rate)

    policy_model = PolicyNetwork(in_dim=feature_dim, n=n_actions).to(device)
    policy_optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate)
    
    # INIT
    history = History()
    observation = env.reset()

    epoch_ite = 0
    episode_ite = 0
    train_ite = 0
    running_reward = -500

    # TENSORBOARD 
    timestr = time.strftime("%d%m%Y-%H%M%S-")

    log_dir = "./runs/" + timestr + env_name + "-BS" + str(batch_size) + "-E" + \
            str(max_episodes) + "-MT" + str(max_timesteps) + "-NE" + str(n_epoch) + \
            "-LR" + str(learning_rate) + "-G" + str(gamma) + "-L" + str(gae_lambda)

    writer = SummaryWriter(log_dir=log_dir)

    # LOAD MODEL
    # Create folder models
    if not Path("./models").exists():
        print("Creating Models folder")
        Path("./models").mkdir()

    model_path = Path("./models/" + env_name + ".tar")
    if model_path.exists():
        print("Loading model!")
        #Load model
        checkpoint = torch.load(model_path)
        policy_model.load_state_dict(checkpoint['policy_model'])
        policy_optimizer.load_state_dict(checkpoint['policy_optimizer'])
        value_model.load_state_dict(checkpoint['value_model'])
        value_optimizer.load_state_dict(checkpoint['value_optimizer'])
        running_reward = checkpoint['running_reward']
    

    for ite in tqdm(range(max_iterations), ascii=True):

        if ite % 5 == 0:
            torch.save({
                'policy_model': policy_model.state_dict(),
                'policy_optimizer': policy_optimizer.state_dict(),
                'value_model': value_model.state_dict(),
                'value_optimizer': value_optimizer.state_dict(),
                'running_reward': running_reward}, model_path)


        
        episode_ite, running_reward = collect(episode_ite, running_reward, env,
                                            max_episodes, max_timesteps, state_scale,
                                            reward_scale, writer, history, policy_model,
                                            value_model, gamma, gae_lambda, device)
        
        # Here we have collected N trajectories.
        history.build_dataset()

        data_loader = DataLoader(history, batch_size=batch_size, shuffle=True, drop_last=True)


        policy_loss, value_loss, train_ite = train_network(data_loader, policy_model, value_model,
                                                        policy_optimizer, value_optimizer ,n_epoch, clip,
                                                        train_ite, writer, entropy_coefficient)


        for p_l, v_l in zip(policy_loss, value_loss):
            epoch_ite += 1
            writer.add_scalar("Policy Loss", p_l, epoch_ite)
            writer.add_scalar("Value Loss", v_l, epoch_ite)

        history.free_memory()

        # print("\n", running_reward)

        writer.add_scalar("Running Reward", running_reward, epoch_ite)


        if (running_reward > env.spec.reward_threshold):
            print("\nSolved!")
            break
示例#11
0
        self.board.push(chess.Move.from_uci(self.move))

    def render(self):
        print(self.board)
        print("Move: ", self.move)

    def print_game(self):
        game = chess.pgn.Game()
        node = game
        for move in self.board.move_stack:
            node = node.add_variation(move)
        print(game)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

policy_model = PolicyNetwork().to(device)

# env = ChessEnv()
# env.update_rival(policy_model, device)
# state = env.reset(True)
# state, reward, done = env.step(env.legal_actions()[0])
# state, reward, done = env.step(env.legal_actions()[0])
# state, reward, done = env.step(env.legal_actions()[0])
# state, reward, done = env.step(env.legal_actions()[0])
# print(reward)
# env.render()
# env.print_game()

# env = ChessEnv()
# observation = env.reset(True)
# env.update_rival(policy_model, device)
示例#12
0
    there's a good chance it could slow things down because we have to move data back and forth between CPU and GPU.
    Regardless I'm leaving this in here. For those of you with GPUs this will mean that you will need to move your 
    tensors to GPU.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    numrun = 1
    
    for run in range(numrun):
        env = make_env()

        in_size = env.observation_space.shape[0]
        num_actions = env.action_space.n

        network = network_factory(in_size, num_actions, env)
        network.to(device)
        pe = PolicyNetwork(network)
        
        # Load policy to test
        #pe.network.load_state_dict(torch.load('saved_network_50000_baseline.pkl'))
        
        ve = ValueNetwork(in_size)
        ep_returns = reinforce(env, pe, ve, episodes) #,ve , loss_policy, loss_value
            
        #fwrite = open('runs_data/'+str(run)+'.pkl','wb')
        #fwrite = open('runs_data/0.pkl','wb')
        #pickle.dump(ep_returns, fwrite)
        #fwrite.close()
            
        
        
    window = 100
示例#13
0
def main(env_name, lr, state_scale, reward_scale, clip, train_epoch,
         max_episodes, max_timesteps, batch_size, max_iterations, gamma,
         gae_lambda, entropy_coefficient, start_running_reward, update_rate):

    # ENVIROMENT
    env_name = env_name
    env = ChessEnv()

    # PARAMETERS
    learning_rate = lr
    state_scale = state_scale
    reward_scale = reward_scale
    clip = clip
    n_epoch = train_epoch
    max_episodes = max_episodes
    max_timesteps = max_timesteps
    batch_size = batch_size
    max_iterations = max_iterations
    gamma = gamma
    gae_lambda = gae_lambda
    entropy_coefficient = entropy_coefficient

    # NETWORK
    value_model = ValueNetwork().to(device)
    value_optimizer = optim.Adam(value_model.parameters(), lr=learning_rate)

    policy_model = PolicyNetwork().to(device)
    policy_optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate)

    # INIT
    history = History()

    epoch_ite = 0
    episode_ite = 0
    train_ite = 0
    running_reward = start_running_reward

    # TENSORBOARD
    timestr = time.strftime("%d%m%Y-%H%M%S-")

    log_dir = "./runs/" + timestr + env_name + "-BS" + str(batch_size) + "-E" + \
            str(max_episodes) + "-MT" + str(max_timesteps) + "-NE" + str(n_epoch) + \
            "-LR" + str(learning_rate) + "-G" + str(gamma) + "-L" + str(gae_lambda)

    writer = SummaryWriter(log_dir=log_dir)

    # LOAD MODEL
    # Create folder models
    if not Path("./models").exists():
        print("Creating Models folder")
        Path("./models").mkdir()

    model_path = Path("./models/" + env_name + ".tar")
    if model_path.exists():
        print("Loading model!")
        #Load model
        checkpoint = torch.load(model_path)
        policy_model.load_state_dict(checkpoint['policy_model'])
        policy_optimizer.load_state_dict(checkpoint['policy_optimizer'])
        value_model.load_state_dict(checkpoint['value_model'])
        value_optimizer.load_state_dict(checkpoint['value_optimizer'])
        running_reward = checkpoint['running_reward']

    # Create SavedEnvs queue
    SavedEnv = queue.SimpleQueue()
    for _ in range(max_episodes):
        env = ChessEnv()
        SavedEnv.put((env, env.reset(), 0))

    # START ITERATING
    for ite in tqdm(range(max_iterations), ascii=True):
        # Load model to rival each update_rate epochs
        if ite % update_rate == 0:
            print("\nUpdating")
            rival_policy = PolicyNetwork().to(device)
            rival_policy.load_state_dict(policy_model.state_dict())

        if ite % 5 == 0:
            torch.save(
                {
                    'policy_model': policy_model.state_dict(),
                    'policy_optimizer': policy_optimizer.state_dict(),
                    'value_model': value_model.state_dict(),
                    'value_optimizer': value_optimizer.state_dict(),
                    'running_reward': running_reward
                }, model_path)

        print("\nSimulating")
        start_simulation = time.perf_counter()

        q = queue.SimpleQueue()

        env_list = []
        while not SavedEnv.empty():
            env_list.append(SavedEnv.get())

        threads = []
        for saved_env in env_list:
            t = threading.Thread(target=collect,
                                 args=[
                                     q, env_name, saved_env, SavedEnv,
                                     max_timesteps, state_scale, reward_scale,
                                     policy_model, value_model, gamma,
                                     gae_lambda, device, rival_policy
                                 ])
            t.start()
            threads.append(t)

        for t in threads:
            t.join()

        # for saved_env in env_list:
        #     if ite % 20 == 0:
        #         update_policy = True
        #     else:
        #         update_policy = False
        #     collect(q, env_name, saved_env,
        #                         SavedEnv, max_timesteps, state_scale, reward_scale,
        #                         policy_model, value_model, gamma,
        #                         gae_lambda, device, update_policy)

        avg_episode_reward = []
        # Write all episodes from queue to history buffer
        while not q.empty():
            episode, done = q.get()
            history.episodes.append(episode)
            avg_episode_reward.append((episode.reward, done))

        end_simulation = time.perf_counter()
        print(f"Simulation time: {end_simulation-start_simulation:.2f} ")

        for ep_reward, done in avg_episode_reward:
            if done:
                running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward

                writer.add_scalar("Average Episode Reward", ep_reward,
                                  episode_ite)
                episode_ite += 1

        # avg_ep_reward = sum(avg_episode_reward) / len(avg_episode_reward)

        # Here we have collected N trajectories and prepare dataset
        history.build_dataset()

        data_loader = DataLoader(history,
                                 batch_size=batch_size,
                                 shuffle=True,
                                 drop_last=True)

        print("Training")
        policy_loss, value_loss, train_ite = train_network(
            data_loader, policy_model, value_model, policy_optimizer,
            value_optimizer, n_epoch, clip, train_ite, writer,
            entropy_coefficient)

        end_training = time.perf_counter()
        print(f"Training time: {end_training-end_simulation:.2f}")

        for p_l, v_l in zip(policy_loss, value_loss):
            epoch_ite += 1
            writer.add_scalar("Policy Loss", p_l, epoch_ite)
            writer.add_scalar("Value Loss", v_l, epoch_ite)

        history.free_memory()

        # print("\n", running_reward)

        writer.add_scalar("Running Reward", running_reward, epoch_ite)

        if (running_reward > 0):
            print("\nSolved!")
            break
示例#14
0
def main():

    # ENVIROMENT
    # env_name = "CartPole-v1"
    # env_name = "LunarLander-v2"
    # env_name = "Acrobot-v1"
    env_name = "MountainCar-v0"
    env = gym.make(env_name)
    n_actions = env.action_space.n
    feature_dim = env.observation_space.shape[0]

    # PARAMETERS
    learning_rate = 1e-3
    state_scale = 1.0
    reward_scale = 1.0
    clip = 0.2
    n_epoch = 4
    max_episodes = 10
    max_timesteps = 100
    batch_size = 32
    max_iterations = 1000
    gamma = 0.99
    gae_lambda = 0.95
    entropy_coefficient = 0.01
    env_threshold = env.spec.reward_threshold

    # NETWORK
    value_model = ValueNetwork(in_dim=feature_dim).to(device)
    value_optimizer = optim.Adam(value_model.parameters(), lr=learning_rate)

    policy_model = PolicyNetwork(in_dim=feature_dim, n=n_actions).to(device)
    policy_optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate)

    # INIT
    history = History()
    observation = env.reset()

    epoch_ite = 0
    episode_ite = 0
    train_ite = 0
    running_reward = -500

    # TENSORBOARD
    timestr = time.strftime("%d%m%Y-%H%M%S-")

    log_dir = "./runs/" + timestr + env_name + "-BS" + str(batch_size) + "-E" + \
            str(max_episodes) + "-MT" + str(max_timesteps) + "-NE" + str(n_epoch) + \
            "-LR" + str(learning_rate) + "-G" + str(gamma) + "-L" + str(gae_lambda)

    writer = SummaryWriter(log_dir=log_dir)

    # LOAD MODEL
    # Create folder models
    if not Path("./models").exists():
        print("Creating Models folder")
        Path("./models").mkdir()

    model_path = Path("./models/" + env_name + ".tar")
    if model_path.exists():
        print("Loading model!")
        #Load model
        checkpoint = torch.load(model_path)
        policy_model.load_state_dict(checkpoint['policy_model'])
        policy_optimizer.load_state_dict(checkpoint['policy_optimizer'])
        value_model.load_state_dict(checkpoint['value_model'])
        value_optimizer.load_state_dict(checkpoint['value_optimizer'])
        running_reward = checkpoint['running_reward']

    EnvQueue = queue.SimpleQueue()

    for _ in range(max_episodes):
        env = gym.make(env_name)
        observation = env.reset()
        EnvQueue.put((env, observation, 0))

    for ite in tqdm(range(max_iterations), ascii=True):

        if ite % 5 == 0:
            torch.save(
                {
                    'policy_model': policy_model.state_dict(),
                    'policy_optimizer': policy_optimizer.state_dict(),
                    'value_model': value_model.state_dict(),
                    'value_optimizer': value_optimizer.state_dict(),
                    'running_reward': running_reward
                }, model_path)

        q = queue.SimpleQueue()

        env_list = []
        while not EnvQueue.empty():
            env_list.append(EnvQueue.get())

        threads = []
        for env in env_list:
            t = threading.Thread(target=collect,
                                 args=[
                                     q, env_name, env, EnvQueue, max_timesteps,
                                     state_scale, reward_scale, policy_model,
                                     value_model, gamma, gae_lambda, device
                                 ])
            t.start()
            threads.append(t)

        for t in threads:
            t.join()

        avg_episode_reward = []
        # Write all episodes from queue to history buffer
        while not q.empty():
            episode, done = q.get()
            history.episodes.append(episode)
            avg_episode_reward.append((episode.reward, done))

        for ep_reward, done in avg_episode_reward:
            if done:
                running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
                writer.add_scalar("Running Reward", running_reward,
                                  episode_ite)
                writer.add_scalar("Episode Reward", ep_reward, episode_ite)
                episode_ite += 1

        # avg_ep_reward = sum(avg_episode_reward) / len(avg_episode_reward)

        # Here we have collected N trajectories and prepare dataset
        history.build_dataset()

        data_loader = DataLoader(history,
                                 batch_size=batch_size,
                                 shuffle=True,
                                 drop_last=True)

        policy_loss, value_loss, train_ite = train_network(
            data_loader, policy_model, value_model, policy_optimizer,
            value_optimizer, n_epoch, clip, train_ite, writer,
            entropy_coefficient)

        for p_l, v_l in zip(policy_loss, value_loss):
            epoch_ite += 1
            writer.add_scalar("Policy Loss", p_l, epoch_ite)
            writer.add_scalar("Value Loss", v_l, epoch_ite)

        history.free_memory()

        # print("\n", running_reward)

        if (running_reward > env_threshold):
            print("\nSolved!")
            break
示例#15
0
文件: agent.py 项目: mfsuve/TORCS-RL
class SAC_Agent:
    def __init__(self, load_from=None, will_train=True):
        self.env = TorcsEnv(
            path='/usr/local/share/games/torcs/config/raceman/quickrace.xml')
        self.args = SAC_args()
        self.buffer = ReplayBuffer(self.args.buffer_size)

        action_dim = self.env.action_space.shape[0]
        state_dim = self.env.observation_space.shape[0]
        hidden_dim = 256

        self.action_size = action_dim
        self.state_size = state_dim

        self.value_net = ValueNetwork(state_dim,
                                      hidden_dim).to(self.args.device)
        self.target_value_net = ValueNetwork(state_dim,
                                             hidden_dim).to(self.args.device)

        self.soft_q_net1 = SoftQNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)
        self.soft_q_net2 = SoftQNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)

        self.policy_net = PolicyNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)

        self.target_value_net.load_state_dict(self.value_net.state_dict())

        self.value_criterion = nn.MSELoss()
        self.soft_q_loss1 = nn.MSELoss()
        self.soft_q_loss2 = nn.MSELoss()

        self.value_opt = optim.Adam(self.value_net.parameters(),
                                    lr=self.args.lr)
        self.soft_q_opt1 = optim.Adam(self.soft_q_net1.parameters(),
                                      lr=self.args.lr)
        self.soft_q_opt2 = optim.Adam(self.soft_q_net2.parameters(),
                                      lr=self.args.lr)
        self.policy_opt = optim.Adam(self.policy_net.parameters(),
                                     lr=self.args.lr)

        if will_train:
            current_time = time.strftime('%d-%b-%y-%H.%M.%S', time.localtime())
            self.plot_folder = f'plots/{current_time}'
            self.model_save_folder = f'model/{current_time}'
            make_sure_dir_exists(self.plot_folder)
            make_sure_dir_exists(self.model_save_folder)
            self.cp = Checkpoint(self.model_save_folder)

        if load_from is not None:
            try:
                self.load_checkpoint(load_from)
            except FileNotFoundError:
                print(f'{load_from} not found. Running default.')
        else:
            print('Starting from scratch.')

    def train(self):
        remove_log_file()
        clear_action_logs()
        eps_n = 0
        rewards = []
        test_rewards = []
        best_reward = -np.inf
        info = None
        for eps_n in range(1, self.args.max_eps + 1):  # Train loop
            self.set_mode('train')
            relaunch = (eps_n - 1) % (20 / self.args.test_rate) == 0
            state = self.env.reset(relaunch=relaunch,
                                   render=False,
                                   sampletrack=False)
            eps_r = 0
            sigma = (self.args.start_sigma - self.args.end_sigma) * (max(
                0, 1 - (eps_n - 1) / self.args.max_eps)) + self.args.end_sigma
            randomprocess = OrnsteinUhlenbeckProcess(self.args.theta, sigma,
                                                     self.action_size)

            for step in range(self.args.max_eps_time):  # Episode
                action = self.policy_net.get_train_action(state, randomprocess)
                next_state, reward, done, info = self.env.step(action)

                self.buffer.push(state, action, reward, next_state, done)

                state = next_state
                eps_r += reward

                if len(self.buffer) > self.args.batch_size:
                    self.update()

                if done:
                    break

            rewards.append(eps_r)

            test_reward = self.test(eps_n)
            test_rewards.append(test_reward)

            if test_reward > best_reward:
                best_reward = test_reward
                self.save_checkpoint(eps_n, best_reward)

            info_str = ', '.join(
                [key for key in info.keys() if key != 'place'])
            info_str += f", {info['place']}. place"
            log(f'Episode {eps_n:<4} Reward: {eps_r:>7.2f} Test Reward: {test_reward:>7.2f} Info: {info_str}'
                )

            if eps_n % self.args.plot_per == 0:
                self.plot(rewards, test_rewards, eps_n)

    def update(self):
        state, action, reward, next_state, done = self.buffer.sample(
            self.args.batch_size)

        state = FloatTensor(state).to(self.args.device)
        next_state = FloatTensor(next_state).to(self.args.device)
        action = FloatTensor(action).to(self.args.device)
        reward = FloatTensor(reward).unsqueeze(1).to(self.args.device)
        done = FloatTensor(np.float32(done)).unsqueeze(1).to(self.args.device)

        predicted_q_value1 = self.soft_q_net1(state, action)
        predicted_q_value2 = self.soft_q_net2(state, action)
        predicted_value = self.value_net(state)
        new_action, log_prob, epsilon, mean, log_std = self.policy_net.evaluate(
            state)

        # Training Q function
        target_value = self.target_value_net(next_state)
        target_q_value = reward + (1 - done) * self.args.gamma * target_value
        q_value_loss1 = self.soft_q_loss1(predicted_q_value1,
                                          target_q_value.detach())
        q_value_loss2 = self.soft_q_loss2(predicted_q_value2,
                                          target_q_value.detach())

        self.soft_q_opt1.zero_grad()
        q_value_loss1.backward()
        if self.args.clipgrad:
            self.clip_grad(self.soft_q_net1.parameters())
        self.soft_q_opt1.step()
        self.soft_q_opt2.zero_grad()
        q_value_loss2.backward()
        if self.args.clipgrad:
            self.clip_grad(self.soft_q_net2.parameters())
        self.soft_q_opt2.step()

        # Training Value function
        predicted_new_q_value = torch.min(self.soft_q_net1(state, new_action),
                                          self.soft_q_net2(state, new_action))
        target_value_func = predicted_new_q_value - self.args.alpha * log_prob.sum(
        )
        value_loss = self.value_criterion(predicted_value,
                                          target_value_func.detach())

        self.value_opt.zero_grad()
        value_loss.backward()
        if self.args.clipgrad:
            self.clip_grad(self.value_net.parameters())
        self.value_opt.step()

        # Training Policy function
        policy_loss = (log_prob - predicted_new_q_value).mean()

        self.policy_opt.zero_grad()
        policy_loss.backward()
        if self.args.clipgrad:
            self.clip_grad(self.policy_net.parameters())
        self.policy_opt.step()

        # Updating target value network
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(target_param.data *
                                    (1.0 - self.args.soft_tau) +
                                    param.data * self.args.soft_tau)

    def test(self, eps_n):
        self.set_mode('eval')
        rewards = []
        for step in range(self.args.test_rate):
            render = (eps_n % 30 == 0) and (step == 0)
            relaunch = render or ((eps_n % 30 == 0) and (step == 1))
            state = self.env.reset(relaunch=relaunch,
                                   render=render,
                                   sampletrack=False)
            running_reward = 0
            for t in range(self.args.max_eps_time):
                action = self.policy_net.get_test_action(state)
                state, reward, done, info = self.env.step(action)
                store(action, eps_n, reward, info, t == 0)
                running_reward += reward
                if done:
                    break
            rewards.append(running_reward)
        avg_reward = sum(rewards) / self.args.test_rate
        return avg_reward

    def plot(self, rewards, test_rewards, eps_n):
        torch.save({
            'train_rewards': rewards,
            'test_rewards': test_rewards
        }, f'{self.plot_folder}/{eps_n}.pth')
        figure = plt.figure()
        plt.plot(rewards, label='Train Rewards')
        plt.plot(test_rewards, label='Test Rewards')
        plt.xlabel('Episode')
        plt.legend()
        plt.savefig(f'{self.plot_folder}/{eps_n}.png')
        try:
            send_mail(f'Improved Torcs SAC | Episode {eps_n}',
                      f'{self.plot_folder}/{eps_n}.png')
            log('Mail has been sent.')
        except (KeyboardInterrupt, SystemExit):
            print('KeyboardInterrupt or SystemExit')
            raise
        except Exception as e:
            print('Mail Exception occured:', e)
            emsg = e.args[-1]
            emsg = emsg[:1].lower() + emsg[1:]
            log('Couldn\'t send mail because', emsg)

    def clip_grad(self, parameters):
        for param in parameters:
            param.grad.data.clamp_(-1, 1)

    def set_mode(self, mode):
        if mode == 'train':
            self.value_net.train()
            self.target_value_net.train()
            self.soft_q_net1.train()
            self.soft_q_net2.train()
            self.policy_net.train()
        elif mode == 'eval':
            self.value_net.eval()
            self.target_value_net.eval()
            self.soft_q_net1.eval()
            self.soft_q_net2.eval()
            self.policy_net.eval()
        else:
            raise ValueError('mode should be either train or eval')

    def save_checkpoint(self, eps_n, test_reward):
        self.cp.update(self.value_net, self.soft_q_net1, self.soft_q_net2,
                       self.policy_net)
        self.cp.save(f'e{eps_n}-r{test_reward:.4f}.pth')
        log(f'Saved checkpoint at episode {eps_n}.')

    def load_checkpoint(self, load_from):
        state_dicts = torch.load(load_from)
        self.value_net.load_state_dict(state_dicts['best_value'])
        self.soft_q_net1.load_state_dict(state_dicts['best_q1'])
        self.soft_q_net2.load_state_dict(state_dicts['best_q2'])
        self.policy_net.load_state_dict(state_dicts['best_policy'])
        print(f'Loaded from {load_from}.')

    def race(self, sampletrack=True):
        with torch.no_grad():
            state = self.env.reset(relaunch=True,
                                   render=True,
                                   sampletrack=sampletrack)
            running_reward = 0
            done = False
            while not done:
                action = self.policy_net.get_test_action(state)
                state, reward, done, info = self.env.step(action)
                running_reward += reward

            print('Reward:', running_reward)
示例#16
0
class PolicyTraining(object):
  def __init__(self, config):
    self.config = config
    self.run_dir = util.run_directory(config)

    self.session = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(
        allow_growth=True)))

    self.policy_network = PolicyNetwork('policy')
    self.policy_player = PolicyPlayer(self.policy_network, self.session)
    util.restore_or_initialize_network(self.session, self.run_dir,
                                       self.policy_network)

    # Train ops
    self.create_train_op(self.policy_network)
    self.writer = tf.summary.FileWriter(self.run_dir)
    util.restore_or_initialize_scope(self.session, self.run_dir,
                                     self.training_scope.name)

    self.opponents = Opponents(
        [RandomPlayer(),
         RandomThreatPlayer(),
         MaxThreatPlayer()])
    self.opponents.restore_networks(self.session, self.run_dir)

  def create_train_op(self, policy_network):
    with tf.variable_scope('policy_training') as self.training_scope:
      self.move = tf.placeholder(tf.int32, shape=[None], name='move')
      self.result = tf.placeholder(tf.float32, shape=[None], name='result')

      policy = tf.reshape(policy_network.policy, [-1, HEIGHT, WIDTH])
      move = tf.expand_dims(tf.one_hot(self.move, WIDTH), axis=1)
      turn = util.turn_win(policy_network.turn)
      move_probability = tf.reduce_sum(policy * move, axis=[1, 2])

      result_loss = -tf.reduce_mean(
          tf.log(move_probability) * turn * self.result)
      entropy_regularisation = (
          -config.entropy * tf.reduce_mean(policy_network.entropy))
      loss = result_loss + entropy_regularisation

      optimizer = tf.train.AdamOptimizer(self.config.learning_rate)
      self.global_step = tf.contrib.framework.get_or_create_global_step()
      self.train_op = optimizer.minimize(loss, self.global_step)

      # Summary
      tf.summary.scalar('loss', loss)
      for var in policy_network.variables + policy_network.policy_layers:
        tf.summary.histogram(var.name, var)
      self.summary = tf.summary.merge_all()

  def train(self):
    for _ in range(self.config.batches):
      opponent = self.opponents.choose_opponent()
      games = self.play_games(opponent)
      step, summary = self.train_games(opponent, games)
      self.process_results(opponent, games, step, summary)

      if self.opponents.all_beaten():
        name = self.opponents.next_network_name()
        print('All opponents beaten. Creating %s' % name)
        self.create_new_opponent(name)

      if step % 100 == 0:
        self.save()

    self.save()

  def save(self):
    util.save_network(self.session, self.run_dir, self.policy_network)
    util.save_scope(self.session, self.run_dir, self.training_scope.name)
    self.opponents.save_opponent_stats(self.run_dir)

  def play_games(self, opponent):
    # Create games
    games = incomplete_games = [Game() for _ in range(self.config.batch_size)]

    # Let opponent play first in half of the games
    self.play_move(games[0:len(games) // 2], opponent)
    player = self.policy_player

    while incomplete_games:
      self.play_move(incomplete_games, player)
      player = self.policy_player if player != self.policy_player else opponent
      incomplete_games = [
          game for game in incomplete_games if not game.position.gameover()
      ]

    return games

  def play_move(self, games, player):
    positions = [game.position for game in games]
    moves = player.play(positions)

    for game, move in zip(games, moves):
      game.move(move, player == self.policy_player)

  def train_games(self, opponent, games):
    turn, disks, empty, legal_moves, threats, moves, results = ([], [], [], [],
                                                                [], [], [])
    for game in games:
      for position, move in game.policy_player_moves:
        turn.append(position.turn)
        disks.append(position.disks)
        empty.append(position.empty)
        legal_moves.append(position.legal_moves)
        threats.append(position.threats)
        moves.append(move)
        results.append(game.result)

    _, step, summary = self.session.run(
        [self.train_op, self.global_step, self.summary], {
            self.policy_network.turn: turn,
            self.policy_network.disks: disks,
            self.policy_network.empty: empty,
            self.policy_network.legal_moves: legal_moves,
            self.policy_network.threats: threats,
            self.move: moves,
            self.result: results
        })

    return step, summary

  def process_results(self, opponent, games, step, summary):
    win_rate = np.mean([game.policy_player_score for game in games])
    average_moves = sum([len(game.moves)
                         for game in games]) / self.config.batch_size

    opponent_summary = tf.Summary()
    opponent_summary.value.add(
        tag=self.training_scope.name + '/' + opponent.name + '/win_rate',
        simple_value=win_rate)
    opponent_summary.value.add(
        tag=self.training_scope.name + '/' + opponent.name + '/moves',
        simple_value=average_moves)

    self.writer.add_summary(summary, step)
    self.writer.add_summary(opponent_summary, step)

    self.opponents.update_win_rate(opponent, win_rate)

    print('Step %d. Opponent %s, win rate %.2f <%.2f>, %.2f moves' %
          (step, opponent.name, win_rate, self.opponents.win_rates[opponent],
           average_moves))

  def create_new_opponent(self, name):
    # Create clone of policy_player
    clone = PolicyNetwork(name)
    self.session.run(self.policy_network.assign(clone))
    util.save_network(self.session, self.run_dir, clone)
    new_opponent = PolicyPlayer(clone, self.session)

    self.opponents.decrease_win_rates()
    self.opponents.add_opponent(new_opponent)
示例#17
0
def trainD(file_name="Distral_1col", list_of_envs=[GridworldEnv(4),
            GridworldEnv(5)], batch_size=128, gamma=0.999, alpha=0.9,
            beta=5, eps_start=0.9, eps_end=0.05, eps_decay=5,
            is_plot=False, num_episodes=200,
            max_num_steps_per_episode=1000, learning_rate=0.001,
            memory_replay_size=10000, memory_policy_size=1000):
    """
    Soft Q-learning training routine. Retuns rewards and durations logs.
    Plot environment screen
    """
    # action dimension
    num_actions = list_of_envs[0].action_space.n
    # total envs
    num_envs = len(list_of_envs)
    # pi_0
    policy = PolicyNetwork(num_actions)
    # Q value, every environment has one, used to calculate A_i,
    models = [DQN(num_actions) for _ in range(0, num_envs)]   ### Add torch.nn.ModuleList (?)
    # replay buffer for env ???
    memories = [ReplayMemory(memory_replay_size, memory_policy_size) for _ in range(0, num_envs)]

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    # device = "cpu"
    print(device)
    # model
    policy = policy.to(device)
    for i in range(len(models)):
        models[i] = models[i].to(device)

    # optimizer for every Q model
    optimizers = [optim.Adam(model.parameters(), lr=learning_rate)
                    for model in models]
    # optimizer for policy
    policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate)
    # optimizer = optim.RMSprop(model.parameters(), )

    # info list for each environment
    episode_durations = [[] for _ in range(num_envs)]   # list of local steps
    episode_rewards = [[] for _ in range(num_envs)]     # list of list of episode reward

    episodes_done = np.zeros(num_envs)      # episode num
    steps_done = np.zeros(num_envs)         # global timesteps for each env
    current_time = np.zeros(num_envs)       # local timesteps for each env

    # Initialize environments
    for env in list_of_envs:
        env.reset()

    while np.min(episodes_done) < num_episodes:
        policy.train()
        for model in models:
            model.train()

        # TODO: add max_num_steps_per_episode

        # Optimization is given by alterating minimization scheme:
        #   1. do the step for each env
        #   2. do one optimization step for each env using "soft-q-learning".
        #   3. do one optimization step for the policy

        #   1. do the step for each env
        for i_env, env in enumerate(list_of_envs):
            # print("Cur episode:", i_episode, "steps done:", steps_done,
            #         "exploration factor:", eps_end + (eps_start - eps_end) * \
            #         math.exp(-1. * steps_done / eps_decay))
        
            # last_screen = env.current_grid_map
            # ===========update step info begin========================
            current_screen = get_screen(env)
            # state
            state = current_screen # - last_screen
            # action chosen by pi_1~pi_i
            action = select_action(state, policy, models[i_env], num_actions,
                                    eps_start, eps_end, eps_decay,
                                    episodes_done[i_env], alpha, beta, device)
            # global_steps
            steps_done[i_env] += 1
            # local steps
            current_time[i_env] += 1
            # reward
            _, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # next state
            last_screen = current_screen
            current_screen = get_screen(env)
            if not done:
                next_state = current_screen # - last_screen
            else:
                next_state = None

            # add to buffer
            time = Tensor([current_time[i_env]])
            memories[i_env].push(state, action, next_state, reward, time)

            #   2. do one optimization step for each env using "soft-q-learning".
            # Perform one step of the optimization (on the target network)
            optimize_model(policy, models[i_env], optimizers[i_env],
                            memories[i_env], batch_size, alpha, beta, gamma, device)
            # ===========update step info end ========================


            # ===========update episode info begin ====================
            if done:
                print("ENV:", i_env, "iter:", episodes_done[i_env],
                    "\treward:", env.episode_total_reward,
                    "\tit:", current_time[i_env], "\texp_factor:", eps_end +
                    (eps_start - eps_end) * math.exp(-1. * episodes_done[i_env] / eps_decay))
                # reset env
                env.reset()
                # episode steps
                episodes_done[i_env] += 1
                # append each episode local timesteps list for every env
                episode_durations[i_env].append(current_time[i_env])
                # reset local timesteps
                current_time[i_env] = 0
                # append total episode_reward to list
                episode_rewards[i_env].append(env.episode_total_reward)
                if is_plot:
                    plot_rewards(episode_rewards, i_env)
            # ===========update episode info end ====================

        #   3. do one optimization step for the policy
        # after all envs has performed one step, optimize policy
        optimize_policy(policy, policy_optimizer, memories, batch_size,
                    num_envs, gamma, device)

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results

    np.save(file_name + '-distral-2col-rewards', episode_rewards)
    np.save(file_name + '-distral-2col-durations', episode_durations)

    return models, policy, episode_rewards, episode_durations
def trainD(file_name="Distral_2col_SQL",
           list_of_envs=[GridworldEnv(5),
                         GridworldEnv(4),
                         GridworldEnv(6)],
           batch_size=128,
           gamma=0.999,
           alpha=0.8,
           beta=5,
           eps_start=0.9,
           eps_end=0.05,
           eps_decay=5,
           is_plot=False,
           num_episodes=200,
           max_num_steps_per_episode=1000,
           learning_rate=0.001,
           memory_replay_size=10000,
           memory_policy_size=1000):
    """
    Soft Q-learning training routine. Returns rewards and durations logs.
    """
    num_actions = list_of_envs[0].action_space.n
    input_size = list_of_envs[0].observation_space.shape[0]
    num_envs = len(list_of_envs)
    policy = PolicyNetwork(input_size, num_actions)
    models = [DQN(input_size, num_actions) for _ in range(0, num_envs)]
    memories = [
        ReplayMemory(memory_replay_size, memory_policy_size)
        for _ in range(0, num_envs)
    ]

    optimizers = [
        optim.Adam(model.parameters(), lr=learning_rate) for model in models
    ]
    policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

    episode_durations = [[] for _ in range(num_envs)]
    episode_rewards = [[] for _ in range(num_envs)]

    steps_done = np.zeros(num_envs)
    episodes_done = np.zeros(num_envs)
    current_time = np.zeros(num_envs)

    # Initialize environments
    states = []
    for env in list_of_envs:
        states.append(
            torch.from_numpy(env.reset()).type(torch.FloatTensor).view(
                -1, input_size))

    while np.min(episodes_done) < num_episodes:
        # TODO: add max_num_steps_per_episode

        # Optimization is given by alternating minimization scheme:
        #   1. do the step for each env
        #   2. do one optimization step for each env using "soft-q-learning".
        #   3. do one optimization step for the policy

        for i_env, env in enumerate(list_of_envs):

            # select an action
            action = select_action(states[i_env], policy, models[i_env],
                                   num_actions, eps_start, eps_end, eps_decay,
                                   episodes_done[i_env], alpha, beta)

            steps_done[i_env] += 1
            current_time[i_env] += 1
            next_state_tmp, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # Observe new state
            next_state = torch.from_numpy(next_state_tmp).type(
                torch.FloatTensor).view(-1, input_size)

            if done:
                next_state = None

            # Store the transition in memory
            time = Tensor([current_time[i_env]])
            memories[i_env].push(states[i_env], action, next_state, reward,
                                 time)

            # Perform one step of the optimization (on the target network)
            optimize_model(policy, models[i_env], optimizers[i_env],
                           memories[i_env], batch_size, alpha, beta, gamma)

            # Update state
            states[i_env] = next_state

            # Check if agent reached target
            if done or current_time[i_env] >= max_num_steps_per_episode:
                if episodes_done[i_env] <= num_episodes:
                    print(
                        "ENV:", i_env, "iter:", episodes_done[i_env],
                        "\treward:{0:.2f}".format(env.episode_total_reward),
                        "\tit:", current_time[i_env], "\texp_factor:",
                        eps_end + (eps_start - eps_end) *
                        math.exp(-1. * episodes_done[i_env] / eps_decay))

                episode_rewards[i_env].append(env.episode_total_reward)
                episodes_done[i_env] += 1
                episode_durations[i_env].append(current_time[i_env])
                current_time[i_env] = 0

                states[i_env] = torch.from_numpy(env.reset()).type(
                    torch.FloatTensor).view(-1, input_size)

                if is_plot:
                    plot_rewards(episode_rewards, i_env)

        # Perform one step of the optimization on the Distilled policy
        optimize_policy(policy, policy_optimizer, memories, batch_size,
                        num_envs, gamma, alpha, beta)

    print('Complete')
    env.render(close=True)
    env.close()

    ## Store Results
    np.save(file_name + '-rewards', episode_rewards)
    np.save(file_name + '-durations', episode_durations)

    return models, policy, episode_rewards, episode_durations
示例#19
0
            acc,
            len(test_loader.dataset),
            test_acc,
        ))
    return test_loss, test_acc


tr_losses = []
te_losses = []
te_accs = []

batch_size = 128
num_epochs = 25
log_interval = 100

model = PolicyNetwork()
train_dataset = TrainDataset()
test_dataset = TestDataset()

train_loader = DataLoader(train_dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          drop_last=True)
test_loader = DataLoader(test_dataset,
                         batch_size=batch_size * 2,
                         shuffle=True,
                         drop_last=True)

model.to('cuda')
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = F.cross_entropy