예제 #1
0
    def __init__(self,
                 random_action_method,
                 future_discount=0.75,
                 learning_rate=0.001,
                 saveAndLoad=True):
        learning_rate = learning_rate * (1 - future_discount) / (1 - 0.8)

        self.model_a = RLModel()
        self.model_a.build((None, AGENT_INPUT_SIZE))

        self.model_b = RLModel()
        self.model_b.build((None, AGENT_INPUT_SIZE))

        self.saveAndLoad = saveAndLoad

        if os.path.isfile(SAVE_PATH_A) and os.path.isfile(
                SAVE_PATH_B) and saveAndLoad:
            print("Loading")
            self.model_a.load_weights(SAVE_PATH_A)
            self.model_b.load_weights(SAVE_PATH_B)

        self.exp_rep_a = ExperienceReplay(ER_SIZE, AGENT_INPUT_SIZE)
        self.exp_rep_b = ExperienceReplay(ER_SIZE, AGENT_INPUT_SIZE)

        self.random_action_method = random_action_method

        self.learning_rate = learning_rate
        self.future_discount = future_discount

        self.loss_measure = tf.losses.MeanSquaredError()
        self.opt = tf.optimizers.Adam(lr=self.learning_rate)

        self.n_since_last_train = 0

        self.latestLoss = tf.add(0, 0)
    def __init__(self,
                 env,
                 batchsize=64,
                 pic_size=(96, 96),
                 num_frame_stack=4,
                 gamma=0.95,
                 frame_skip=1,
                 train_freq=4,
                 initial_epsilon=1.0,
                 min_epsilon=0.1,
                 render=True,
                 epsilon_decay_steps=int(1e6),
                 min_experience_size=int(1e3),
                 experience_capacity=int(1e5),
                 network_update_freq=5000,
                 regularization=1e-6,
                 optimizer_params=None,
                 action_map=None):
        self.exp_history = ExperienceReplay(num_frame_stack,
                                            capacity=experience_capacity,
                                            pic_size=pic_size)
        self.playing_cache = ExperienceReplay(num_frame_stack,
                                              capacity=num_frame_stack * 5 +
                                              10,
                                              pic_size=pic_size)
        self.network_update_freq = network_update_freq
        self.action_map = action_map
        self.env = env
        self.batchsize = batchsize
        self.num_frame_stack = num_frame_stack
        self.gamma = gamma
        self.frame_skip = frame_skip
        self.train_freq = train_freq
        self.initial_epsilon = initial_epsilon
        self.min_epsilon = min_epsilon
        self.epsilon_decay_steps = epsilon_decay_steps
        self.render = render
        self.min_experience_size = min_experience_size
        self.pic_size = pic_size
        self.regularization = regularization
        self.optimizer_params = optimizer_params or dict(learning_rate=0.0004,
                                                         epsilon=1e-7)
        self.do_training = True
        self.playing_epsilon = 0.0
        self.session = None
        self.state_size = (self.num_frame_stack, ) + self.pic_size
        self.global_counter = 0
        self.episode_counter = 0

        if action_map is not None:
            self.dim_actions = len(action_map)
        else:
            self.dim_actions = env.action_space.n

        self.q_values = []
        self.loss_his = []
    def __init__(self):
        self.batch_size = 64  # How many experiences to use for each training step
        self.train_frequency = 5  # How often you update the network
        self.num_epochs = 20  # How many epochs to train when updating the network
        self.y = 0.99  # Discount factor
        self.prob_random_start = 0.6  # Starting chance of random action
        self.prob_random_end = 0.1  # Ending chance of random action
        self.annealing_steps = 1000.  # Steps of training to reduce from start_e -> end_e
        self.max_num_episodes = 10000  # Max number of episodes you are allowes to played to train the game
        self.min_pre_train_episodes = 100  # Number of episodes played with random actions before to start training.
        self.max_num_step = 50  # Maximum allowed episode length
        self.goal = 15  # Number of rewards we want to achieve while playing a game.

        # Set env
        self.env = gameEnv(partial=False, size=5)

        # Reset everything from keras session
        K.clear_session()

        # Setup our Q-networks
        self.main_qn = Qnetwork()
        self.target_qn = Qnetwork()

        # Setup our experience replay
        self.experience_replay = ExperienceReplay()
예제 #4
0
    def __init__(self,
                 random_action_method,
                 future_discount=0.75,
                 learning_rate=0.001,
                 load_path=None):
        learning_rate = learning_rate * (1 - future_discount) / (1 - 0.8)

        self.model = RLModel()
        self.model.build((None, AGENT_INPUT_SIZE))
        self.load_path = load_path
        if load_path is not None and os.path.isfile(load_path):
            print("Loading")
            self.model.load_weights(load_path)

        self.exp_rep = ExperienceReplay(ER_SIZE, AGENT_INPUT_SIZE)

        self.random_action_method = random_action_method

        self.learning_rate = learning_rate
        self.future_discount = future_discount

        self.loss_measure = tf.losses.MeanSquaredError()
        self.opt = tf.optimizers.Adam(lr=self.learning_rate)

        self.n_since_last_train = 0

        self.latestLoss = tf.add(0, 0)
예제 #5
0
    def __init__(self):
        self.eps = 0.1
        self.env = GridEnv(3)
        self.batch_size = 20

        if prioritized_replay and replay_type == "proportional":
            self.replay = ProportionalReplay(max_buffer_size,
                                             prioritized_replay_alpha)
        elif prioritized_replay and replay_type == "ranked":
            N_list = [self.batch_size] + [
                int(x) for x in np.linspace(100, max_buffer_size, 5)
            ]
            save_quantiles(N_list=N_list,
                           k=self.batch_size,
                           alpha=prioritized_replay_alpha)
            self.replay = RankBasedReplay(max_buffer_size,
                                          prioritized_replay_alpha)
        else:
            self.replay = ExperienceReplay(
                max_buffer_size)  # passing size of buffer

        # define graph
        self.inputs = tf.placeholder(tf.float32,
                                     shape=(None, self.env.state_size))
        self.target_values = tf.placeholder(tf.float32, shape=(None, ))
        self.actions = tf.placeholder(tf.int32, shape=(None, ))
        self.is_weights = tf.placeholder(tf.float32, shape=(
            None, ))  # importance sampling weights for prioritized replay
        self.Q_out_op, self.Q_update_op, self.td_error_op = self.build_graph(
        )  # build main network
        self.target_Q_out_op, _, _ = self.build_graph(
            'target')  # build identical target network

        self.init_op = tf.global_variables_initializer()
        self.sess = tf.Session()
예제 #6
0
    def __init__(self):
        self.experience_replay = ExperienceReplay('BreakoutDeterministic-v0',
                                                  FLAGS.replay_buffer_size, 84,
                                                  84, 4, self.policy,
                                                  FLAGS.decay_to_epoch)

        config = DQNConfig()
        config.learning_rate = FLAGS.learning_rate
        config.gamma = FLAGS.gamma
        config.decay = FLAGS.decay
        config.momentum = FLAGS.momentum
        config.eps = FLAGS.eps
        config.input_width = FLAGS.image_width
        config.input_height = FLAGS.image_height
        config.skip = FLAGS.skip
        self.dqn = DQN(config, FLAGS.use_huber)

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        logger.info('initializing variables...')
        self.sess.run(tf.global_variables_initializer())
        self.update_target()

        self.epoch = 0
        self.decay_epsilon()
예제 #7
0
    def test_observation_zeroing(self):
        """ Tests zeroing out of frames not from current episode """
        obs_shape = (84, 84, 1)
        er = ExperienceReplay(5, obs_shape)

        for terminal_idx in range(5):
            obs_ = []
            obs_next_ = []
            for i in range(1, 6):
                partial_obs = np.ones(obs_shape) * i
                terminal = 1 if i == terminal_idx else 0
                er.append(partial_obs, 0, 0, terminal)

                if i <= terminal_idx:
                    partial_obs *= 0
                if i < 5:
                    obs_.append(partial_obs)
                if i > 1:
                    obs_next_.append(partial_obs)
            obs_ = np.transpose(np.array(obs_), (3, 1, 2, 0))
            obs_next_ = np.transpose(np.array(obs_next_), (3, 1, 2, 0))

            batch = er.sample(1)
            obs, rewards, actions, obs_next, terminals = batch
            assert np.array_equal(obs_, obs)
            assert np.array_equal(obs_next_, obs_next)
예제 #8
0
def init():
    train_env = SquigglesEnvironment(num_notes=2)
    evaluation_env = SquigglesEnvironment(num_notes=2)

    train_env = tf_py_environment.TFPyEnvironment(train_env)
    evaluation_env = tf_py_environment.TFPyEnvironment(evaluation_env)

    agent, _ = generic_dqn_agent(train_env)

    experience_replay = ExperienceReplay(agent, train_env, BATCH_SIZE)

    return agent, train_env, evaluation_env, experience_replay
예제 #9
0
def run_episode(plan_step_fn,
                learner,
                dataset,
                cache_subtree,
                add_returns,
                preproc_obs_fn=None,
                render=False):
    episode_done = False
    actor.reset()
    episode_rewards = []
    aux_replay = ExperienceReplay(
    )  # New auxiliary buffer to save current episode transitions
    while not episode_done:
        # Planning step
        tree_policy = plan_step_fn(len(episode_rewards))

        # Execute action (choose one node as the new root from depth 1)
        a = sample_pmf(tree_policy)
        prev_root_data, current_root_data = actor.step(a,
                                                       cache_subtree,
                                                       render,
                                                       render_size=(512, 512))
        aux_replay.append({
            "observations": prev_root_data["obs"],
            "target_policy": tree_policy
        })
        episode_rewards.append(current_root_data["r"])
        episode_done = current_root_data["done"]

        # Learning step
        if learner is not None:
            batch = dataset.sample(batch_size)
            if preproc_obs_fn is not None:
                batch["observations"] = preproc_obs_fn(batch["observations"])
            obs = tf.constant(batch["observations"], dtype=tf.float32)
            target_policy = tf.constant(batch["target_policy"],
                                        dtype=tf.float32)
            if add_returns:
                returns = tf.constant(batch["returns"], dtype=tf.float32)
                loss, _ = learner.train_step(obs, target_policy, returns)
            else:
                loss, _ = learner.train_step(obs, target_policy)

    # Add episode to the dataset
    if add_returns:
        returns = compute_returns(episode_rewards,
                                  discount_factor)  # Backpropagate rewards
        aux_replay.add_column("returns", returns)  # Add them to the dataset
    dataset.extend(
        aux_replay
    )  # Add transitions to the buffer that will be used for learning

    return episode_rewards
예제 #10
0
    def test_sampling(self):
        """ Tests observation construction from partial observations """
        obs_shape = (84, 84, 1)
        er = ExperienceReplay(5, obs_shape)

        for i in range(1, 6):
            partial_obs = np.ones(obs_shape) * i
            er.append(partial_obs, 1, 1, 0)

        batch = er.sample(1)
        _, rewards, actions, _, terminals = batch
        assert np.array_equal(rewards, np.array([1]))
        assert np.array_equal(actions, np.array([1]))
        assert np.array_equal(terminals, np.array([0]))
    def __init__(self):

        self.prob_random = 1.0  # Probability to play random action
        self.y = .99  # Discount factor
        self.batch_size = 64  # How many experiences to use for each training step
        self.prob_random_end = .01  # Ending chance of random action
        self.prob_random_decay = .996  # Decrease decay of the prob random
        self.max_episode = 300  # Max number of episodes you are allowes to played to train the game
        self.expected_goal = 200  # Expected goal

        self.dnn = DNN()
        self.env = gym.make('CartPole-v0')

        self.memory = ExperienceReplay(buffer_size=10000)

        self.metadata = [
        ]  # we will store here info score, at the end of each episode
예제 #12
0
def main():
    hist_length = 50
    processor = Processor(history_length=hist_length)
    price_history = processor.fetchData()
    train_price_history = price_history['train']
    test_price_history = price_history['test']
    env = Environment(horizon=20,
                      train_price_history=train_price_history,
                      test_price_history=test_price_history,
                      history_length=hist_length)
    exp_replay = ExperienceReplay()
    agent = Agent(feature_size=6,
                  window=hist_length,
                  action_size=3,
                  experience_replay=exp_replay,
                  environment=env)
    agent.train()
    print("Agent done training, now testing: ")
    agent.test(test_price_history)
예제 #13
0
    def __init__(self):
        # gamma is a parameter of Q - learing algorithm
        self.gamma = 0.9

        # We use epsilon - greedy strategy of learning
        self.epsilon = 1
        self.epsilon_decay = 0.99
        self.epsilon_min = 0.01
        
        # Number of epochs (fully played games) to study an agent
        self.epochs = 500

        # Game to play
        self.game = Game()

        # Number of hidden layer nodes
        self.hidden_layer_nodes = 20

        # Create keras model
        # _________________________________________________________________
        # Layer (type)                 Output Shape              Param #   
        # =================================================================
        # dense_1 (Dense)              (None, 20)                120       
        # _________________________________________________________________
        # dense_2 (Dense)              (None, 20)                420       
        # _________________________________________________________________
        # dense_3 (Dense)              (None, 5)                 105       
        # =================================================================
        # Total params: 645
        # Trainable params: 645
        # Non-trainable params: 0
        # _________________________________________________________________
        self.model = Sequential()
        self.model.add(Dense(self.hidden_layer_nodes, input_dim=self.game.state_size, activation='relu'))
        self.model.add(Dense(self.hidden_layer_nodes, activation='relu'))
        self.model.add(Dense(len(POSSIBLE_ACTIONS), activation='linear'))
        self.model.compile('Adam', loss='mse')

        # Initialize experience replay
        self.experience_replay = ExperienceReplay(size=2000)
        self.batch_size = 20
        self.max_turns = 100
예제 #14
0
    def __init__(self,
                 env,
                 net_update_rate: int = 25,
                 exploration_rate: float = 1.0,
                 exploration_decay: float = 0.00005):
        # set hyper parameters
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay
        self.net_updating_rate = net_update_rate

        # set environment
        self.env = env
        self.state_shape = env.get_state_shape()
        self.action_shape = env.get_action_shape()

        # the number of experience per batch for batch learning
        # Experience Replay for batch learning
        self.exp_rep = ExperienceReplay()

        # Deep Q Network
        self.net = None
예제 #15
0
    def test_observation_construction(self):
        """ Tests observation construction from partial observations """
        obs_shape = (84, 84, 1)
        er = ExperienceReplay(5, obs_shape)

        obs_ = []
        obs_next_ = []
        for i in range(1, 6):
            partial_obs = np.ones(obs_shape) * i
            if i < 5:
                obs_.append(partial_obs)
            if i > 1:
                obs_next_.append(partial_obs)
            er.append(partial_obs, 0, 0, 0)
        obs_ = np.transpose(np.array(obs_), (3, 1, 2, 0))
        obs_next_ = np.transpose(np.array(obs_next_), (3, 1, 2, 0))

        batch = er.sample(1)
        obs, rewards, actions, obs_next, terminals = batch
        assert np.array_equal(obs_, obs)
        assert np.array_equal(obs_next_, obs_next)
예제 #16
0
 def __init__(self, FLAGS):
     """
     This class build the model that implements the deterministic 
     gradient descent algorithm.
     
     :param FLAGS: TensorFlow flags which contain the values for hyperparameters
     
     """
     
     self.FLAGS=FLAGS
     
     self.env = gym.make('Pendulum-v0')
     self.state_size = len(self.env.observation_space.sample())
     self.num_episodes=1000
     self.batch_size=64
     
     self.exp_replay=ExperienceReplay(50000,1500, FLAGS)
     
     self.action_noise=OrnsteinUhlenbeckActionNoise(self.env,mu= 0.0, sigma=0.2, theta=.15, dt=1e-2, x0=None)
     
     self.actor_target=Actor(scope='target',target_network=None,env=self.env, flags=FLAGS)
     self.actor=Actor(scope='actor',target_network=self.actor_target,env=self.env, flags=FLAGS)
     
     self.critic_target=Critic(scope='target',target_network=None,env=self.env, flags=FLAGS)
     self.critic=Critic(scope='critic',target_network=self.critic_target,env=self.env, flags=FLAGS)
     
     init = tf.global_variables_initializer()
     self.session = tf.InteractiveSession()
     self.session.run(init)
     
     self.critic.set_session(self.session)
     self.actor.set_session(self.session)
     self.actor_target.set_session(self.session)
     self.critic_target.set_session(self.session)
     
     self.critic.init_target_network()
     self.actor.init_target_network()
예제 #17
0
    def __init__(self, s_size, a_size, seed):
        """

        Parameters:
            s_size (int): dimension of each state
            a_size (int): dimension of each action
            seed (int): random seed
        """
        self.s_size = s_size
        self.a_size = a_size
        self.seed = random.seed(seed)

        # Initialize both the Q-networks
        self.local_dqn = Model(s_size, a_size, seed).to(device)
        self.target_dqn = Model(s_size, a_size, seed).to(device)
        self.optimizer = optim.Adam(self.local_dqn.parameters(),
                                    lr=c.LEARNING_RATE)

        # Initialize experience deque
        self.buffer = ExperienceReplay(a_size, c.REPLAY_BUFFER_SIZE,
                                       c.BATCH_SIZE, seed)

        # Time step counter used for updating as per UPDATE_FREQUENCY
        self.t_step = 0
예제 #18
0
pygame.key.set_repeat(1, 1)

env = GameEnvironment(DISPLAY_SHAPE, 1.0 / float(FPS))


def action_vector(a):
    res = np.zeros(9)
    res[int(a)] = 1.0
    return res


# Define Experience Replay
if SAVE_EXPERIENCE:
    er = ExperienceReplay.load(EXP_REPLAY_FILE)
    if er == None:
        er = ExperienceReplay(BUFFER_SIZE)


def gameover(hero_score):

    gameDisplay.fill(WHITE)

    font = pygame.font.SysFont(None, 42)
    text = font.render("GAME OVER", True, BLACK)
    gameDisplay.blit(text, (DISPLAY_SHAPE[0] / 3, DISPLAY_SHAPE[1] / 3))

    pygame.display.update()

    pygame.time.delay(3000)

예제 #19
0
파일: dqn.py 프로젝트: alessandrobessi/2048
from experience_replay import ExperienceReplay
from logger import Logger

ACTIONS = {0: "UP", 1: "DOWN", 2: "RIGHT", 3: "LEFT"}
NUM_ACTIONS = len(ACTIONS)

NUM_GAMES = 30000
OBSERVE = 1000
MAX_TILE = 2048

epsilon = 0.1
min_epsilon = 1e-2
gamma_epsilon = 0.999
gamma_reward = 0.99

replay = ExperienceReplay(capacity=1e6)
logger = Logger()

online = PolicyNetwork(batch_size=32)
target = PolicyNetwork(batch_size=32)


def preprocess(a: np.array) -> np.array:
    a = np.where(a <= 0, 1, a)
    a = np.log2(a) / np.log2(MAX_TILE)
    return a


if __name__ == "__main__":

    best_score = 0
예제 #20
0
파일: main.py 프로젝트: apzarabi/dqn-ale
def main(_):
    # Reproducability
    tf.reset_default_graph()
    np.random.seed(cfg.random_seed)
    tf.set_random_seed(cfg.random_seed)

    # Logging
    summary_writer = tf.summary.FileWriter(cfg.log_dir)

    if not cfg.evaluate and not tf.gfile.Exists(cfg.save_dir):
        tf.gfile.MakeDirs(cfg.save_dir)
    else:
        assert tf.gfile.Exists(cfg.save_dir)

    # TODO handel this
    episode_results_path = os.path.join(cfg.log_dir, "episodeResults.csv")
    episode_results = tf.gfile.GFile(episode_results_path, "w")
    episode_results.write("model_freq={},save_dir={}".format(
        cfg.model_freq, cfg.save_dir))
    episode_results.write("episode,reward,steps\n")
    episode_results.flush()

    # Setup ALE and DQN graph
    obs_shape = (84, 84, 1)
    input_height, input_width, _ = obs_shape

    dqn = DQN(input_height, input_width, cfg.num_actions)

    # Global step
    global_step = tf.train.get_or_create_global_step()
    increment_step = tf.assign_add(global_step, 1)

    # Save all variables
    vars_to_save = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                     scope="agent/q")
    vars_to_save.append(global_step)
    saver = tf.train.Saver(var_list=vars_to_save)

    # Handle loading specific variables
    sess_config = tf.ConfigProto()
    sess_config.gpu_options.allow_growth = True
    sess = tf.Session(config=sess_config)

    restore_or_initialize_weights(sess, dqn, saver)
    sess.run(dqn.copy_to_target)

    if cfg.evaluate:
        # if in evaluation mode, saver is no longer needed
        saver = None

    # ##### Restoring AEs ########
    if not cfg.evaluate:
        vaes = create_generative_models(sess)
        image_summaries = []
        image_summaries_ph = tf.placeholder(tf.float32,
                                            shape=(4, 84, 84, 4),
                                            name="image_summaries_placeholder")
        for i in range(4):
            for j in range(4):
                image_summaries.append(
                    tf.summary.image(
                        "VAE_OUT_{}_{}".format(i, j),
                        tf.reshape(image_summaries_ph[i, :, :, j],
                                   (1, 84, 84, 1))))
    # ############################

    if not cfg.evaluate:
        summary_writer.add_graph(tf.get_default_graph())
        summary_writer.add_graph(vaes[0].graph)
        summary_writer.add_graph(vaes[1].graph)
        summary_writer.add_graph(vaes[2].graph)

        summary_writer.flush()

    # Initialize ALE
    postprocess_frame = lambda frame: sess.run(dqn.process_frame,
                                               feed_dict={dqn.image: frame})
    env = AtariEnvironment(obs_shape, postprocess_frame)

    # Replay buffer
    if not cfg.evaluate:
        replay_buffer = ExperienceReplay(cfg.replay_buffer_size, obs_shape)

    # Perform random policy to get some training data
    with tqdm(total=cfg.seed_frames,
              disable=cfg.disable_progress or cfg.evaluate) as pbar:
        seed_steps = 0
        while seed_steps * cfg.frame_skip < cfg.seed_frames and not cfg.evaluate:
            action = np.random.randint(cfg.num_actions)
            reward, next_state, terminal = env.act(action)
            seed_steps += 1

            replay_buffer.append(next_state[:, :, -1, np.newaxis], action,
                                 reward, terminal)

            if terminal:
                pbar.update(env.episode_frames)
                env.reset(inc_episode_count=False)

    if cfg.evaluate:
        assert cfg.max_episode_count > 0
    else:
        assert len(replay_buffer) >= cfg.seed_frames // cfg.frame_skip

    # Main training loop
    steps = tf.train.global_step(sess, global_step)
    env.reset(inc_episode_count=False)
    terminal = False

    total = cfg.max_episode_count if cfg.evaluate else cfg.num_frames
    with tqdm(total=total, disable=cfg.disable_progress) as pbar:
        # Loop while we haven't observed our max frame number
        # If we are at our max frame number we will finish the current episode
        while (not (
                # We must be evaluating or observed the last frame
                # As well as be terminal
                # As well as seen the maximum episode number
            (steps * cfg.frame_skip > cfg.num_frames or cfg.evaluate)
                and terminal and env.episode_count >= cfg.max_episode_count)):
            # Epsilon greedy policy with epsilon annealing
            if not cfg.evaluate and steps * cfg.frame_skip < cfg.eps_anneal_over:
                # Only compute epsilon step while we're still annealing epsilon
                epsilon = cfg.eps_initial - steps * (
                    (cfg.eps_initial - cfg.eps_final) / cfg.eps_anneal_over)
            else:
                epsilon = cfg.eps_final

            # Epsilon greedy policy
            if np.random.uniform() < epsilon:
                action = np.random.randint(0, cfg.num_actions)
            else:
                action = sess.run(dqn.action, feed_dict={dqn.S: [env.state]})

            # Perform environment step
            steps = sess.run(increment_step)
            reward, next_state, terminal = env.act(action)

            if not cfg.evaluate:
                replay_buffer.append(next_state[:, :, -1, np.newaxis], action,
                                     reward, terminal)

                # Sample and do gradient updates
                if steps % cfg.learning_freq == 0:
                    placeholders = [
                        dqn.S,
                        dqn.actions,
                        dqn.rewards,
                        dqn.S_p,
                        dqn.terminals,
                    ]
                    batch = replay_buffer.sample(cfg.batch_size)
                    train_op = [dqn.train]
                    if steps % (cfg.learning_freq * cfg.model_freq) == 0:
                        experience_batch = batch
                        batch = imagined_batch(vaes, batch[1])
                        if steps / (cfg.learning_freq * cfg.model_freq) < 10:
                            placeholders.append(image_summaries_ph)
                            batch = list(batch)
                            batch.append(batch[0][
                                np.random.randint(0, 32, size=4), :, :, :])
                            train_op.extend(image_summaries)
                    if steps % cfg.log_summary_every:
                        train_op.append(dqn.summary)
                    result = sess.run(
                        train_op,
                        feed_dict=dict(zip(placeholders, batch)),
                    )
                    if len(result) > 1:
                        for i in range(1, len(result)):
                            summary_writer.add_summary(result[i],
                                                       global_step=steps)
                if steps % cfg.target_update_every == 0:
                    sess.run([dqn.copy_to_target])
                if steps % cfg.model_chkpt_every == 0:
                    saver.save(sess,
                               "%s/model_epoch_%04d" % (cfg.save_dir, steps))

            if terminal:
                episode_results.write("%d,%d,%d\n" %
                                      (env.episode_count, env.episode_reward,
                                       env.episode_frames))
                episode_results.flush()
                # Log episode summaries to Tensorboard
                add_simple_summary(summary_writer, "episode/reward",
                                   env.episode_reward, env.episode_count)
                add_simple_summary(summary_writer, "episode/frames",
                                   env.episode_frames, env.episode_count)

                pbar.update(env.episode_frames if not cfg.evaluate else 1)
                env.reset()

    episode_results.close()
    tf.logging.info("Finished %d %s" % (
        cfg.max_episode_count if cfg.evaluate else cfg.num_frames,
        "episodes" if cfg.evaluate else "frames",
    ))
예제 #21
0
                               downsampling_pix_values=None,
                               atari_frameskip=args.atari_frameskip)
        eval_fn = get_evaluate_fn(env_eval=env_eval,
                                  preproc_obs_fn=preproc_obs_fn,
                                  policy_NN=call_model,
                                  args=args)

    process = psutil.Process()
    memory_usage_fn = lambda: process.memory_info().rss

    stats = Stats(use_tensorboard=args.use_tensorboard, log_path=log_path)
    experience_keys = ["observations", "target_policy"]
    if args.compute_value:
        experience_keys.append("returns")

    experience_replay = ExperienceReplay(keys=experience_keys,
                                         capacity=args.replay_capacity)

    run_episode_fn = get_episode_fn(
        actor=high_level_actor if args.hierarchical else low_level_actor,
        planner=high_level_planner if args.hierarchical else low_level_planner,
        train_fn=train_fn,
        dataset=experience_replay,
        add_returns=args.compute_value,
        stats=stats,
        memory_usage_fn=memory_usage_fn,
        preproc_obs_fn=preproc_obs_fn,
        eval_fn=eval_fn,
        n_actions=env.action_space.n,
        value_scalars_to_distrs=value_scalars_to_distrs,
        value_logits_to_scalars=value_logits_to_scalars,
        args=args)
예제 #22
0
from training_testing import test

# parameters
epsilon = 0.1  # exploration
max_memory = 500  # Maximum number of experiences we are storing
hidden_size = 100  # Size of the hidden layers
batch_size = 1  # Number of experiences we use for training per batch
epoch = 50


def baseline_model(grid_size, num_actions, hidden_size):
    # seting up the model with keras
    model = Sequential()
    model.add(
        Dense(hidden_size, input_shape=(grid_size**2, ), activation='relu'))
    model.add(Dense(hidden_size, activation='relu'))
    model.add(Dense(num_actions))
    model.compile(SGD(lr=.1), "mse")
    return model


# Define environment/game
env = Catch()

# Initialize experience replay object
exp_replay = ExperienceReplay(max_memory=max_memory)

model = baseline_model(grid_size, num_actions, hidden_size)
train(env, model, exp_replay, epoch, epsilon, num_actions, batch_size)
test(model)
예제 #23
0
from qnet import QNetAgent
from torch.utils.tensorboard import SummaryWriter

# if gpu is to be used
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
Tensor = torch.Tensor
LongTensor = torch.LongTensor
random_seed = 42
torch.manual_seed(random_seed)
random.seed(random_seed)

writer = SummaryWriter()

actionSpace = ActionSpace()
memory = ExperienceReplay(config.replay_mem_size)
qnet_agent = QNetAgent()

steps_total = []

frames_total = 0
solved_after = 0
solved = False

start_time = time.time()

# Main loop
step = 0
total_reward = 0
done = False
gamestate = console.step()
예제 #24
0
    def __init__(self,
            env,
            obs_size = (115,),
            num_frame_stack = 1,
            batch_size = 32,
            mdp_gamma = 0.95,
            initial_epsilon = 1.0,
            min_epsilon = 0.1,
            epsilon_decay_steps = int(1e6),
            replay_capacity = int(1e5),
            min_replay_size = int(1e3),
            train_freq = 4,
            network_update_freq = 5000,
            regularization = 1e-6,
            optimizer_params = None,
            render = False):

            """
            Initialization function
            
            param env:                object. a gym-like environment which our RL agent interacts with
            parma obs_size:           list. the shape of the observation, i.e. (115,) for vector observation or (32,32) for image observation
            parma num_frame_stack:    int. number of stacked frames for network input
            param batch_size:         int. batch size
            param mdp_gamma:          float. MDP discount factor
            param initial_epsilon:    float. epsilon parameter of epsilon-greedy policy
            param min_epsilon:        float. minimum epsilon parameter of epsilon-greedy policy
            param epsilon_decay_steps: int. how many steps to decay epsilon 
            param replay_capacity:    int. replay buffer size
            param min_replay_size:    int. minimum replay buffer size
            param train_freq:         int. training frequency
            param network_update_freq: int. network update frequency
            param regularization:     float. regularization coefficient
            param optimizer_params:   dict. optimizer specilized parameters. i.e. learning rate, momentum
            param render:             bool. is render mode on?
            """
            
            # experience replay buffer for training
            self.exp_buffer = ExperienceReplay(
                num_frame_stack,
                capacity=replay_capacity,
                obs_size = obs_size
            )

            # experience replay buffer for playing/testing
            self.play_buffer = ExperienceReplay(
                num_frame_stack,
                capacity=num_frame_stack * 10,
                obs_size = obs_size
            )

            self.env = env
            self.obs_size = obs_size
            self.num_frame_stack = num_frame_stack
            self.batch_size = batch_size
            self.mdp_gamma = mdp_gamma
            self.initial_epsilon = initial_epsilon
            self.min_epsilon = min_epsilon
            self.epsilon_decay_steps = epsilon_decay_steps
            self.replay_capacity = replay_capacity
            self.min_replay_size = min_replay_size
            self.train_freq = train_freq
            self.network_update_freq = network_update_freq
            self.regularization = regularization
            self.render = render

            self.dim_actions = env.action_space.n
            self.dim_state = (num_frame_stack,) + self.obs_size

            if optimizer_params:
                self.optimizer_params = optimizer_params
            else:
                self.optimizer_params = dict(learning_rate = 0.0001, epsilon = 1e-7)

            self.is_training = True
            # epsilon used for playing
            # if 0, means that we just use the Q-network's optimal action without any exploration
            self.playing_epsilon = 0.0
            
            self.session = None
            
            self.global_counter = 0
            self.episode_counter = 0
            self.loss_history = []
예제 #25
0
from __future__ import division, print_function

import gym
import gym_gazebo
import numpy as np
import sys
import os
from ddq_model import Qnet
from experience_replay import ExperienceReplay
from utils import Config

argv = sys.argv[1:]
config = Config(argv)
env = gym.make('GazeboTurtlebotMazeColor-v0')
replay = ExperienceReplay(config.args.output_dir,
                          config.args.replay_buffer_size)
qnet = Qnet(env.num_state, env.num_action)

if (config.args.continue_from != None):
    qnet.load(config.args.continue_from)
    replay.load(config.args.continue_from)

elif (config.args.from_pretrain != None):
    qnet.load(config.args.from_pretrain)

epsilon = config.args.start_epsilon
epsilon_decay = (config.args.start_epsilon -
                 config.args.end_epsilon) / config.args.annealing_steps

while True:
    state = env.reset()
예제 #26
0
        plan_step_fn = get_pi_iw_planning_step_fn(
            actor=actor,
            planner=planner,
            policy_fn=network_policy,
            tree_budget=tree_budget,
            discount_factor=discount_factor,
            temp=policy_temp)
        learner = SupervisedPolicy(model,
                                   optimizer,
                                   regularization_factor=regularization_factor,
                                   use_graph=True)

    # Initialize experience replay: run complete episodes until we exceed both batch_size and dataset_min_transitions
    print("Initializing experience replay", flush=True)
    train_stats = TrainStats()
    experience_replay = ExperienceReplay(capacity=replay_capacity)
    while len(experience_replay) < batch_size or len(
            experience_replay) < replay_min_transitions:
        episode_rewards = run_episode(
            plan_step_fn=plan_step_fn,
            learner=None,
            dataset=experience_replay,
            cache_subtree=cache_subtree,
            add_returns=(args.algorithm == "AlphaZero"),
            preproc_obs_fn=preproc_obs_fn,
            render=args.render)
        train_stats.report(episode_rewards, actor.nodes_generated)

    # Interleave planning and learning steps
    print("\nInterleaving planning and learning steps.", flush=True)
    while actor.nodes_generated < max_simulator_steps:
예제 #27
0
파일: end_to_end.py 프로젝트: shir994/L-GSO
def end_to_end_training(
        epochs: int,
        model_cls: BaseConditionalGenerationOracle,
        optimizer_cls: BaseOptimizer,
        optimized_function_cls: BaseConditionalGenerationOracle,
        logger: BaseLogger,
        model_config: dict,
        optimizer_config: dict,
        n_samples_per_dim: int,
        step_data_gen: float,
        n_samples: int,
        current_psi: Union[List[float], torch.tensor],
        reuse_optimizer: bool = False,
        reuse_model: bool = False,
        shift_model: bool = False,
        finetune_model: bool = False,
        use_experience_replay: bool = True,
        add_box_constraints: bool = False,
        experiment=None,
        scale_psi=False):
    """

    :param epochs: int
        number of local training steps to perfomr
    :param model_cls: BaseConditionalGenerationOracle
        model that is able to generate samples and calculate loss function
    :param optimizer_cls: BaseOptimizer
    :param logger: BaseLogger
    :param model_config: dict
    :param optimizer_config: dict
    :param n_samples_per_dim: int
    :param step_data_gen: float
    :param n_samples: int
    :param current_psi:
    :param reuse_model:
    :param reuse_optimizer:
    :param finetune_model:
    :param shift_model:

    :return:
    """
    gan_logger = GANLogger(experiment)
    # gan_logger = RegressionLogger(experiment)
    # gan_logger = None

    y_sampler = optimized_function_cls(device=device, psi_init=current_psi)
    model = model_cls(y_model=y_sampler, **model_config,
                      logger=gan_logger).to(device)

    optimizer = optimizer_cls(oracle=model, x=current_psi, **optimizer_config)
    print(model_config)
    exp_replay = ExperienceReplay(psi_dim=model_config['psi_dim'],
                                  y_dim=model_config['y_dim'],
                                  x_dim=model_config['x_dim'],
                                  device=device)
    weights = None

    logger.log_performance(y_sampler=y_sampler,
                           current_psi=current_psi,
                           n_samples=n_samples)
    for epoch in range(epochs):
        # generate new data sample
        x, condition = y_sampler.generate_local_data_lhs(
            n_samples_per_dim=n_samples_per_dim,
            step=step_data_gen,
            current_psi=current_psi,
            n_samples=n_samples)
        if x is None and condition is None:
            print("Empty training set, continue")
            continue
        x_exp_replay, condition_exp_replay = exp_replay.extract(
            psi=current_psi, step=step_data_gen)
        exp_replay.add(y=x, condition=condition)
        x = torch.cat([x, x_exp_replay], dim=0)
        condition = torch.cat([condition, condition_exp_replay], dim=0)
        used_samples = n_samples

        # breaking things
        if model_config.get("predict_risk", False):
            condition = condition[::n_samples_per_dim, :current_psi.shape[0]]
            x = y_sampler.func(condition,
                               num_repetitions=n_samples_per_dim).reshape(
                                   -1, x.shape[1])
        print(x.shape, condition.shape)
        ## Scale train set
        if scale_psi:
            scale_factor = 10
            feature_max = condition[:, :model_config['psi_dim']].max(axis=0)[0]
            y_sampler.scale_factor = scale_factor
            y_sampler.feature_max = feature_max
            y_sampler.scale_psi = True
            print("MAX FEATURES", feature_max)
            condition[:, :
                      model_config['psi_dim']] /= feature_max * scale_factor
            current_psi = current_psi / feature_max * scale_factor
            print(feature_max.shape, current_psi.shape)
            print("MAX PSI", current_psi)

        model.train()
        if reuse_model:
            if shift_model:
                if isinstance(model, ShiftedOracle):
                    model.set_shift(current_psi.clone().detach())
                else:
                    model = ShiftedOracle(oracle=model,
                                          shift=current_psi.clone().detach())
                model.fit(x, condition=condition, weights=weights)
            else:
                model.fit(x, condition=condition, weights=weights)
        else:
            # if not reusing model
            # then at each epoch re-initialize and re-fit
            model = model_cls(y_model=y_sampler,
                              **model_config,
                              logger=gan_logger).to(device)
            print("y_shape: {}, cond: {}".format(x.shape, condition.shape))
            model.fit(x, condition=condition, weights=weights)

        model.eval()

        if reuse_optimizer:
            optimizer.update(oracle=model, x=current_psi)
        else:
            # find new psi
            optimizer = optimizer_cls(oracle=model,
                                      x=current_psi,
                                      **optimizer_config)

        if add_box_constraints:
            box_barriers = make_box_barriers(current_psi, step_data_gen)
            add_barriers_to_oracle(oracle=model, barriers=box_barriers)

        previous_psi = current_psi.clone()
        current_psi, status, history = optimizer.optimize()
        if scale_psi:
            current_psi, status, history = optimizer.optimize()
            current_psi = current_psi / scale_factor * feature_max
            y_sampler.scale_psi = False
            print("NEW_PSI: ", current_psi)

        try:
            # logging optimization, i.e. statistics of psi
            logger.log_grads(model,
                             y_sampler,
                             current_psi,
                             n_samples_per_dim,
                             log_grad_diff=False)
            logger.log_optimizer(optimizer)
            logger.log_performance(y_sampler=y_sampler,
                                   current_psi=current_psi,
                                   n_samples=n_samples)
            experiment.log_metric("used_samples_per_step", used_samples)
            experiment.log_metric("sample_size", len(x))

        except Exception as e:
            print(e)
            print(traceback.format_exc())
            # raise
        torch.cuda.empty_cache()
    logger.func_saver.join()
    return