def __init__(self, random_action_method, future_discount=0.75, learning_rate=0.001, saveAndLoad=True): learning_rate = learning_rate * (1 - future_discount) / (1 - 0.8) self.model_a = RLModel() self.model_a.build((None, AGENT_INPUT_SIZE)) self.model_b = RLModel() self.model_b.build((None, AGENT_INPUT_SIZE)) self.saveAndLoad = saveAndLoad if os.path.isfile(SAVE_PATH_A) and os.path.isfile( SAVE_PATH_B) and saveAndLoad: print("Loading") self.model_a.load_weights(SAVE_PATH_A) self.model_b.load_weights(SAVE_PATH_B) self.exp_rep_a = ExperienceReplay(ER_SIZE, AGENT_INPUT_SIZE) self.exp_rep_b = ExperienceReplay(ER_SIZE, AGENT_INPUT_SIZE) self.random_action_method = random_action_method self.learning_rate = learning_rate self.future_discount = future_discount self.loss_measure = tf.losses.MeanSquaredError() self.opt = tf.optimizers.Adam(lr=self.learning_rate) self.n_since_last_train = 0 self.latestLoss = tf.add(0, 0)
def __init__(self, env, batchsize=64, pic_size=(96, 96), num_frame_stack=4, gamma=0.95, frame_skip=1, train_freq=4, initial_epsilon=1.0, min_epsilon=0.1, render=True, epsilon_decay_steps=int(1e6), min_experience_size=int(1e3), experience_capacity=int(1e5), network_update_freq=5000, regularization=1e-6, optimizer_params=None, action_map=None): self.exp_history = ExperienceReplay(num_frame_stack, capacity=experience_capacity, pic_size=pic_size) self.playing_cache = ExperienceReplay(num_frame_stack, capacity=num_frame_stack * 5 + 10, pic_size=pic_size) self.network_update_freq = network_update_freq self.action_map = action_map self.env = env self.batchsize = batchsize self.num_frame_stack = num_frame_stack self.gamma = gamma self.frame_skip = frame_skip self.train_freq = train_freq self.initial_epsilon = initial_epsilon self.min_epsilon = min_epsilon self.epsilon_decay_steps = epsilon_decay_steps self.render = render self.min_experience_size = min_experience_size self.pic_size = pic_size self.regularization = regularization self.optimizer_params = optimizer_params or dict(learning_rate=0.0004, epsilon=1e-7) self.do_training = True self.playing_epsilon = 0.0 self.session = None self.state_size = (self.num_frame_stack, ) + self.pic_size self.global_counter = 0 self.episode_counter = 0 if action_map is not None: self.dim_actions = len(action_map) else: self.dim_actions = env.action_space.n self.q_values = [] self.loss_his = []
def __init__(self): self.batch_size = 64 # How many experiences to use for each training step self.train_frequency = 5 # How often you update the network self.num_epochs = 20 # How many epochs to train when updating the network self.y = 0.99 # Discount factor self.prob_random_start = 0.6 # Starting chance of random action self.prob_random_end = 0.1 # Ending chance of random action self.annealing_steps = 1000. # Steps of training to reduce from start_e -> end_e self.max_num_episodes = 10000 # Max number of episodes you are allowes to played to train the game self.min_pre_train_episodes = 100 # Number of episodes played with random actions before to start training. self.max_num_step = 50 # Maximum allowed episode length self.goal = 15 # Number of rewards we want to achieve while playing a game. # Set env self.env = gameEnv(partial=False, size=5) # Reset everything from keras session K.clear_session() # Setup our Q-networks self.main_qn = Qnetwork() self.target_qn = Qnetwork() # Setup our experience replay self.experience_replay = ExperienceReplay()
def __init__(self, random_action_method, future_discount=0.75, learning_rate=0.001, load_path=None): learning_rate = learning_rate * (1 - future_discount) / (1 - 0.8) self.model = RLModel() self.model.build((None, AGENT_INPUT_SIZE)) self.load_path = load_path if load_path is not None and os.path.isfile(load_path): print("Loading") self.model.load_weights(load_path) self.exp_rep = ExperienceReplay(ER_SIZE, AGENT_INPUT_SIZE) self.random_action_method = random_action_method self.learning_rate = learning_rate self.future_discount = future_discount self.loss_measure = tf.losses.MeanSquaredError() self.opt = tf.optimizers.Adam(lr=self.learning_rate) self.n_since_last_train = 0 self.latestLoss = tf.add(0, 0)
def __init__(self): self.eps = 0.1 self.env = GridEnv(3) self.batch_size = 20 if prioritized_replay and replay_type == "proportional": self.replay = ProportionalReplay(max_buffer_size, prioritized_replay_alpha) elif prioritized_replay and replay_type == "ranked": N_list = [self.batch_size] + [ int(x) for x in np.linspace(100, max_buffer_size, 5) ] save_quantiles(N_list=N_list, k=self.batch_size, alpha=prioritized_replay_alpha) self.replay = RankBasedReplay(max_buffer_size, prioritized_replay_alpha) else: self.replay = ExperienceReplay( max_buffer_size) # passing size of buffer # define graph self.inputs = tf.placeholder(tf.float32, shape=(None, self.env.state_size)) self.target_values = tf.placeholder(tf.float32, shape=(None, )) self.actions = tf.placeholder(tf.int32, shape=(None, )) self.is_weights = tf.placeholder(tf.float32, shape=( None, )) # importance sampling weights for prioritized replay self.Q_out_op, self.Q_update_op, self.td_error_op = self.build_graph( ) # build main network self.target_Q_out_op, _, _ = self.build_graph( 'target') # build identical target network self.init_op = tf.global_variables_initializer() self.sess = tf.Session()
def __init__(self): self.experience_replay = ExperienceReplay('BreakoutDeterministic-v0', FLAGS.replay_buffer_size, 84, 84, 4, self.policy, FLAGS.decay_to_epoch) config = DQNConfig() config.learning_rate = FLAGS.learning_rate config.gamma = FLAGS.gamma config.decay = FLAGS.decay config.momentum = FLAGS.momentum config.eps = FLAGS.eps config.input_width = FLAGS.image_width config.input_height = FLAGS.image_height config.skip = FLAGS.skip self.dqn = DQN(config, FLAGS.use_huber) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) logger.info('initializing variables...') self.sess.run(tf.global_variables_initializer()) self.update_target() self.epoch = 0 self.decay_epsilon()
def test_observation_zeroing(self): """ Tests zeroing out of frames not from current episode """ obs_shape = (84, 84, 1) er = ExperienceReplay(5, obs_shape) for terminal_idx in range(5): obs_ = [] obs_next_ = [] for i in range(1, 6): partial_obs = np.ones(obs_shape) * i terminal = 1 if i == terminal_idx else 0 er.append(partial_obs, 0, 0, terminal) if i <= terminal_idx: partial_obs *= 0 if i < 5: obs_.append(partial_obs) if i > 1: obs_next_.append(partial_obs) obs_ = np.transpose(np.array(obs_), (3, 1, 2, 0)) obs_next_ = np.transpose(np.array(obs_next_), (3, 1, 2, 0)) batch = er.sample(1) obs, rewards, actions, obs_next, terminals = batch assert np.array_equal(obs_, obs) assert np.array_equal(obs_next_, obs_next)
def init(): train_env = SquigglesEnvironment(num_notes=2) evaluation_env = SquigglesEnvironment(num_notes=2) train_env = tf_py_environment.TFPyEnvironment(train_env) evaluation_env = tf_py_environment.TFPyEnvironment(evaluation_env) agent, _ = generic_dqn_agent(train_env) experience_replay = ExperienceReplay(agent, train_env, BATCH_SIZE) return agent, train_env, evaluation_env, experience_replay
def run_episode(plan_step_fn, learner, dataset, cache_subtree, add_returns, preproc_obs_fn=None, render=False): episode_done = False actor.reset() episode_rewards = [] aux_replay = ExperienceReplay( ) # New auxiliary buffer to save current episode transitions while not episode_done: # Planning step tree_policy = plan_step_fn(len(episode_rewards)) # Execute action (choose one node as the new root from depth 1) a = sample_pmf(tree_policy) prev_root_data, current_root_data = actor.step(a, cache_subtree, render, render_size=(512, 512)) aux_replay.append({ "observations": prev_root_data["obs"], "target_policy": tree_policy }) episode_rewards.append(current_root_data["r"]) episode_done = current_root_data["done"] # Learning step if learner is not None: batch = dataset.sample(batch_size) if preproc_obs_fn is not None: batch["observations"] = preproc_obs_fn(batch["observations"]) obs = tf.constant(batch["observations"], dtype=tf.float32) target_policy = tf.constant(batch["target_policy"], dtype=tf.float32) if add_returns: returns = tf.constant(batch["returns"], dtype=tf.float32) loss, _ = learner.train_step(obs, target_policy, returns) else: loss, _ = learner.train_step(obs, target_policy) # Add episode to the dataset if add_returns: returns = compute_returns(episode_rewards, discount_factor) # Backpropagate rewards aux_replay.add_column("returns", returns) # Add them to the dataset dataset.extend( aux_replay ) # Add transitions to the buffer that will be used for learning return episode_rewards
def test_sampling(self): """ Tests observation construction from partial observations """ obs_shape = (84, 84, 1) er = ExperienceReplay(5, obs_shape) for i in range(1, 6): partial_obs = np.ones(obs_shape) * i er.append(partial_obs, 1, 1, 0) batch = er.sample(1) _, rewards, actions, _, terminals = batch assert np.array_equal(rewards, np.array([1])) assert np.array_equal(actions, np.array([1])) assert np.array_equal(terminals, np.array([0]))
def __init__(self): self.prob_random = 1.0 # Probability to play random action self.y = .99 # Discount factor self.batch_size = 64 # How many experiences to use for each training step self.prob_random_end = .01 # Ending chance of random action self.prob_random_decay = .996 # Decrease decay of the prob random self.max_episode = 300 # Max number of episodes you are allowes to played to train the game self.expected_goal = 200 # Expected goal self.dnn = DNN() self.env = gym.make('CartPole-v0') self.memory = ExperienceReplay(buffer_size=10000) self.metadata = [ ] # we will store here info score, at the end of each episode
def main(): hist_length = 50 processor = Processor(history_length=hist_length) price_history = processor.fetchData() train_price_history = price_history['train'] test_price_history = price_history['test'] env = Environment(horizon=20, train_price_history=train_price_history, test_price_history=test_price_history, history_length=hist_length) exp_replay = ExperienceReplay() agent = Agent(feature_size=6, window=hist_length, action_size=3, experience_replay=exp_replay, environment=env) agent.train() print("Agent done training, now testing: ") agent.test(test_price_history)
def __init__(self): # gamma is a parameter of Q - learing algorithm self.gamma = 0.9 # We use epsilon - greedy strategy of learning self.epsilon = 1 self.epsilon_decay = 0.99 self.epsilon_min = 0.01 # Number of epochs (fully played games) to study an agent self.epochs = 500 # Game to play self.game = Game() # Number of hidden layer nodes self.hidden_layer_nodes = 20 # Create keras model # _________________________________________________________________ # Layer (type) Output Shape Param # # ================================================================= # dense_1 (Dense) (None, 20) 120 # _________________________________________________________________ # dense_2 (Dense) (None, 20) 420 # _________________________________________________________________ # dense_3 (Dense) (None, 5) 105 # ================================================================= # Total params: 645 # Trainable params: 645 # Non-trainable params: 0 # _________________________________________________________________ self.model = Sequential() self.model.add(Dense(self.hidden_layer_nodes, input_dim=self.game.state_size, activation='relu')) self.model.add(Dense(self.hidden_layer_nodes, activation='relu')) self.model.add(Dense(len(POSSIBLE_ACTIONS), activation='linear')) self.model.compile('Adam', loss='mse') # Initialize experience replay self.experience_replay = ExperienceReplay(size=2000) self.batch_size = 20 self.max_turns = 100
def __init__(self, env, net_update_rate: int = 25, exploration_rate: float = 1.0, exploration_decay: float = 0.00005): # set hyper parameters self.exploration_rate = exploration_rate self.exploration_decay = exploration_decay self.net_updating_rate = net_update_rate # set environment self.env = env self.state_shape = env.get_state_shape() self.action_shape = env.get_action_shape() # the number of experience per batch for batch learning # Experience Replay for batch learning self.exp_rep = ExperienceReplay() # Deep Q Network self.net = None
def test_observation_construction(self): """ Tests observation construction from partial observations """ obs_shape = (84, 84, 1) er = ExperienceReplay(5, obs_shape) obs_ = [] obs_next_ = [] for i in range(1, 6): partial_obs = np.ones(obs_shape) * i if i < 5: obs_.append(partial_obs) if i > 1: obs_next_.append(partial_obs) er.append(partial_obs, 0, 0, 0) obs_ = np.transpose(np.array(obs_), (3, 1, 2, 0)) obs_next_ = np.transpose(np.array(obs_next_), (3, 1, 2, 0)) batch = er.sample(1) obs, rewards, actions, obs_next, terminals = batch assert np.array_equal(obs_, obs) assert np.array_equal(obs_next_, obs_next)
def __init__(self, FLAGS): """ This class build the model that implements the deterministic gradient descent algorithm. :param FLAGS: TensorFlow flags which contain the values for hyperparameters """ self.FLAGS=FLAGS self.env = gym.make('Pendulum-v0') self.state_size = len(self.env.observation_space.sample()) self.num_episodes=1000 self.batch_size=64 self.exp_replay=ExperienceReplay(50000,1500, FLAGS) self.action_noise=OrnsteinUhlenbeckActionNoise(self.env,mu= 0.0, sigma=0.2, theta=.15, dt=1e-2, x0=None) self.actor_target=Actor(scope='target',target_network=None,env=self.env, flags=FLAGS) self.actor=Actor(scope='actor',target_network=self.actor_target,env=self.env, flags=FLAGS) self.critic_target=Critic(scope='target',target_network=None,env=self.env, flags=FLAGS) self.critic=Critic(scope='critic',target_network=self.critic_target,env=self.env, flags=FLAGS) init = tf.global_variables_initializer() self.session = tf.InteractiveSession() self.session.run(init) self.critic.set_session(self.session) self.actor.set_session(self.session) self.actor_target.set_session(self.session) self.critic_target.set_session(self.session) self.critic.init_target_network() self.actor.init_target_network()
def __init__(self, s_size, a_size, seed): """ Parameters: s_size (int): dimension of each state a_size (int): dimension of each action seed (int): random seed """ self.s_size = s_size self.a_size = a_size self.seed = random.seed(seed) # Initialize both the Q-networks self.local_dqn = Model(s_size, a_size, seed).to(device) self.target_dqn = Model(s_size, a_size, seed).to(device) self.optimizer = optim.Adam(self.local_dqn.parameters(), lr=c.LEARNING_RATE) # Initialize experience deque self.buffer = ExperienceReplay(a_size, c.REPLAY_BUFFER_SIZE, c.BATCH_SIZE, seed) # Time step counter used for updating as per UPDATE_FREQUENCY self.t_step = 0
pygame.key.set_repeat(1, 1) env = GameEnvironment(DISPLAY_SHAPE, 1.0 / float(FPS)) def action_vector(a): res = np.zeros(9) res[int(a)] = 1.0 return res # Define Experience Replay if SAVE_EXPERIENCE: er = ExperienceReplay.load(EXP_REPLAY_FILE) if er == None: er = ExperienceReplay(BUFFER_SIZE) def gameover(hero_score): gameDisplay.fill(WHITE) font = pygame.font.SysFont(None, 42) text = font.render("GAME OVER", True, BLACK) gameDisplay.blit(text, (DISPLAY_SHAPE[0] / 3, DISPLAY_SHAPE[1] / 3)) pygame.display.update() pygame.time.delay(3000)
from experience_replay import ExperienceReplay from logger import Logger ACTIONS = {0: "UP", 1: "DOWN", 2: "RIGHT", 3: "LEFT"} NUM_ACTIONS = len(ACTIONS) NUM_GAMES = 30000 OBSERVE = 1000 MAX_TILE = 2048 epsilon = 0.1 min_epsilon = 1e-2 gamma_epsilon = 0.999 gamma_reward = 0.99 replay = ExperienceReplay(capacity=1e6) logger = Logger() online = PolicyNetwork(batch_size=32) target = PolicyNetwork(batch_size=32) def preprocess(a: np.array) -> np.array: a = np.where(a <= 0, 1, a) a = np.log2(a) / np.log2(MAX_TILE) return a if __name__ == "__main__": best_score = 0
def main(_): # Reproducability tf.reset_default_graph() np.random.seed(cfg.random_seed) tf.set_random_seed(cfg.random_seed) # Logging summary_writer = tf.summary.FileWriter(cfg.log_dir) if not cfg.evaluate and not tf.gfile.Exists(cfg.save_dir): tf.gfile.MakeDirs(cfg.save_dir) else: assert tf.gfile.Exists(cfg.save_dir) # TODO handel this episode_results_path = os.path.join(cfg.log_dir, "episodeResults.csv") episode_results = tf.gfile.GFile(episode_results_path, "w") episode_results.write("model_freq={},save_dir={}".format( cfg.model_freq, cfg.save_dir)) episode_results.write("episode,reward,steps\n") episode_results.flush() # Setup ALE and DQN graph obs_shape = (84, 84, 1) input_height, input_width, _ = obs_shape dqn = DQN(input_height, input_width, cfg.num_actions) # Global step global_step = tf.train.get_or_create_global_step() increment_step = tf.assign_add(global_step, 1) # Save all variables vars_to_save = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="agent/q") vars_to_save.append(global_step) saver = tf.train.Saver(var_list=vars_to_save) # Handle loading specific variables sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess = tf.Session(config=sess_config) restore_or_initialize_weights(sess, dqn, saver) sess.run(dqn.copy_to_target) if cfg.evaluate: # if in evaluation mode, saver is no longer needed saver = None # ##### Restoring AEs ######## if not cfg.evaluate: vaes = create_generative_models(sess) image_summaries = [] image_summaries_ph = tf.placeholder(tf.float32, shape=(4, 84, 84, 4), name="image_summaries_placeholder") for i in range(4): for j in range(4): image_summaries.append( tf.summary.image( "VAE_OUT_{}_{}".format(i, j), tf.reshape(image_summaries_ph[i, :, :, j], (1, 84, 84, 1)))) # ############################ if not cfg.evaluate: summary_writer.add_graph(tf.get_default_graph()) summary_writer.add_graph(vaes[0].graph) summary_writer.add_graph(vaes[1].graph) summary_writer.add_graph(vaes[2].graph) summary_writer.flush() # Initialize ALE postprocess_frame = lambda frame: sess.run(dqn.process_frame, feed_dict={dqn.image: frame}) env = AtariEnvironment(obs_shape, postprocess_frame) # Replay buffer if not cfg.evaluate: replay_buffer = ExperienceReplay(cfg.replay_buffer_size, obs_shape) # Perform random policy to get some training data with tqdm(total=cfg.seed_frames, disable=cfg.disable_progress or cfg.evaluate) as pbar: seed_steps = 0 while seed_steps * cfg.frame_skip < cfg.seed_frames and not cfg.evaluate: action = np.random.randint(cfg.num_actions) reward, next_state, terminal = env.act(action) seed_steps += 1 replay_buffer.append(next_state[:, :, -1, np.newaxis], action, reward, terminal) if terminal: pbar.update(env.episode_frames) env.reset(inc_episode_count=False) if cfg.evaluate: assert cfg.max_episode_count > 0 else: assert len(replay_buffer) >= cfg.seed_frames // cfg.frame_skip # Main training loop steps = tf.train.global_step(sess, global_step) env.reset(inc_episode_count=False) terminal = False total = cfg.max_episode_count if cfg.evaluate else cfg.num_frames with tqdm(total=total, disable=cfg.disable_progress) as pbar: # Loop while we haven't observed our max frame number # If we are at our max frame number we will finish the current episode while (not ( # We must be evaluating or observed the last frame # As well as be terminal # As well as seen the maximum episode number (steps * cfg.frame_skip > cfg.num_frames or cfg.evaluate) and terminal and env.episode_count >= cfg.max_episode_count)): # Epsilon greedy policy with epsilon annealing if not cfg.evaluate and steps * cfg.frame_skip < cfg.eps_anneal_over: # Only compute epsilon step while we're still annealing epsilon epsilon = cfg.eps_initial - steps * ( (cfg.eps_initial - cfg.eps_final) / cfg.eps_anneal_over) else: epsilon = cfg.eps_final # Epsilon greedy policy if np.random.uniform() < epsilon: action = np.random.randint(0, cfg.num_actions) else: action = sess.run(dqn.action, feed_dict={dqn.S: [env.state]}) # Perform environment step steps = sess.run(increment_step) reward, next_state, terminal = env.act(action) if not cfg.evaluate: replay_buffer.append(next_state[:, :, -1, np.newaxis], action, reward, terminal) # Sample and do gradient updates if steps % cfg.learning_freq == 0: placeholders = [ dqn.S, dqn.actions, dqn.rewards, dqn.S_p, dqn.terminals, ] batch = replay_buffer.sample(cfg.batch_size) train_op = [dqn.train] if steps % (cfg.learning_freq * cfg.model_freq) == 0: experience_batch = batch batch = imagined_batch(vaes, batch[1]) if steps / (cfg.learning_freq * cfg.model_freq) < 10: placeholders.append(image_summaries_ph) batch = list(batch) batch.append(batch[0][ np.random.randint(0, 32, size=4), :, :, :]) train_op.extend(image_summaries) if steps % cfg.log_summary_every: train_op.append(dqn.summary) result = sess.run( train_op, feed_dict=dict(zip(placeholders, batch)), ) if len(result) > 1: for i in range(1, len(result)): summary_writer.add_summary(result[i], global_step=steps) if steps % cfg.target_update_every == 0: sess.run([dqn.copy_to_target]) if steps % cfg.model_chkpt_every == 0: saver.save(sess, "%s/model_epoch_%04d" % (cfg.save_dir, steps)) if terminal: episode_results.write("%d,%d,%d\n" % (env.episode_count, env.episode_reward, env.episode_frames)) episode_results.flush() # Log episode summaries to Tensorboard add_simple_summary(summary_writer, "episode/reward", env.episode_reward, env.episode_count) add_simple_summary(summary_writer, "episode/frames", env.episode_frames, env.episode_count) pbar.update(env.episode_frames if not cfg.evaluate else 1) env.reset() episode_results.close() tf.logging.info("Finished %d %s" % ( cfg.max_episode_count if cfg.evaluate else cfg.num_frames, "episodes" if cfg.evaluate else "frames", ))
downsampling_pix_values=None, atari_frameskip=args.atari_frameskip) eval_fn = get_evaluate_fn(env_eval=env_eval, preproc_obs_fn=preproc_obs_fn, policy_NN=call_model, args=args) process = psutil.Process() memory_usage_fn = lambda: process.memory_info().rss stats = Stats(use_tensorboard=args.use_tensorboard, log_path=log_path) experience_keys = ["observations", "target_policy"] if args.compute_value: experience_keys.append("returns") experience_replay = ExperienceReplay(keys=experience_keys, capacity=args.replay_capacity) run_episode_fn = get_episode_fn( actor=high_level_actor if args.hierarchical else low_level_actor, planner=high_level_planner if args.hierarchical else low_level_planner, train_fn=train_fn, dataset=experience_replay, add_returns=args.compute_value, stats=stats, memory_usage_fn=memory_usage_fn, preproc_obs_fn=preproc_obs_fn, eval_fn=eval_fn, n_actions=env.action_space.n, value_scalars_to_distrs=value_scalars_to_distrs, value_logits_to_scalars=value_logits_to_scalars, args=args)
from training_testing import test # parameters epsilon = 0.1 # exploration max_memory = 500 # Maximum number of experiences we are storing hidden_size = 100 # Size of the hidden layers batch_size = 1 # Number of experiences we use for training per batch epoch = 50 def baseline_model(grid_size, num_actions, hidden_size): # seting up the model with keras model = Sequential() model.add( Dense(hidden_size, input_shape=(grid_size**2, ), activation='relu')) model.add(Dense(hidden_size, activation='relu')) model.add(Dense(num_actions)) model.compile(SGD(lr=.1), "mse") return model # Define environment/game env = Catch() # Initialize experience replay object exp_replay = ExperienceReplay(max_memory=max_memory) model = baseline_model(grid_size, num_actions, hidden_size) train(env, model, exp_replay, epoch, epsilon, num_actions, batch_size) test(model)
from qnet import QNetAgent from torch.utils.tensorboard import SummaryWriter # if gpu is to be used use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") Tensor = torch.Tensor LongTensor = torch.LongTensor random_seed = 42 torch.manual_seed(random_seed) random.seed(random_seed) writer = SummaryWriter() actionSpace = ActionSpace() memory = ExperienceReplay(config.replay_mem_size) qnet_agent = QNetAgent() steps_total = [] frames_total = 0 solved_after = 0 solved = False start_time = time.time() # Main loop step = 0 total_reward = 0 done = False gamestate = console.step()
def __init__(self, env, obs_size = (115,), num_frame_stack = 1, batch_size = 32, mdp_gamma = 0.95, initial_epsilon = 1.0, min_epsilon = 0.1, epsilon_decay_steps = int(1e6), replay_capacity = int(1e5), min_replay_size = int(1e3), train_freq = 4, network_update_freq = 5000, regularization = 1e-6, optimizer_params = None, render = False): """ Initialization function param env: object. a gym-like environment which our RL agent interacts with parma obs_size: list. the shape of the observation, i.e. (115,) for vector observation or (32,32) for image observation parma num_frame_stack: int. number of stacked frames for network input param batch_size: int. batch size param mdp_gamma: float. MDP discount factor param initial_epsilon: float. epsilon parameter of epsilon-greedy policy param min_epsilon: float. minimum epsilon parameter of epsilon-greedy policy param epsilon_decay_steps: int. how many steps to decay epsilon param replay_capacity: int. replay buffer size param min_replay_size: int. minimum replay buffer size param train_freq: int. training frequency param network_update_freq: int. network update frequency param regularization: float. regularization coefficient param optimizer_params: dict. optimizer specilized parameters. i.e. learning rate, momentum param render: bool. is render mode on? """ # experience replay buffer for training self.exp_buffer = ExperienceReplay( num_frame_stack, capacity=replay_capacity, obs_size = obs_size ) # experience replay buffer for playing/testing self.play_buffer = ExperienceReplay( num_frame_stack, capacity=num_frame_stack * 10, obs_size = obs_size ) self.env = env self.obs_size = obs_size self.num_frame_stack = num_frame_stack self.batch_size = batch_size self.mdp_gamma = mdp_gamma self.initial_epsilon = initial_epsilon self.min_epsilon = min_epsilon self.epsilon_decay_steps = epsilon_decay_steps self.replay_capacity = replay_capacity self.min_replay_size = min_replay_size self.train_freq = train_freq self.network_update_freq = network_update_freq self.regularization = regularization self.render = render self.dim_actions = env.action_space.n self.dim_state = (num_frame_stack,) + self.obs_size if optimizer_params: self.optimizer_params = optimizer_params else: self.optimizer_params = dict(learning_rate = 0.0001, epsilon = 1e-7) self.is_training = True # epsilon used for playing # if 0, means that we just use the Q-network's optimal action without any exploration self.playing_epsilon = 0.0 self.session = None self.global_counter = 0 self.episode_counter = 0 self.loss_history = []
from __future__ import division, print_function import gym import gym_gazebo import numpy as np import sys import os from ddq_model import Qnet from experience_replay import ExperienceReplay from utils import Config argv = sys.argv[1:] config = Config(argv) env = gym.make('GazeboTurtlebotMazeColor-v0') replay = ExperienceReplay(config.args.output_dir, config.args.replay_buffer_size) qnet = Qnet(env.num_state, env.num_action) if (config.args.continue_from != None): qnet.load(config.args.continue_from) replay.load(config.args.continue_from) elif (config.args.from_pretrain != None): qnet.load(config.args.from_pretrain) epsilon = config.args.start_epsilon epsilon_decay = (config.args.start_epsilon - config.args.end_epsilon) / config.args.annealing_steps while True: state = env.reset()
plan_step_fn = get_pi_iw_planning_step_fn( actor=actor, planner=planner, policy_fn=network_policy, tree_budget=tree_budget, discount_factor=discount_factor, temp=policy_temp) learner = SupervisedPolicy(model, optimizer, regularization_factor=regularization_factor, use_graph=True) # Initialize experience replay: run complete episodes until we exceed both batch_size and dataset_min_transitions print("Initializing experience replay", flush=True) train_stats = TrainStats() experience_replay = ExperienceReplay(capacity=replay_capacity) while len(experience_replay) < batch_size or len( experience_replay) < replay_min_transitions: episode_rewards = run_episode( plan_step_fn=plan_step_fn, learner=None, dataset=experience_replay, cache_subtree=cache_subtree, add_returns=(args.algorithm == "AlphaZero"), preproc_obs_fn=preproc_obs_fn, render=args.render) train_stats.report(episode_rewards, actor.nodes_generated) # Interleave planning and learning steps print("\nInterleaving planning and learning steps.", flush=True) while actor.nodes_generated < max_simulator_steps:
def end_to_end_training( epochs: int, model_cls: BaseConditionalGenerationOracle, optimizer_cls: BaseOptimizer, optimized_function_cls: BaseConditionalGenerationOracle, logger: BaseLogger, model_config: dict, optimizer_config: dict, n_samples_per_dim: int, step_data_gen: float, n_samples: int, current_psi: Union[List[float], torch.tensor], reuse_optimizer: bool = False, reuse_model: bool = False, shift_model: bool = False, finetune_model: bool = False, use_experience_replay: bool = True, add_box_constraints: bool = False, experiment=None, scale_psi=False): """ :param epochs: int number of local training steps to perfomr :param model_cls: BaseConditionalGenerationOracle model that is able to generate samples and calculate loss function :param optimizer_cls: BaseOptimizer :param logger: BaseLogger :param model_config: dict :param optimizer_config: dict :param n_samples_per_dim: int :param step_data_gen: float :param n_samples: int :param current_psi: :param reuse_model: :param reuse_optimizer: :param finetune_model: :param shift_model: :return: """ gan_logger = GANLogger(experiment) # gan_logger = RegressionLogger(experiment) # gan_logger = None y_sampler = optimized_function_cls(device=device, psi_init=current_psi) model = model_cls(y_model=y_sampler, **model_config, logger=gan_logger).to(device) optimizer = optimizer_cls(oracle=model, x=current_psi, **optimizer_config) print(model_config) exp_replay = ExperienceReplay(psi_dim=model_config['psi_dim'], y_dim=model_config['y_dim'], x_dim=model_config['x_dim'], device=device) weights = None logger.log_performance(y_sampler=y_sampler, current_psi=current_psi, n_samples=n_samples) for epoch in range(epochs): # generate new data sample x, condition = y_sampler.generate_local_data_lhs( n_samples_per_dim=n_samples_per_dim, step=step_data_gen, current_psi=current_psi, n_samples=n_samples) if x is None and condition is None: print("Empty training set, continue") continue x_exp_replay, condition_exp_replay = exp_replay.extract( psi=current_psi, step=step_data_gen) exp_replay.add(y=x, condition=condition) x = torch.cat([x, x_exp_replay], dim=0) condition = torch.cat([condition, condition_exp_replay], dim=0) used_samples = n_samples # breaking things if model_config.get("predict_risk", False): condition = condition[::n_samples_per_dim, :current_psi.shape[0]] x = y_sampler.func(condition, num_repetitions=n_samples_per_dim).reshape( -1, x.shape[1]) print(x.shape, condition.shape) ## Scale train set if scale_psi: scale_factor = 10 feature_max = condition[:, :model_config['psi_dim']].max(axis=0)[0] y_sampler.scale_factor = scale_factor y_sampler.feature_max = feature_max y_sampler.scale_psi = True print("MAX FEATURES", feature_max) condition[:, : model_config['psi_dim']] /= feature_max * scale_factor current_psi = current_psi / feature_max * scale_factor print(feature_max.shape, current_psi.shape) print("MAX PSI", current_psi) model.train() if reuse_model: if shift_model: if isinstance(model, ShiftedOracle): model.set_shift(current_psi.clone().detach()) else: model = ShiftedOracle(oracle=model, shift=current_psi.clone().detach()) model.fit(x, condition=condition, weights=weights) else: model.fit(x, condition=condition, weights=weights) else: # if not reusing model # then at each epoch re-initialize and re-fit model = model_cls(y_model=y_sampler, **model_config, logger=gan_logger).to(device) print("y_shape: {}, cond: {}".format(x.shape, condition.shape)) model.fit(x, condition=condition, weights=weights) model.eval() if reuse_optimizer: optimizer.update(oracle=model, x=current_psi) else: # find new psi optimizer = optimizer_cls(oracle=model, x=current_psi, **optimizer_config) if add_box_constraints: box_barriers = make_box_barriers(current_psi, step_data_gen) add_barriers_to_oracle(oracle=model, barriers=box_barriers) previous_psi = current_psi.clone() current_psi, status, history = optimizer.optimize() if scale_psi: current_psi, status, history = optimizer.optimize() current_psi = current_psi / scale_factor * feature_max y_sampler.scale_psi = False print("NEW_PSI: ", current_psi) try: # logging optimization, i.e. statistics of psi logger.log_grads(model, y_sampler, current_psi, n_samples_per_dim, log_grad_diff=False) logger.log_optimizer(optimizer) logger.log_performance(y_sampler=y_sampler, current_psi=current_psi, n_samples=n_samples) experiment.log_metric("used_samples_per_step", used_samples) experiment.log_metric("sample_size", len(x)) except Exception as e: print(e) print(traceback.format_exc()) # raise torch.cuda.empty_cache() logger.func_saver.join() return