def __init__(self, state_dim, action_dim, buffer_size=int(1e5), batch_size=128, target_update_frequency=1000, gamma=0.99, learning_rate=1e-3, weight_decay=1e-2, hidden_layers=(48, 16)): self.state_dim = state_dim self.action_dim = action_dim self.replay_buffer = ReplayBuffer(buffer_size) self.policy_dqn = DQN(state_dim, action_dim, hidden_layers=hidden_layers) self.target_dqn = DQN(state_dim, action_dim, hidden_layers=hidden_layers) self.optimizer = torch.optim.Adam(self.policy_dqn.parameters(), lr=learning_rate, weight_decay=weight_decay) self.gamma = gamma self.batch_size = batch_size self.target_update_frequency = target_update_frequency
def __init__(self, params, state_size, action_size, seed, hidden_layers): super(Agent, self).__init__() """ Params ====== config: configuration file state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.batch_size = int(params["batch_size"]) self.beta_frames = int(params["beta_frames"]) self.buffer_size = int(params["buffer_size"]) self.gamma = params["gamma"] self.lr = params["lr"] self.p_alpha = params["p_alpha"] self.p_beta = params["p_beta"] self.tau = params["tau"] self.update_every = params["update_every"] # Q-Network self.qnetwork_local = DQN(state_size, action_size, seed, hidden_layers=hidden_layers).to(device) self.qnetwork_target = DQN(state_size, action_size, seed, hidden_layers=hidden_layers).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) # Replay memory self.memory = PriorityReplayBuffer(action_size, self.buffer_size, self.batch_size, seed, self.p_alpha, self.p_beta, self.beta_frames) self.t_step = 0 # Initialize time step (for updating every UPDATE_EVERY steps)
def __init__(self, env, gamma, lr, n_actions, input_dim, no_rnn_hidden, no_rnn_layer, ann_layer, mem_size, batch_size, epsilon, eps_min=0.01, eps_dec=5e-6, replace=500, path='tmp'): self.env = env self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dim = input_dim self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.path = path self.action_space = [i for i in range(self.n_actions)] self.learn_step_counter = 0 self.memory = ReplayMemory(mem_size) self.q_eval = DQN(input_dim, no_rnn_hidden, no_rnn_layer, ann_layer, n_actions, self.batch_size) self.q_next = DQN(input_dim, no_rnn_hidden, no_rnn_layer, ann_layer, n_actions, self.batch_size) self.optimizer = torch.optim.Adam(self.q_eval.parameters(), lr=self.lr) self.loss = nn.SmoothL1Loss() self.last_loss = 0
def __init__(self, *args, **kwargs): super(DQNAgent, self).__init__(*args, **kwargs) self.q_eval = DQN(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name+"_"+self.algorithm+"_q_eval", checkpoint_dir=self.checkpoint_dir) self.q_policy = DQN(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name+"_"+self.algorithm+"_q_policy", checkpoint_dir=self.checkpoint_dir)
def main(): replay_buffer = deque(maxlen=REPLAY_MEMORY) last_100_game_reward = deque(maxlen=100) with tf.Session() as sess: mainDQN = DQN(sess, INPUT_SIZE, OUTPUT_SIZE, name="main") targetDQN = DQN(sess, INPUT_SIZE, OUTPUT_SIZE, name="target") sess.run(tf.global_variables_initializer()) spend_time = tf.placeholder(tf.float32) rr = tf.summary.scalar('reward', spend_time) merged = tf.summary.merge_all() writer = tf.summary.FileWriter('./board/dqn_not_per', sess.graph) # initial copy q_net -> target_net copy_ops = get_copy_var_ops(dest_scope_name="target", src_scope_name="main") sess.run(copy_ops) for episode in range(MAX_EPISODES): e = 1. / ((episode / 10) + 1) done = False step_count = 0 state = env.reset() while not done: if np.random.rand() < e: action = env.action_space.sample() else: # Choose an action by greedily from the Q-network action = np.argmax(mainDQN.predict(state)) # Get new state and reward from environment next_state, reward, done, _ = env.step(action) if done: # Penalty reward = -1 # Save the experience to our buffer replay_buffer.append((state, action, reward, next_state, done)) if done: if len(replay_buffer) > BATCH_SIZE: minibatch = random.sample(replay_buffer, BATCH_SIZE) loss, _ = replay_train(mainDQN, targetDQN, minibatch) sess.run(copy_ops) #if step_count % TARGET_UPDATE_FREQUENCY == 0: # sess.run(copy_ops) summary = sess.run(merged, feed_dict={spend_time: step_count}) writer.add_summary(summary, episode) state = next_state step_count += 1 print("Episode: {} steps: {}".format(episode, step_count))
def __init__(self, state_size, action_size): self.target = DQN(state_size, action_size).to(device) self.current = DQN(state_size, action_size).to(device) self.loss_fn = torch.nn.MSELoss(reduction='sum') self.memory = deque(maxlen = 4000) self.batch_size = 128 learning_rate = 0.0025 self.optimizer = torch.optim.Adam(self.current.parameters(), lr=learning_rate)
def __init__(self): self.game_network = DQN(9, 6) self.wenz_network = DQN(36, 8) self.solo_network = DQN(37, 8) self.solo_playing_network = DQN(37, 8) self.wenz_playing_network = DQN(36, 8) self.match = None self.game_memory = {} self.card_memory = {} self.explore = True self.N = 100
def __init__(self): self.dqn_local = DQN() self.batch_size = self.dqn_local.BATCH_SIZE print(self.dqn_local.dqn.summary()) self.dqn_target = DQN() self.replay_memory = ReplayMemory(self.dqn_local) self.temp = 0
def __init__(self): # self.config = config self.gamma = 0.4 # self.logger = logging.getLogger("DQNAgent") self.screen_width = 600 # define models (policy and target) self.policy_model = DQN() self.target_model = DQN() # define memory self.memory = ReplayMemory() # define loss self.loss = HuberLoss() # define optimizer self.optim = torch.optim.Adam(self.policy_model.parameters(), lr=0.01) # define environment self.env = PyCar() #TODO # self.cartpole = PyCar(self.screen_width) # initialize counter self.current_episode = 0 self.current_iteration = 0 self.episode_durations = [] self.batch_size = 1700 # set cuda flag self.is_cuda = torch.cuda.is_available() self.cuda = self.is_cuda if self.cuda: # print_cuda_statistics() self.device = torch.device("cuda") else: self.device = torch.device("cpu") self.policy_model = self.policy_model.to(self.device) self.target_model = self.target_model.to(self.device) self.loss = self.loss.to(self.device) # Initialize Target model with policy model state dict self.target_model.load_state_dict(self.policy_model.state_dict()) self.target_model.eval() self.savepath = "/home/sk002/Desktop/model/"
def main(): max_episode = 5000 replay_buffer = deque() with tf.Session() as sess: mainDQN = DQN(sess, input_size, output_size, name='main') targetDQN = DQN(sess, input_size, output_size, name='target') tf.global_variables_initializer().run() copy_ops = get_copy_var_ops(dest_scope_name="taget", src_scope_name="main") sess.run(copy_ops) for episode in range(max_episode): e = 1 / ((episode / 10) + 1) done = False step_count = 0 state = env.reset() while not done: if np.random.rand(1) < e: action = env.action_space.sample() else: action = np.argmax(mainDQN.predict(state)) next_state, reward, done, _ = env.step(action) if done and step_count != 200: reward -= 100 replay_buffer.append((state, action, reward, next_state, done)) if len(replay_buffer) > REPLAY_MEMORY: replay_buffer.popleft() state = next_state step_count += 1 if step_count > 10000: break print('Episode[{}] - steps : {}'.format(episode, step_count)) if step_count > 10000: pass if episode % 10 == 1: for _ in range(50): minibatch = random.sample(replay_buffer, 10) loss, _ = replay_train(mainDQN, targetDQN, minibatch) print('loss : ', loss) # copy q_net -> traget_net sess.run(copy_ops) bot_play(mainDQN)
def __init__(self, manager, agent_params): self.manager = manager self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.verbose = agent_params["verbose"] self.scaling_eps = agent_params["scaling_eps"] # Rainbow enhancements self.noisy_nets = agent_params["noisy_nets"] self.dueling = agent_params["dueling"] self.double_dqn = agent_params["double_dqn"] self.prioritized = agent_params["prioritized"] self.discount = agent_params["discount"] self.ep_start = agent_params["ep_start"] self.ep = self.ep_start self.ep_end = agent_params["ep_end"] self.ep_endt = agent_params["ep_endt"] self.eval_ep = agent_params["eval_ep"] # For running gradient correction self.correct_error_magnitude = agent_params["correct_error_magnitude"] self.max_error_magnitude = agent_params["max_error_magnitude"] self.min_error_divisor = agent_params["min_error_divisor"] self.error_mag_beta = agent_params["error_mag_beta"] self.error_mag_updates = 0.0 self.error_mag_biased = 0.0 self.error_mag = 0.0 self.adam_lr = agent_params["adam_lr"] self.adam_eps = agent_params["adam_eps"] self.adam_beta1 = agent_params["adam_beta1"] self.adam_beta2 = agent_params["adam_beta2"] self.network = DQN(self.manager.gpu, self.manager.in_channels, self.manager.n_actions, 1, self.noisy_nets, self.dueling, False) self.target_network = DQN(self.manager.gpu, self.manager.in_channels, self.manager.n_actions, 1, self.noisy_nets, self.dueling, False) self.target_network.load_state_dict(self.network.state_dict()) self.optimizer = optim.Adam(self.network.parameters(), lr=self.adam_lr, betas=(self.adam_beta1, self.adam_beta2), eps=self.adam_eps)
def _before_sim_loop(self): n_state = self._env.observation_space.shape[0] n_action = self._env.action_space.n self._algo = DQN(n_state, n_action, self._algo_params) self._algo.update_net() self._score = 0.0 self._score_sum = 0.0
def train(lr, e_greedy, times=100): env = gym.make('CartPole-v0') env = env.unwrapped rlmodel = DQN( n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=lr, e_greedy=e_greedy, replace_loop=100, memory_size=2000, e_greedy_increment=0.001, show_info=False, ) total_steps = 0 history = [] for i_episode in range(times): observation = env.reset() ep_r = 0 cur_steps = 0 while True: action = rlmodel.choose_action(observation) observation_, reward, done, info = env.step(action) reward = get_reward(observation, env) rlmodel.store_transition(observation, action, reward, observation_) ep_r += reward if total_steps > 1000: rlmodel.learn() if done or cur_steps >= 10000: history.append(ep_r) break observation = observation_ cur_steps += 1 total_steps += 1 env.close() return rlmodel, history
def main(): env = get_player( args.rom, image_size=IMAGE_SIZE, train=True, frame_skip=FRAME_SKIP) file_path = "memory.npz" rpm = ReplayMemory( MEMORY_SIZE, IMAGE_SIZE, CONTEXT_LEN, load_file=True, # load replay memory data from file file_path=file_path) act_dim = env.action_space.n model = AtariModel(act_dim) algorithm = DQN( model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE * gpu_num) agent = AtariAgent( algorithm, act_dim=act_dim, total_step=args.train_total_steps) if os.path.isfile('./model.ckpt'): logger.info("load model from file") agent.restore('./model.ckpt') if args.train: logger.info("train with memory data") run_train_step(agent, rpm) logger.info("finish training. Save the model.") agent.save('./model.ckpt') else: logger.info("collect experience") collect_exp(env, rpm, agent) rpm.save_memory() logger.info("finish collecting, save successfully")
def __init__(self, config, actor_idx, starting_port, tensorboard_logger): super().__init__(config, actor_idx, starting_port, tensorboard_logger) self.dqn = DQN( input_shape=(config.width, config.height, config.stacked_frames), num_actions=3, learning_rate=config.learning_rate, )
def __init__(self, poi_info, user_KG, params): self.poi_info = poi_info self.user_KG = user_KG self.visit_counter = 0 self.ll = params.ll self.lc = params.lc self.lp = params.lp self.poi_cat_dict = poi_info.poi_cat_dict self.poi_loc_dict = poi_info.poi_loc_dict self.poi_dist_mat = poi_info.poi_dist_mat self.cat_sim_mat = poi_info.cat_sim_mat self.memory_capacity = params.memory_capacity self.environment = Environment(user_KG.s_u.shape[1], self.poi_info.env_nt_1, self.poi_info.env_nt_2) self.dqn = DQN(self.environment, user_KG.s_u.shape[1] + user_KG.s_KG.x.shape[1], user_KG.s_KG.num_POI, params.memory_capacity, params.lr, params.epsilon, params.batch_size, params.gamma, params.target_replace_iter, mode=params.priority_mode) self.predict_POI_index = np.random.randint(user_KG.s_KG.num_POI) self.r = reward(params.ll, params.lc, params.lp, self.predict_POI_index, 0, poi_info.poi_cat_dict, poi_info.poi_loc_dict, poi_info.poi_dist_mat, poi_info.cat_sim_mat)
def __init__(self, model_name): self.model_name = model_name self.action_names = ['A', 'D', 'M', 'L', 'R'] self.num_actions = len(self.action_names) self.memory = deque() #self.model = Cnn(self.model_name, self.memory) #self.target_model = Cnn(self.model_name, [], target=True) self.model = DQN(model_name, self.memory) # self.state = np.zeros([1, VISION_F + VISION_B + 1, VISION_W * 2 + 1, 1]) self.previous_states = np.zeros([1, VISION_F + VISION_B + 1, VISION_W * 2 + 1, 4]) self.previous_actions = np.zeros([4]) self.previous_actions.fill(2) self.q_values = np.zeros(5) self.action = 2 self.count_states = self.model.get_count_states() self.delay_count = 0 self.epsilon_linear = LinearControlSignal(start_value=EPSILON_GREEDY_START_PROB, end_value=EPSILON_GREEDY_END_PROB, repeat=False) self.advantage = 0 self.value = 0 self.score = 0
def main(): #Set up the network for the first time features = 263 h1 = 50 h2 = 50 dqn = DQN(features, h1, h2, "models/test_1") #Initialize the game initialize() game = setup_game() ai_player = game.current_player try: while True: if game.current_player == ai_player: action_choice = look_ahead(game, dqn) print("Action chosen was: ", action_choice) import pdb; pdb.set_trace() perform_action(action_choice, ai_player, game) else: actions = get_actions(game.current_player) index = random.randint(0, len(actions)-1) perform_action(actions[index], game.current_player, game) except GetStateError: print("Error with get_state function") except GameOver: print("Game ended normally")
def __init__(self): self.experience_replay = ExperienceReplay('BreakoutDeterministic-v0', FLAGS.replay_buffer_size, 84, 84, 4, self.policy, FLAGS.decay_to_epoch) config = DQNConfig() config.learning_rate = FLAGS.learning_rate config.gamma = FLAGS.gamma config.decay = FLAGS.decay config.momentum = FLAGS.momentum config.eps = FLAGS.eps config.input_width = FLAGS.image_width config.input_height = FLAGS.image_height config.skip = FLAGS.skip self.dqn = DQN(config, FLAGS.use_huber) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) logger.info('initializing variables...') self.sess.run(tf.global_variables_initializer()) self.update_target() self.epoch = 0 self.decay_epsilon()
def main(args): """ Run a trained model for the cartpole problem :param args: (ArgumentParser) the input arguments """ env = gym.make("CartPole-v0") model = DQN( env=env, policy_class=MlpPolicy, learning_rate=5e-4, buffer_size=50000, double_q=False, prioritized_replay=True, dueling=True, exploration_fraction=0.2, exploration_final_eps=0.02, model_path='cartpole_model' ) model = model.load("cartpole_model") while True: obs, done = env.reset(), False episode_rew = 0 while not done: if not args.no_render: env.render() action, _ = model.predict(obs) obs, rew, done, _ = env.step(action) episode_rew += rew print("Episode reward", episode_rew) # No render is only used for automatic testing if args.no_render: break
def replay(): print('replay ... ') sess = tf game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=True) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state('model') saver.restore(sess, ckpt.model_checkpoint_path) for episode in range(MAX_EPISODE): terminal = False total_reward = 0 state = game.reset() brain.init_state(state) while not terminal: action = brain.get_action() state, reward, terminal = game.step(action) total_reward += reward brain.remember(state, action, reward, terminal) time.sleep(0.3) print('game count: %d score: %d' % (episode + 1, total_reward))
def play(args): device = torch.device("cuda" if args.gpu else "cpu") env = Environment(draw=True, fps=args.fps, debug=args.debug, dist_to_pipe=args.dist_to_pipe, dist_between_pipes=args.dist_between_pipes, obs_this_pipe=args.obs_this_pipe) observation_space = env.get_observation_size_buffer() action_space = env.get_action_size() network = DQN(observation_space, action_space) network.load_checkpoint(args.checkpoint) for _ in range(args.runs): state = env.reset() total_reward = 0.0 while True: state_v = torch.tensor(np.array([state], copy=False)).to(device) q_vals_v = network(state_v.float()) _, act_v = torch.max(q_vals_v, dim=1) action = int(act_v.item()) next_state, reward, done = env.step(action) total_reward += reward state = next_state if done: print("REWARD: ", total_reward) break
def main(): value_function = Sequential(Linear(in_features=4, out_features=128), ReLU(), Linear(in_features=128, out_features=128), ReLU(), Linear(in_features=128, out_features=32), ReLU(), Linear(in_features=32, out_features=2)).to( torch.device("cuda:0")) optimizer = RMSprop(params=value_function.parameters(), alpha=0.95, lr=0.0001) agent = DQN(value_function=value_function, optimizer=optimizer, lr_scheduler=LambdaLR(optimizer=optimizer, lr_lambda=lambda e: max(0.9999**e, 0.1)), gamma=0.95, epsilon_fn=lambda x: 0.9999**x, replay_buffer_size=10000, replay_batch_size=128, start_training_at=1024, unfreeze_freq=64, device=torch.device("cuda:0"), verbose=True) run_qlearning(agent, render=True)
def main(args): if args.gpu: ctx = get_extension_context('cudnn', device_id=str(args.device)) nn.set_default_context(ctx) # atari environment env = AtariWrapper(gym.make(args.env), args.seed, episodic=True) eval_env = AtariWrapper(gym.make(args.env), 50, episodic=False) num_actions = env.action_space.n # action-value function built with neural network model = DQN(q_function, num_actions, args.batch_size, args.gamma, args.lr) if args.load is not None: nn.load_parameters(args.load) model.update_target() buffer = ReplayBuffer(args.buffer_size, args.batch_size) exploration = LinearlyDecayEpsilonGreedy(num_actions, args.epsilon, 0.1, args.schedule_duration) monitor = prepare_monitor(args.logdir) update_fn = update(model, buffer, args.target_update_interval) eval_fn = evaluate(eval_env, model, render=args.render) train(env, model, buffer, exploration, monitor, update_fn, eval_fn, args.final_step, args.update_start, args.update_interval, args.save_interval, args.evaluate_interval, ['loss'])
def replay(): print('dqn_setting') sess = tf.Session() game = Sim(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=True) brain = DQN(sess, VIEW_WIDTH, VIEW_HEIGHT, NUM_ACTION) saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state('model') saver.restore(sess, ckpt.model_checkpoint_path) # start game for episode in range(MAX_EPISODE): terminal = False total_reward = 0 state = game.Reset() brain.init_state(state) while not terminal: action = brain.get_action() print('action_choice : ' + str(action)) # get data state, reward, terminal = game.Update(action) total_reward += reward brain.remember(state, action, reward, terminal) # show the play time.sleep(10) print('Number of game: %d Score: %d' % (episode + 1, total_reward))
def main(): USE_CUDA = torch.cuda.is_available() env = gym.make('CartPole-v0') dqn = DQN(env.observation_space.shape[0], env.action_space.n) if USE_CUDA: dqn = dqn.cuda() optimizer = optim.RMSprop(dqn.parameters(), lr=0.00025, momentum=0.95, alpha=0.95, eps=0.01) epsilon_schedule = get_epsilon_schedule(start=1.0, end=0.01, endt=1000, learn_start=50) replay_buffer = ReplayBuffer(capacity=1000) agent = DQNAgent(env, dqn, optimizer, epsilon_schedule, replay_buffer, discount_factor=0.99, target_update_rate=10, batch_size=32, learn_start=50) agent.train(5000) total_reward = agent.play(render=True) agent.env.close() print('Total Reward: ', total_reward)
def __init__(self, state_size, action_size): self.model = DQN(state_size, action_size) self.loss_fn = torch.nn.MSELoss(reduction='sum') learning_rate = 0.0025 self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate)
def trainModel(env, action_size): state = env.reset() observation_space = len(state) agent = DQN(observation_space, action_size) target_model_update_counter = 0 agent.step = 0 for _ in range(parameters.EPISODES): print("Episode number: " + str(_)) state = env.reset() observation_size = len(state) state = np.reshape(state, [1, observation_size]) done = False while not done and rclpy.ok(): agent.step += 1 target_model_update_counter += 1 if target_model_update_counter % parameters.TARGET_MODEL_UPDATE_STEP == 0: agent.save_load_model_weights() target_model_update_counter = 0 action = agent.get_action(state) next_state, reward, done = env.step(action) next_state = np.reshape(next_state, [1, observation_space]) agent.save_to_memory(state, action, reward, next_state, done) state = next_state if not done: agent.experience_replay() sleep(parameters.LOOP_RATE) agent.model.save('random_crawl_model.h5')
def main(): # initialize OpenAI Gym env and dqn agent env = gym.make(ENV_NAME) agent = DQN(env) for episode in range(EPISODE): # initialize task state = env.reset() # Train for step in range(STEP): action = agent.egreedy_action(state) # e-greedy action for train next_state, reward, done, _ = env.step(action) # Define reward for agent reward_agent = -1 if done else 0.1 agent.perceive(state, action, reward, next_state, done) state = next_state if done: break # Test every 100 episodes if episode % 100 == 0: total_reward = 0 for i in range(TEST): state = env.reset() for j in range(STEP): env.render() action = agent.action(state) # direct action for test state, reward, done, _ = env.step(action) total_reward += reward if done: break ave_reward = total_reward / TEST print('episode: ', episode, 'Evaluation Average Reward:', ave_reward) if ave_reward >= 200: break
def train_dqn(env, args): agent = DQN(env, args) agent.train() total_episodes = args.episodes max_steps = 10 for episode in range(total_episodes): print(episode, agent.epsilon, end='\r') state = env.reset() done = False for step in range(max_steps): action = agent.act(state) next_state, reward, done, _ = env.step(action) agent.push(state, action, reward, next_state, done) agent.learn(episode) state = next_state if done: break if episode % 5 == 0: max_steps += 10 return agent