def train_model(): # Initiates the env env = gym.make('Mario-Kart-Luigi-Raceway-v0') resolution = (120, 160) actions = [ [-60, 0, 1, 0, 0], # left [60, 0, 1, 0, 0], # right [0, -80, 0, 1, 0], # back [0, 0, 1, 0, 0] ] # go straight # [ 0, 0, 0, 1, 0]] # brake # Initiates Model model = DQNModel(resolution=resolution, nb_frames=learn_param['nb_frames'], actions=actions) # print("number of actions: ", len(doom.actions)) # 16 if model_weights: model.load_weights(model_weights) agent = RLAgent(model, **learn_param) # Preform Reinforcement Learning on Scenario agent.train(env)
def __init__(self, in_channels, action_size, seed): """Initialize an Agent object. """ self.in_channels = in_channels self.action_size = action_size #self.seed = random.seed(seed) # Q-Network self.qnetwork_local = DQNModel(in_channels, action_size) self.qnetwork_target = DQNModel(in_channels, action_size) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.loss_list = []
def __init__(self, epsilon=1.0): self.next_actionable = 0 self.scout_locations = {} self.rewards = [] weighted_actions = { self.no_op: 1, self.standby: 1, self.attack: 3, self.manage_supply: 5, self.adjust_refinery_assignment: 1, self.manage_refineries: 1, self.manage_barracks: 3, self.manage_barracks_tech_labs: 1, self.manage_barracks_reactors: 1, self.manage_factories: 1, self.manage_starports: 1, self.train_workers: 3, self.train_marines: 7, self.train_marauders: 4, self.train_hellions: 1, self.train_medivacs: 1, self.upgrade_cc: 1, self.expand: 4, self.scout: 1, self.calldown_mules: 2, } self.actions = [] for action_fn, weight in weighted_actions.items(): for _ in range(weight): self.actions.append(action_fn) self.curr_state = None self.num_actions = len(self.actions) self.dqn = DQNModel(self.actions, eps=epsilon) self.iteration = 0 # <list> [UnitId] specifying military composition. self.military_distribution = [ MARINE, MARAUDER, HELLION ] self.tl_tags = [] self.techlab_research_options = [ RESEARCH_COMBATSHIELD, RESEARCH_CONCUSSIVESHELLS, BARRACKSTECHLABRESEARCH_STIMPACK ]
def __init__(self, env, action_size, config): self.memory = RingBuffer(int( config.config_section_map()['memorysize'])) self.gamma = float( config.config_section_map()['gamma']) # discount rate self.epsilon = float( config.config_section_map()['epsilon']) # exploration rate self.epsilon_min = float(config.config_section_map()['epsilonmin']) self.epsilon_decay = float(config.config_section_map()['epsilondecay']) self.learning_rate = float(config.config_section_map()['learningrate']) self.action_size = action_size self.env = env self.dqn_model = DQNModel(self.learning_rate, action_size)
def test_result(): ############# # test # ############# #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") policy_model = DQNModel(4, 18) #policy_model.load_state_dict(torch.load('./data/dqn_Riverraid_qnetwork_target_state_dict.pt' )) #policy_model.eval() env = atari_wrappers.make_atari('RiverraidNoFrameskip-v4') env = atari_wrappers.wrap_deepmind(env, clip_rewards=True, frame_stack=True, pytorch_img=True) policy_model.load_model( torch.load('./data/dqn_Riverraid_qnetwork_target_state_dict.pickle')) num_episodes = 5 episode = 1 score = 0 ep_score = [] done = False while (episode < num_episodes): observation = env.reset() done = False while not done: #action = agent.act(state) with torch.no_grad(): t_observation /= 255 #t_observation = t_observation.view(1, t_observation.shape[0], t_observation.shape[1], t_observation.shape[2]) q_value = policy_model.forward(t_observation) action = argmax(q_value) env.render() time.sleep(0.0005) next_observation, reward, done, info = env.step(action) score += reward observation = next_observation if info['ale.lives'] == 0: episode += 1 ep_score.append(score) score = 0 print("Average Score : {}".format(int(np.mean(ep_score)))) print(ep_score)
def __init__(self, portfolio_size, batch_size, max_experiences, min_experiences, is_eval=False): self.portfolio_size = portfolio_size self.action_size = 3 # sit, buy, sell self.input_shape = ( self.portfolio_size, self.portfolio_size, ) self.is_eval = is_eval #replay buffer hyperparameters self.expReplayBuffer = { 's': [], 'a': [], 'r': [], 's2': [], 'done': [] } self.expReplayBufferSize = 0 self.batch_size = batch_size #for replay buffer self.max_experiences = max_experiences self.min_experiences = min_experiences #training hyperparameters self.alpha = 0.5 self.gamma = 0.95 self.epsilon = 1.0 self.epsilon_min = 0.01 self.epsilon_decay = 0.05 #decay rate after every iteration #models self.hidden_units = [100, 50] self.train_model = DQNModel(self.input_shape, self.hidden_units, self.action_size, self.portfolio_size).get_model() self.test_model = self.get_model()
def run_weights(): env = gym.make('Mario-Kart-Luigi-Raceway-v0') resolution = (120, 160) actions = [ [-60, 0, 1, 0, 0], # left [60, 0, 1, 0, 0], # right [0, -80, 0, 1, 0], # back [0, 0, 1, 0, 0] ] # go straight # [ 0, 0, 0, 1, 0]] # brake # Load Model and Weights model = DQNModel(resolution=resolution, nb_frames=test_param['nb_frames'], actions=actions) model.load_weights(model_weights) agent = RLAgent(model, **test_param) agent.test(env)
# Initiates the env env = gym.make('Mario-Kart-Luigi-Raceway-v0') resolution = (120, 160) actions = [ [-60, 0, 1, 0, 0], # left [60, 0, 1, 0, 0], # right [0, -80, 0, 1, 0], # back [0, 0, 1, 0, 0] ] # go straight # [ 0, 0, 0, 1, 0]] # brake # Initiates Model model = DQNModel(resolution=resolution, nb_frames=learn_param['nb_frames'], actions=actions) # print("number of actions: ", len(doom.actions)) # 16 if model_weights: model.load_weights(model_weights) else: print("Please provide a model_weights file") agent = RLAgent(model, **learn_param) # give a step number randomly to catch a random screen shot agent.visualize(env)
def main(): use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") print("use_cuda: ", use_cuda) print("Device: ", device) env = atari_wrapper.make_atari('RiverraidNoFrameskip-v4') env = atari_wrapper.wrap_deepmind(env, clip_rewards=False, frame_stack=True, pytorch_img=True) action_space = [a for a in range(env.action_space.n)] n_action = len(action_space) # DQN Model and optimizer: policy_model = DQNModel().to(device) target_model = DQNModel().to(device) target_model.load_state_dict(policy_model.state_dict()) optimizer = torch.optim.RMSprop(policy_model.parameters(), lr=lr, alpha=alpha) # Initialize the Replay Buffer replay_buffer = ReplayBuffer(rep_buf_size) while len(replay_buffer) < rep_buf_ini: observation = env.reset() done = False while not done: with torch.no_grad(): t_observation = torch.from_numpy(observation).float().to( device) t_observation = t_observation.view(1, t_observation.shape[0], t_observation.shape[1], t_observation.shape[2]) action = random.sample(range(len(action_space)), 1)[0] next_observation, reward, done, info = env.step( action_space[action]) replay_buffer.push(observation, action, reward, next_observation, done) observation = next_observation print('Experience Replay buffer initialized') # Use log to record the performance logger = logging.getLogger('dqn_Riverraid') logger.setLevel(logging.INFO) logger_handler = logging.FileHandler('./dqn_Riverraid.log') logger.addHandler(logger_handler) # Training part env.reset() score = 0 episode_score = [] mean_episode_score = [] episode_true = 0 num_frames = 0 episode = 0 last_100episode_score = deque(maxlen=100) while episode < max_episodes: observation = env.reset() done = False # import time # start=time.time() while not done: with torch.no_grad(): t_observation = torch.from_numpy(observation).float().to( device) / 255 t_observation = t_observation.view(1, t_observation.shape[0], t_observation.shape[1], t_observation.shape[2]) epsilon = epsilon_by_frame(num_frames) if random.random() > epsilon: q_value = policy_model(t_observation) action = q_value.argmax(1).data.cpu().numpy().astype( int)[0] else: action = random.sample(range(len(action_space)), 1)[0] next_observation, reward, done, info = env.step( action_space[action]) num_frames += 1 score += reward replay_buffer.push(observation, action, reward, next_observation, done) observation = next_observation # Update policy if len(replay_buffer ) > batch_size and num_frames % skip_frame == 0: observations, actions, rewards, next_observations, dones = replay_buffer.sample( batch_size) observations = torch.from_numpy(np.array(observations) / 255).float().to(device) actions = torch.from_numpy( np.array(actions).astype(int)).float().to(device) actions = actions.view(actions.shape[0], 1) rewards = torch.from_numpy( np.array(rewards)).float().to(device) rewards = rewards.view(rewards.shape[0], 1) next_observations = torch.from_numpy( np.array(next_observations) / 255).float().to(device) dones = torch.from_numpy( np.array(dones).astype(int)).float().to(device) dones = dones.view(dones.shape[0], 1) q_values = policy_model(observations) next_q_values = target_model(next_observations) q_value = q_values.gather(1, actions.long()) next_q_value = next_q_values.max(1)[0].unsqueeze(1) expected_q_value = rewards + gamma * next_q_value * (1 - dones) loss = huber_loss(q_value, expected_q_value) optimizer.zero_grad() loss.backward() optimizer.step() for target_param, policy_param in zip( target_model.parameters(), policy_model.parameters()): target_param.data.copy_(TAU * policy_param.data + (1 - TAU) * target_param.data) episode += 1 # episode_score.append(score) # end=time.time() # print("Running time ( %i episode): %.3f Seconds "%(episode ,end-start)) if info['ale.lives'] == 0: # episode_score.append(score) mean_score = score episode_true += 1 score = 0 # if episode % 20 == 0: # mean_score = np.mean(episode_score) mean_episode_score.append(mean_score) last_100episode_score.append(mean_score) # episode_score = [] logger.info('Frame: ' + str(num_frames) + ' / Episode: ' + str(episode_true) + ' / Average Score : ' + str(int(mean_score)) + ' / epsilon: ' + str(float(epsilon))) #plot_score(mean_episode_score, episode_true) pickle.dump(mean_episode_score, open('./dqn_Riverraid_mean_scores.pickle', 'wb')) if episode_true % 50 == 1: logger.info('Frame: ' + str(num_frames) + ' / Episode: ' + str(episode_true) + ' / Average Score : ' + str(int(mean_score)) + ' / epsilon: ' + str(float(epsilon)) + ' / last_100episode_score: ' + str(float(np.mean(last_100episode_score)))) if episode % 50 == 0: torch.save(target_model.state_dict(), './dqn_spaceinvaders_target_model_state_dict.pt') torch.save(policy_model.state_dict(), './dqn_spaceinvaders_model_state_dict.pt') pass
# STATE_SHAPE = [8] # NUM_ACTIONS = 3 # # A higher learning rate can be used for simple envs # LEARNING_RATE = 1e-2 # fake_states = np.random.random([3] + STATE_SHAPE) # fake_target_states = np.random.random([3] + STATE_SHAPE) fake_rewards = np.array([100, 100, 100]) fake_dones = np.array([1, 1, 1]) print('Testing action optimization process') for i_action in range(NUM_ACTIONS): fake_actions = np.array(3 * [i_action]) tf.reset_default_graph() model = DQNModel(STATE_SHAPE, NUM_ACTIONS) print('Optimizing for action', i_action) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) old_preds = model.predict(sess, fake_states) print('Old predictions:\n', old_preds) for _ in range(100): model.train(sess, LEARNING_RATE, fake_states, fake_target_states, fake_actions, fake_rewards, fake_dones) new_preds = model.predict(sess, fake_states) print('New predictions:\n', new_preds) print('Testing target update process') tf.reset_default_graph()
from learner import Learner from model import DQNModel import gym import maze_env env=gym.make('Maze-v0') learner = Learner(env,model=DQNModel()) learner.run()
import numpy as np from model import DQNModel from policy import EpsGreedyPolicy from memory import Memory from agent import DQNAgent from processor import AtariProcessor if __name__ == '__main__': ENV_NAME = 'Riverraid-v4' env = gym.make(ENV_NAME) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n model = DQNModel(nb_actions=nb_actions).model policy = EpsGreedyPolicy(eps_min=0.1, eps_max=1, eps_test=0.05, nb_steps=1000000) memory = Memory(max_len=1000000) processor = AtariProcessor() dqn = DQNAgent(env, model, policy, memory, processor, gamma=0.99, batch_size=32, target_model_update_steps=10000, nb_episodes_warmup=500)
# Init environment env = gym.make(args.env) if "Street" not in args.env: env.unwrapped.set_difficulty(status["difficulty"], weighted=False) env.shaped_reward = args.dense_reward env.seed(args.seed) # Get obs space and preprocess function obs_space, preprocess_obss = utils.get_obss_preprocessor( args.env, env.observation_space, model_dir) # Load model try: policy_net = utils.load_model(model_dir) target_net = DQNModel(env.action_space, env=args.env) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() print("Model successfully loaded\n") except OSError: policy_net = DQNModel(env.action_space, env=args.env) target_net = DQNModel(env.action_space, env=args.env) target_net.load_state_dict(policy_net.state_dict()) print("Model successfully created\n") if torch.cuda.is_available(): policy_net.cuda() target_net.cuda() target_net.eval() print("CUDA available: {}\n".format(torch.cuda.is_available()))
second_tiger_handle: int deer_handle, first_tiger_handle, second_tiger_handle = environment.get_handles( ) environment.reset() environment.add_walls(method="random", n=map_size * map_size * wall_density) environment.add_agents(deer_handle, method="random", n=deers) environment.add_agents(first_tiger_handle, method="random", n=tigers) environment.add_agents(second_tiger_handle, method="random", n=tigers) view_space: Tuple = environment.get_view_space(first_tiger_handle) view_space = (view_space[-1], ) + view_space[:2] dqn_model: DQNModel = DQNModel( view_space, environment.get_feature_space(first_tiger_handle), environment.get_action_space(first_tiger_handle)[0]) dqn_model.load_state_dict(torch.load(model, map_location=map_location)) print(dqn_model) reward_tiger_1: float = 0.0 reward_tiger_2: float = 0.0 survivors: int while True: first_tiger_actions: ndarray = get_actions(environment, dqn_model, first_tiger_handle) second_tiger_actions: ndarray = get_actions(environment, dqn_model, second_tiger_handle) environment.set_action(first_tiger_handle, first_tiger_actions)
deer_handle: int tiger_handle: int deer_handle, tiger_handle = gridworld.get_handles() def reset_environment(): gridworld.reset() gridworld.add_walls(method="random", n=MAP_SIZE * MAP_SIZE * WALLS_DENSITY) gridworld.add_agents(deer_handle, method="random", n=COUNT_DEERS) gridworld.add_agents(tiger_handle, method="random", n=COUNT_TIGERS) environment: MAgentEnv = MAgentEnv( gridworld, tiger_handle, reset_environment_funcion=reset_environment) dqn_model: DQNModel = DQNModel( environment.single_observation_space.spaces[0].shape, environment.single_observation_space.spaces[1].shape, gridworld.get_action_space(tiger_handle)[0]).to(device) target_net: TargetNet = ptan.agent.TargetNet(dqn_model) print(dqn_model) action_selector: EpsilonGreedyActionSelector = EpsilonGreedyActionSelector( epsilon=PARAMETERS.epsilon_start) epsilon_tracker: EpsilonTracker = EpsilonTracker(action_selector, PARAMETERS) pre_processor: MAgentPreprocessor = MAgentPreprocessor(device) dqn_agent: ptan.agent.DQNAgent = ptan.agent.DQNAgent( dqn_model, action_selector, device, preprocessor=pre_processor) experience_source: ptan.experience.ExperienceSourceFirstLast = ptan.experience.ExperienceSourceFirstLast( environment, dqn_agent, PARAMETERS.gamma, vectorized=True)
# Load training status try: status = utils.load_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} # Define actor-critic model try: base_model = utils.load_model(model_dir) logger.info("Model successfully loaded\n") except OSError: if args.algo == "dqn": base_model = DQNModel(obs_space, envs[0].action_space, args.mem, args.text) else: base_model = ACModel(obs_space, envs[0].action_space, args.mem, args.text) logger.info("Model successfully created\n") logger.info("{}\n".format(base_model)) if torch.cuda.is_available(): base_model.cuda() logger.info("CUDA available: {}\n".format(torch.cuda.is_available())) # Train model num_frames = status["num_frames"] total_start_time = time.time() update = status["update"]