def train_model(): # Initiates the env env = gym.make('Mario-Kart-Luigi-Raceway-v0') resolution = (120, 160) actions = [ [-60, 0, 1, 0, 0], # left [60, 0, 1, 0, 0], # right [0, -80, 0, 1, 0], # back [0, 0, 1, 0, 0] ] # go straight # [ 0, 0, 0, 1, 0]] # brake # Initiates Model model = DQNModel(resolution=resolution, nb_frames=learn_param['nb_frames'], actions=actions) # print("number of actions: ", len(doom.actions)) # 16 if model_weights: model.load_weights(model_weights) agent = RLAgent(model, **learn_param) # Preform Reinforcement Learning on Scenario agent.train(env)
def __init__(self, epsilon=1.0): self.next_actionable = 0 self.scout_locations = {} self.rewards = [] weighted_actions = { self.no_op: 1, self.standby: 1, self.attack: 3, self.manage_supply: 5, self.adjust_refinery_assignment: 1, self.manage_refineries: 1, self.manage_barracks: 3, self.manage_barracks_tech_labs: 1, self.manage_barracks_reactors: 1, self.manage_factories: 1, self.manage_starports: 1, self.train_workers: 3, self.train_marines: 7, self.train_marauders: 4, self.train_hellions: 1, self.train_medivacs: 1, self.upgrade_cc: 1, self.expand: 4, self.scout: 1, self.calldown_mules: 2, } self.actions = [] for action_fn, weight in weighted_actions.items(): for _ in range(weight): self.actions.append(action_fn) self.curr_state = None self.num_actions = len(self.actions) self.dqn = DQNModel(self.actions, eps=epsilon) self.iteration = 0 # <list> [UnitId] specifying military composition. self.military_distribution = [ MARINE, MARAUDER, HELLION ] self.tl_tags = [] self.techlab_research_options = [ RESEARCH_COMBATSHIELD, RESEARCH_CONCUSSIVESHELLS, BARRACKSTECHLABRESEARCH_STIMPACK ]
def __init__(self, env, action_size, config): self.memory = RingBuffer(int( config.config_section_map()['memorysize'])) self.gamma = float( config.config_section_map()['gamma']) # discount rate self.epsilon = float( config.config_section_map()['epsilon']) # exploration rate self.epsilon_min = float(config.config_section_map()['epsilonmin']) self.epsilon_decay = float(config.config_section_map()['epsilondecay']) self.learning_rate = float(config.config_section_map()['learningrate']) self.action_size = action_size self.env = env self.dqn_model = DQNModel(self.learning_rate, action_size)
def __init__(self, in_channels, action_size, seed): """Initialize an Agent object. """ self.in_channels = in_channels self.action_size = action_size #self.seed = random.seed(seed) # Q-Network self.qnetwork_local = DQNModel(in_channels, action_size) self.qnetwork_target = DQNModel(in_channels, action_size) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.loss_list = []
def test_result(): ############# # test # ############# #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") policy_model = DQNModel(4, 18) #policy_model.load_state_dict(torch.load('./data/dqn_Riverraid_qnetwork_target_state_dict.pt' )) #policy_model.eval() env = atari_wrappers.make_atari('RiverraidNoFrameskip-v4') env = atari_wrappers.wrap_deepmind(env, clip_rewards=True, frame_stack=True, pytorch_img=True) policy_model.load_model( torch.load('./data/dqn_Riverraid_qnetwork_target_state_dict.pickle')) num_episodes = 5 episode = 1 score = 0 ep_score = [] done = False while (episode < num_episodes): observation = env.reset() done = False while not done: #action = agent.act(state) with torch.no_grad(): t_observation /= 255 #t_observation = t_observation.view(1, t_observation.shape[0], t_observation.shape[1], t_observation.shape[2]) q_value = policy_model.forward(t_observation) action = argmax(q_value) env.render() time.sleep(0.0005) next_observation, reward, done, info = env.step(action) score += reward observation = next_observation if info['ale.lives'] == 0: episode += 1 ep_score.append(score) score = 0 print("Average Score : {}".format(int(np.mean(ep_score)))) print(ep_score)
def __init__(self, portfolio_size, batch_size, max_experiences, min_experiences, is_eval=False): self.portfolio_size = portfolio_size self.action_size = 3 # sit, buy, sell self.input_shape = ( self.portfolio_size, self.portfolio_size, ) self.is_eval = is_eval #replay buffer hyperparameters self.expReplayBuffer = { 's': [], 'a': [], 'r': [], 's2': [], 'done': [] } self.expReplayBufferSize = 0 self.batch_size = batch_size #for replay buffer self.max_experiences = max_experiences self.min_experiences = min_experiences #training hyperparameters self.alpha = 0.5 self.gamma = 0.95 self.epsilon = 1.0 self.epsilon_min = 0.01 self.epsilon_decay = 0.05 #decay rate after every iteration #models self.hidden_units = [100, 50] self.train_model = DQNModel(self.input_shape, self.hidden_units, self.action_size, self.portfolio_size).get_model() self.test_model = self.get_model()
def run_weights(): env = gym.make('Mario-Kart-Luigi-Raceway-v0') resolution = (120, 160) actions = [ [-60, 0, 1, 0, 0], # left [60, 0, 1, 0, 0], # right [0, -80, 0, 1, 0], # back [0, 0, 1, 0, 0] ] # go straight # [ 0, 0, 0, 1, 0]] # brake # Load Model and Weights model = DQNModel(resolution=resolution, nb_frames=test_param['nb_frames'], actions=actions) model.load_weights(model_weights) agent = RLAgent(model, **test_param) agent.test(env)
# STATE_SHAPE = [8] # NUM_ACTIONS = 3 # # A higher learning rate can be used for simple envs # LEARNING_RATE = 1e-2 # fake_states = np.random.random([3] + STATE_SHAPE) # fake_target_states = np.random.random([3] + STATE_SHAPE) fake_rewards = np.array([100, 100, 100]) fake_dones = np.array([1, 1, 1]) print('Testing action optimization process') for i_action in range(NUM_ACTIONS): fake_actions = np.array(3 * [i_action]) tf.reset_default_graph() model = DQNModel(STATE_SHAPE, NUM_ACTIONS) print('Optimizing for action', i_action) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) old_preds = model.predict(sess, fake_states) print('Old predictions:\n', old_preds) for _ in range(100): model.train(sess, LEARNING_RATE, fake_states, fake_target_states, fake_actions, fake_rewards, fake_dones) new_preds = model.predict(sess, fake_states) print('New predictions:\n', new_preds) print('Testing target update process') tf.reset_default_graph()
from learner import Learner from model import DQNModel import gym import maze_env env=gym.make('Maze-v0') learner = Learner(env,model=DQNModel()) learner.run()
import numpy as np from model import DQNModel from policy import EpsGreedyPolicy from memory import Memory from agent import DQNAgent from processor import AtariProcessor if __name__ == '__main__': ENV_NAME = 'Riverraid-v4' env = gym.make(ENV_NAME) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n model = DQNModel(nb_actions=nb_actions).model policy = EpsGreedyPolicy(eps_min=0.1, eps_max=1, eps_test=0.05, nb_steps=1000000) memory = Memory(max_len=1000000) processor = AtariProcessor() dqn = DQNAgent(env, model, policy, memory, processor, gamma=0.99, batch_size=32, target_model_update_steps=10000, nb_episodes_warmup=500)
# Init environment env = gym.make(args.env) if "Street" not in args.env: env.unwrapped.set_difficulty(status["difficulty"], weighted=False) env.shaped_reward = args.dense_reward env.seed(args.seed) # Get obs space and preprocess function obs_space, preprocess_obss = utils.get_obss_preprocessor( args.env, env.observation_space, model_dir) # Load model try: policy_net = utils.load_model(model_dir) target_net = DQNModel(env.action_space, env=args.env) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() print("Model successfully loaded\n") except OSError: policy_net = DQNModel(env.action_space, env=args.env) target_net = DQNModel(env.action_space, env=args.env) target_net.load_state_dict(policy_net.state_dict()) print("Model successfully created\n") if torch.cuda.is_available(): policy_net.cuda() target_net.cuda() target_net.eval() print("CUDA available: {}\n".format(torch.cuda.is_available()))
class TerranBot(sc2.BotAI): def __init__(self, epsilon=1.0): self.next_actionable = 0 self.scout_locations = {} self.rewards = [] weighted_actions = { self.no_op: 1, self.standby: 1, self.attack: 3, self.manage_supply: 5, self.adjust_refinery_assignment: 1, self.manage_refineries: 1, self.manage_barracks: 3, self.manage_barracks_tech_labs: 1, self.manage_barracks_reactors: 1, self.manage_factories: 1, self.manage_starports: 1, self.train_workers: 3, self.train_marines: 7, self.train_marauders: 4, self.train_hellions: 1, self.train_medivacs: 1, self.upgrade_cc: 1, self.expand: 4, self.scout: 1, self.calldown_mules: 2, } self.actions = [] for action_fn, weight in weighted_actions.items(): for _ in range(weight): self.actions.append(action_fn) self.curr_state = None self.num_actions = len(self.actions) self.dqn = DQNModel(self.actions, eps=epsilon) self.iteration = 0 # <list> [UnitId] specifying military composition. self.military_distribution = [ MARINE, MARAUDER, HELLION ] self.tl_tags = [] self.techlab_research_options = [ RESEARCH_COMBATSHIELD, RESEARCH_CONCUSSIVESHELLS, BARRACKSTECHLABRESEARCH_STIMPACK ] async def on_step(self, iteration): self.seconds_elapsed = self.state.game_loop / TIME_SCALAR self.minutes_elapsed = self.seconds_elapsed / SECONDS_PER_MIN self.attack_waves = set() self.iteration += 1 self.num_troops_per_wave = min(14 + self.minutes_elapsed, 30) if self.curr_state is not None: self.prev_state = self.curr_state self.remember() if self.iteration % REPLAY_BATCH_SIZE == 0: self.dqn.replay(REPLAY_BATCH_SIZE) if self.iteration % UPDATE_TARGET_FREQ == 0: self.dqn.train_target_model() await self.visualize() if not self.townhalls.exists: target = self.known_enemy_structures.random_or(self.enemy_start_locations[0]).position for unit in self.workers | self.military_units: await self.do(unit.attack(target)) return ready_techlabs = self.units(BARRACKSTECHLAB).ready if len(ready_techlabs) != self.tl_tags: self.tl_tags = [] for techlab in ready_techlabs: self.tl_tags.append(techlab.tag) if len(self.techlab_research_options) > 0: for techlab in ready_techlabs: try: to_research = random.choice(self.techlab_research_options) if self.can_afford(to_research): await self.do(techlab(to_research)) self.techlab_research_options = \ self.techlab_research_options.filter(lambda x: x != to_research) except Exception as err: pass for cc in self.townhalls: enemies = self.known_enemy_units.closer_than(25.0, cc).filter( lambda x: x.name.lower() not in ["scv", "drone", "probe"]) if len(enemies) > 0: target = random.choice(enemies) for unit in self.military_units: await self.do(unit.attack(target)) break self.action = self.make_action_selection() # print(f"action chosen == {self.action}") self.prepare_attack() if len(list(self.attack_waves)) > 0 and self.units(MEDIVAC).idle.amount > 0: alive_units = list(self.attack_waves)[0].select_units(self.units) for med in self.units(MEDIVAC).idle: await self.do(med.attack(alive_units.first.position)) await self.distribute_workers() await self.lower_depots() await self.take_action() async def no_op(self): pass async def standby(self): self.next_actionable = self.seconds_elapsed + random.randrange(1, 37) async def take_action(self): if self.seconds_elapsed <= self.next_actionable: return try: await self.actions[self.action]() except Exception as err: print(str(err)) def make_action_selection(self): if self.seconds_elapsed <= self.next_actionable or self.curr_state is None: return 0 return self.dqn.choose_action(self.curr_state) def remember(self, reward=None, done=False): reward_value = reward if reward else (self.state.score.score / (200 * self.seconds_elapsed)) self.rewards.append(reward_value) self.dqn.remember(self.prev_state, self.action, reward_value, self.curr_state, done) #### WORKERS #### ################# async def train_workers(self): if not self.can_afford(SCV): return for cc in self.townhalls.ready.filter(lambda x: len(x.orders) < 3): if len(self.workers) < 18 * len(self.townhalls): await self.do(cc.train(SCV)) async def manage_supply(self): if self.can_afford(SUPPLYDEPOT) \ and self.supply_left < 10 and self.already_pending(SUPPLYDEPOT) < 2: position = self.townhalls.ready.random.position.towards( self.game_info.map_center, 5) await self.build(SUPPLYDEPOT, position) async def lower_depots(self): for sd in self.units(SUPPLYDEPOT).ready: await self.do(sd(MORPH_SUPPLYDEPOT_LOWER)) async def upgrade_cc(self): for cc in self.units(COMMANDCENTER).idle: if self.barracks.ready.exists and self.can_afford(ORBITALCOMMAND): await self.do(cc(UPGRADETOORBITAL_ORBITALCOMMAND)) async def calldown_mules(self): for oc in self.units(ORBITALCOMMAND).filter(lambda x: x.energy >= 50): mfs = self.state.mineral_field.closer_than(10, oc) if mfs: mf = max(mfs, key=lambda x: x.mineral_contents) await self.do(oc(CALLDOWNMULE_CALLDOWNMULE, mf)) async def expand(self): try: if self.can_afford(COMMANDCENTER): await self.expand_now(max_distance=100) except Exception as err: print(str(err)) async def manage_refineries(self): for cc in self.units(COMMANDCENTER).ready: vgs = self.state.vespene_geyser.closer_than(16.0, cc) for vg in vgs: if not self.can_afford(REFINERY): break worker = self.select_build_worker(vg.position) if worker is None: break if not self.units(REFINERY).closer_than(2.0, vg).exists: await self.do(worker.build(REFINERY, vg)) async def adjust_refinery_assignment(self): r = self.units(REFINERY).ready.random if r.assigned_harvesters < r.ideal_harvesters: w = self.workers.closer_than(16.0, r) if w.exists: await self.do(w.random.gather(r)) #### MILITARY #### ################## async def attack(self): """ Sends any attack group out to target. No micro is done on the army dispatch. """ if len(self.known_enemy_structures) > 0: target = random.choice(self.known_enemy_structures).position elif len(self.known_enemy_units) > 0: target = self.known_enemy_units.closest_to(random.choice(self.townhalls)).position else: target = self.enemy_start_locations[0].position for wave in list(self.attack_waves): alive_units = wave.select_units(self.units) if alive_units.exists and alive_units.idle.exists: for unit in wave.select_units(self.units): await self.do(unit.attack(target)) else: self.attack_waves.remove(wave) async def manage_barracks(self): if not self.depots.ready.exists: return if self.can_afford(BARRACKS) and self.barracks.amount < 1 + self.minutes_elapsed: depot = self.depots.ready.random await self.build(BARRACKS, near=depot) async def manage_barracks_tech_labs(self): rax = self.barracks.ready.noqueue.random if rax.add_on_tag == 0: await self.do(rax.build(BARRACKSTECHLAB)) async def manage_barracks_reactors(self): rax = self.barracks.ready.noqueue.random if rax.add_on_tag == 0: await self.do(rax.build(BARRACKSREACTOR)) async def manage_factories(self): if not self.depots.ready.exists: return if not self.barracks.ready.exists: return if self.can_afford(FACTORY) and self.units(FACTORY).amount < 3: depot = self.depots.ready.random await self.build(FACTORY, near=depot) async def manage_starports(self): if not self.depots.ready.exists: return if not self.barracks.ready.exists: return if not self.units(FACTORY).ready.exists: return if self.can_afford(STARPORT) and self.units(STARPORT).amount < 2: depot = self.depots.ready.random await self.build(STARPORT, near=depot) async def train_marines(self): for rax in self.barracks.ready.filter(lambda x: x.add_on_tag not in self.tl_tags and len(x.orders) < 3): if not self.can_afford(MARINE): break await self.do(rax.train(MARINE)) async def train_marauders(self): for rax in self.barracks.ready.filter(lambda x: x.add_on_tag in self.tl_tags and len(x.orders) < 3): if not self.can_afford(MARAUDER): break await self.do(rax.train(MARAUDER)) async def train_hellions(self): for f in self.units(FACTORY).ready.filter(lambda x: len(x.orders) < 3): if not self.can_afford(HELLION): break await self.do(f.train(HELLION)) async def train_medivacs(self): for sp in self.units(STARPORT).ready.filter(lambda x: len(x.orders) < 3): if not self.can_afford(MEDIVAC): break await self.do(sp.train(MEDIVAC)) def prepare_attack(self): """ Prepares an attack wave when ready. """ total = 0 for unit in self.military_distribution: units = self.units(unit) total += units.idle.amount if total >= self.num_troops_per_wave: attack_wave = None for unit in self.military_distribution: units = self.units(unit) if attack_wave is None: attack_wave = ControlGroup(units.idle) else: attack_wave.add_units(units.idle) self.attack_waves.add(attack_wave) #### VISUALIZATION #### ####################### async def visualize(self): game_map = np.zeros((self.game_info.map_size[1], self.game_info.map_size[0], 3), np.uint8) await self.visualize_map(game_map) await self.visualize_resources(game_map) # cv assumes (0, 0) top-left => need to flip along horizontal axis curr_state = cv.flip(game_map, 0) if VISUALIZE: cv.imshow('Map', cv.resize(curr_state, dsize=None, fx=2, fy=2)) cv.waitKey(1) self.curr_state = curr_state.reshape([-1, 184, 152, 3]) async def visualize_map(self, game_map): # game coordinates need to be represented as (y, x) in 2d arrays for unit in self.units().ready: posn = unit.position cv.circle(game_map, (int(posn[0]), int(posn[1])), int(unit.radius*8), (0, 0, 255), math.ceil(int(unit.radius*0.5))) for unit in self.known_enemy_units: posn = unit.position cv.circle(game_map, (int(posn[0]), int(posn[1])), int(unit.radius*8), (255, 0, 0), math.ceil(int(unit.radius*0.5))) async def visualize_resources(self, game_map): line_scalar = 40 minerals = min(1.0, self.minerals / 1200) vespene = min(1.0, self.vespene / 1200) pop_space = min(1.0, self.supply_left / max(1.0, self.supply_cap)) supply_usage = self.supply_cap / 200 military = (self.supply_cap - self.supply_left - self.workers.amount) \ / max(1, self.supply_cap - self.supply_left) cv.line(game_map, (0, 16), (int(line_scalar*minerals), 16), (255, 40, 37), 2) cv.line(game_map, (0, 12), (int(line_scalar*vespene), 12), (25, 240, 20), 2) cv.line(game_map, (0, 8), (int(line_scalar*pop_space), 8), (150, 150, 150), 2) cv.line(game_map, (0, 4), (int(line_scalar*supply_usage), 4), (64, 64, 64), 2) cv.line(game_map, (0, 0), (int(line_scalar*military), 0), (0, 0, 255), 2) #### SCOUTING #### ################## async def scout(self): expand_distances = {} for el in self.expansion_locations: distance_to_enemy_start = el.distance_to(self.enemy_start_locations[0]) expand_distances[distance_to_enemy_start] = el distance_keys = sorted(k for k in expand_distances) unit_tags = [unit.tag for unit in self.units] to_be_removed = [] for s in self.scout_locations: if s not in unit_tags: to_be_removed.append(s) for scout in to_be_removed: del self.scout_locations[scout] assign_scout = True for unit in self.workers: if unit.tag in self.scout_locations: assign_scout = False if assign_scout: workers = self.workers.idle if len(self.workers.idle) > 0 else self.workers.gathering for worker in workers[:1]: if worker.tag not in self.scout_locations: for dist in distance_keys: try: location = next(v for k, v in expand_distances.items() if k == dist) active_locations = [self.scout_locations[k] for k in self.scout_locations] if location not in active_locations: await self.do(worker.move(location)) self.scout_locations[worker.tag] = location break except Exception as e: pass for worker in self.workers: if worker.tag in self.scout_locations: await self.do(worker.move(self.vary_loc(self.scout_locations[worker.tag]))) def vary_loc(self, location): x = location[0] + random.randrange(-10, 10) y = location[1] + random.randrange(-10, 10) x = min(self.game_info.map_size[0], max(x, 0)) y = min(self.game_info.map_size[1], max(y, 0)) return position.Point2(position.Pointlike((x,y))) #### HELPERS #### ################# @property def depots(self): return self.units.of_type([ SUPPLYDEPOT, SUPPLYDEPOTLOWERED, SUPPLYDEPOTDROP ]) @property def barracks(self): return self.units(BARRACKS) @property def military_units(self): return self.marines | self.marauders | self.medivacs | self.hellions @property def marines(self): return self.units(MARINE) @property def marauders(self): return self.units(MARAUDER) @property def medivacs(self): return self.units(MEDIVAC) @property def hellions(self): return self.units(HELLION)
class Agent: def __init__(self, portfolio_size, batch_size, max_experiences, min_experiences, is_eval=False): self.portfolio_size = portfolio_size self.action_size = 3 # sit, buy, sell self.input_shape = ( self.portfolio_size, self.portfolio_size, ) self.is_eval = is_eval #replay buffer hyperparameters self.expReplayBuffer = { 's': [], 'a': [], 'r': [], 's2': [], 'done': [] } self.expReplayBufferSize = 0 self.batch_size = batch_size #for replay buffer self.max_experiences = max_experiences self.min_experiences = min_experiences #training hyperparameters self.alpha = 0.5 self.gamma = 0.95 self.epsilon = 1.0 self.epsilon_min = 0.01 self.epsilon_decay = 0.05 #decay rate after every iteration #models self.hidden_units = [100, 50] self.train_model = DQNModel(self.input_shape, self.hidden_units, self.action_size, self.portfolio_size).get_model() self.test_model = self.get_model() def get_model(self): """ Load the saved model """ json_file = open("models/model.json", 'r') loaded_json_file = json_file.read() json_file.close() loaded_model = model_from_json(loaded_json_file) loaded_model.load_weights("models/model.h5") return loaded_model def predictions_to_weights(self, pred): """ Helper function - Convert the model predictions to the form of weights associated with the portfolio stocks """ weights = np.zeros(len(pred)) raw_weights = np.argmax(pred, axis=-1) for stock, action in enumerate(raw_weights): #should be pred if action == 0: weights[stock] = 0 elif action == 1: weights[stock] = np.abs( pred[stock][0][action]) #bcoz pred is array of arrays else: weights[stock] = -np.abs( pred[stock][0][action]) #bcoz pred is array of arrays return weights def policy(self, state): if self.is_eval: #testing the model, get the model predictions directly irrespective of epsilon pred = self.test_model.predict( np.expand_dims(state.values, 0) ) #np.expand_dims is required because we will predict 3 cases from the state position else: if random.random( ) <= self.epsilon: #during training, epsilon probability of choosing randomly weights = np.random.normal(0, 1, size=(self.portfolio_size, )) saved_sum = np.sum(weights) weights = weights / saved_sum #sum of all weights should be 1 return weights else: pred = self.train_model.predict(np.expand_dims( state.values, 0)) return self.predictions_to_weights(pred) def weights_to_predictions(self, action_weights, rewards, Q_star): Q = np.zeros((self.portfolio_size, self.action_size)) for i in range(self.portfolio_size): if action_weights[i] == 0: Q[i][0] = rewards[i] + self.gamma * np.max(Q_star[i][0]) elif action_weights[i] > 0: Q[i][1] = rewards[i] + self.gamma * np.max(Q_star[i][1]) else: Q[i][2] = rewards[i] + self.gamma * np.max(Q_star[i][2]) return Q def train(self, TargetNet): # print("Training in progress") ids = np.random.randint( low=0, high=len(self.expReplayBuffer['s']), size=self.batch_size) #get batchsize exp data for training #store the experience data in vars for easy access # states = np.asarray([self.expReplayBuffer['s'][i] for i in ids]) # actions = np.asarray([self.expReplayBuffer['a'][i] for i in ids]) # rewards = np.asarray([self.expReplayBuffer['r'][i] for i in ids]) # states_next = np.asarray([self.expReplayBuffer['s2'][i] for i in ids]) # dones = np.asarray([self.expReplayBuffer['done'][i] for i in ids]) for i in range(len(self.expReplayBuffer['s'])): state = self.expReplayBuffer['s'][i] action = self.expReplayBuffer['a'][i] reward = self.expReplayBuffer['r'][i] state_next = self.expReplayBuffer['s2'][i] done = self.expReplayBuffer['done'][i] #predict the q values for the states_next using TargetNet as the variables of that net would be more stable # print("Shape: " + str(state_next.shape)) values_next = np.max(TargetNet.predict( np.expand_dims(state_next, axis=0)), axis=1) # print("Action vals") # print(action) # actual_values = np.where(dones, rewards, rewards+self.gamma*values_next) Q_learned_values = self.weights_to_predictions( action, reward, values_next) Q_val = TargetNet.predict(np.expand_dims(state, axis=0)) #Q learing formula Q_val = [ np.add(a * (1 - self.alpha), q * self.alpha) for a, q in zip(Q_val, Q_learned_values) ] #train the main model self.train_model.fit(np.expand_dims(state, 0), Q_val, epochs=1, verbose=0) #decrease the exploration rate after every iteration def add_experience(self, experience): """ add experience to the expReplayBuffer """ # print("Length: " + str(self.expReplayBufferSize)) if self.expReplayBufferSize >= self.max_experiences: for key in self.expReplayBuffer.keys(): self.expReplayBuffer[key].pop( 0 ) #remove an old experience to make place for a new one FIFO for key, value in experience.items(): self.expReplayBuffer[key].append(value) #add the new experience
class DQNAgent: def __init__(self, env, action_size, config): self.memory = RingBuffer(int( config.config_section_map()['memorysize'])) self.gamma = float( config.config_section_map()['gamma']) # discount rate self.epsilon = float( config.config_section_map()['epsilon']) # exploration rate self.epsilon_min = float(config.config_section_map()['epsilonmin']) self.epsilon_decay = float(config.config_section_map()['epsilondecay']) self.learning_rate = float(config.config_section_map()['learningrate']) self.action_size = action_size self.env = env self.dqn_model = DQNModel(self.learning_rate, action_size) def remember(self, state, action, reward, next_state, done): state = state.astype('uint8') next_state = next_state.astype('uint8') reward = np.sign(reward) self.memory.append((state, action, reward, next_state, done)) def action(self, fi_t, env_sample, csv_handler): num_random = random.uniform(0, 1) if num_random <= self.epsilon: # with probability epsilon do a random action return env_sample else: fi_t = np.expand_dims(fi_t, axis=0) action = self.dqn_model.model.predict( [fi_t, np.ones([1, self.action_size])]) csv_handler.write_q_values(action) return np.argmax(action[0]) def replay(self, batch_size, csv_logger): states = np.zeros((batch_size, 4, 84, 84), dtype='float32') actions = np.zeros((batch_size, 4), dtype='uint8') rewards = np.zeros(batch_size, dtype='float32') next_states = np.zeros((batch_size, 4, 84, 84), dtype='float32') dones = np.ones((batch_size, 4), dtype=bool) mini_batch = self.get_minibatch( batch_size) # sample random mini_batch from D i = 0 for state, action, reward, next_state, done in mini_batch: next_state = next_state.astype('float32') state = state.astype('float32') states[i] = state actions[i][action] = 1 rewards[i] = reward next_states[i] = next_state dones[i] = [done, done, done, done] i += 1 next_state_q_values = self.dqn_model.target_model.predict( [next_states, np.ones(actions.shape)]) next_state_q_values[dones] = 0 q_values = rewards + self.gamma * np.max(next_state_q_values, axis=1) # Trains the model for a fixed number of epochs (iterations on a dataset) self.dqn_model.model.fit([states, actions], actions * q_values[:, None], batch_size=batch_size, verbose=0, callbacks=[csv_logger]) def get_minibatch(self, batch_size): mini_batch = [] for i in range(batch_size): index = randint(0, self.memory.__len__() - 1) mini_batch.append(self.memory.__getitem__(index)) return mini_batch def load(self, name): self.dqn_model.model.load_weights(name) self.dqn_model.update_target_model() def save(self, name): self.dqn_model.model.save_weights(name) def decrease_epsilone(self): if self.epsilon > self.epsilon_min: self.epsilon -= self.epsilon_decay
deer_handle: int tiger_handle: int deer_handle, tiger_handle = gridworld.get_handles() def reset_environment(): gridworld.reset() gridworld.add_walls(method="random", n=MAP_SIZE * MAP_SIZE * WALLS_DENSITY) gridworld.add_agents(deer_handle, method="random", n=COUNT_DEERS) gridworld.add_agents(tiger_handle, method="random", n=COUNT_TIGERS) environment: MAgentEnv = MAgentEnv( gridworld, tiger_handle, reset_environment_funcion=reset_environment) dqn_model: DQNModel = DQNModel( environment.single_observation_space.spaces[0].shape, environment.single_observation_space.spaces[1].shape, gridworld.get_action_space(tiger_handle)[0]).to(device) target_net: TargetNet = ptan.agent.TargetNet(dqn_model) print(dqn_model) action_selector: EpsilonGreedyActionSelector = EpsilonGreedyActionSelector( epsilon=PARAMETERS.epsilon_start) epsilon_tracker: EpsilonTracker = EpsilonTracker(action_selector, PARAMETERS) pre_processor: MAgentPreprocessor = MAgentPreprocessor(device) dqn_agent: ptan.agent.DQNAgent = ptan.agent.DQNAgent( dqn_model, action_selector, device, preprocessor=pre_processor) experience_source: ptan.experience.ExperienceSourceFirstLast = ptan.experience.ExperienceSourceFirstLast( environment, dqn_agent, PARAMETERS.gamma, vectorized=True)
def main(): use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") print("use_cuda: ", use_cuda) print("Device: ", device) env = atari_wrapper.make_atari('RiverraidNoFrameskip-v4') env = atari_wrapper.wrap_deepmind(env, clip_rewards=False, frame_stack=True, pytorch_img=True) action_space = [a for a in range(env.action_space.n)] n_action = len(action_space) # DQN Model and optimizer: policy_model = DQNModel().to(device) target_model = DQNModel().to(device) target_model.load_state_dict(policy_model.state_dict()) optimizer = torch.optim.RMSprop(policy_model.parameters(), lr=lr, alpha=alpha) # Initialize the Replay Buffer replay_buffer = ReplayBuffer(rep_buf_size) while len(replay_buffer) < rep_buf_ini: observation = env.reset() done = False while not done: with torch.no_grad(): t_observation = torch.from_numpy(observation).float().to( device) t_observation = t_observation.view(1, t_observation.shape[0], t_observation.shape[1], t_observation.shape[2]) action = random.sample(range(len(action_space)), 1)[0] next_observation, reward, done, info = env.step( action_space[action]) replay_buffer.push(observation, action, reward, next_observation, done) observation = next_observation print('Experience Replay buffer initialized') # Use log to record the performance logger = logging.getLogger('dqn_Riverraid') logger.setLevel(logging.INFO) logger_handler = logging.FileHandler('./dqn_Riverraid.log') logger.addHandler(logger_handler) # Training part env.reset() score = 0 episode_score = [] mean_episode_score = [] episode_true = 0 num_frames = 0 episode = 0 last_100episode_score = deque(maxlen=100) while episode < max_episodes: observation = env.reset() done = False # import time # start=time.time() while not done: with torch.no_grad(): t_observation = torch.from_numpy(observation).float().to( device) / 255 t_observation = t_observation.view(1, t_observation.shape[0], t_observation.shape[1], t_observation.shape[2]) epsilon = epsilon_by_frame(num_frames) if random.random() > epsilon: q_value = policy_model(t_observation) action = q_value.argmax(1).data.cpu().numpy().astype( int)[0] else: action = random.sample(range(len(action_space)), 1)[0] next_observation, reward, done, info = env.step( action_space[action]) num_frames += 1 score += reward replay_buffer.push(observation, action, reward, next_observation, done) observation = next_observation # Update policy if len(replay_buffer ) > batch_size and num_frames % skip_frame == 0: observations, actions, rewards, next_observations, dones = replay_buffer.sample( batch_size) observations = torch.from_numpy(np.array(observations) / 255).float().to(device) actions = torch.from_numpy( np.array(actions).astype(int)).float().to(device) actions = actions.view(actions.shape[0], 1) rewards = torch.from_numpy( np.array(rewards)).float().to(device) rewards = rewards.view(rewards.shape[0], 1) next_observations = torch.from_numpy( np.array(next_observations) / 255).float().to(device) dones = torch.from_numpy( np.array(dones).astype(int)).float().to(device) dones = dones.view(dones.shape[0], 1) q_values = policy_model(observations) next_q_values = target_model(next_observations) q_value = q_values.gather(1, actions.long()) next_q_value = next_q_values.max(1)[0].unsqueeze(1) expected_q_value = rewards + gamma * next_q_value * (1 - dones) loss = huber_loss(q_value, expected_q_value) optimizer.zero_grad() loss.backward() optimizer.step() for target_param, policy_param in zip( target_model.parameters(), policy_model.parameters()): target_param.data.copy_(TAU * policy_param.data + (1 - TAU) * target_param.data) episode += 1 # episode_score.append(score) # end=time.time() # print("Running time ( %i episode): %.3f Seconds "%(episode ,end-start)) if info['ale.lives'] == 0: # episode_score.append(score) mean_score = score episode_true += 1 score = 0 # if episode % 20 == 0: # mean_score = np.mean(episode_score) mean_episode_score.append(mean_score) last_100episode_score.append(mean_score) # episode_score = [] logger.info('Frame: ' + str(num_frames) + ' / Episode: ' + str(episode_true) + ' / Average Score : ' + str(int(mean_score)) + ' / epsilon: ' + str(float(epsilon))) #plot_score(mean_episode_score, episode_true) pickle.dump(mean_episode_score, open('./dqn_Riverraid_mean_scores.pickle', 'wb')) if episode_true % 50 == 1: logger.info('Frame: ' + str(num_frames) + ' / Episode: ' + str(episode_true) + ' / Average Score : ' + str(int(mean_score)) + ' / epsilon: ' + str(float(epsilon)) + ' / last_100episode_score: ' + str(float(np.mean(last_100episode_score)))) if episode % 50 == 0: torch.save(target_model.state_dict(), './dqn_spaceinvaders_target_model_state_dict.pt') torch.save(policy_model.state_dict(), './dqn_spaceinvaders_model_state_dict.pt') pass
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, in_channels, action_size, seed): """Initialize an Agent object. """ self.in_channels = in_channels self.action_size = action_size #self.seed = random.seed(seed) # Q-Network self.qnetwork_local = DQNModel(in_channels, action_size) self.qnetwork_target = DQNModel(in_channels, action_size) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.loss_list = [] def step(self, observation, action, reward, next_observation, done,num_frames): # Save experience in replay memory self.memory.add(observation, action, reward, next_observation, done) self.t_step = num_frames # Learn every UPDATE_EVERY time steps. if self.t_step % skip_frame== 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: #experiences = self.memory.sample() self.learn() def act(self, observation, eps=0.): #Returns actions for given observation as per current policy. t_observation = torch.from_numpy(observation).double()/255 # gray standard t_observation = t_observation.unsqueeze(0).to(device) # Epsilon-greedy action selection if random.random() > eps: action_values = self.qnetwork_local.forward(t_observation) action = action_values.argmax(1).data.cpu().numpy().astype(int)[0] # note the d of argmax , if the tensor is 4d then the para of argmax should be 2 else: action = random.sample(range(self.action_size), 1)[0] return action def learn(self): observations, actions, rewards, next_observations, dones = self.memory.sample() observations = torch.from_numpy(np.array(observations) / 255).double().to(device) actions = torch.from_numpy(np.array(actions).astype(int)).int().to(device) actions = actions.view(actions.shape[0], 1) rewards = torch.from_numpy(np.array(rewards)).double().to(device) rewards = rewards.view(rewards.shape[0], 1) next_observations = torch.from_numpy(np.array(next_observations) / 255).double().to(device) dones = torch.from_numpy(np.array(dones).astype(int)).int().to(device) dones = dones.view(dones.shape[0], 1) Q_target_next = self.qnetwork_target.forward(next_observations).max(1)[0].unsqueeze(1) Q_target = rewards + gamma*(Q_target_next)*(1-dones) # if done, than the second will not be added # compute the Q_local Q_local = self.qnetwork_local.forward(observations).gather(1, actions.long()) loss = self.huber_loss(Q_local, Q_target) self.qnetwork_local.backward(Q_target,Q_local, "huber",actions) self.loss_list.append(loss.cpu().numpy()) self.qnetwork_local.step() # update target network # if self.t_step % UPDATE_FREQUENCY == 0: self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = tau*θ_local + (1 - tau)*θ_target """ self.qnetwork_target.soft_update(local_model, TAU) def huber_loss(self, input, target, beta=1, size_average=True): """ a method of defining loss which increase the robustness of computing on discrete data """ n = torch.abs(input - target) cond = n < beta loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta) if size_average: return loss.mean() return loss.sum()
second_tiger_handle: int deer_handle, first_tiger_handle, second_tiger_handle = environment.get_handles( ) environment.reset() environment.add_walls(method="random", n=map_size * map_size * wall_density) environment.add_agents(deer_handle, method="random", n=deers) environment.add_agents(first_tiger_handle, method="random", n=tigers) environment.add_agents(second_tiger_handle, method="random", n=tigers) view_space: Tuple = environment.get_view_space(first_tiger_handle) view_space = (view_space[-1], ) + view_space[:2] dqn_model: DQNModel = DQNModel( view_space, environment.get_feature_space(first_tiger_handle), environment.get_action_space(first_tiger_handle)[0]) dqn_model.load_state_dict(torch.load(model, map_location=map_location)) print(dqn_model) reward_tiger_1: float = 0.0 reward_tiger_2: float = 0.0 survivors: int while True: first_tiger_actions: ndarray = get_actions(environment, dqn_model, first_tiger_handle) second_tiger_actions: ndarray = get_actions(environment, dqn_model, second_tiger_handle) environment.set_action(first_tiger_handle, first_tiger_actions)
# Initiates the env env = gym.make('Mario-Kart-Luigi-Raceway-v0') resolution = (120, 160) actions = [ [-60, 0, 1, 0, 0], # left [60, 0, 1, 0, 0], # right [0, -80, 0, 1, 0], # back [0, 0, 1, 0, 0] ] # go straight # [ 0, 0, 0, 1, 0]] # brake # Initiates Model model = DQNModel(resolution=resolution, nb_frames=learn_param['nb_frames'], actions=actions) # print("number of actions: ", len(doom.actions)) # 16 if model_weights: model.load_weights(model_weights) else: print("Please provide a model_weights file") agent = RLAgent(model, **learn_param) # give a step number randomly to catch a random screen shot agent.visualize(env)
# Load training status try: status = utils.load_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} # Define actor-critic model try: base_model = utils.load_model(model_dir) logger.info("Model successfully loaded\n") except OSError: if args.algo == "dqn": base_model = DQNModel(obs_space, envs[0].action_space, args.mem, args.text) else: base_model = ACModel(obs_space, envs[0].action_space, args.mem, args.text) logger.info("Model successfully created\n") logger.info("{}\n".format(base_model)) if torch.cuda.is_available(): base_model.cuda() logger.info("CUDA available: {}\n".format(torch.cuda.is_available())) # Train model num_frames = status["num_frames"] total_start_time = time.time() update = status["update"]