示例#1
0
def train_model():

    # Initiates the env
    env = gym.make('Mario-Kart-Luigi-Raceway-v0')

    resolution = (120, 160)

    actions = [
        [-60, 0, 1, 0, 0],  # left
        [60, 0, 1, 0, 0],  # right
        [0, -80, 0, 1, 0],  # back
        [0, 0, 1, 0, 0]
    ]  # go straight
    # [  0,   0, 0, 1, 0]]             # brake

    # Initiates Model
    model = DQNModel(resolution=resolution,
                     nb_frames=learn_param['nb_frames'],
                     actions=actions)

    # print("number of actions: ", len(doom.actions))   # 16

    if model_weights:
        model.load_weights(model_weights)

    agent = RLAgent(model, **learn_param)

    # Preform Reinforcement Learning on Scenario
    agent.train(env)
示例#2
0
    def __init__(self, epsilon=1.0):
        self.next_actionable = 0
        self.scout_locations = {}
        self.rewards = []

        weighted_actions = {
            self.no_op: 1,
            self.standby: 1,
            self.attack: 3,
            self.manage_supply: 5,
            self.adjust_refinery_assignment: 1,
            self.manage_refineries: 1,
            self.manage_barracks: 3,
            self.manage_barracks_tech_labs: 1,
            self.manage_barracks_reactors: 1,
            self.manage_factories: 1,
            self.manage_starports: 1,
            self.train_workers: 3,
            self.train_marines: 7,
            self.train_marauders: 4,
            self.train_hellions: 1,
            self.train_medivacs: 1,
            self.upgrade_cc: 1,
            self.expand: 4,
            self.scout: 1,
            self.calldown_mules: 2,
        }

        self.actions = []
        for action_fn, weight in weighted_actions.items():
            for _ in range(weight):
                self.actions.append(action_fn)

        self.curr_state = None
        self.num_actions = len(self.actions)
        self.dqn = DQNModel(self.actions, eps=epsilon)

        self.iteration = 0

        # <list> [UnitId] specifying military composition.
        self.military_distribution = [
            MARINE,
            MARAUDER,
            HELLION
        ]

        self.tl_tags = []
        self.techlab_research_options = [
            RESEARCH_COMBATSHIELD, 
            RESEARCH_CONCUSSIVESHELLS, 
            BARRACKSTECHLABRESEARCH_STIMPACK
        ]
示例#3
0
 def __init__(self, env, action_size, config):
     self.memory = RingBuffer(int(
         config.config_section_map()['memorysize']))
     self.gamma = float(
         config.config_section_map()['gamma'])  # discount rate
     self.epsilon = float(
         config.config_section_map()['epsilon'])  # exploration rate
     self.epsilon_min = float(config.config_section_map()['epsilonmin'])
     self.epsilon_decay = float(config.config_section_map()['epsilondecay'])
     self.learning_rate = float(config.config_section_map()['learningrate'])
     self.action_size = action_size
     self.env = env
     self.dqn_model = DQNModel(self.learning_rate, action_size)
    def __init__(self, in_channels, action_size, seed):
        """Initialize an Agent object.
        """
        self.in_channels = in_channels
        self.action_size = action_size
        #self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = DQNModel(in_channels, action_size)
        self.qnetwork_target = DQNModel(in_channels, action_size)
    
        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        
        self.loss_list = []
示例#5
0
def test_result():
    #############
    #   test    #
    #############
    #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    policy_model = DQNModel(4, 18)
    #policy_model.load_state_dict(torch.load('./data/dqn_Riverraid_qnetwork_target_state_dict.pt' ))
    #policy_model.eval()
    env = atari_wrappers.make_atari('RiverraidNoFrameskip-v4')
    env = atari_wrappers.wrap_deepmind(env,
                                       clip_rewards=True,
                                       frame_stack=True,
                                       pytorch_img=True)
    policy_model.load_model(
        torch.load('./data/dqn_Riverraid_qnetwork_target_state_dict.pickle'))
    num_episodes = 5
    episode = 1
    score = 0
    ep_score = []
    done = False
    while (episode < num_episodes):
        observation = env.reset()
        done = False
        while not done:

            #action = agent.act(state)
            with torch.no_grad():
                t_observation /= 255
                #t_observation = t_observation.view(1, t_observation.shape[0], t_observation.shape[1], t_observation.shape[2])
                q_value = policy_model.forward(t_observation)
                action = argmax(q_value)
                env.render()
                time.sleep(0.0005)
                next_observation, reward, done, info = env.step(action)
                score += reward
                observation = next_observation

        if info['ale.lives'] == 0:
            episode += 1
            ep_score.append(score)
            score = 0
    print("Average Score : {}".format(int(np.mean(ep_score))))
    print(ep_score)
示例#6
0
    def __init__(self,
                 portfolio_size,
                 batch_size,
                 max_experiences,
                 min_experiences,
                 is_eval=False):
        self.portfolio_size = portfolio_size
        self.action_size = 3  # sit, buy, sell
        self.input_shape = (
            self.portfolio_size,
            self.portfolio_size,
        )
        self.is_eval = is_eval

        #replay buffer hyperparameters
        self.expReplayBuffer = {
            's': [],
            'a': [],
            'r': [],
            's2': [],
            'done': []
        }
        self.expReplayBufferSize = 0
        self.batch_size = batch_size  #for replay buffer
        self.max_experiences = max_experiences
        self.min_experiences = min_experiences

        #training hyperparameters
        self.alpha = 0.5
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.05  #decay rate after every iteration

        #models
        self.hidden_units = [100, 50]
        self.train_model = DQNModel(self.input_shape, self.hidden_units,
                                    self.action_size,
                                    self.portfolio_size).get_model()
        self.test_model = self.get_model()
示例#7
0
def run_weights():

    env = gym.make('Mario-Kart-Luigi-Raceway-v0')

    resolution = (120, 160)

    actions = [
        [-60, 0, 1, 0, 0],  # left
        [60, 0, 1, 0, 0],  # right
        [0, -80, 0, 1, 0],  # back
        [0, 0, 1, 0, 0]
    ]  # go straight
    # [  0,   0, 0, 1, 0]]             # brake

    # Load Model and Weights
    model = DQNModel(resolution=resolution,
                     nb_frames=test_param['nb_frames'],
                     actions=actions)

    model.load_weights(model_weights)

    agent = RLAgent(model, **test_param)

    agent.test(env)
示例#8
0
# STATE_SHAPE = [8]
# NUM_ACTIONS = 3
# # A higher learning rate can be used for simple envs
# LEARNING_RATE = 1e-2
# fake_states = np.random.random([3] + STATE_SHAPE)
# fake_target_states = np.random.random([3] + STATE_SHAPE)

fake_rewards = np.array([100, 100, 100])
fake_dones = np.array([1, 1, 1])

print('Testing action optimization process')
for i_action in range(NUM_ACTIONS):
    fake_actions = np.array(3 * [i_action])

    tf.reset_default_graph()
    model = DQNModel(STATE_SHAPE, NUM_ACTIONS)

    print('Optimizing for action', i_action)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        old_preds = model.predict(sess, fake_states)
        print('Old predictions:\n', old_preds)
        for _ in range(100):
            model.train(sess, LEARNING_RATE, fake_states, fake_target_states,
                        fake_actions, fake_rewards, fake_dones)
        new_preds = model.predict(sess, fake_states)
        print('New predictions:\n', new_preds)

print('Testing target update process')
tf.reset_default_graph()
示例#9
0
from learner import Learner
from model import DQNModel
import gym
import maze_env

env=gym.make('Maze-v0')
learner = Learner(env,model=DQNModel())
learner.run()
示例#10
0
import numpy as np
from model import DQNModel
from policy import EpsGreedyPolicy
from memory import Memory
from agent import DQNAgent
from processor import AtariProcessor

if __name__ == '__main__':

    ENV_NAME = 'Riverraid-v4'
    env = gym.make(ENV_NAME)
    np.random.seed(123)
    env.seed(123)
    nb_actions = env.action_space.n

    model = DQNModel(nb_actions=nb_actions).model
    policy = EpsGreedyPolicy(eps_min=0.1,
                             eps_max=1,
                             eps_test=0.05,
                             nb_steps=1000000)
    memory = Memory(max_len=1000000)
    processor = AtariProcessor()
    dqn = DQNAgent(env,
                   model,
                   policy,
                   memory,
                   processor,
                   gamma=0.99,
                   batch_size=32,
                   target_model_update_steps=10000,
                   nb_episodes_warmup=500)
示例#11
0
# Init environment
env = gym.make(args.env)
if "Street" not in args.env:
    env.unwrapped.set_difficulty(status["difficulty"], weighted=False)
    env.shaped_reward = args.dense_reward
env.seed(args.seed)

# Get obs space and preprocess function
obs_space, preprocess_obss = utils.get_obss_preprocessor(
    args.env, env.observation_space, model_dir)

# Load model
try:
    policy_net = utils.load_model(model_dir)
    target_net = DQNModel(env.action_space, env=args.env)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()
    print("Model successfully loaded\n")
except OSError:
    policy_net = DQNModel(env.action_space, env=args.env)
    target_net = DQNModel(env.action_space, env=args.env)
    target_net.load_state_dict(policy_net.state_dict())
    print("Model successfully created\n")

if torch.cuda.is_available():
    policy_net.cuda()
    target_net.cuda()
    target_net.eval()
print("CUDA available: {}\n".format(torch.cuda.is_available()))
示例#12
0
class TerranBot(sc2.BotAI):

    def __init__(self, epsilon=1.0):
        self.next_actionable = 0
        self.scout_locations = {}
        self.rewards = []

        weighted_actions = {
            self.no_op: 1,
            self.standby: 1,
            self.attack: 3,
            self.manage_supply: 5,
            self.adjust_refinery_assignment: 1,
            self.manage_refineries: 1,
            self.manage_barracks: 3,
            self.manage_barracks_tech_labs: 1,
            self.manage_barracks_reactors: 1,
            self.manage_factories: 1,
            self.manage_starports: 1,
            self.train_workers: 3,
            self.train_marines: 7,
            self.train_marauders: 4,
            self.train_hellions: 1,
            self.train_medivacs: 1,
            self.upgrade_cc: 1,
            self.expand: 4,
            self.scout: 1,
            self.calldown_mules: 2,
        }

        self.actions = []
        for action_fn, weight in weighted_actions.items():
            for _ in range(weight):
                self.actions.append(action_fn)

        self.curr_state = None
        self.num_actions = len(self.actions)
        self.dqn = DQNModel(self.actions, eps=epsilon)

        self.iteration = 0

        # <list> [UnitId] specifying military composition.
        self.military_distribution = [
            MARINE,
            MARAUDER,
            HELLION
        ]

        self.tl_tags = []
        self.techlab_research_options = [
            RESEARCH_COMBATSHIELD, 
            RESEARCH_CONCUSSIVESHELLS, 
            BARRACKSTECHLABRESEARCH_STIMPACK
        ]

    async def on_step(self, iteration):
        self.seconds_elapsed = self.state.game_loop / TIME_SCALAR
        self.minutes_elapsed = self.seconds_elapsed / SECONDS_PER_MIN
        self.attack_waves = set()
        self.iteration += 1
        self.num_troops_per_wave = min(14 + self.minutes_elapsed, 30)

        if self.curr_state is not None:
            self.prev_state = self.curr_state
            self.remember()
            if self.iteration % REPLAY_BATCH_SIZE == 0:
                self.dqn.replay(REPLAY_BATCH_SIZE)
            if self.iteration % UPDATE_TARGET_FREQ == 0:
                self.dqn.train_target_model()

        await self.visualize()

        if not self.townhalls.exists:
            target = self.known_enemy_structures.random_or(self.enemy_start_locations[0]).position
            for unit in self.workers | self.military_units:
                await self.do(unit.attack(target))
            return

        ready_techlabs = self.units(BARRACKSTECHLAB).ready
        if len(ready_techlabs) != self.tl_tags:
            self.tl_tags = []
            for techlab in ready_techlabs:
                self.tl_tags.append(techlab.tag)

        if len(self.techlab_research_options) > 0:
            for techlab in ready_techlabs:
                try:
                    to_research = random.choice(self.techlab_research_options)
                    if self.can_afford(to_research):
                        await self.do(techlab(to_research))
                        self.techlab_research_options = \
                        self.techlab_research_options.filter(lambda x: x != to_research)
                except Exception as err:
                    pass

        for cc in self.townhalls:
            enemies = self.known_enemy_units.closer_than(25.0, cc).filter(
                lambda x: x.name.lower() not in ["scv", "drone", "probe"])
            if len(enemies) > 0:
                target = random.choice(enemies)
                for unit in self.military_units:
                    await self.do(unit.attack(target))
                break

        self.action = self.make_action_selection()

        # print(f"action chosen == {self.action}")
        self.prepare_attack()
        if len(list(self.attack_waves)) > 0 and self.units(MEDIVAC).idle.amount > 0:
            alive_units = list(self.attack_waves)[0].select_units(self.units)
            for med in self.units(MEDIVAC).idle:
                await self.do(med.attack(alive_units.first.position))

        await self.distribute_workers()
        await self.lower_depots()
        await self.take_action()

    async def no_op(self):
        pass

    async def standby(self):
        self.next_actionable = self.seconds_elapsed + random.randrange(1, 37)

    async def take_action(self):
        if self.seconds_elapsed <= self.next_actionable:
            return

        try:
            await self.actions[self.action]()
        except Exception as err:
            print(str(err))

    def make_action_selection(self):
        if self.seconds_elapsed <= self.next_actionable or self.curr_state is None:
            return 0

        return self.dqn.choose_action(self.curr_state)

    def remember(self, reward=None, done=False):
        reward_value = reward if reward else (self.state.score.score / (200 * self.seconds_elapsed))
        self.rewards.append(reward_value)
        self.dqn.remember(self.prev_state, self.action, reward_value, self.curr_state, done)

    #### WORKERS ####
    #################

    async def train_workers(self):
        if not self.can_afford(SCV):
            return

        for cc in self.townhalls.ready.filter(lambda x: len(x.orders) < 3):
            if len(self.workers) < 18 * len(self.townhalls):
                await self.do(cc.train(SCV))

    async def manage_supply(self):
        if self.can_afford(SUPPLYDEPOT) \
        and self.supply_left < 10 and self.already_pending(SUPPLYDEPOT) < 2:
            position = self.townhalls.ready.random.position.towards(
                self.game_info.map_center, 5)
            await self.build(SUPPLYDEPOT, position)

    async def lower_depots(self):
        for sd in self.units(SUPPLYDEPOT).ready:
            await self.do(sd(MORPH_SUPPLYDEPOT_LOWER))

    async def upgrade_cc(self):
        for cc in self.units(COMMANDCENTER).idle:
            if self.barracks.ready.exists and self.can_afford(ORBITALCOMMAND):
                await self.do(cc(UPGRADETOORBITAL_ORBITALCOMMAND))

    async def calldown_mules(self):
        for oc in self.units(ORBITALCOMMAND).filter(lambda x: x.energy >= 50):
            mfs = self.state.mineral_field.closer_than(10, oc)
            if mfs:
                mf = max(mfs, key=lambda x: x.mineral_contents)
                await self.do(oc(CALLDOWNMULE_CALLDOWNMULE, mf))

    async def expand(self):
        try:
            if self.can_afford(COMMANDCENTER):
                await self.expand_now(max_distance=100)
        except Exception as err:
            print(str(err))

    async def manage_refineries(self):
        for cc in self.units(COMMANDCENTER).ready:
            vgs = self.state.vespene_geyser.closer_than(16.0, cc)
            for vg in vgs:
                if not self.can_afford(REFINERY):
                    break
                worker = self.select_build_worker(vg.position)
                if worker is None:
                    break
                if not self.units(REFINERY).closer_than(2.0, vg).exists:
                    await self.do(worker.build(REFINERY, vg))

    async def adjust_refinery_assignment(self):
        r = self.units(REFINERY).ready.random
        if r.assigned_harvesters < r.ideal_harvesters:
            w = self.workers.closer_than(16.0, r)
            if w.exists:
                await self.do(w.random.gather(r))

    #### MILITARY ####
    ##################

    async def attack(self):
        """
        Sends any attack group out to target. No micro is done on the army 
        dispatch.
        """

        if len(self.known_enemy_structures) > 0:
            target = random.choice(self.known_enemy_structures).position
        elif len(self.known_enemy_units) > 0:
            target = self.known_enemy_units.closest_to(random.choice(self.townhalls)).position
        else:
            target = self.enemy_start_locations[0].position

        for wave in list(self.attack_waves):
            alive_units = wave.select_units(self.units)
            if alive_units.exists and alive_units.idle.exists:
                for unit in wave.select_units(self.units):
                    await self.do(unit.attack(target))
            else:
                self.attack_waves.remove(wave)

    async def manage_barracks(self): 
        if not self.depots.ready.exists:
            return

        if self.can_afford(BARRACKS) and self.barracks.amount < 1 + self.minutes_elapsed:
            depot = self.depots.ready.random
            await self.build(BARRACKS, near=depot)

    async def manage_barracks_tech_labs(self):
        rax = self.barracks.ready.noqueue.random
        if rax.add_on_tag == 0:
            await self.do(rax.build(BARRACKSTECHLAB))

    async def manage_barracks_reactors(self):
        rax = self.barracks.ready.noqueue.random
        if rax.add_on_tag == 0:
            await self.do(rax.build(BARRACKSREACTOR))

    async def manage_factories(self): 
        if not self.depots.ready.exists:
            return
        if not self.barracks.ready.exists:
            return

        if self.can_afford(FACTORY) and self.units(FACTORY).amount < 3:
            depot = self.depots.ready.random
            await self.build(FACTORY, near=depot)

    async def manage_starports(self): 
        if not self.depots.ready.exists:
            return
        if not self.barracks.ready.exists:
            return
        if not self.units(FACTORY).ready.exists:
            return

        if self.can_afford(STARPORT) and self.units(STARPORT).amount < 2:
            depot = self.depots.ready.random
            await self.build(STARPORT, near=depot)

    async def train_marines(self):
        for rax in self.barracks.ready.filter(lambda x: x.add_on_tag not in self.tl_tags and len(x.orders) < 3):
            if not self.can_afford(MARINE):
                break
            await self.do(rax.train(MARINE))

    async def train_marauders(self):
        for rax in self.barracks.ready.filter(lambda x: x.add_on_tag in self.tl_tags and len(x.orders) < 3):
            if not self.can_afford(MARAUDER):
                break
            await self.do(rax.train(MARAUDER))

    async def train_hellions(self):
        for f in self.units(FACTORY).ready.filter(lambda x: len(x.orders) < 3):
            if not self.can_afford(HELLION):
                break
            await self.do(f.train(HELLION))

    async def train_medivacs(self):
        for sp in self.units(STARPORT).ready.filter(lambda x: len(x.orders) < 3):
            if not self.can_afford(MEDIVAC):
                break
            await self.do(sp.train(MEDIVAC))

    def prepare_attack(self):
        """
        Prepares an attack wave when ready.
        """
        total = 0
        for unit in self.military_distribution:
            units = self.units(unit)
            total += units.idle.amount

        if total >= self.num_troops_per_wave:
            attack_wave = None

            for unit in self.military_distribution:
                units = self.units(unit)

                if attack_wave is None:
                    attack_wave = ControlGroup(units.idle)
                else:
                    attack_wave.add_units(units.idle)

            self.attack_waves.add(attack_wave)

    #### VISUALIZATION ####
    #######################

    async def visualize(self):
        game_map = np.zeros((self.game_info.map_size[1], self.game_info.map_size[0], 3), np.uint8)
        await self.visualize_map(game_map)
        await self.visualize_resources(game_map)

        # cv assumes (0, 0) top-left => need to flip along horizontal axis
        curr_state = cv.flip(game_map, 0)

        if VISUALIZE:
            cv.imshow('Map', cv.resize(curr_state, dsize=None, fx=2, fy=2))
            cv.waitKey(1)
        self.curr_state = curr_state.reshape([-1, 184, 152, 3])

    async def visualize_map(self, game_map):
        # game coordinates need to be represented as (y, x) in 2d arrays

        for unit in self.units().ready:
            posn = unit.position
            cv.circle(game_map, (int(posn[0]), int(posn[1])), int(unit.radius*8), (0, 0, 255), math.ceil(int(unit.radius*0.5)))

        for unit in self.known_enemy_units:
            posn = unit.position
            cv.circle(game_map, (int(posn[0]), int(posn[1])), int(unit.radius*8), (255, 0, 0), math.ceil(int(unit.radius*0.5)))

    async def visualize_resources(self, game_map):
        line_scalar = 40
        minerals = min(1.0, self.minerals / 1200)
        vespene = min(1.0, self.vespene / 1200)
        pop_space = min(1.0, self.supply_left / max(1.0, self.supply_cap))
        supply_usage = self.supply_cap / 200
        military = (self.supply_cap - self.supply_left - self.workers.amount) \
        / max(1, self.supply_cap - self.supply_left)


        cv.line(game_map, (0, 16), (int(line_scalar*minerals), 16), (255, 40, 37), 2)  
        cv.line(game_map, (0, 12), (int(line_scalar*vespene), 12), (25, 240, 20), 2)
        cv.line(game_map, (0, 8),  (int(line_scalar*pop_space), 8), (150, 150, 150), 2)
        cv.line(game_map, (0, 4),  (int(line_scalar*supply_usage), 4), (64, 64, 64), 2)
        cv.line(game_map, (0, 0),  (int(line_scalar*military), 0), (0, 0, 255), 2)

    #### SCOUTING ####
    ##################

    async def scout(self):
        expand_distances = {}

        for el in self.expansion_locations:
            distance_to_enemy_start = el.distance_to(self.enemy_start_locations[0])
            expand_distances[distance_to_enemy_start] = el

        distance_keys = sorted(k for k in expand_distances)
        unit_tags = [unit.tag for unit in self.units]

        to_be_removed = []
        for s in self.scout_locations:
            if s not in unit_tags:
                to_be_removed.append(s)

        for scout in to_be_removed:
            del self.scout_locations[scout]

        assign_scout = True

        for unit in self.workers:
            if unit.tag in self.scout_locations:
                assign_scout = False

        if assign_scout:
            workers = self.workers.idle if len(self.workers.idle) > 0 else self.workers.gathering
            for worker in workers[:1]:
                if worker.tag not in self.scout_locations:
                    for dist in distance_keys:
                        try:
                            location = next(v for k, v in expand_distances.items() if k == dist)
                            active_locations = [self.scout_locations[k] for k in self.scout_locations]

                            if location not in active_locations:
                                await self.do(worker.move(location))
                                self.scout_locations[worker.tag] = location
                                break
                        except Exception as e:
                            pass

        for worker in self.workers:
            if worker.tag in self.scout_locations:
                await self.do(worker.move(self.vary_loc(self.scout_locations[worker.tag])))

    def vary_loc(self, location):
        x = location[0] + random.randrange(-10, 10)
        y = location[1] + random.randrange(-10, 10)

        x = min(self.game_info.map_size[0], max(x, 0))
        y = min(self.game_info.map_size[1], max(y, 0))

        return position.Point2(position.Pointlike((x,y)))

    #### HELPERS ####
    #################

    @property
    def depots(self):
        return self.units.of_type([
            SUPPLYDEPOT, 
            SUPPLYDEPOTLOWERED, 
            SUPPLYDEPOTDROP
            ])

    @property
    def barracks(self):
        return self.units(BARRACKS)

    @property
    def military_units(self):
        return self.marines | self.marauders | self.medivacs | self.hellions
    
    @property
    def marines(self):
        return self.units(MARINE)

    @property
    def marauders(self):
        return self.units(MARAUDER)

    @property
    def medivacs(self):
        return self.units(MEDIVAC)

    @property
    def hellions(self):
        return self.units(HELLION)
示例#13
0
class Agent:
    def __init__(self,
                 portfolio_size,
                 batch_size,
                 max_experiences,
                 min_experiences,
                 is_eval=False):
        self.portfolio_size = portfolio_size
        self.action_size = 3  # sit, buy, sell
        self.input_shape = (
            self.portfolio_size,
            self.portfolio_size,
        )
        self.is_eval = is_eval

        #replay buffer hyperparameters
        self.expReplayBuffer = {
            's': [],
            'a': [],
            'r': [],
            's2': [],
            'done': []
        }
        self.expReplayBufferSize = 0
        self.batch_size = batch_size  #for replay buffer
        self.max_experiences = max_experiences
        self.min_experiences = min_experiences

        #training hyperparameters
        self.alpha = 0.5
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.05  #decay rate after every iteration

        #models
        self.hidden_units = [100, 50]
        self.train_model = DQNModel(self.input_shape, self.hidden_units,
                                    self.action_size,
                                    self.portfolio_size).get_model()
        self.test_model = self.get_model()

    def get_model(self):
        """
            Load the saved model
        """
        json_file = open("models/model.json", 'r')
        loaded_json_file = json_file.read()
        json_file.close()
        loaded_model = model_from_json(loaded_json_file)
        loaded_model.load_weights("models/model.h5")
        return loaded_model

    def predictions_to_weights(self, pred):
        """
            Helper function - Convert the model predictions to the form of weights associated with the portfolio stocks
        """
        weights = np.zeros(len(pred))
        raw_weights = np.argmax(pred, axis=-1)

        for stock, action in enumerate(raw_weights):  #should be pred
            if action == 0:
                weights[stock] = 0
            elif action == 1:
                weights[stock] = np.abs(
                    pred[stock][0][action])  #bcoz pred is array of arrays
            else:
                weights[stock] = -np.abs(
                    pred[stock][0][action])  #bcoz pred is array of arrays
        return weights

    def policy(self, state):
        if self.is_eval:  #testing the model, get the model predictions directly irrespective of epsilon
            pred = self.test_model.predict(
                np.expand_dims(state.values, 0)
            )  #np.expand_dims is required because we will predict 3 cases from the state position
        else:
            if random.random(
            ) <= self.epsilon:  #during training, epsilon probability of choosing randomly
                weights = np.random.normal(0, 1, size=(self.portfolio_size, ))
                saved_sum = np.sum(weights)
                weights = weights / saved_sum  #sum of all weights should be 1
                return weights
            else:
                pred = self.train_model.predict(np.expand_dims(
                    state.values, 0))
        return self.predictions_to_weights(pred)

    def weights_to_predictions(self, action_weights, rewards, Q_star):
        Q = np.zeros((self.portfolio_size, self.action_size))
        for i in range(self.portfolio_size):
            if action_weights[i] == 0:
                Q[i][0] = rewards[i] + self.gamma * np.max(Q_star[i][0])
            elif action_weights[i] > 0:
                Q[i][1] = rewards[i] + self.gamma * np.max(Q_star[i][1])
            else:
                Q[i][2] = rewards[i] + self.gamma * np.max(Q_star[i][2])
        return Q

    def train(self, TargetNet):
        # print("Training in progress")
        ids = np.random.randint(
            low=0, high=len(self.expReplayBuffer['s']),
            size=self.batch_size)  #get batchsize exp data for training
        #store the experience data in vars for easy access
        # states = np.asarray([self.expReplayBuffer['s'][i] for i in ids])
        # actions = np.asarray([self.expReplayBuffer['a'][i] for i in ids])
        # rewards = np.asarray([self.expReplayBuffer['r'][i] for i in ids])
        # states_next = np.asarray([self.expReplayBuffer['s2'][i] for i in ids])
        # dones = np.asarray([self.expReplayBuffer['done'][i] for i in ids])

        for i in range(len(self.expReplayBuffer['s'])):
            state = self.expReplayBuffer['s'][i]
            action = self.expReplayBuffer['a'][i]
            reward = self.expReplayBuffer['r'][i]
            state_next = self.expReplayBuffer['s2'][i]
            done = self.expReplayBuffer['done'][i]
            #predict the q values for the states_next using TargetNet as the variables of that net would be more stable
            # print("Shape: " + str(state_next.shape))
            values_next = np.max(TargetNet.predict(
                np.expand_dims(state_next, axis=0)),
                                 axis=1)
            # print("Action vals")
            # print(action)
            # actual_values = np.where(dones, rewards, rewards+self.gamma*values_next)
            Q_learned_values = self.weights_to_predictions(
                action, reward, values_next)
            Q_val = TargetNet.predict(np.expand_dims(state, axis=0))
            #Q learing formula
            Q_val = [
                np.add(a * (1 - self.alpha), q * self.alpha)
                for a, q in zip(Q_val, Q_learned_values)
            ]

            #train the main model
            self.train_model.fit(np.expand_dims(state, 0),
                                 Q_val,
                                 epochs=1,
                                 verbose=0)
            #decrease the exploration rate after every iteration

    def add_experience(self, experience):
        """
            add experience to the expReplayBuffer
        """
        # print("Length: " + str(self.expReplayBufferSize))
        if self.expReplayBufferSize >= self.max_experiences:
            for key in self.expReplayBuffer.keys():
                self.expReplayBuffer[key].pop(
                    0
                )  #remove an old experience to make place for a new one FIFO
        for key, value in experience.items():
            self.expReplayBuffer[key].append(value)  #add the new experience
示例#14
0
class DQNAgent:
    def __init__(self, env, action_size, config):
        self.memory = RingBuffer(int(
            config.config_section_map()['memorysize']))
        self.gamma = float(
            config.config_section_map()['gamma'])  # discount rate
        self.epsilon = float(
            config.config_section_map()['epsilon'])  # exploration rate
        self.epsilon_min = float(config.config_section_map()['epsilonmin'])
        self.epsilon_decay = float(config.config_section_map()['epsilondecay'])
        self.learning_rate = float(config.config_section_map()['learningrate'])
        self.action_size = action_size
        self.env = env
        self.dqn_model = DQNModel(self.learning_rate, action_size)

    def remember(self, state, action, reward, next_state, done):
        state = state.astype('uint8')
        next_state = next_state.astype('uint8')

        reward = np.sign(reward)

        self.memory.append((state, action, reward, next_state, done))

    def action(self, fi_t, env_sample, csv_handler):

        num_random = random.uniform(0, 1)

        if num_random <= self.epsilon:  # with probability epsilon do a random action
            return env_sample
        else:
            fi_t = np.expand_dims(fi_t, axis=0)
            action = self.dqn_model.model.predict(
                [fi_t, np.ones([1, self.action_size])])
            csv_handler.write_q_values(action)
            return np.argmax(action[0])

    def replay(self, batch_size, csv_logger):

        states = np.zeros((batch_size, 4, 84, 84), dtype='float32')
        actions = np.zeros((batch_size, 4), dtype='uint8')
        rewards = np.zeros(batch_size, dtype='float32')
        next_states = np.zeros((batch_size, 4, 84, 84), dtype='float32')
        dones = np.ones((batch_size, 4), dtype=bool)

        mini_batch = self.get_minibatch(
            batch_size)  # sample random mini_batch from D

        i = 0

        for state, action, reward, next_state, done in mini_batch:

            next_state = next_state.astype('float32')
            state = state.astype('float32')

            states[i] = state
            actions[i][action] = 1
            rewards[i] = reward
            next_states[i] = next_state
            dones[i] = [done, done, done, done]

            i += 1

        next_state_q_values = self.dqn_model.target_model.predict(
            [next_states, np.ones(actions.shape)])

        next_state_q_values[dones] = 0

        q_values = rewards + self.gamma * np.max(next_state_q_values, axis=1)

        #  Trains the model for a fixed number of epochs (iterations on a dataset)
        self.dqn_model.model.fit([states, actions],
                                 actions * q_values[:, None],
                                 batch_size=batch_size,
                                 verbose=0,
                                 callbacks=[csv_logger])

    def get_minibatch(self, batch_size):
        mini_batch = []
        for i in range(batch_size):
            index = randint(0, self.memory.__len__() - 1)
            mini_batch.append(self.memory.__getitem__(index))
        return mini_batch

    def load(self, name):
        self.dqn_model.model.load_weights(name)
        self.dqn_model.update_target_model()

    def save(self, name):
        self.dqn_model.model.save_weights(name)

    def decrease_epsilone(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay
示例#15
0
    deer_handle: int
    tiger_handle: int
    deer_handle, tiger_handle = gridworld.get_handles()

    def reset_environment():
        gridworld.reset()
        gridworld.add_walls(method="random",
                            n=MAP_SIZE * MAP_SIZE * WALLS_DENSITY)
        gridworld.add_agents(deer_handle, method="random", n=COUNT_DEERS)
        gridworld.add_agents(tiger_handle, method="random", n=COUNT_TIGERS)

    environment: MAgentEnv = MAgentEnv(
        gridworld, tiger_handle, reset_environment_funcion=reset_environment)

    dqn_model: DQNModel = DQNModel(
        environment.single_observation_space.spaces[0].shape,
        environment.single_observation_space.spaces[1].shape,
        gridworld.get_action_space(tiger_handle)[0]).to(device)

    target_net: TargetNet = ptan.agent.TargetNet(dqn_model)
    print(dqn_model)

    action_selector: EpsilonGreedyActionSelector = EpsilonGreedyActionSelector(
        epsilon=PARAMETERS.epsilon_start)
    epsilon_tracker: EpsilonTracker = EpsilonTracker(action_selector,
                                                     PARAMETERS)

    pre_processor: MAgentPreprocessor = MAgentPreprocessor(device)
    dqn_agent: ptan.agent.DQNAgent = ptan.agent.DQNAgent(
        dqn_model, action_selector, device, preprocessor=pre_processor)
    experience_source: ptan.experience.ExperienceSourceFirstLast = ptan.experience.ExperienceSourceFirstLast(
        environment, dqn_agent, PARAMETERS.gamma, vectorized=True)
示例#16
0
def main():
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    print("use_cuda: ", use_cuda)
    print("Device: ", device)

    env = atari_wrapper.make_atari('RiverraidNoFrameskip-v4')
    env = atari_wrapper.wrap_deepmind(env,
                                      clip_rewards=False,
                                      frame_stack=True,
                                      pytorch_img=True)

    action_space = [a for a in range(env.action_space.n)]
    n_action = len(action_space)

    # DQN Model and optimizer:
    policy_model = DQNModel().to(device)
    target_model = DQNModel().to(device)
    target_model.load_state_dict(policy_model.state_dict())

    optimizer = torch.optim.RMSprop(policy_model.parameters(),
                                    lr=lr,
                                    alpha=alpha)

    # Initialize the Replay Buffer
    replay_buffer = ReplayBuffer(rep_buf_size)

    while len(replay_buffer) < rep_buf_ini:

        observation = env.reset()
        done = False

        while not done:
            with torch.no_grad():
                t_observation = torch.from_numpy(observation).float().to(
                    device)
                t_observation = t_observation.view(1, t_observation.shape[0],
                                                   t_observation.shape[1],
                                                   t_observation.shape[2])
                action = random.sample(range(len(action_space)), 1)[0]

            next_observation, reward, done, info = env.step(
                action_space[action])

            replay_buffer.push(observation, action, reward, next_observation,
                               done)
            observation = next_observation

    print('Experience Replay buffer initialized')

    # Use log to record the performance
    logger = logging.getLogger('dqn_Riverraid')
    logger.setLevel(logging.INFO)
    logger_handler = logging.FileHandler('./dqn_Riverraid.log')
    logger.addHandler(logger_handler)

    # Training part
    env.reset()
    score = 0
    episode_score = []
    mean_episode_score = []
    episode_true = 0
    num_frames = 0
    episode = 0
    last_100episode_score = deque(maxlen=100)

    while episode < max_episodes:

        observation = env.reset()
        done = False
        # import time
        # start=time.time()

        while not done:

            with torch.no_grad():

                t_observation = torch.from_numpy(observation).float().to(
                    device) / 255
                t_observation = t_observation.view(1, t_observation.shape[0],
                                                   t_observation.shape[1],
                                                   t_observation.shape[2])
                epsilon = epsilon_by_frame(num_frames)
                if random.random() > epsilon:
                    q_value = policy_model(t_observation)
                    action = q_value.argmax(1).data.cpu().numpy().astype(
                        int)[0]
                else:
                    action = random.sample(range(len(action_space)), 1)[0]

            next_observation, reward, done, info = env.step(
                action_space[action])
            num_frames += 1
            score += reward

            replay_buffer.push(observation, action, reward, next_observation,
                               done)
            observation = next_observation

            # Update policy
            if len(replay_buffer
                   ) > batch_size and num_frames % skip_frame == 0:
                observations, actions, rewards, next_observations, dones = replay_buffer.sample(
                    batch_size)

                observations = torch.from_numpy(np.array(observations) /
                                                255).float().to(device)

                actions = torch.from_numpy(
                    np.array(actions).astype(int)).float().to(device)
                actions = actions.view(actions.shape[0], 1)

                rewards = torch.from_numpy(
                    np.array(rewards)).float().to(device)
                rewards = rewards.view(rewards.shape[0], 1)

                next_observations = torch.from_numpy(
                    np.array(next_observations) / 255).float().to(device)

                dones = torch.from_numpy(
                    np.array(dones).astype(int)).float().to(device)
                dones = dones.view(dones.shape[0], 1)

                q_values = policy_model(observations)
                next_q_values = target_model(next_observations)

                q_value = q_values.gather(1, actions.long())
                next_q_value = next_q_values.max(1)[0].unsqueeze(1)
                expected_q_value = rewards + gamma * next_q_value * (1 - dones)

                loss = huber_loss(q_value, expected_q_value)

                optimizer.zero_grad()
                loss.backward()

                optimizer.step()

                for target_param, policy_param in zip(
                        target_model.parameters(), policy_model.parameters()):
                    target_param.data.copy_(TAU * policy_param.data +
                                            (1 - TAU) * target_param.data)

        episode += 1
        # episode_score.append(score)
        # end=time.time()
        # print("Running time ( %i episode): %.3f Seconds "%(episode ,end-start))

        if info['ale.lives'] == 0:
            # episode_score.append(score)
            mean_score = score
            episode_true += 1
            score = 0

            # if episode % 20 == 0:
            # mean_score = np.mean(episode_score)
            mean_episode_score.append(mean_score)
            last_100episode_score.append(mean_score)
            # episode_score = []
            logger.info('Frame: ' + str(num_frames) + ' / Episode: ' +
                        str(episode_true) + ' / Average Score : ' +
                        str(int(mean_score)) + '   / epsilon: ' +
                        str(float(epsilon)))
            #plot_score(mean_episode_score, episode_true)
            pickle.dump(mean_episode_score,
                        open('./dqn_Riverraid_mean_scores.pickle', 'wb'))
            if episode_true % 50 == 1:
                logger.info('Frame: ' + str(num_frames) + ' / Episode: ' +
                            str(episode_true) + ' / Average Score : ' +
                            str(int(mean_score)) + '   / epsilon: ' +
                            str(float(epsilon)) +
                            '   / last_100episode_score: ' +
                            str(float(np.mean(last_100episode_score))))

        if episode % 50 == 0:
            torch.save(target_model.state_dict(),
                       './dqn_spaceinvaders_target_model_state_dict.pt')
            torch.save(policy_model.state_dict(),
                       './dqn_spaceinvaders_model_state_dict.pt')

    pass
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, in_channels, action_size, seed):
        """Initialize an Agent object.
        """
        self.in_channels = in_channels
        self.action_size = action_size
        #self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = DQNModel(in_channels, action_size)
        self.qnetwork_target = DQNModel(in_channels, action_size)
    
        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        
        self.loss_list = []
    
    def step(self, observation, action, reward, next_observation, done,num_frames):
        # Save experience in replay memory
        self.memory.add(observation, action, reward, next_observation, done)
        self.t_step = num_frames
        # Learn every UPDATE_EVERY time steps.
        if self.t_step %  skip_frame== 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                #experiences = self.memory.sample()
                self.learn()

    def act(self, observation, eps=0.):
        #Returns actions for given observation as per current policy.
        t_observation = torch.from_numpy(observation).double()/255  
        # gray standard
        t_observation = t_observation.unsqueeze(0).to(device)
        

        # Epsilon-greedy action selection
        if random.random() > eps:
            action_values = self.qnetwork_local.forward(t_observation)
            action = action_values.argmax(1).data.cpu().numpy().astype(int)[0]
            # note the d of argmax , if the tensor is 4d then the para of argmax should be 2
        else:
            action = random.sample(range(self.action_size), 1)[0]
        return action

    def learn(self):
        
        observations, actions, rewards, next_observations, dones =  self.memory.sample()
        
        observations = torch.from_numpy(np.array(observations) / 255).double().to(device)
            
        actions = torch.from_numpy(np.array(actions).astype(int)).int().to(device)
        actions = actions.view(actions.shape[0], 1)
            
        rewards = torch.from_numpy(np.array(rewards)).double().to(device)
        rewards = rewards.view(rewards.shape[0], 1)
            
        next_observations = torch.from_numpy(np.array(next_observations) / 255).double().to(device)
        
        dones = torch.from_numpy(np.array(dones).astype(int)).int().to(device)
        dones = dones.view(dones.shape[0], 1)
        
        Q_target_next = self.qnetwork_target.forward(next_observations).max(1)[0].unsqueeze(1)
        Q_target = rewards + gamma*(Q_target_next)*(1-dones) # if done, than the second will not be added
        # compute the Q_local 
        Q_local = self.qnetwork_local.forward(observations).gather(1, actions.long())
        loss = self.huber_loss(Q_local, Q_target)
        self.qnetwork_local.backward(Q_target,Q_local, "huber",actions)
        self.loss_list.append(loss.cpu().numpy())
        self.qnetwork_local.step()
        #  update target network #
        if self.t_step % UPDATE_FREQUENCY == 0:
            self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = tau*θ_local + (1 - tau)*θ_target
        """
        self.qnetwork_target.soft_update(local_model, TAU)
     
    
    
    def huber_loss(self, input, target, beta=1, size_average=True):
        """
        a method of  defining loss which increase the robustness of computing on discrete data
        """
        n = torch.abs(input - target)
        cond = n < beta
        loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta)
        if size_average:
            return loss.mean()
        return loss.sum()
示例#18
0
    second_tiger_handle: int

    deer_handle, first_tiger_handle, second_tiger_handle = environment.get_handles(
    )

    environment.reset()
    environment.add_walls(method="random",
                          n=map_size * map_size * wall_density)
    environment.add_agents(deer_handle, method="random", n=deers)
    environment.add_agents(first_tiger_handle, method="random", n=tigers)
    environment.add_agents(second_tiger_handle, method="random", n=tigers)

    view_space: Tuple = environment.get_view_space(first_tiger_handle)
    view_space = (view_space[-1], ) + view_space[:2]
    dqn_model: DQNModel = DQNModel(
        view_space, environment.get_feature_space(first_tiger_handle),
        environment.get_action_space(first_tiger_handle)[0])
    dqn_model.load_state_dict(torch.load(model, map_location=map_location))
    print(dqn_model)

    reward_tiger_1: float = 0.0
    reward_tiger_2: float = 0.0

    survivors: int
    while True:
        first_tiger_actions: ndarray = get_actions(environment, dqn_model,
                                                   first_tiger_handle)
        second_tiger_actions: ndarray = get_actions(environment, dqn_model,
                                                    second_tiger_handle)

        environment.set_action(first_tiger_handle, first_tiger_actions)
示例#19
0
# Initiates the env
env = gym.make('Mario-Kart-Luigi-Raceway-v0')

resolution = (120, 160)

actions = [
    [-60, 0, 1, 0, 0],  # left
    [60, 0, 1, 0, 0],  # right
    [0, -80, 0, 1, 0],  # back
    [0, 0, 1, 0, 0]
]  # go straight
# [  0,   0, 0, 1, 0]]             # brake

# Initiates Model
model = DQNModel(resolution=resolution,
                 nb_frames=learn_param['nb_frames'],
                 actions=actions)

# print("number of actions: ", len(doom.actions))   # 16

if model_weights:
    model.load_weights(model_weights)
else:
    print("Please provide a model_weights file")

agent = RLAgent(model, **learn_param)

# give a step number randomly to catch a random screen shot
agent.visualize(env)
示例#20
0
# Load training status

try:
    status = utils.load_status(model_dir)
except OSError:
    status = {"num_frames": 0, "update": 0}

# Define actor-critic model

try:
    base_model = utils.load_model(model_dir)
    logger.info("Model successfully loaded\n")
except OSError:
    if args.algo == "dqn":
        base_model = DQNModel(obs_space, envs[0].action_space, args.mem,
                              args.text)
    else:
        base_model = ACModel(obs_space, envs[0].action_space, args.mem,
                             args.text)
    logger.info("Model successfully created\n")
logger.info("{}\n".format(base_model))

if torch.cuda.is_available():
    base_model.cuda()
logger.info("CUDA available: {}\n".format(torch.cuda.is_available()))

# Train model

num_frames = status["num_frames"]
total_start_time = time.time()
update = status["update"]