Пример #1
0
def train_model():

    # Initiates the env
    env = gym.make('Mario-Kart-Luigi-Raceway-v0')

    resolution = (120, 160)

    actions = [
        [-60, 0, 1, 0, 0],  # left
        [60, 0, 1, 0, 0],  # right
        [0, -80, 0, 1, 0],  # back
        [0, 0, 1, 0, 0]
    ]  # go straight
    # [  0,   0, 0, 1, 0]]             # brake

    # Initiates Model
    model = DQNModel(resolution=resolution,
                     nb_frames=learn_param['nb_frames'],
                     actions=actions)

    # print("number of actions: ", len(doom.actions))   # 16

    if model_weights:
        model.load_weights(model_weights)

    agent = RLAgent(model, **learn_param)

    # Preform Reinforcement Learning on Scenario
    agent.train(env)
    def __init__(self, in_channels, action_size, seed):
        """Initialize an Agent object.
        """
        self.in_channels = in_channels
        self.action_size = action_size
        #self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = DQNModel(in_channels, action_size)
        self.qnetwork_target = DQNModel(in_channels, action_size)
    
        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        
        self.loss_list = []
Пример #3
0
    def __init__(self, epsilon=1.0):
        self.next_actionable = 0
        self.scout_locations = {}
        self.rewards = []

        weighted_actions = {
            self.no_op: 1,
            self.standby: 1,
            self.attack: 3,
            self.manage_supply: 5,
            self.adjust_refinery_assignment: 1,
            self.manage_refineries: 1,
            self.manage_barracks: 3,
            self.manage_barracks_tech_labs: 1,
            self.manage_barracks_reactors: 1,
            self.manage_factories: 1,
            self.manage_starports: 1,
            self.train_workers: 3,
            self.train_marines: 7,
            self.train_marauders: 4,
            self.train_hellions: 1,
            self.train_medivacs: 1,
            self.upgrade_cc: 1,
            self.expand: 4,
            self.scout: 1,
            self.calldown_mules: 2,
        }

        self.actions = []
        for action_fn, weight in weighted_actions.items():
            for _ in range(weight):
                self.actions.append(action_fn)

        self.curr_state = None
        self.num_actions = len(self.actions)
        self.dqn = DQNModel(self.actions, eps=epsilon)

        self.iteration = 0

        # <list> [UnitId] specifying military composition.
        self.military_distribution = [
            MARINE,
            MARAUDER,
            HELLION
        ]

        self.tl_tags = []
        self.techlab_research_options = [
            RESEARCH_COMBATSHIELD, 
            RESEARCH_CONCUSSIVESHELLS, 
            BARRACKSTECHLABRESEARCH_STIMPACK
        ]
Пример #4
0
 def __init__(self, env, action_size, config):
     self.memory = RingBuffer(int(
         config.config_section_map()['memorysize']))
     self.gamma = float(
         config.config_section_map()['gamma'])  # discount rate
     self.epsilon = float(
         config.config_section_map()['epsilon'])  # exploration rate
     self.epsilon_min = float(config.config_section_map()['epsilonmin'])
     self.epsilon_decay = float(config.config_section_map()['epsilondecay'])
     self.learning_rate = float(config.config_section_map()['learningrate'])
     self.action_size = action_size
     self.env = env
     self.dqn_model = DQNModel(self.learning_rate, action_size)
Пример #5
0
def test_result():
    #############
    #   test    #
    #############
    #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    policy_model = DQNModel(4, 18)
    #policy_model.load_state_dict(torch.load('./data/dqn_Riverraid_qnetwork_target_state_dict.pt' ))
    #policy_model.eval()
    env = atari_wrappers.make_atari('RiverraidNoFrameskip-v4')
    env = atari_wrappers.wrap_deepmind(env,
                                       clip_rewards=True,
                                       frame_stack=True,
                                       pytorch_img=True)
    policy_model.load_model(
        torch.load('./data/dqn_Riverraid_qnetwork_target_state_dict.pickle'))
    num_episodes = 5
    episode = 1
    score = 0
    ep_score = []
    done = False
    while (episode < num_episodes):
        observation = env.reset()
        done = False
        while not done:

            #action = agent.act(state)
            with torch.no_grad():
                t_observation /= 255
                #t_observation = t_observation.view(1, t_observation.shape[0], t_observation.shape[1], t_observation.shape[2])
                q_value = policy_model.forward(t_observation)
                action = argmax(q_value)
                env.render()
                time.sleep(0.0005)
                next_observation, reward, done, info = env.step(action)
                score += reward
                observation = next_observation

        if info['ale.lives'] == 0:
            episode += 1
            ep_score.append(score)
            score = 0
    print("Average Score : {}".format(int(np.mean(ep_score))))
    print(ep_score)
Пример #6
0
    def __init__(self,
                 portfolio_size,
                 batch_size,
                 max_experiences,
                 min_experiences,
                 is_eval=False):
        self.portfolio_size = portfolio_size
        self.action_size = 3  # sit, buy, sell
        self.input_shape = (
            self.portfolio_size,
            self.portfolio_size,
        )
        self.is_eval = is_eval

        #replay buffer hyperparameters
        self.expReplayBuffer = {
            's': [],
            'a': [],
            'r': [],
            's2': [],
            'done': []
        }
        self.expReplayBufferSize = 0
        self.batch_size = batch_size  #for replay buffer
        self.max_experiences = max_experiences
        self.min_experiences = min_experiences

        #training hyperparameters
        self.alpha = 0.5
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.05  #decay rate after every iteration

        #models
        self.hidden_units = [100, 50]
        self.train_model = DQNModel(self.input_shape, self.hidden_units,
                                    self.action_size,
                                    self.portfolio_size).get_model()
        self.test_model = self.get_model()
Пример #7
0
def run_weights():

    env = gym.make('Mario-Kart-Luigi-Raceway-v0')

    resolution = (120, 160)

    actions = [
        [-60, 0, 1, 0, 0],  # left
        [60, 0, 1, 0, 0],  # right
        [0, -80, 0, 1, 0],  # back
        [0, 0, 1, 0, 0]
    ]  # go straight
    # [  0,   0, 0, 1, 0]]             # brake

    # Load Model and Weights
    model = DQNModel(resolution=resolution,
                     nb_frames=test_param['nb_frames'],
                     actions=actions)

    model.load_weights(model_weights)

    agent = RLAgent(model, **test_param)

    agent.test(env)
Пример #8
0
# Initiates the env
env = gym.make('Mario-Kart-Luigi-Raceway-v0')

resolution = (120, 160)

actions = [
    [-60, 0, 1, 0, 0],  # left
    [60, 0, 1, 0, 0],  # right
    [0, -80, 0, 1, 0],  # back
    [0, 0, 1, 0, 0]
]  # go straight
# [  0,   0, 0, 1, 0]]             # brake

# Initiates Model
model = DQNModel(resolution=resolution,
                 nb_frames=learn_param['nb_frames'],
                 actions=actions)

# print("number of actions: ", len(doom.actions))   # 16

if model_weights:
    model.load_weights(model_weights)
else:
    print("Please provide a model_weights file")

agent = RLAgent(model, **learn_param)

# give a step number randomly to catch a random screen shot
agent.visualize(env)
Пример #9
0
def main():
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    print("use_cuda: ", use_cuda)
    print("Device: ", device)

    env = atari_wrapper.make_atari('RiverraidNoFrameskip-v4')
    env = atari_wrapper.wrap_deepmind(env,
                                      clip_rewards=False,
                                      frame_stack=True,
                                      pytorch_img=True)

    action_space = [a for a in range(env.action_space.n)]
    n_action = len(action_space)

    # DQN Model and optimizer:
    policy_model = DQNModel().to(device)
    target_model = DQNModel().to(device)
    target_model.load_state_dict(policy_model.state_dict())

    optimizer = torch.optim.RMSprop(policy_model.parameters(),
                                    lr=lr,
                                    alpha=alpha)

    # Initialize the Replay Buffer
    replay_buffer = ReplayBuffer(rep_buf_size)

    while len(replay_buffer) < rep_buf_ini:

        observation = env.reset()
        done = False

        while not done:
            with torch.no_grad():
                t_observation = torch.from_numpy(observation).float().to(
                    device)
                t_observation = t_observation.view(1, t_observation.shape[0],
                                                   t_observation.shape[1],
                                                   t_observation.shape[2])
                action = random.sample(range(len(action_space)), 1)[0]

            next_observation, reward, done, info = env.step(
                action_space[action])

            replay_buffer.push(observation, action, reward, next_observation,
                               done)
            observation = next_observation

    print('Experience Replay buffer initialized')

    # Use log to record the performance
    logger = logging.getLogger('dqn_Riverraid')
    logger.setLevel(logging.INFO)
    logger_handler = logging.FileHandler('./dqn_Riverraid.log')
    logger.addHandler(logger_handler)

    # Training part
    env.reset()
    score = 0
    episode_score = []
    mean_episode_score = []
    episode_true = 0
    num_frames = 0
    episode = 0
    last_100episode_score = deque(maxlen=100)

    while episode < max_episodes:

        observation = env.reset()
        done = False
        # import time
        # start=time.time()

        while not done:

            with torch.no_grad():

                t_observation = torch.from_numpy(observation).float().to(
                    device) / 255
                t_observation = t_observation.view(1, t_observation.shape[0],
                                                   t_observation.shape[1],
                                                   t_observation.shape[2])
                epsilon = epsilon_by_frame(num_frames)
                if random.random() > epsilon:
                    q_value = policy_model(t_observation)
                    action = q_value.argmax(1).data.cpu().numpy().astype(
                        int)[0]
                else:
                    action = random.sample(range(len(action_space)), 1)[0]

            next_observation, reward, done, info = env.step(
                action_space[action])
            num_frames += 1
            score += reward

            replay_buffer.push(observation, action, reward, next_observation,
                               done)
            observation = next_observation

            # Update policy
            if len(replay_buffer
                   ) > batch_size and num_frames % skip_frame == 0:
                observations, actions, rewards, next_observations, dones = replay_buffer.sample(
                    batch_size)

                observations = torch.from_numpy(np.array(observations) /
                                                255).float().to(device)

                actions = torch.from_numpy(
                    np.array(actions).astype(int)).float().to(device)
                actions = actions.view(actions.shape[0], 1)

                rewards = torch.from_numpy(
                    np.array(rewards)).float().to(device)
                rewards = rewards.view(rewards.shape[0], 1)

                next_observations = torch.from_numpy(
                    np.array(next_observations) / 255).float().to(device)

                dones = torch.from_numpy(
                    np.array(dones).astype(int)).float().to(device)
                dones = dones.view(dones.shape[0], 1)

                q_values = policy_model(observations)
                next_q_values = target_model(next_observations)

                q_value = q_values.gather(1, actions.long())
                next_q_value = next_q_values.max(1)[0].unsqueeze(1)
                expected_q_value = rewards + gamma * next_q_value * (1 - dones)

                loss = huber_loss(q_value, expected_q_value)

                optimizer.zero_grad()
                loss.backward()

                optimizer.step()

                for target_param, policy_param in zip(
                        target_model.parameters(), policy_model.parameters()):
                    target_param.data.copy_(TAU * policy_param.data +
                                            (1 - TAU) * target_param.data)

        episode += 1
        # episode_score.append(score)
        # end=time.time()
        # print("Running time ( %i episode): %.3f Seconds "%(episode ,end-start))

        if info['ale.lives'] == 0:
            # episode_score.append(score)
            mean_score = score
            episode_true += 1
            score = 0

            # if episode % 20 == 0:
            # mean_score = np.mean(episode_score)
            mean_episode_score.append(mean_score)
            last_100episode_score.append(mean_score)
            # episode_score = []
            logger.info('Frame: ' + str(num_frames) + ' / Episode: ' +
                        str(episode_true) + ' / Average Score : ' +
                        str(int(mean_score)) + '   / epsilon: ' +
                        str(float(epsilon)))
            #plot_score(mean_episode_score, episode_true)
            pickle.dump(mean_episode_score,
                        open('./dqn_Riverraid_mean_scores.pickle', 'wb'))
            if episode_true % 50 == 1:
                logger.info('Frame: ' + str(num_frames) + ' / Episode: ' +
                            str(episode_true) + ' / Average Score : ' +
                            str(int(mean_score)) + '   / epsilon: ' +
                            str(float(epsilon)) +
                            '   / last_100episode_score: ' +
                            str(float(np.mean(last_100episode_score))))

        if episode % 50 == 0:
            torch.save(target_model.state_dict(),
                       './dqn_spaceinvaders_target_model_state_dict.pt')
            torch.save(policy_model.state_dict(),
                       './dqn_spaceinvaders_model_state_dict.pt')

    pass
Пример #10
0
# STATE_SHAPE = [8]
# NUM_ACTIONS = 3
# # A higher learning rate can be used for simple envs
# LEARNING_RATE = 1e-2
# fake_states = np.random.random([3] + STATE_SHAPE)
# fake_target_states = np.random.random([3] + STATE_SHAPE)

fake_rewards = np.array([100, 100, 100])
fake_dones = np.array([1, 1, 1])

print('Testing action optimization process')
for i_action in range(NUM_ACTIONS):
    fake_actions = np.array(3 * [i_action])

    tf.reset_default_graph()
    model = DQNModel(STATE_SHAPE, NUM_ACTIONS)

    print('Optimizing for action', i_action)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        old_preds = model.predict(sess, fake_states)
        print('Old predictions:\n', old_preds)
        for _ in range(100):
            model.train(sess, LEARNING_RATE, fake_states, fake_target_states,
                        fake_actions, fake_rewards, fake_dones)
        new_preds = model.predict(sess, fake_states)
        print('New predictions:\n', new_preds)

print('Testing target update process')
tf.reset_default_graph()
Пример #11
0
from learner import Learner
from model import DQNModel
import gym
import maze_env

env=gym.make('Maze-v0')
learner = Learner(env,model=DQNModel())
learner.run()
Пример #12
0
import numpy as np
from model import DQNModel
from policy import EpsGreedyPolicy
from memory import Memory
from agent import DQNAgent
from processor import AtariProcessor

if __name__ == '__main__':

    ENV_NAME = 'Riverraid-v4'
    env = gym.make(ENV_NAME)
    np.random.seed(123)
    env.seed(123)
    nb_actions = env.action_space.n

    model = DQNModel(nb_actions=nb_actions).model
    policy = EpsGreedyPolicy(eps_min=0.1,
                             eps_max=1,
                             eps_test=0.05,
                             nb_steps=1000000)
    memory = Memory(max_len=1000000)
    processor = AtariProcessor()
    dqn = DQNAgent(env,
                   model,
                   policy,
                   memory,
                   processor,
                   gamma=0.99,
                   batch_size=32,
                   target_model_update_steps=10000,
                   nb_episodes_warmup=500)
Пример #13
0
# Init environment
env = gym.make(args.env)
if "Street" not in args.env:
    env.unwrapped.set_difficulty(status["difficulty"], weighted=False)
    env.shaped_reward = args.dense_reward
env.seed(args.seed)

# Get obs space and preprocess function
obs_space, preprocess_obss = utils.get_obss_preprocessor(
    args.env, env.observation_space, model_dir)

# Load model
try:
    policy_net = utils.load_model(model_dir)
    target_net = DQNModel(env.action_space, env=args.env)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()
    print("Model successfully loaded\n")
except OSError:
    policy_net = DQNModel(env.action_space, env=args.env)
    target_net = DQNModel(env.action_space, env=args.env)
    target_net.load_state_dict(policy_net.state_dict())
    print("Model successfully created\n")

if torch.cuda.is_available():
    policy_net.cuda()
    target_net.cuda()
    target_net.eval()
print("CUDA available: {}\n".format(torch.cuda.is_available()))
Пример #14
0
    second_tiger_handle: int

    deer_handle, first_tiger_handle, second_tiger_handle = environment.get_handles(
    )

    environment.reset()
    environment.add_walls(method="random",
                          n=map_size * map_size * wall_density)
    environment.add_agents(deer_handle, method="random", n=deers)
    environment.add_agents(first_tiger_handle, method="random", n=tigers)
    environment.add_agents(second_tiger_handle, method="random", n=tigers)

    view_space: Tuple = environment.get_view_space(first_tiger_handle)
    view_space = (view_space[-1], ) + view_space[:2]
    dqn_model: DQNModel = DQNModel(
        view_space, environment.get_feature_space(first_tiger_handle),
        environment.get_action_space(first_tiger_handle)[0])
    dqn_model.load_state_dict(torch.load(model, map_location=map_location))
    print(dqn_model)

    reward_tiger_1: float = 0.0
    reward_tiger_2: float = 0.0

    survivors: int
    while True:
        first_tiger_actions: ndarray = get_actions(environment, dqn_model,
                                                   first_tiger_handle)
        second_tiger_actions: ndarray = get_actions(environment, dqn_model,
                                                    second_tiger_handle)

        environment.set_action(first_tiger_handle, first_tiger_actions)
Пример #15
0
    deer_handle: int
    tiger_handle: int
    deer_handle, tiger_handle = gridworld.get_handles()

    def reset_environment():
        gridworld.reset()
        gridworld.add_walls(method="random",
                            n=MAP_SIZE * MAP_SIZE * WALLS_DENSITY)
        gridworld.add_agents(deer_handle, method="random", n=COUNT_DEERS)
        gridworld.add_agents(tiger_handle, method="random", n=COUNT_TIGERS)

    environment: MAgentEnv = MAgentEnv(
        gridworld, tiger_handle, reset_environment_funcion=reset_environment)

    dqn_model: DQNModel = DQNModel(
        environment.single_observation_space.spaces[0].shape,
        environment.single_observation_space.spaces[1].shape,
        gridworld.get_action_space(tiger_handle)[0]).to(device)

    target_net: TargetNet = ptan.agent.TargetNet(dqn_model)
    print(dqn_model)

    action_selector: EpsilonGreedyActionSelector = EpsilonGreedyActionSelector(
        epsilon=PARAMETERS.epsilon_start)
    epsilon_tracker: EpsilonTracker = EpsilonTracker(action_selector,
                                                     PARAMETERS)

    pre_processor: MAgentPreprocessor = MAgentPreprocessor(device)
    dqn_agent: ptan.agent.DQNAgent = ptan.agent.DQNAgent(
        dqn_model, action_selector, device, preprocessor=pre_processor)
    experience_source: ptan.experience.ExperienceSourceFirstLast = ptan.experience.ExperienceSourceFirstLast(
        environment, dqn_agent, PARAMETERS.gamma, vectorized=True)
Пример #16
0
# Load training status

try:
    status = utils.load_status(model_dir)
except OSError:
    status = {"num_frames": 0, "update": 0}

# Define actor-critic model

try:
    base_model = utils.load_model(model_dir)
    logger.info("Model successfully loaded\n")
except OSError:
    if args.algo == "dqn":
        base_model = DQNModel(obs_space, envs[0].action_space, args.mem,
                              args.text)
    else:
        base_model = ACModel(obs_space, envs[0].action_space, args.mem,
                             args.text)
    logger.info("Model successfully created\n")
logger.info("{}\n".format(base_model))

if torch.cuda.is_available():
    base_model.cuda()
logger.info("CUDA available: {}\n".format(torch.cuda.is_available()))

# Train model

num_frames = status["num_frames"]
total_start_time = time.time()
update = status["update"]