示例#1
0
文件: dqn_agent.py 项目: lucms/DQN
    def __init__(self,
                 state_dim,
                 action_dim,
                 buffer_size=int(1e5),
                 batch_size=128,
                 target_update_frequency=1000,
                 gamma=0.99,
                 learning_rate=1e-3,
                 weight_decay=1e-2,
                 hidden_layers=(48, 16)):
        self.state_dim = state_dim
        self.action_dim = action_dim

        self.replay_buffer = ReplayBuffer(buffer_size)
        self.policy_dqn = DQN(state_dim,
                              action_dim,
                              hidden_layers=hidden_layers)
        self.target_dqn = DQN(state_dim,
                              action_dim,
                              hidden_layers=hidden_layers)
        self.optimizer = torch.optim.Adam(self.policy_dqn.parameters(),
                                          lr=learning_rate,
                                          weight_decay=weight_decay)

        self.gamma = gamma
        self.batch_size = batch_size
        self.target_update_frequency = target_update_frequency
示例#2
0
    def __init__(self, params, state_size, action_size, seed, hidden_layers):
        super(Agent, self).__init__()
        
        """
        Params
        ======
            config: configuration file
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.batch_size = int(params["batch_size"])
        self.beta_frames = int(params["beta_frames"])
        self.buffer_size = int(params["buffer_size"])
        self.gamma = params["gamma"]
        self.lr = params["lr"]
        self.p_alpha = params["p_alpha"]
        self.p_beta = params["p_beta"]
        self.tau = params["tau"]
        self.update_every = params["update_every"]

        # Q-Network
        self.qnetwork_local = DQN(state_size, action_size, seed, hidden_layers=hidden_layers).to(device)
        self.qnetwork_target = DQN(state_size, action_size, seed, hidden_layers=hidden_layers).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr)

        # Replay memory
        self.memory = PriorityReplayBuffer(action_size, self.buffer_size, self.batch_size, seed, self.p_alpha,
                                           self.p_beta, self.beta_frames)
        self.t_step = 0 # Initialize time step (for updating every UPDATE_EVERY steps)
    def __init__(self, env, gamma, lr, n_actions, input_dim, no_rnn_hidden, no_rnn_layer, ann_layer, 
                mem_size, batch_size, epsilon, 
                eps_min=0.01, eps_dec=5e-6, replace=500, path='tmp'):
        self.env = env
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dim = input_dim
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.path = path
        self.action_space = [i for i in range(self.n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayMemory(mem_size)

        self.q_eval = DQN(input_dim, no_rnn_hidden, no_rnn_layer, ann_layer, n_actions, self.batch_size)
        self.q_next = DQN(input_dim, no_rnn_hidden, no_rnn_layer, ann_layer, n_actions, self.batch_size)

        self.optimizer = torch.optim.Adam(self.q_eval.parameters(), lr=self.lr)
        self.loss = nn.SmoothL1Loss()
        self.last_loss = 0
示例#4
0
    def __init__(self, *args, **kwargs):
        super(DQNAgent, self).__init__(*args, **kwargs)

        self.q_eval = DQN(self.lr, self.n_actions, input_dims=self.input_dims,
                          name=self.env_name+"_"+self.algorithm+"_q_eval", checkpoint_dir=self.checkpoint_dir)

        self.q_policy = DQN(self.lr, self.n_actions, input_dims=self.input_dims,
                          name=self.env_name+"_"+self.algorithm+"_q_policy", checkpoint_dir=self.checkpoint_dir)
示例#5
0
def main():
    replay_buffer = deque(maxlen=REPLAY_MEMORY)

    last_100_game_reward = deque(maxlen=100)

    with tf.Session() as sess:
        mainDQN = DQN(sess, INPUT_SIZE, OUTPUT_SIZE, name="main")
        targetDQN = DQN(sess, INPUT_SIZE, OUTPUT_SIZE, name="target")
        sess.run(tf.global_variables_initializer())

        spend_time = tf.placeholder(tf.float32)
        rr = tf.summary.scalar('reward', spend_time)
        merged = tf.summary.merge_all()
        writer = tf.summary.FileWriter('./board/dqn_not_per', sess.graph)

        # initial copy q_net -> target_net
        copy_ops = get_copy_var_ops(dest_scope_name="target",
                                    src_scope_name="main")
        sess.run(copy_ops)

        for episode in range(MAX_EPISODES):
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0
            state = env.reset()

            while not done:
                if np.random.rand() < e:
                    action = env.action_space.sample()
                else:
                    # Choose an action by greedily from the Q-network
                    action = np.argmax(mainDQN.predict(state))

                # Get new state and reward from environment
                next_state, reward, done, _ = env.step(action)

                if done:  # Penalty
                    reward = -1

                # Save the experience to our buffer
                replay_buffer.append((state, action, reward, next_state, done))

                if done:
                    if len(replay_buffer) > BATCH_SIZE:
                        minibatch = random.sample(replay_buffer, BATCH_SIZE)
                        loss, _ = replay_train(mainDQN, targetDQN, minibatch)
                        sess.run(copy_ops)
                    #if step_count % TARGET_UPDATE_FREQUENCY == 0:
                    #    sess.run(copy_ops)
                    summary = sess.run(merged,
                                       feed_dict={spend_time: step_count})
                    writer.add_summary(summary, episode)
                state = next_state
                step_count += 1

            print("Episode: {}  steps: {}".format(episode, step_count))
    def __init__(self, state_size, action_size):
        self.target = DQN(state_size, action_size).to(device)
        self.current = DQN(state_size, action_size).to(device)

        self.loss_fn = torch.nn.MSELoss(reduction='sum')

        self.memory = deque(maxlen = 4000)
        self.batch_size =  128
        
        learning_rate = 0.0025
        self.optimizer = torch.optim.Adam(self.current.parameters(), lr=learning_rate)
示例#7
0
 def __init__(self):
     self.game_network = DQN(9, 6)
     self.wenz_network = DQN(36, 8)
     self.solo_network = DQN(37, 8)
     self.solo_playing_network = DQN(37, 8)
     self.wenz_playing_network = DQN(36, 8)
     self.match = None
     self.game_memory = {}
     self.card_memory = {}
     self.explore = True
     self.N = 100
示例#8
0
    def __init__(self):
        
        self.dqn_local = DQN()
        self.batch_size = self.dqn_local.BATCH_SIZE

        print(self.dqn_local.dqn.summary())
        
        self.dqn_target = DQN()

        self.replay_memory = ReplayMemory(self.dqn_local)

        self.temp = 0
示例#9
0
    def __init__(self):
        # self.config = config
        self.gamma = 0.4

        # self.logger = logging.getLogger("DQNAgent")

        self.screen_width = 600

        # define models (policy and target)
        self.policy_model = DQN()
        self.target_model = DQN()

        # define memory
        self.memory = ReplayMemory()

        # define loss
        self.loss = HuberLoss()

        # define optimizer
        self.optim = torch.optim.Adam(self.policy_model.parameters(), lr=0.01)

        # define environment
        self.env = PyCar()  #TODO
        # self.cartpole = PyCar(self.screen_width)

        # initialize counter
        self.current_episode = 0
        self.current_iteration = 0
        self.episode_durations = []

        self.batch_size = 1700

        # set cuda flag
        self.is_cuda = torch.cuda.is_available()

        self.cuda = self.is_cuda

        if self.cuda:
            # print_cuda_statistics()
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")

        self.policy_model = self.policy_model.to(self.device)
        self.target_model = self.target_model.to(self.device)
        self.loss = self.loss.to(self.device)

        # Initialize Target model with policy model state dict
        self.target_model.load_state_dict(self.policy_model.state_dict())
        self.target_model.eval()

        self.savepath = "/home/sk002/Desktop/model/"
示例#10
0
def main():
    max_episode = 5000
    replay_buffer = deque()

    with tf.Session() as sess:
        mainDQN = DQN(sess, input_size, output_size, name='main')
        targetDQN = DQN(sess, input_size, output_size, name='target')
        tf.global_variables_initializer().run()

        copy_ops = get_copy_var_ops(dest_scope_name="taget", src_scope_name="main")
        sess.run(copy_ops)

        for episode in range(max_episode):
            e = 1 / ((episode / 10) + 1)
            done = False
            step_count = 0

            state = env.reset()

            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    action = np.argmax(mainDQN.predict(state))

                next_state, reward, done, _ = env.step(action)

                if done and step_count != 200:
                    reward -= 100

                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()

                state = next_state
                step_count += 1
                if step_count > 10000:
                    break

            print('Episode[{}] - steps : {}'.format(episode, step_count))
            if step_count > 10000:
                pass

            if episode % 10 == 1:
                for _ in range(50):
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = replay_train(mainDQN, targetDQN, minibatch)
                print('loss : ', loss)
                # copy q_net -> traget_net
                sess.run(copy_ops)

        bot_play(mainDQN)
示例#11
0
    def __init__(self, manager, agent_params):

        self.manager = manager

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.verbose = agent_params["verbose"]

        self.scaling_eps = agent_params["scaling_eps"]

        # Rainbow enhancements
        self.noisy_nets = agent_params["noisy_nets"]
        self.dueling = agent_params["dueling"]
        self.double_dqn = agent_params["double_dqn"]
        self.prioritized = agent_params["prioritized"]

        self.discount = agent_params["discount"]

        self.ep_start = agent_params["ep_start"]
        self.ep = self.ep_start
        self.ep_end = agent_params["ep_end"]
        self.ep_endt = agent_params["ep_endt"]

        self.eval_ep = agent_params["eval_ep"]

        # For running gradient correction
        self.correct_error_magnitude = agent_params["correct_error_magnitude"]
        self.max_error_magnitude = agent_params["max_error_magnitude"]
        self.min_error_divisor = agent_params["min_error_divisor"]
        self.error_mag_beta = agent_params["error_mag_beta"]
        self.error_mag_updates = 0.0
        self.error_mag_biased = 0.0
        self.error_mag = 0.0

        self.adam_lr = agent_params["adam_lr"]
        self.adam_eps = agent_params["adam_eps"]
        self.adam_beta1 = agent_params["adam_beta1"]
        self.adam_beta2 = agent_params["adam_beta2"]

        self.network = DQN(self.manager.gpu, self.manager.in_channels,
                           self.manager.n_actions, 1, self.noisy_nets,
                           self.dueling, False)
        self.target_network = DQN(self.manager.gpu, self.manager.in_channels,
                                  self.manager.n_actions, 1, self.noisy_nets,
                                  self.dueling, False)
        self.target_network.load_state_dict(self.network.state_dict())
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=self.adam_lr,
                                    betas=(self.adam_beta1, self.adam_beta2),
                                    eps=self.adam_eps)
示例#12
0
 def _before_sim_loop(self):
     n_state = self._env.observation_space.shape[0]
     n_action = self._env.action_space.n
     self._algo = DQN(n_state, n_action, self._algo_params)
     self._algo.update_net()
     self._score = 0.0
     self._score_sum = 0.0
示例#13
0
def train(lr, e_greedy, times=100):
    env = gym.make('CartPole-v0')
    env = env.unwrapped
    rlmodel = DQN(
        n_actions=env.action_space.n,
        n_features=env.observation_space.shape[0],
        learning_rate=lr,
        e_greedy=e_greedy,
        replace_loop=100,
        memory_size=2000,
        e_greedy_increment=0.001,
        show_info=False,
    )
    total_steps = 0
    history = []
    for i_episode in range(times):
        observation = env.reset()
        ep_r = 0
        cur_steps = 0
        while True:
            action = rlmodel.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            reward = get_reward(observation, env)
            rlmodel.store_transition(observation, action, reward, observation_)
            ep_r += reward
            if total_steps > 1000:
                rlmodel.learn()
            if done or cur_steps >= 10000:
                history.append(ep_r)
                break
            observation = observation_
            cur_steps += 1
            total_steps += 1
    env.close()
    return rlmodel, history
示例#14
0
def main():
    env = get_player(
        args.rom, image_size=IMAGE_SIZE, train=True, frame_skip=FRAME_SKIP)
    file_path = "memory.npz"
    rpm = ReplayMemory(
        MEMORY_SIZE,
        IMAGE_SIZE,
        CONTEXT_LEN,
        load_file=True,  # load replay memory data from file
        file_path=file_path)
    act_dim = env.action_space.n

    model = AtariModel(act_dim)
    algorithm = DQN(
        model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE * gpu_num)
    agent = AtariAgent(
        algorithm, act_dim=act_dim, total_step=args.train_total_steps)
    if os.path.isfile('./model.ckpt'):
        logger.info("load model from file")
        agent.restore('./model.ckpt')

    if args.train:
        logger.info("train with memory data")
        run_train_step(agent, rpm)
        logger.info("finish training. Save the model.")
        agent.save('./model.ckpt')
    else:
        logger.info("collect experience")
        collect_exp(env, rpm, agent)
        rpm.save_memory()
        logger.info("finish collecting, save successfully")
示例#15
0
 def __init__(self, config, actor_idx, starting_port, tensorboard_logger):
     super().__init__(config, actor_idx, starting_port, tensorboard_logger)
     self.dqn = DQN(
         input_shape=(config.width, config.height, config.stacked_frames),
         num_actions=3,
         learning_rate=config.learning_rate,
     )
示例#16
0
    def __init__(self, poi_info, user_KG, params):
        self.poi_info = poi_info
        self.user_KG = user_KG
        self.visit_counter = 0
        self.ll = params.ll
        self.lc = params.lc
        self.lp = params.lp
        self.poi_cat_dict = poi_info.poi_cat_dict
        self.poi_loc_dict = poi_info.poi_loc_dict
        self.poi_dist_mat = poi_info.poi_dist_mat
        self.cat_sim_mat = poi_info.cat_sim_mat

        self.memory_capacity = params.memory_capacity

        self.environment = Environment(user_KG.s_u.shape[1],
                                       self.poi_info.env_nt_1,
                                       self.poi_info.env_nt_2)

        self.dqn = DQN(self.environment,
                       user_KG.s_u.shape[1] + user_KG.s_KG.x.shape[1],
                       user_KG.s_KG.num_POI,
                       params.memory_capacity,
                       params.lr,
                       params.epsilon,
                       params.batch_size,
                       params.gamma,
                       params.target_replace_iter,
                       mode=params.priority_mode)

        self.predict_POI_index = np.random.randint(user_KG.s_KG.num_POI)

        self.r = reward(params.ll, params.lc, params.lp,
                        self.predict_POI_index, 0, poi_info.poi_cat_dict,
                        poi_info.poi_loc_dict, poi_info.poi_dist_mat,
                        poi_info.cat_sim_mat)
示例#17
0
    def __init__(self, model_name):
        self.model_name = model_name
        self.action_names = ['A', 'D', 'M', 'L', 'R']
        self.num_actions = len(self.action_names)
        self.memory = deque()

        #self.model = Cnn(self.model_name, self.memory)
        #self.target_model = Cnn(self.model_name, [], target=True)
        self.model = DQN(model_name, self.memory)

        # self.state = np.zeros([1, VISION_F + VISION_B + 1, VISION_W * 2 + 1, 1])
        self.previous_states = np.zeros([1, VISION_F + VISION_B + 1, VISION_W * 2 + 1, 4])
        self.previous_actions = np.zeros([4])
        self.previous_actions.fill(2)
        self.q_values = np.zeros(5)
        self.action = 2

        self.count_states = self.model.get_count_states()

        self.delay_count = 0

        self.epsilon_linear = LinearControlSignal(start_value=EPSILON_GREEDY_START_PROB,
                                                  end_value=EPSILON_GREEDY_END_PROB,
                                                  repeat=False)

        self.advantage = 0
        self.value = 0

        self.score = 0
示例#18
0
def main():

    #Set up the network for the first time
    features = 263
    h1 = 50
    h2 = 50

    dqn = DQN(features, h1, h2, "models/test_1")

    #Initialize the game
    initialize()
    game = setup_game()
    ai_player = game.current_player

    try:
        while True:
            if game.current_player == ai_player:
                action_choice = look_ahead(game, dqn)
                print("Action chosen was: ", action_choice)
                import pdb; pdb.set_trace()
                perform_action(action_choice, ai_player, game)
            else:
                actions = get_actions(game.current_player)
                index = random.randint(0, len(actions)-1)
                perform_action(actions[index], game.current_player, game)


    except GetStateError:
        print("Error with get_state function")
    except GameOver:
        print("Game ended normally")
示例#19
0
    def __init__(self):
        self.experience_replay = ExperienceReplay('BreakoutDeterministic-v0',
                                                  FLAGS.replay_buffer_size, 84,
                                                  84, 4, self.policy,
                                                  FLAGS.decay_to_epoch)

        config = DQNConfig()
        config.learning_rate = FLAGS.learning_rate
        config.gamma = FLAGS.gamma
        config.decay = FLAGS.decay
        config.momentum = FLAGS.momentum
        config.eps = FLAGS.eps
        config.input_width = FLAGS.image_width
        config.input_height = FLAGS.image_height
        config.skip = FLAGS.skip
        self.dqn = DQN(config, FLAGS.use_huber)

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        logger.info('initializing variables...')
        self.sess.run(tf.global_variables_initializer())
        self.update_target()

        self.epoch = 0
        self.decay_epsilon()
示例#20
0
def main(args):
    """
    Run a trained model for the cartpole problem
    :param args: (ArgumentParser) the input arguments
    """
    env = gym.make("CartPole-v0")
    model = DQN(
        env=env,
        policy_class=MlpPolicy,
        learning_rate=5e-4,
        buffer_size=50000,
        double_q=False,
        prioritized_replay=True,
        dueling=True,
        exploration_fraction=0.2,
        exploration_final_eps=0.02,
        model_path='cartpole_model'
    )
    model = model.load("cartpole_model")

    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            if not args.no_render:
                env.render()
            action, _ = model.predict(obs)
            obs, rew, done, _ = env.step(action)
            episode_rew += rew
        print("Episode reward", episode_rew)
        # No render is only used for automatic testing
        if args.no_render:
            break
示例#21
0
def replay():
    print('replay ... ')
    sess = tf
    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=True)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    saver = tf.train.Saver()
    ckpt = tf.train.get_checkpoint_state('model')
    saver.restore(sess, ckpt.model_checkpoint_path)

    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        state = game.reset()
        brain.init_state(state)

        while not terminal:
            action = brain.get_action()

            state, reward, terminal = game.step(action)
            total_reward += reward

            brain.remember(state, action, reward, terminal)

            time.sleep(0.3)

        print('game count: %d score: %d' % (episode + 1, total_reward))
示例#22
0
文件: play.py 项目: KyloRen1/Q-Bird
def play(args):
    device = torch.device("cuda" if args.gpu else "cpu")
    env = Environment(draw=True,
                      fps=args.fps,
                      debug=args.debug,
                      dist_to_pipe=args.dist_to_pipe,
                      dist_between_pipes=args.dist_between_pipes,
                      obs_this_pipe=args.obs_this_pipe)

    observation_space = env.get_observation_size_buffer()
    action_space = env.get_action_size()

    network = DQN(observation_space, action_space)
    network.load_checkpoint(args.checkpoint)

    for _ in range(args.runs):
        state = env.reset()
        total_reward = 0.0
        while True:
            state_v = torch.tensor(np.array([state], copy=False)).to(device)
            q_vals_v = network(state_v.float())
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())

            next_state, reward, done = env.step(action)
            total_reward += reward
            state = next_state

            if done:
                print("REWARD: ", total_reward)
                break
示例#23
0
def main():
    value_function = Sequential(Linear(in_features=4, out_features=128),
                                ReLU(),
                                Linear(in_features=128, out_features=128),
                                ReLU(), Linear(in_features=128,
                                               out_features=32), ReLU(),
                                Linear(in_features=32, out_features=2)).to(
                                    torch.device("cuda:0"))

    optimizer = RMSprop(params=value_function.parameters(),
                        alpha=0.95,
                        lr=0.0001)

    agent = DQN(value_function=value_function,
                optimizer=optimizer,
                lr_scheduler=LambdaLR(optimizer=optimizer,
                                      lr_lambda=lambda e: max(0.9999**e, 0.1)),
                gamma=0.95,
                epsilon_fn=lambda x: 0.9999**x,
                replay_buffer_size=10000,
                replay_batch_size=128,
                start_training_at=1024,
                unfreeze_freq=64,
                device=torch.device("cuda:0"),
                verbose=True)

    run_qlearning(agent, render=True)
示例#24
0
def main(args):
    if args.gpu:
        ctx = get_extension_context('cudnn', device_id=str(args.device))
        nn.set_default_context(ctx)

    # atari environment
    env = AtariWrapper(gym.make(args.env), args.seed, episodic=True)
    eval_env = AtariWrapper(gym.make(args.env), 50, episodic=False)
    num_actions = env.action_space.n

    # action-value function built with neural network
    model = DQN(q_function, num_actions, args.batch_size, args.gamma, args.lr)
    if args.load is not None:
        nn.load_parameters(args.load)
    model.update_target()

    buffer = ReplayBuffer(args.buffer_size, args.batch_size)

    exploration = LinearlyDecayEpsilonGreedy(num_actions, args.epsilon, 0.1,
                                             args.schedule_duration)

    monitor = prepare_monitor(args.logdir)

    update_fn = update(model, buffer, args.target_update_interval)

    eval_fn = evaluate(eval_env, model, render=args.render)

    train(env, model, buffer, exploration, monitor, update_fn, eval_fn,
          args.final_step, args.update_start, args.update_interval,
          args.save_interval, args.evaluate_interval, ['loss'])
示例#25
0
def replay():
    print('dqn_setting')
    sess = tf.Session()

    game = Sim(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=True)
    brain = DQN(sess, VIEW_WIDTH, VIEW_HEIGHT, NUM_ACTION)

    saver = tf.train.Saver()
    ckpt = tf.train.get_checkpoint_state('model')
    saver.restore(sess, ckpt.model_checkpoint_path)

    # start game
    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        state = game.Reset()
        brain.init_state(state)

        while not terminal:
            action = brain.get_action()
            print('action_choice : ' + str(action))

            # get data
            state, reward, terminal = game.Update(action)
            total_reward += reward

            brain.remember(state, action, reward, terminal)

            # show the play
            time.sleep(10)

        print('Number of game: %d Score: %d' % (episode + 1, total_reward))
示例#26
0
def main():
    USE_CUDA = torch.cuda.is_available()

    env = gym.make('CartPole-v0')
    dqn = DQN(env.observation_space.shape[0], env.action_space.n)
    if USE_CUDA:
        dqn = dqn.cuda()
    optimizer = optim.RMSprop(dqn.parameters(),
                              lr=0.00025,
                              momentum=0.95,
                              alpha=0.95,
                              eps=0.01)
    epsilon_schedule = get_epsilon_schedule(start=1.0,
                                            end=0.01,
                                            endt=1000,
                                            learn_start=50)
    replay_buffer = ReplayBuffer(capacity=1000)
    agent = DQNAgent(env,
                     dqn,
                     optimizer,
                     epsilon_schedule,
                     replay_buffer,
                     discount_factor=0.99,
                     target_update_rate=10,
                     batch_size=32,
                     learn_start=50)

    agent.train(5000)
    total_reward = agent.play(render=True)
    agent.env.close()
    print('Total Reward: ', total_reward)
    def __init__(self, state_size, action_size):
        self.model = DQN(state_size, action_size)

        self.loss_fn = torch.nn.MSELoss(reduction='sum')
        
        learning_rate = 0.0025
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate)
示例#28
0
def trainModel(env, action_size):
    state = env.reset()
    observation_space = len(state)
    agent = DQN(observation_space, action_size)
    target_model_update_counter = 0
    agent.step = 0
    for _ in range(parameters.EPISODES):
        print("Episode number: " + str(_))
        state = env.reset()
        observation_size = len(state)
        state = np.reshape(state, [1, observation_size])
        done = False
        while not done and rclpy.ok():
            agent.step += 1
            target_model_update_counter += 1
            if target_model_update_counter % parameters.TARGET_MODEL_UPDATE_STEP == 0:
                agent.save_load_model_weights()
                target_model_update_counter = 0
            action = agent.get_action(state)
            next_state, reward, done = env.step(action)
            next_state = np.reshape(next_state, [1, observation_space])
            agent.save_to_memory(state, action, reward, next_state, done)
            state = next_state
            if not done:
                agent.experience_replay()
            sleep(parameters.LOOP_RATE)
        agent.model.save('random_crawl_model.h5')
示例#29
0
def main():
    # initialize OpenAI Gym env and dqn agent
    env = gym.make(ENV_NAME)
    agent = DQN(env)

    for episode in range(EPISODE):
        # initialize task
        state = env.reset()
        # Train
        for step in range(STEP):
            action = agent.egreedy_action(state)  # e-greedy action for train
            next_state, reward, done, _ = env.step(action)
            # Define reward for agent
            reward_agent = -1 if done else 0.1
            agent.perceive(state, action, reward, next_state, done)
            state = next_state
            if done:
                break
            # Test every 100 episodes
            if episode % 100 == 0:
                total_reward = 0
                for i in range(TEST):
                    state = env.reset()
                    for j in range(STEP):
                        env.render()
                        action = agent.action(state)  # direct action for test
                        state, reward, done, _ = env.step(action)
                        total_reward += reward
                        if done:
                            break
                ave_reward = total_reward / TEST
                print('episode: ', episode, 'Evaluation Average Reward:',
                      ave_reward)
                if ave_reward >= 200:
                    break
示例#30
0
def train_dqn(env, args):

    agent = DQN(env, args)
    agent.train()

    total_episodes = args.episodes
    max_steps = 10

    for episode in range(total_episodes):
        print(episode, agent.epsilon, end='\r')

        state = env.reset()
        done = False

        for step in range(max_steps):
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)

            agent.push(state, action, reward, next_state, done)
            agent.learn(episode)

            state = next_state

            if done:
                break

        if episode % 5 == 0:
            max_steps += 10

    return agent