def eval_agent(point):
    # TODO: (Federico) This is a mess, find a better way to convert parameters
    # point = [df, lr, memsize, updatefreq, nlayers, nunits]
    model = get_model(point[4], point[5], point[1])
    parameters = {
        "discount_factor": point[0],
        "learning_rate": point[1],
        "memory_size": point[2],
        "target_update_frequency": point[3],
        "train_start": 1000,
        "epsilon": 0.02,
        "batch_size": 32,
        "env": env,
        "full_model": model,
        "model":
        None  #TODO merge full_model and model in a single function that
        # takes aslo the number of layers. I didn't do it because
        # there may be lot of usages in the code I'm not aware of
    }
    print('Evaluating at ' + str(parameters) + ', nlayers: ' + str(point[4]) +
          ', nunits: ' + str(point[5]))
    agent = DQNAgent(parameters)
    agent.train(generate_experiment_name(parameters),
                episode_num=1000,
                solved_score=195)
    res = agent.test(300)
    print('Evaluation result: ' + str(res))
    return res
示例#2
0
 def __init__(self,
              model_class,
              model=None,
              env=None,
              exploration=None,
              gamma=0.99,
              memory_size=100000,
              batch_size=1,
              target_update_frequency=1000,
              saving_dir=None,
              min_mem=10000):
     """
     base class for lstm dqn agent
     :param model_class: sub class of torch.nn.Module. class reference of the model
     :param model: initial model of the policy net. could be None if loading from checkpoint
     :param env: environment
     :param exploration: exploration object. Must have function value(step) which returns e
     :param gamma: gamma
     :param memory_size: size of the memory
     :param batch_size: size of the mini batch for one step update
     :param target_update_frequency: the frequency for updating target net (in steps)
     :param saving_dir: the directory for saving checkpoint
     """
     DQNAgent.__init__(self, model_class, model, env, exploration, gamma,
                       memory_size, batch_size, target_update_frequency,
                       saving_dir)
     self.memory = EpisodicReplayMemory(memory_size)
     self.hidden = None
     self.min_mem = min_mem
示例#3
0
文件: dqn_trainer.py 项目: lucms/DQN
    def __init__(self, env: gym.Env,
                 log_frequency=1000,
                 exploration=None,
                 **kwargs):

        self.log_frequency = log_frequency
        self.agent = DQNAgent(action_dim=env.action_space.n, state_dim=env.observation_space.shape[0], **kwargs)
        self.env = env

        # Avoid mutable argument
        if exploration is None:
            exploration = {'algorithm': 'epsilon_greedy',
                           'decay': 'linear',
                           'initial_epsilon': 1.0,
                           'final_epsilon': 0.01,
                           'decay_timesteps': 1000}
        self.exploration_config = exploration

        # Parse the exploration dict to make the update_explo_param function
        if self.exploration_config['algorithm'] == 'epsilon_greedy':
            if self.exploration_config['decay'] == 'linear':
                update_term = (self.exploration_config['initial_epsilon'] - self.exploration_config[
                    'final_epsilon']) / self.exploration_config['decay_timesteps']
                self.update_explo_param = (lambda epsilon: epsilon - update_term if epsilon > self.exploration_config[
                    'final_epsilon'] else epsilon)

            elif self.exploration_config['decay'] == 'exponential':
                self.update_explo_param = (lambda epsilon: epsilon * self.exploration_config['epsilon_decay'])
        else:
            raise NotImplementedError
示例#4
0
def play(env, game, model_path):
    agent = DQNAgent(model_path)

    done, score = False, game.start_score
    observation = env.reset()

    # Initial history
    state = preprocess_frame(observation)
    history = np.stack((state, state, state, state), axis=2)
    history = np.reshape([history], (1, 84, 84, 4))

    while not done:
        env.render()
        time.sleep(0.05)

        # Play action
        action = agent.choose_action(history)
        game_action = get_ingame_action(action)
        observation, reward, done, info = env.step(game_action)

        # Update history
        next_state = preprocess_frame(observation)
        next_state = np.reshape([next_state], (1, 84, 84, 1))
        next_history = np.append(next_state, history[:, :, :, :3], axis=3)
        history = next_history

        reward = np.clip(reward, -1., 1.)
        score += reward
    print("score: ", score)
示例#5
0
class TestDQNAgent(unittest.TestCase):
    def setUp(self):
        self.state_size = 3
        self.action_size = 5
        fc = nn.Sequential(nn.Linear(self.state_size, 5), nn.ReLU(),
                           nn.Linear(5, 7), nn.ReLU(), nn.Linear(7, 9),
                           nn.ReLU(), nn.Linear(9, self.action_size))
        self.main_model = QNetwork(name="my_network", fc=fc)
        self.target_model = QNetwork(name="my_network", fc=fc)
        self.agent = DQNAgent(main_model=self.main_model,
                              target_network=self.target_model,
                              memory=WeightedReplayBuffer(buffer_size=12,
                                                          batch_size=3))
        self.eps_greediness = 0.01

    def test_allruns(self):
        """ No explosions? """
        # act
        state_value = [random()] * self.agent.state_size
        self.agent.act(state=state_value, eps=self.eps_greediness)

        agent_learned = False
        while not agent_learned:  # I want to force a learning step.
            agent_learned = self.agent.step(
                state=[random()] * self.agent.state_size,
                action=np.random.randint(self.agent.action_size),
                reward=random(),
                next_state=[random()] * self.agent.state_size,
                done=random() > 0.75)
示例#6
0
    def __init__(self, model_path):
        super(ReversiDisplay, self).__init__()

        self.BORD_PX_SIZE = 480
        self.keylock = False

        self.model_path = model_path
        self.title("リバーシ")
        self.geometry("{}x{}+{}+{}".format(self.BORD_PX_SIZE + 30,
                                           self.BORD_PX_SIZE + 30 + 100,
                                           self.BORD_PX_SIZE, 100))
        self.color = ["", "white", "black"]
        # {tag: position}
        self.tag2pos = {}
        # 座標からtagの変換
        self.z2tag = {}

        #メインクラスの作成
        env = Reversi()
        self.bord_size = env.Board_Size
        self.env = env

        #対戦エージェントの作成
        self.agent = DQNAgent(env.enable_actions, env.name, env.Board_Size)
        self.agent.load_model(args.model_path)

        # 符号
        self.numstr = self.make_numstr()
        self.alpstr = self.make_alpstr()
        # Set up some variables
        self.set_variables()
        # Set up game board
        self.set_board()
        # Set up some buttons
        self.set_button()
def _train(opponents, train_from_scratch=False, render=False):
    env = pommerman.make('PommeFFACompetition-v0', [])

    # Exploration strategy
    exp_schedule = LinearExploration(env, config.eps_begin,
                                     config.eps_end, config.eps_nsteps)

    # Learning rate schedule
    lr_schedule = LinearSchedule(config.lr_begin, config.lr_end,
                                 config.lr_nsteps)

    # Initialize agents.
    dqn_agent = DQNAgent(env, config, exp_schedule, lr_schedule, True, train_from_scratch=train_from_scratch)
    dqn_agent_index = _init_agents(env, exp_schedule, lr_schedule, opponents, dqn_agent)

    t = 1
    while t < config.nsteps_train:
        state = env.reset()

        done = False
        while not done:
            t += 1
            if render:
                env.render()
            actions = env.act(state)
            state, reward, done, info = env.step(actions)

            if reward[dqn_agent_index] == -1 and not done:
                # Stop the episode when the training agent dies.
                dqn_agent.episode_end(-1)
                done = True

    env.close()
示例#8
0
 def __init__(self):
     self.config = None
     self.memory_debug_path = ROOT + '/memory.txt'
     self.memory_path = ROOT + '/memory.pkl'
     self.memory = []
     self.game_state_strings = []
     self.agent = DQNAgent()
     self.initial_weights = self.agent.NN.model.layers[-1].get_weights()[0]
示例#9
0
 def trainOneEpisode(self,
                     num_episodes,
                     max_episode_steps=100,
                     save_freq=100,
                     render=False):
     self.hidden = None
     DQNAgent.trainOneEpisode(self, num_episodes, max_episode_steps,
                              save_freq, render)
示例#10
0
    def __init__(self, state_size, action_size, input_shape, memory_size,
                 replay_start_step, load_model):
        DQNAgent.__init__(self, state_size, action_size, replay_start_step,
                          memory_size)

        self.input_shape = input_shape
        self.initializer = he_normal()
        if load_model is not False:
            self.load_model(load_model)
        else:
            self.__build_model()
示例#11
0
 def setUp(self):
     self.state_size = 3
     self.action_size = 5
     fc = nn.Sequential(nn.Linear(self.state_size, 5), nn.ReLU(),
                        nn.Linear(5, 7), nn.ReLU(), nn.Linear(7, 9),
                        nn.ReLU(), nn.Linear(9, self.action_size))
     self.main_model = QNetwork(name="my_network", fc=fc)
     self.target_model = QNetwork(name="my_network", fc=fc)
     self.agent = DQNAgent(main_model=self.main_model,
                           target_network=self.target_model,
                           memory=WeightedReplayBuffer(buffer_size=12,
                                                       batch_size=3))
     self.eps_greediness = 0.01
    def __init__(self,
                 level_filepath,
                 episodes=30000,
                 initial_epsilon=1.,
                 min_epsilon=0.1,
                 exploration_ratio=0.5,
                 max_steps=2000,
                 render_freq=500,
                 enable_render=True,
                 render_fps=20,
                 save_dir='checkpoints',
                 enable_save=True,
                 save_freq=500,
                 gamma=0.99,
                 batch_size=64,
                 min_replay_memory_size=1000,
                 replay_memory_size=100000,
                 target_update_freq=5,
                 seed=42):
        self.set_random_seed(seed)

        self.episodes = episodes
        self.max_steps = max_steps
        self.epsilon = initial_epsilon
        self.min_epsilon = min_epsilon
        self.exploration_ratio = exploration_ratio
        self.render_freq = render_freq
        self.enable_render = enable_render
        self.render_fps = render_fps
        self.save_dir = save_dir
        self.enable_save = enable_save
        self.save_freq = save_freq

        if enable_save and not os.path.exists(save_dir):
            os.makedirs(save_dir)

        level_loader = LevelLoader(level_filepath)

        self.agent = DQNAgent(level_loader.get_field_size(),
                              gamma=gamma,
                              batch_size=batch_size,
                              min_replay_memory_size=min_replay_memory_size,
                              replay_memory_size=replay_memory_size,
                              target_update_freq=target_update_freq)
        self.env = Snake(level_loader)
        self.summary = Summary()
        self.current_episode = 0
        self.max_average_length = 0

        self.epsilon_decay = (initial_epsilon -
                              min_epsilon) / (exploration_ratio * episodes)
示例#13
0
def main():
    logging.getLogger().setLevel(logging.INFO)
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', type=str, default='config/global_config.json')
    parser.add_argument('--num_step', type=int, default=2000)
    parser.add_argument('--ckpt', type=str)
    parser.add_argument('--algo', type=str, default='DQN', choices=['DQN', 'DDQN', 'DuelDQN'], help='choose an algorithm')


    args = parser.parse_args()

    # preparing config
    # # for environment
    config = json.load(open(args.config))
    config["num_step"] = args.num_step
    cityflow_config = json.load(open(config['cityflow_config_file']))
    roadnetFile = cityflow_config['dir'] + cityflow_config['roadnetFile']
    config["lane_phase_info"] = parse_roadnet(roadnetFile)

    # # for agent
    intersection_id = "intersection_1_1"
    config["intersection_id"] = intersection_id
    config["state_size"] = len(config['lane_phase_info'][intersection_id]['start_lane']) + 1  # 1 is for the current phase. [vehicle_count for each start lane] + [current_phase]
    phase_list = config['lane_phase_info'][intersection_id]['phase']
    config["action_size"] = len(phase_list)
    config["batch_size"] = args.batch_size
    
    logging.info(phase_list)

    # build cityflow environment
    env = CityFlowEnv(config)

    # build agent
    agent = DQNAgent(config)
    
    # inference
    agent.load(args.ckpt)
    env.reset()
    state = env.get_state()
    
    for i in range(args.num_step): 
        action = agent.choose_action(state) # index of action
        action_phase = phase_list[action] # actual action
        next_state, reward = env.step(action_phase) # one step

        state = next_state

        # logging
        logging.info("step:{}/{}, action:{}, reward:{}"
                        .format(i, args.num_step, action, reward))
    def __init__(self):
        # Hyperparameters / Constants
        self.noOfEpisodes = 400
        self.ReplayMemoryQueueSize = 100000
        self.minReplayMemoryQueueSize = 10000
        self.sampleBatchSize = 1000
        self.epsilon = 1
        self.epsilonDecay = 0.99
        self.minEpsilon = 0.001
        self.discount = 0.99
        self.doRender = False
        self.gameEnv = 'MountainCar-v0'

        # Environment details
        self.env = gym.make(self.gameEnv)
        self.actionDimension = self.env.action_space.n
        self.observationDimension = self.env.observation_space.shape

        # creating own session to use across all the Keras/Tensorflow models we are using
        self.sess = tf.compat.v1.Session()

        # Replay memory to store experiances of the model with the environment
        self.replay_memory = deque(maxlen=self.ReplayMemoryQueueSize)

        # Our models to solve the mountaincar problem.
        self.agent = DQNAgent(self.sess, self.actionDimension, self.observationDimension)
示例#15
0
def build_graph():
    session = tf.Session()
    optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
    writer = tf.summary.FileWriter("/home/drl/DRL/tensorflow-reinforce/tmp/")

    # Policy parameters for the exploration policy
    epsilon = 0.9
    target_update_rate = 0.1
    dqn_agent = DQNAgent(session,
                         optimizer,
                         q_network,
                         state_dim,
                         num_actions,
                         target_update_rate=target_update_rate,
                         summary_writer=writer)
    # Switch between greedy and exploratory policy
    exploration_policy = EpsilonGreedyPolicy(dqn_agent, num_actions, epsilon)
    # Always take greedy actions according to greedy policy
    greedy_policy = EpsilonGreedyPolicy(dqn_agent, num_actions, 1.0)

    # Sampler (collect trajectories using the present dqn agent)
    num_episodes = 10
    training_sampler = Sampler(exploration_policy,
                               env,
                               num_episodes=num_episodes)
    testing_sampler = Sampler(greedy_policy, env, num_episodes=5)

    # Initializing ReplayBuffer
    buffer_size = 100000
    replay_buffer = ReplayBuffer(buffer_size)

    return dqn_agent, training_sampler, testing_sampler, replay_buffer
示例#16
0
文件: main.py 项目: wentaoyuan/dqn
def main(args):
    agent = DQNAgent(args)
    if args.test:
        agent.restore(args.checkpoint)
        rewards = agent.evaluate(args.eval_episodes, args.final_epsilon)
        print('Reward mean: %f   std: %f' % (rewards.mean(), rewards.std()))
    else:
        agent.train(args)
def load_from_checkpoint(checkpoint_fname):
    checkpoint_dict = torch.load(checkpoint_fname)

    agent = DQNAgent(checkpoint_dict['config'])
    agent.q_net.load_state_dict(checkpoint_dict['q_net'])
    agent.memory.buffer = checkpoint_dict['buffer']
    loss_hist = checkpoint_dict['loss_hist']
    avg_rewards = checkpoint_dict['avg_rewards']

    return agent, loss_hist, avg_rewards
示例#18
0
def main(args):
    with open(args.param, "r") as f:
        config = json.load(f)

    env = gym.make('Freeway-v0')
    env.seed(args.seed)
    env = FrameStack(env, config)

    print('State shape: ', env.observation_space.shape)
    print('Action shape: ', env.action_space.n)
    agent = DQNAgent(state_size=200,
                     action_size=env.action_space.n,
                     config=config)
    #agent_r.load("models-28_11_2020_22:25:27/2000-")
    env = gym.wrappers.Monitor(env,
                               "./vid",
                               video_callable=lambda episode_id: True,
                               force=True)
    #agent.qnetwork_local.load_state_dict(torch.load('checkpoint-score80.47156817885116_epi_125.pth'))
    agent.qnetwork_local.load_state_dict(
        torch.load('search_results/models/eval-{}/_q_net.pth'.format(
            args.agent)))
    agent.encoder.load_state_dict(
        torch.load('search_results/models/eval-{}/_encoder.pth'.format(
            args.agent)))
    n_episodes = 1
    max_t = 3000
    eps = 0
    for i_episode in range(1, n_episodes + 1):
        state = env.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)

            next_state, reward, done, _ = env.step(action)
            score += reward
            time.sleep(0.01)
            state = next_state
            env.render()
            if done:
                break
        print("Episode {}  Reward {} Steps {}".format(i_episode, score, t))
        env.close()
def training(**kwargs):
    # Set logging level
    if kwargs['debug']:
        LOGGER.setLevel(logging.DEBUG)
    else:
        LOGGER.setLevel(logging.INFO)

    agent = DQNAgent(environment=env,
                     action_space=[0, 1, 2, 3, 4, 5, 6, 7],
                     NN_arch=kwargs['NN_arch'],
                     maxIters=kwargs['max_iters'],
                     eta=0.00001,
                     epsilon=0.4,
                     discount=0.95,
                     weights_dir=kwargs['weights_dir'],
                     mem_size=10**5)

    while True:
        agent.learn(replay=kwargs['replay'],
                    frame_skipping=kwargs['frame_skipping'],
                    batch_size=kwargs['batch_size'])
        if agent.numIters > agent.maxIters:
            break

    agent.save(agent.save_path % kwargs['max_iters'])
    # return the agent object
    return agent
示例#20
0
def train_model(max_episodes=50000):
    """
    Trains a DQN agent to play the CartPole game
    """
    agent = DQNAgent()
    buffer = ReplayBuffer()
    env = gym.make("CartPole-v0")
    for _ in range(100):
        collect_gameplay_experiences(env, agent, buffer)
    for epis in range(
            max_episodes):  # Train the agent for 6000 episodes of the game
        collect_gameplay_experiences(env, agent, buffer)
        gameplay_experience_batch = buffer.sample_gameplay_batch()
        loss = agent.train(gameplay_experience_batch)
        avg_reward = evaluate_training_result(env, agent)
        if epis % 20 == 0:
            agent.update_target_network()
        print("Episode {}/{} and so far the performance is {} and loss is {}".
              format(epis, max_episodes, avg_reward, loss[0]))
    # env.close()
    print("Training Complete")
    play(env, agent)
示例#21
0
def run_eval(dir_name: str, episodes: int = 100, render: bool = False) -> List[int]:
    agent_conf = AgentConf()
    env = Tetris()
    agent = DQNAgent(env.get_state_size(),
                     n_neurons=agent_conf.n_neurons, activations=agent_conf.activations,
                     epsilon_stop_episode=agent_conf.epsilon_stop_episode, mem_size=agent_conf.mem_size,
                     discount=agent_conf.discount, replay_start_size=agent_conf.replay_start_size)

    # timestamp_str = "20190730-165821"
    # log_dir = f'logs/tetris-nn={str(agent_conf.n_neurons)}-mem={agent_conf.mem_size}' \
    #     f'-bs={agent_conf.batch_size}-e={agent_conf.epochs}-{timestamp_str}'

    # tetris-20190731-221411-nn=[32, 32]-mem=25000-bs=512-e=1 good

    log_dir = 'logs/' + dir_name

    # load_model
    agent.model = load_model(f'{log_dir}/model.hdf')
    agent.epsilon = 0
    scores = []
    for episode in range(episodes):
        env.reset()
        done = False

        while not done:
            next_states = env.get_next_states()
            best_state = agent.best_state(next_states.values())

            # find the action, that corresponds to the best state
            best_action = None
            for action, state in next_states.items():
                if state == best_state:
                    best_action = action
                    break
            _, done = env.hard_drop([best_action[0], 0], best_action[1], render=render)
        scores.append(env.score)
        # print results at the end of the episode
        print(f'episode {episode} => {env.score}')
    return scores
示例#22
0
def train_speed_agent(coach):
    """
    takes caoch(lstm) to modify the
    target reward function for the 
    agent
    
    """
    score = 0
    coaching_score_keep = []
    coaching_episode_keep = []
    env = gym.make('CartPole-v0')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    coaching = DQNAgent(len(env.reset()), env.action_space.n)
    done = False
    batch_size = 32
    index = 0
    for e in range(EPISODES):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        for time in range(500):
            #env.render()
            action = coaching.act(state)
            next_state, reward, done, _ = env.step(action)
            reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, state_size])
            index = index + 1
            coaching.remember(state, action, reward, next_state, done, index)
            score = score + reward
            success = determine_sucess(done, score)
            coaching.lstm_data(state, action, reward, next_state, done,
                               success)
            state = next_state
            if done:
                print("episode: {}/{}, score: {}, e: {:.2}".format(
                    e, EPISODES, time, coaching.epsilon))
                coaching_score_keep.append(score)
                coaching_episode_keep.append(e)
                score = 0
                break
        if len(coaching.memory) > batch_size:
            coaching.replay(batch_size, coach)
    return agent, coaching_score_keep, coaching_episode_keep
示例#23
0
def train_expert():
    """
    craetes an agent that is trained to an optimal policy
    and captures all values required to train lstm
    """
    agent_score_keep = []
    agent_episode_keep = []
    score = 0
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQNAgent(len(env.reset()), env.action_space.n)
    # agent.load("./save/cartpole-dqn.h5")
    done = False
    batch_size = 32
    index = 0
    for e in range(EPISODES):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        for time in range(500):
            #                env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            a_reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, state_size])
            index = index + 0
            agent.remember(state, action, a_reward, next_state, done, index)
            score = score + reward
            success = determine_sucess(done, score)
            agent.lstm_data(state, action, reward, next_state, done, success)
            state = next_state
            if done:
                print("episode: {}/{}, score: {}, e: {:.2}".format(
                    e, EPISODES, score, agent.epsilon))
                agent_score_keep.append(score)
                agent_episode_keep.append(e)
                score = 0
                break
        if len(agent.memory) > batch_size:
            agent.replay(batch_size, None)
    return agent, agent_score_keep, agent_episode_keep
示例#24
0
    def create(name, env, max_schedule_time=20, verbose=False):
        """Static method to create an agents by name"""
        if name not in AgentFactory.available_agents():
            raise(Exception(f'Unsupported agent: {name}'))
        
        if name == 'baseline':
            from random_agent import RandomAgent
            return RandomAgent(env.action_space)

        if name == 'qlearning':
            from qlearning_td_agent import QLearningTDAgent
            return QLearningTDAgent(jobs_data=jobs_data, epsilon=.4,
                max_schedule_time=max_schedule_time, verbose=verbose)

        if name == 'dqn':
            from dqn_agent import DQNAgent
            return DQNAgent(env.observation_space, env.action_space,
                verbose=verbose)
        return None
def process_conversation_POST(state_tracker_id, message):
    state_tracker = None

    if state_tracker_id in StateTracker_Container.keys():
        state_tracker = StateTracker_Container[state_tracker_id][0]
        confirm_obj = StateTracker_Container[state_tracker_id][1]
    else:
        # print("---------------------------------in model")
        state_tracker = StateTracker(database, constants)
        confirm_obj = None
        StateTracker_Container[state_tracker_id] = (state_tracker, confirm_obj)

    user_action, new_confirm_obj = process_message_to_user_request(
        message, state_tracker)
    print("-----------------------------------user action")
    print(user_action)
    #nếu là câu request mới của user thì reset state tracker và cho confirm về lại None
    if user_action['request_slots'] != {}:
        state_tracker.reset()
        confirm_obj = None
    #nếu có câu confirm request mới thì ghi đè
    if new_confirm_obj != None:
        confirm_obj = new_confirm_obj
    if user_action['intent'] not in ["hello", "other", "done"]:
        dqn_agent = DQNAgent(state_tracker.get_state_size(), constants)
        agent_act = get_agent_response(state_tracker, dqn_agent, user_action)
        StateTracker_Container[state_tracker_id] = (state_tracker, confirm_obj)
        agent_message = response_craft(agent_act, state_tracker, confirm_obj)
    else:
        # to prevent key error
        agent_act = {
            'intent': user_action['intent'],
            'request_slots': [],
            'inform_slots': []
        }
        agent_message = random.choice(
            response_to_user_free_style[user_action['intent']])
        #nếu là done thì reset và cho confirm về None
        if user_action['intent'] == "done":
            state_tracker.reset()
            StateTracker_Container[state_tracker_id] = (state_tracker, None)
    return agent_message, agent_act
示例#26
0
def main():
    """
    Main method runs the whole experiment.
    """
    env = UnityEnvironment(file_name=ENV_PATH)
    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    print_env_state(env=env, brain_name=brain_name, brain=brain)

    agent = DQNAgent(state_size=PARAM.STATE_SIZE,
                     action_size=PARAM.ACTION_SIZE,
                     seed=0)
    # agent = DDQNAgent(state_size=PARAM.STATE_SIZE, action_size=PARAM.ACTION_SIZE, seed=0)
    # agent = DDQNAgentPrioExpReplay(state_size=PARAM.STATE_SIZE, action_size=PARAM.ACTION_SIZE, seed=0)

    if not TRAIN_MODE:
        load_model_into_agent(agent)

    scores = run_agent(agent=agent, env=env, brain_name=brain_name)
    save_score_plot(scores=scores)
示例#27
0
def experiment_wrapper(feed_units, i, num_episodes, randomize, env_type):
    from supervised_agent import SupervisedAgent
    from supervised_agent_one_step import SupervisedAgentOneStep
    from dqn_agent import DQNAgent
    from deep_exp_hyper_agent import DeepExpHyperAgent
    from deep_exp_agent import DeepExpAgent
    from deep_exp_ids_agent import DeepExpIDSAgent
    import numpy as np

    deep_exp_agents = []
    num_positive = 0
    for feed in feed_units:
        if feed.interest > 0:
            num_positive += 1
    for prior in range(0, 1):
        deep_exp_agents.append(
            DeepExpAgent(
                [k for k in range(len(feed_units))],
                'deep_exploration_{}_{}_{}'.format(num_positive,
                                                   len(feed_units), prior),
                prior_variance=10**prior,
            ))
    agents = ([
        SupervisedAgent(
            [k for k in range(len(feed_units))], 'supervised_{}_{}'.format(
                num_positive, len(feed_units))),
        DQNAgent([k for k in range(len(feed_units))], 'dqn_{}_{}'.format(
            num_positive, len(feed_units))),
    ] + deep_exp_agents)

    cumulative_reward = run_experiment(agents, feed_units, i, num_episodes,
                                       randomize, env_type)

    np.save(
        'ids_experiment_{}_{}_{}_{}_{}'.format(num_positive, len(feed_units),
                                               int(randomize), i, env_type),
        cumulative_reward)
示例#28
0
def _test(opponents, match_num=20, render=True):
    env = pommerman.make('PommeFFACompetition-v0', [])

    # Exploration strategy
    exp_schedule = LinearExploration(env, 0, 0, 1)

    # Learning rate schedule
    lr_schedule = LinearSchedule(config.lr_begin, config.lr_end,
                                 config.lr_nsteps)

    # Initialize agents.
    dqn_agent = DQNAgent(env, config, exp_schedule, lr_schedule, False)
    dqn_agent_index = _init_agents(env, exp_schedule, lr_schedule, opponents, dqn_agent)

    count = 0
    win = 0
    for _ in range(match_num):
        state = env.reset()

        done = False

        while not done:
            if render:
                env.render()
            actions = env.act(state)
            state, reward, done, info = env.step(actions)
            if reward[0] == 1:
                win += 1
                print('win at episode %d' % count)

            if reward[dqn_agent_index] == -1 and not done:
                # Stop the episode when the testing agent dies.
                done = True
        count += 1
    print(win / count)

    env.close()

if __name__ == '__main__':
    env = gym.make('CartPole-v0')
    np.random.seed(123)
    env.seed(123)
    nb_actions = env.action_space.n
    actions = np.arange(nb_actions)
    policy = EpsGreedyQPolicy(eps=1., eps_decay_rate=.999, min_eps=.01)
    memory = Memory(limit=50000, maxlen=1)
    obs = env.reset()
    agent = DQNAgent(actions=actions,
                     memory=memory,
                     update_interval=200,
                     train_interval=1,
                     batch_size=32,
                     observation=obs,
                     input_shape=[len(obs)],
                     policy=policy,
                     obs_processer=obs_processer)

    agent.compile()

    result = []
    nb_epsiodes = 1000
    for episode in range(nb_epsiodes):
        agent.reset()
        observation = env.reset()
        observation = deepcopy(observation)
        agent.observe(observation)
        done = False
    W1 = tf.get_variable("W1", [state_dim, 20],
                         initializer=tf.truncated_normal_initializer())
    b1 = tf.get_variable("b1", [20], initializer=tf.constant_initializer(0))
    h1 = tf.nn.relu(tf.matmul(states, W1) + b1)
    W2 = tf.get_variable("W2", [20, num_actions],
                         initializer=tf.truncated_normal_initializer())
    b2 = tf.get_variable("b2", [num_actions],
                         initializer=tf.constant_initializer(0))
    q = tf.matmul(h1, W2) + b2
    return q


dqn_agent = DQNAgent(q_session,
                     q_optimizer,
                     q_network,
                     state_dim,
                     num_actions,
                     target_update_rate=0.01,
                     summary_writer=q_writer,
                     summary_every=q_summary_every)

# Initializing ReplayBuffer
buffer_size = 100000
sample_size = 2**13
replay_buffer = ReplayBuffer(buffer_size)


# Training
def computing_probabilities(batch):
    probabilites = pg_reinforce.compute_action_probabilities(
        batch["next_states"])
    return probabilites
示例#31
0
    img.set_array(state_t_1)
    plt.axis("off")
    return img,


if __name__ == "__main__":
    # args
    parser = argparse.ArgumentParser()
    parser.add_argument("-m", "--model_path")
    parser.add_argument("-s", "--save", dest="save", action="store_true")
    parser.set_defaults(save=False)
    args = parser.parse_args()

    # environmet, agent
    env = CatchBall()
    agent = DQNAgent(env.enable_actions, env.name)
    agent.load_model(args.model_path)

    # variables
    win, lose = 0, 0
    state_t_1, reward_t, terminal = env.observe()

    # animate
    fig = plt.figure(figsize=(env.screen_n_rows / 2, env.screen_n_cols / 2))
    fig.canvas.set_window_title("{}-{}".format(env.name, agent.name))
    img = plt.imshow(state_t_1, interpolation="none", cmap="gray")
    ani = animation.FuncAnimation(fig, animate, init_func=init, interval=(1000 / env.frame_rate), blit=True)

    if args.save:
        # save animation (requires ImageMagick)
        ani_path = os.path.join(
示例#32
0
import numpy as np

from catch_ball import CatchBall
from dqn_agent import DQNAgent


if __name__ == "__main__":
    # parameters
    n_epochs = 1000

    # environment, agent
    env = CatchBall()
    agent = DQNAgent(env.enable_actions, env.name)

    # variables
    win = 0

    for e in range(n_epochs):
        # reset
        frame = 0
        loss = 0.0
        Q_max = 0.0
        env.reset()
        state_t_1, reward_t, terminal = env.observe()

        while not terminal:
            state_t = state_t_1

            # execute action in environment
            action_t = agent.select_action(state_t, agent.exploration)
            env.execute_action(action_t)