Пример #1
0
def agent_loop(dictionary, lock1, lock2):
    random.seed()
    environment = GridWorldModel()
    agent = Agent(environment)
    agent.Q = dictionary[Q_SHARED_KEY]  # initialize with shared Q

    while environment.step_count < MAX_STEPS_PER_AGENT:
        environment.reset()
        agent.state = environment.get_start_state()
        while True:
            agent.act()
            if environment.step_count % ASYNC_UPDATE_INTERVAL == 0 or environment.is_terminal_state(
            ):
                lock1.acquire()
                q = dictionary[Q_SHARED_KEY]
                # Need to write it back, otherwise the proxy won't pick up the changes.
                dictionary[Q_SHARED_KEY] = np.add(q, agent.dQ)
                lock1.release()
                agent.dQ = np.zeros((GridWorldModel.get_number_of_states(),
                                     GridWorldModel.get_number_of_actions()),
                                    dtype=float)
            if environment.is_terminal_state():
                break

    lock2.acquire()
    combined_rewards = dictionary[REWARDS_KEY]
    agents_rewards = np.array(agent.rewards)
    # ...same here
    dictionary[REWARDS_KEY] = np.add(combined_rewards,
                                     agents_rewards[:MAX_STEPS_PER_AGENT])
    lock2.release()
Пример #2
0
def train():
    env = gym.make("CartPole-v1")
    input_space = env.observation_space.shape[0]
    output_space = env.action_space.n
    print(input_space, output_space)
    agent = Agent(input_space, output_space)
    run = 0
    x = []
    y = []
    while run < 100:
        run += 1
        state = env.reset()
        state = np.reshape(state, [1, -1])
        step = 0
        while True:
            step += 1  # 步数越多,相当于站立的时间越长,比较容易理解。
            env.render()
            action = agent.act(state)
            state_next, reward, done, _ = env.step(action)
            reward = reward if not done else -reward  # 棍子倒了,分数肯定是负数了
            state_next = np.reshape(state_next, [1, -1])
            agent.add_data(state, action, reward, state_next, done)
            state = state_next
            if done:
                print("Run: " + str(run) + ", exploration: " +
                      str(agent.exploration) + ", score:" + str(step))
                x.append(run)
                y.append(step)
                break
            agent.train_from_buffer()  # 每次都要执行训练
    plt.plot(x, y)
    plt.show()
Пример #3
0
    def run(self, agent: Agent.Agent):
        s = th.tensor(self.env.reset(), dtype=th.float)
        R = 0

        while True:
            self.env.render()

            a = agent.act(s)

            s_, r, done, info = self.env.step(a)
            s_ = th.tensor(s_, dtype=th.float)

            if done:  # terminal state
                s_ = None

            agent.observe((s, a, r, s_))
            agent.replay()

            s = s_
            R += r

            if done:
                break

        print("Total reward:", R)
Пример #4
0
def run_dqn():
    total_reward_history = []
    # start Q-Learning
    agent = Agent(shape=state_shape, num_actions=num_actions)
    # Init Memory\
    state = env.reset()
    state = img_prcss.preprocess(image=state)
    print("Start adding memory")
    while agent.memory.is_full() == False:
        action = agent.act_randomly()
        next_state, reward, done, _ = env.step(action)
        next_state = img_prcss.preprocess(image=next_state)
        if done:
            next_state = None
        experience = (state, action, reward, next_state)
        agent.memory.add(experience)
        if done:
            state = env.reset()
        else:
            state = next_state
    print("memory is full")
    # init display
    img_states = env.render(mode='rgb_array')
    img = plt.imshow(img_states)  # only call this once
    for episode in range(NUM_EPISODES):
        # reset env
        state = env.reset()
        state = img_prcss.preprocess(image=state)
        total_reward = 0
        for time in range(NUM_STEPS):
            # 1: get action(t)
            action = agent.act(state, episode)
            # 2: action(t) -> {state(t+1)}
            next_state, reward, done, _ = env.step(action)
            next_state = img_prcss.preprocess(image=next_state)
            if done:
                next_state = None
            # 3: get reward(t)
            total_reward += reward
            # 4: Memory stored as (s(t), a(t), r(t), s(t+1))
            experience = (state, action, reward, next_state)
            agent.memory.add(experience)
            # 5: update target Q-network
            agent.update_target_network()
            # 6: replay experiences and update network weight
            agent.replay()
            # 7: save state
            state = next_state
            # ex: judge go to next episode
            if done:
                total_reward_history.append(total_reward)
                #plt.plot([ep for ep in range(episode+1)], total_reward_history)
                #plt.pause(0.001)
                # ex: logout
                print('Ep:', episode, ', Tm:', time, 'Rwd:', total_reward)
                #env.render()
                break  # go to next episode
            # ex: display
            img_states = env.render(mode='rgb_array')
            img.set_data(img_states)  # just update the data
def train_agent(env: UnityEnvironment,
                brain_name: str,
                agent: Agent,
                n_episodes: int,
                max_steps: int = 1500) -> []:
    """
    Trans the agent for n episodes
    :param env:
    :param brain_name:
    :param agent:
    :param n_episodes: number of episodes to train
    :param max_steps: max amount of steps
    :return: returns an array containing the score of every episode
    """
    scores: [int] = []
    # store the last 100 scores into a queue to check if the agent reached the goal
    scores_window = deque(maxlen=100)

    for i_episode in range(1, n_episodes + 1):
        # reset the environment
        env_info = env.reset(train_mode=True)[brain_name]
        agent.reset()
        state = env_info.vector_observations[0]
        score = 0

        # the environment will end the episode after n steps, thus no manual termination of the episode is needed
        for a in range(max_steps):
            action: int = agent.act(state, add_noise=False)
            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            score += reward

            agent.step((state, action, reward, next_state, done))

            state = next_state
            if done:
                break

        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score

        # print('\rEpisode {}\tavg Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 10 == 0:
            print(
                f"""Episode {i_episode}: Average Score: {np.mean(scores_window):.2f}"""
            )

        if np.mean(scores_window) >= 30.0:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode, np.mean(scores_window)))
            torch.save(agent.actor_local.state_dict(), 'checkpoint-actor.pth')
            torch.save(agent.critic_local.state_dict(),
                       'checkpoint-critic.pth')
            break
    return scores
Пример #6
0
class Train:
    def __init__(self):
        self.sample_batch_size = 128
        self.episodes = 10000
        self.state_size = 150
        self.action_size = 3
        self.agent = Agent(self.state_size, self.action_size)
        self.env = Env([0, 1, 2], [j for j in range(1, self.state_size)],
                       self.agent.x_train[0], self.agent.y_train[0])
        self.label_data = 1.0
        self.label_past = 1.0

    def run(self):
        try:
            for index_episode in range(self.episodes):
                #time_to_begin = random.randint(1,self.state_size)
                index_random_data = random.randint(0, 49)
                seq = self.agent.x_train[index_random_data]
                seq_label = self.agent.y_train[index_random_data]
                self.env.reset(seq, seq_label)
                done = False
                index = 1  #time_to_begin
                while not done and index <= 150:
                    state = self.env.get_sequence_state()
                    action = self.agent.act(state, index)
                    next_state, reward, done = self.env.step(action)
                    next_state = self.env.get_sequence_state()
                    next_state = np.reshape(self.env.get_sequence_state(),
                                            (1, 150, 1, 1))
                    self.agent.remember(state, action, reward, next_state,
                                        done)
                    state = next_state
                    index += 1
                if (index_episode % 20 == 0):
                    print("Episode {}".format(index_episode))
                if index_episode % 100 == 0 and index_episode != 0:
                    acc, res, t = self.agent.compute_acc()
                    acc_val, res_val, t_val = self.agent.compute_acc_val()
                    print(
                        "acc_train {} ======> average_time_train {} ======> update {}"
                        .format(acc, np.mean(t), self.agent.update_number))
                    print(
                        "acc_val {} ======> average_time_val {} ======> update {}"
                        .format(acc_val, np.mean(t_val),
                                self.agent.update_number))
                    if acc > 0.9:
                        self.agent.save_weight()

                self.agent.replay(self.sample_batch_size)
                self.agent.target_train()
        finally:
            self.agent.save_model()
Пример #7
0
class Environment(threading.Thread):
    stop_signal = False

    def __init__(self,
                 env,
                 render=False,
                 eps_start=EPS_START,
                 eps_end=EPS_STOP,
                 eps_steps=EPS_STEPS):
        threading.Thread.__init__(self)

        self.render = render
        self.env = env
        self.agent = Agent()

    def runEpisode(self):
        s = self.env.reset()

        R = 0
        while True:
            time.sleep(THREAD_DELAY)  # yield

            if self.render:
                self.env.render()

            a = self.agent.act(s)
            s_, r, done = self.env.step(a)

            if done:  # terminal state
                s_ = None

            self.agent.train(s, a, r, s_)

            s = s_
            R += r

            if done or self.stop_signal:
                break

        print("Total R:", R)

    def run(self):
        while not self.stop_signal:
            self.runEpisode()

    def stop(self):
        self.stop_signal = True
Пример #8
0
def train_the_agent(env,n_episodes = 400,max_t = 700):
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=False)[brain_name]
    states = env_info.vector_observations
    state_size = states.shape[1]
    action_size = brain.vector_action_space_size
    print(state_size,action_size)
    agent = Agent(state_size=state_size, action_size=action_size, random_seed=10,sigma=0.05)
    scores_deque = deque(maxlen=100)
    scores = []
    max_score = -np.Inf
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations
        agent.reset()
        score = 0
        while True:
            action = agent.act(state)
            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break
        scores_deque.append(score)
        scores.append(score)
        print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(i_episode, np.mean(scores_deque), score),
              end="")
        if i_episode % 100 == 0:
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
    print(scores)
    env.close()
def watch_agent(env: UnityEnvironment, brain_name: str, agent: Agent) -> None:
    """
    Shows agent simulation
    :param env:
    :param brain_name:
    :param agent:
    :return:
    """
    env_info = env.reset(train_mode=False)[brain_name]  # reset the environment
    state = env_info.vector_observations[0]  # get the current state
    score = 0  # initialize the score

    while True:
        action = agent.act(state)
        env_info = env.step(action)[
            brain_name]  # send the action to the environment
        next_state = env_info.vector_observations[0]  # get the next state
        reward = env_info.rewards[0]  # get the reward
        done = env_info.local_done[0]  # see if episode has finished
        score += reward  # update the score
        state = next_state  # roll over the state to next time step
        if done:  # exit loop if episode finished
            break
    print(f"Agent achieved a score of {score}")
env = Maze_Env(maze, display_width, display_height, MAX_MOVES)

#initialising agent
directory = 'maze2'
agent = Agent(env, alpha=0, dir=None)

#load the final policy
file = 'maze2/policy_final.pickle'
agent.set_policy(file)

#or load an intermediate Q-table
#with open('maze2/Q_table_1000.pickle', 'rb') as f:
#   Q = pickle.load(f)

#testing the agent
for i in range(10):  #running for 10 times
    state = env.reset()
    total_reward = 0
    while True:
        action = agent.act(state, test=True)
        #rendering the environment
        env.render(action)
        time.sleep(0.3)
        state, reward, done = env.step(action)
        total_reward += reward
        if done:
            env.render(action)
            time.sleep(2)
            print(total_reward)
            break
Пример #11
0
        
    

episode = 0
running = True
isTrained = False
while running:
    episode += 1
    state = env.reset() #Reset enviroment
    state, _, done, info = env.step(0) #get game details
    while not done:
        beginState = state
        beginState = getState(getBoard(beginState), info["current_piece"], info["next_piece"])
        
        #output [column, rotation] eg. [0,0,0,1,0,0,0,0,0,0], [0,0,1,0]
        netOut = agent.act(beginState) #Act on last state
        print(netOut)
        curPiece = list(info["statistics"].keys()).index(info["current_piece"][0])
        actionArr = getActions(netOut, curPiece)
        #do actions based on what rotation and location network wants piece in
        for action in actionArr:
            _, _, done, info = env.step(action)
        
        #until next piece
        nextPiece = info["next_piece"]
        while info["current_piece"] != nextPiece:
            rawstate, reward, done, info = env.step(5) #Move down
        
        board = getBoard(rawstate)
        state = getState(board, info["current_piece"], info["next_piece"])
        
Пример #12
0
from Agent import Agent
from Brain import Brain
from ConnectFourEnvironment import ConnectFourEnvironment

if __name__ == '__main__':
    env = ConnectFourEnvironment(play_with_rng=False)
    model = './yellow.h5'

    brain = Brain(model)
    agent = Agent(brain=brain)
    while True:
        s = env.reset()
        env.render()
        while not env.is_finished():
            if env.yellows_turn():
                a = agent.act(s)
            else:
                a = input("Choose action (0-6): ")
                a = int(a)

            s_prime, r, done = env.step(a)
            env.render()
            s = s_prime
Пример #13
0
# Main training loop
totalSimSteps = 0
while totalSimSteps < max_steps:

    #Run episodes until the iteration simulation budget runs out
    iterSimSteps = 0
    while iterSimSteps < N:

        # Reset the simulation
        observation = sim.reset()

        # Simulate this episode until done
        while True:
            # Query the agent for action given the state observation
            action = agent.act(sess, observation)

            #Simulate using the action
            #Note: this tutorial does not repeat the same action for two steps,
            #unlike the Run.py script used for the ICML paper results.
            #Repeating the action for multiple steps seems to yield better exploration
            #in most cases, possibly because it reduces high-frequency action noise.
            nextObservation, reward, done, info = sim.step(action[0, :])

            # Save the experience point
            agent.memorize(observation, action, reward, nextObservation, done)
            observation = nextObservation

            # Bookkeeping
            iterSimSteps += 1
Пример #14
0
    env_test = MarketEnv(df, seq_size, foreignScaler=env.scaler)

sess = tf.Session()
agent = Agent(sess, seq_size, n_features, hidden_size=16, a_size=3)
sess.run(tf.global_variables_initializer())

i = 0
reward_history = []
print('Sequence size: ' + str(seq_size))

while True:
    running_reward = 0
    s = env.reset()

    while True:
        a = agent.act(s)
        s_, r, done = env.step(a)

        td_error = agent.critic_learn(np.array([s]), r, np.array([s_]))
        agent.actor_learn(np.array([s]), a, td_error)

        s = s_
        running_reward += r

        if done:
            reward_history.append(running_reward)
            break

    if i % 100 == 0 and i > 0:
        reward_history = reward_history[-100:]
        last_100_episodes_mean = np.mean(reward_history)
Пример #15
0
class Environment:
    '''
    This is the Environment class, it is responsible for 
    spawning a new thread and using an agent to act and train
    '''
    def __init__(self,
                 n_step,
                 gamma,
                 queue_sync,
                 queue_upd,
                 n_processors,
                 g_counter,
                 env,
                 isGlobal=False):  # removed network param
        from Network import Network
        # from   Agent   import Agent
        # import keras

        self.n_step = n_step
        self.gamma = gamma
        self.isGlobal = isGlobal
        self.queue_sync = queue_sync
        self.queue_upd = queue_upd
        self.n_processors = n_processors

        # global and local counter
        self.g_counter = g_counter
        self.counter = 0
        self.lock = Lock()

        # create new random seed for each child process
        np.random.seed()

        if isGlobal:
            self.network = Network(self.gamma, self.n_step, self.queue_sync,
                                   self.n_processors, self.isGlobal)
            self.run_sync_agents()
        else:
            self.run(env)

    # Synchronize the agents as long as they are training. This function is mainly
    # used by the global network.
    def run_sync_agents(self):
        while True:
            # start synchronization with the agents every 'x' timestep
            if self.g_counter.value % 500 == 0:
                while not self.queue_sync.empty():
                    _ = self.queue_sync.get()

                for _ in range(self.n_processors):
                    # send the global network's weights to the agents
                    weights = self.network.get_weights()
                    self.queue_sync.put(self.pickle_weights(weights))
                self.increment_global_counter()
                print('GLOBAL NET: Syncing weights to agents!')
                self.network.model.save('mario_model.h5')

            # update the global network's weights when there is d_w in the queue
            while not self.queue_upd.empty():
                # d_w is the change of the weights
                d_w = self.unpickle_weights(self.queue_upd.get())
                self.network.update_weights(d_w)
                print('GLOBAL NET: Updated weights from an agent!')

            # self.increment_global_counter()

    # initialize the network, environment and the agent and
    # then run the training.
    def run(self, env):
        self.network = Network(self.gamma, self.n_step, self.queue_sync,
                               self.n_processors, self.isGlobal)
        self.env = env
        self.agent = Agent(self.env, self.n_step, self.gamma, self.network)
        self.env.reset()

        self.run_episode()

    def run_episode(self):
        # Reset the env
        s = self.env.reset()
        self.init_env()

        # Reset the agent
        self.agent.init_frames()

        # Act and observe until were done
        while True:
            action_idx, action = self.agent.act()
            s_prim, reward, done, info = self.env.step(action)

            # process the rewards
            done, reward_processed = self.process_reward(reward, info, done)

            if done:
                s_prim = None

            onehot_action = np.zeros(len(self.agent.actions))
            onehot_action[action_idx] = 1
            has_updated = self.agent.train(s, onehot_action, reward_processed,
                                           s_prim)
            #s = s_prim

            if done:  # or self.stop_signal
                self.init_env()
                self.agent.init_frames()
                self.env.change_level(0)
                continue
            else:
                s = s_prim
                self.agent.next_frame(s)

            # increment the local and global counter after each episode
            self.increment_local_counter()
            self.increment_global_counter()

            if has_updated:
                w = self.network.get_weights()
                delta_w = w - self.network.get_global_weights()
                self.queue_upd.put(self.pickle_weights(delta_w))

            if self.counter % 100 == 0:
                if not self.queue_sync.empty():
                    new_w = self.unpickle_weights(self.queue_sync.get())
                    self.network.set_weights(new_w)
                    self.network.set_global_weights(np.array(new_w))
                    self.counter = 0  # prevent unecessarily large numbers

    def init_env(self):
        self.gameInfo = {
            'max_distance': 0,
            'time': 400,
            'score': 0,
            'staleness': 0
        }
        #s = self.env.reset()

    def get_state_dim(self):
        return (self.env.observation_space.shape[0],
                self.env.observation_space.shape[1])

    # increments the local counter
    def increment_local_counter(self):
        self.counter += 1

    # Increment the global counter
    def increment_global_counter(self):
        with self.g_counter.get_lock():
            self.g_counter.value = self.g_counter.value + 1

    # pickle the weights so they can be passed through a queue
    def pickle_weights(self, w):
        # use protocol=-1 to use the latest protocol
        return pickle.dumps(w, protocol=-1)

    def unpickle_weights(self, w):
        return pickle.loads(np.array(w))

    def process_reward(self, reward, info, done):
        ##### TODO: FIX REWARD WHEN RESPAWNING
        r = 0
        if 'distance' in info:

            if info['distance'] > self.gameInfo['max_distance']:
                self.gameInfo['max_distance'] = info['distance']
                self.gameInfo['staleness'] = 0
            else:
                self.gameInfo['staleness'] += 1

            r += reward * 0.5

            # Check time
            if info['time'] < self.gameInfo['time']:
                r -= 0.01

            # Check score
            r += (info['score'] - self.gameInfo['score']) * 0.0001  # tune

            if info['life'] == 0 or self.gameInfo[
                    'staleness'] > 200:  # tune staleness
                r -= 1
                done = True

            if done and info[
                    'distance'] > 0.97 * 3266:  # 3266 is max_distance @ level 1
                r += 1

            self.gameInfo['time'] = info['time']
            self.gameInfo['score'] = info['score']

            return done, r
        return done, r
Пример #16
0
                  mode=modes[mode],
                  gamma=0,
                  initialMean=-1 * np.ones(actionDim),
                  initialSd=0.25 * np.ones(actionDim),
                  H=5)  #in this simple problem, we may use a smaller H
    tf.global_variables_initializer().run(session=sess)
    agent.init(sess)  # must be called after TensorFlow global variables init

    #always use the same random seed so that all algorithms start from the same initial action distribution
    np.random.seed(0)

    #loop over training iterations
    for iter in range(nIter):
        print("Iter {}".format(iter))
        #query actions
        actions = agent.act(sess, dummyState)
        #compute rewards
        rewards = -np.sum(np.square(actions), axis=1)
        #make the agent memorize the episodes, each episode with just one action
        for idx in range(actions.shape[0]):
            agent.memorize(dummyState[idx], actions[idx, :], rewards[idx],
                           dummyState[idx], True)
        #update agent (trains the value function predictor and policy networks)
        agent.updateWithMemorized(sess)

        #visualize
        if (iter + 1) in plotIters:
            nCols = len(plotIters)
            plotIdx = plotIters.index(iter + 1)
            pp.subplot(nModes, nCols, plotIdx + 1 + mode * nCols)
            pp.cla()
Пример #17
0
        obj = self.object_grid[pos[0]][pos[1]][0]
        self.object_grid[pos[0]][pos[1]][0] = 0
        return obj

    def let_object(self, pos, obj):
        if self.object_grid[pos[0]][pos[1]][0] == 0:
            self.object_grid[pos[0]][pos[1]][0] = copy.deepcopy(obj)
            return True
        else:
            return False

    def show(self):
        for i in self.object_grid:
            print([k[0] for k in i])


if __name__ == "__main__":
    g = Grid(50, 200, 200)
    agents = list()
    for i in range(20):
        a = Agent(10, 0.1, 0.3, 1, g)
        g.place_agent(a)
        agents.append(a)
    print(g.object_grid)
    for step in range(200000):
        for a in agents:
            a.act()
        if step % 1000 == 0:
            print("step {}".format(step))
            g.show()
                  seed=seed,
                  lr=LR,
                  memory=memory,
                  update_every=UPDATE_EVERY,
                  batch_size=BATCH_SIZE,
                  gamma=GAMMA,
                  TAU=TAU,
                  device=device)

    for i_episode in range(1, n_episodes + 1):
        #state = env.reset()
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            #next_state, reward, done, _ = env.step(action)
            env_info = env.step(action.astype(int))[brain_name]
            next_state = env_info.vector_observations[0]  # get the next state
            reward = env_info.rewards[0]  # get the reward
            done = env_info.local_done[0]

            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break
        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(
Пример #19
0
                          batch_size=args.replay_size,
                          input_dimension=INPUT_SIZE_90X,
                          number_of_actions=NUMBER_OF_ACTIONS,
                          alpha=args.alpha,
                          load_weights=args.load)

    EPSILON = args.epsilon

    for episode in tqdm(range(args.ep)):
        state = Env.reset_scene()
        episode_rw = 0.0
        done = 0
        Agent.min_rw = 1000
        Agent.last_rw = 0
        for step in range(args.steps):
            action = Agent.act(state[3], EPSILON)
            vell = Agent.action_to_vel(action)
            reward, next_state = Env.do_step(vell, args.model)
            ##############################
            #print(reward)
            Agent.last_rw = reward
            Agent.min_rw = reward if (reward < Agent.min_rw) else Agent.min_rw
            ##############################
            episode_rw += reward
            done = Env.done()
            if done: break
            Agent.write_memory(state[3], action, reward, done, next_state[3])
            state = next_state

        if len(Agent.memory) >= int(Agent.BATCH_SIZE):
            evall = Agent.replay(args.gamma, args.epochs)
Пример #20
0

# scores = ddpg()
# assert False

agent.actor_local.load_state_dict(torch.load('actor4850_1.pth'))
# agent.critic_local.load_state_dict(torch.load('critic1.pth'))

state_list = np.load('init_state.npy')
fuel_list = []
for ep in range(500):
    total_reward = 0
    fuel = 0
    # state = state_list[ep]
    # state = env.reset(state=state, set_state=True)
    state = env.reset()
    for t in range(200):
        action = agent.act(state, add_noise=False)
        print(action, type(action))
        assert False
        fuel += abs(action)
        state, reward, done, _ = env.step(action)
        total_reward += reward
        if done:
            break
    print(t, total_reward)
    if t == 199:
        fuel_list.append(fuel)
# np.save('init_state.npy', np.array(state_list))
print(len(fuel_list) / 500, np.mean(fuel_list))
env.close()
Пример #21
0
batch_size = 32
agent = Agent(window_size, batch_size)
data = getStockData("LT.NS")
l = len(data) - 1

episode_count = 200
Buy, Sell, Rewards, Total_Profit = [], [], [], []
Buy_t, Sell_t, Rewards_t, Total_Profit_t = [], [], [], []
for e in tqdm.tqdm(range(episode_count)):
    # print("Episode " + str(e) + "/" + str(episode_count))
    state = getState(data, 0, window_size + 1)
    agent.inventory = []
    total_profit = 0
    done = False
    for t in range(l):
        action = agent.act(state)
        action_prob = agent.actor_local.model.predict(state)

        next_state = getState(data, t + 1, window_size + 1)
        reward = 0
        if action == 1:
            agent.inventory.append(data[t])
            # print("Buy:" + formatPrice(data[t]))
        elif action == 2 and len(agent.inventory) > 0:  # sell
            bought_price = agent.inventory.pop(0)
            reward = max(data[t] - bought_price, 0)
            total_profit += data[t] - bought_price
            # print("sell: " + formatPrice(data[t]) + "| profit: " +
            #       formatPrice(data[t] - bought_price))

        if t == l - 1:
Пример #22
0
def main():
    logger.configure('./{}_logs'.format(C['env_id']))
    for k, v in C.items():
        logger.record_tabular(k, v)
    logger.dump_tabular()

    train_tracker = [0.0]
    eval_tracker = []
    best_reward = 0

    sess = tf.InteractiveSession()
    train_reward = tf.placeholder(tf.float32, name='train_reward')
    eval_reward = tf.placeholder(tf.float32, name='eval_reward')

    train_env = make_env(C['env_id'], C['noop_max'])
    eval_env = make_env(C['env_id'], C['noop_max'])
    agent = Agent(train_env, C)
    sess.run(tf.global_variables_initializer())
    agent.nn.update_target()

    train_summary = tf.summary.scalar('train_rew', train_reward)
    eval_summary = tf.summary.scalar('eval_reward', eval_reward)
    writer = tf.summary.FileWriter('{}{}_summary'.format('./', C['env_id']),
                                   sess.graph)

    train_fs = reset_fs()
    train_s = train_env.reset()
    for it in range(C['iterations']):
        # Training
        train_fs.append(train_s)
        train_a = agent.act(np.transpose(train_fs, (1, 2, 0)))
        ns, train_r, train_d, _ = train_env.step(train_a)
        train_tracker[-1] += train_r
        agent.perceive(train_s, train_a, train_r, float(train_d), it)
        train_s = ns
        if train_d:
            if train_env.env.env.was_real_done:
                if len(train_tracker) % 100 == 0:
                    summary = sess.run(train_summary,
                                       feed_dict={
                                           train_reward:
                                           np.mean(train_tracker[-100:])
                                       })
                    writer.add_summary(summary, it)
                    logger.record_tabular('steps', it)
                    logger.record_tabular('episode', len(train_tracker))
                    logger.record_tabular('epsilon', 100 * agent.epsilon)
                    logger.record_tabular('learning rate', agent.lr)
                    logger.record_tabular('mean 100 episodes',
                                          np.mean(train_tracker[-100:]))
                    logger.dump_tabular()
                train_tracker.append(0.0)
            train_fs = reset_fs()
            train_s = train_env.reset()

        # Evaluation
        if it % C['eval_freq'] == 0:
            for _ in range(C['eval_episodes']):
                temp_video = []
                temp_reward = 0
                eval_tracker.append(0.0)
                eval_fs = reset_fs()
                eval_s = eval_env.reset()
                while True:
                    temp_video.append(eval_s)
                    eval_fs.append(eval_s)
                    eval_a = agent.greedy_act(np.transpose(eval_fs, (1, 2, 0)))
                    eval_s, eval_r, eval_d, _ = eval_env.step(eval_a)
                    eval_tracker[-1] += eval_r

                    if eval_env.env.env.was_real_done:
                        break
                    if eval_d:
                        eval_fs = reset_fs()
                        eval_s = eval_env.reset()

                if eval_tracker[-1] > best_reward:  # Save best video
                    best_reward = eval_tracker[-1]
                    logger.log(
                        'Dump best video reward: {}'.format(best_reward))
                    best_video = temp_video
                    with open('video.pkl', 'wb') as f:
                        pickle.dump(best_video,
                                    f,
                                    protocol=pickle.HIGHEST_PROTOCOL)

            logger.log(
                'Evaluate mean reward: {:.2f}, max reward: {:.2f}, std: {:.2f}'
                .format(np.mean(eval_tracker[-C['eval_episodes']:]),
                        np.max(eval_tracker[-C['eval_episodes']:]),
                        np.std(eval_tracker[-C['eval_episodes']:])))
            summary = sess.run(eval_summary,
                               feed_dict={
                                   eval_reward:
                                   np.mean(eval_tracker[-C['eval_episodes']:])
                               })
            writer.add_summary(summary, it)

    agent.nn.save('./{}_model'.format(C['env_id']))
Пример #23
0
def main(env_name, mode, learning_rate, ppo_epsilon, ppo_ent_l_w, max_steps,
         iter_steps, render, batch_size, history_buffer_size, n_updates,
         verbose, run_suffix):
    suffix = '%s-%s-batch_size=%d,iter_steps=%d' % (mode, env_name, batch_size,
                                                    iter_steps)
    if mode == "PPO":
        suffix += '-epsilon=%.3f-ppo_ent_l_w=%.2f' % (ppo_epsilon, ppo_ent_l_w)
    else:
        suffix += '-H=%d' % history_buffer_size

    print('Starting run for the settings %s' % suffix)

    logger.configure(dir='%s-%s' % (suffix, run_suffix))

    # Init tensorflow
    sess = tf.InteractiveSession()

    # Create environment
    sim = gym.make(env_name)

    # Create the agent
    agent = Agent(
        mode=mode,
        stateDim=sim.observation_space.low.shape[0],
        actionDim=sim.action_space.low.shape[0],
        actionMin=sim.action_space.low,
        actionMax=sim.action_space.high,
        learningRate=learning_rate,
        PPOepsilon=ppo_epsilon,
        PPOentropyLossWeight=ppo_ent_l_w,
        H=history_buffer_size,
        useScaler=
        True  # This makes the agent to try to normalize the scale state observations.
    )

    # Finalize initialization
    tf.global_variables_initializer().run(session=sess)
    # print("Initializing agent")
    agent.init(
        sess
    )  # Should only be called after the global variables initializer above

    # How many simulation steps to use the same action (larger values than 1 seem to help in MuJoCo agent exploration)
    actionRepeat = 2

    # Main training loop
    totalSimSteps = 0
    nextObservation = None

    iteration = 0
    while totalSimSteps < max_steps:
        #Counter for total simulation steps taken in this iteration
        nSimSteps = 0

        # A list to hold the experience trajectories
        trajectories = []

        #run episodes until budget runs out, computing the average episode reward
        nEpisodes = 0
        averageEpisodeReward = 0
        # print("Collecting experience...")
        while nSimSteps < iter_steps:
            # Reset the episode
            observation = sim.reset()
            done = False
            episodeReward = 0

            # List to hold the experience of this episode
            trajectory = []

            # Simulate this episode until done
            while not done:
                # Query the agent for action
                action = agent.act(sess, observation)

                # Simulate using the action, repeating the same action for actionRepeat steps.
                # Also, compute the total reward received.
                reward = 0
                for _ in range(actionRepeat):
                    nextObservation, stepReward, done, info = sim.step(
                        action[0, :])

                    # Uncomment the following two lines to enable rendering
                    if render and nEpisodes < 5:  # Only render the first few episodes of each iteration
                        sim.render()

                    nSimSteps += 1
                    totalSimSteps += 1
                    reward += stepReward
                    episodeReward += stepReward
                    if done:
                        break

                # Save the experience point
                e = Experience(observation, action, reward, nextObservation,
                               done)
                trajectory.append(e)
                observation = nextObservation

            # Episode done, bookkeeping
            trajectories.append(trajectory)
            averageEpisodeReward += episodeReward
            nEpisodes += 1

        #All episodes of this iteration done, print results and update the agent
        averageEpisodeReward /= nEpisodes
        iteration += 1
        print('================ Iteration %d ================' % iteration)
        logger.record_tabular("Total iterations", iteration)
        logger.record_tabular("Total timesteps", totalSimSteps)
        logger.record_tabular("Episode reward mean", averageEpisodeReward)
        logger.record_tabular("Average policy std",
                              agent.getAverageActionStdev())
        logger.dump_tabular()
        agent.update(sess,
                     trajectories,
                     batchSize=batch_size,
                     nBatches=n_updates,
                     verbose=verbose)
    sess.close()
    print('Finished run for the settings %s' % suffix)
Пример #24
0
class Learning(object):
    """ docstring for Learning """
    def __init__(self,
                 number_of_actions,
                 input_dimension,
                 load,
                 batch_size=25,
                 episodes=10,
                 max_steps=100,
                 epsilon=0,
                 gamma=0.0,
                 alpha=0.0,
                 epsilon_decay=1.0,
                 episodes_decay=30,
                 epochs=1):
        self.episodes = episodes
        self.max_steps = max_steps
        self.epsilon = epsilon
        self.gamma = gamma
        self.alpha = alpha
        self.epsilon_decay = epsilon_decay
        self.episodes_decay = episodes_decay
        self.epochs = epochs
        self.agent = Agent(number_of_actions, input_dimension, batch_size,
                           self.alpha, load, 'model_weights.h5')
        self.analyzer = Results()

    """ append a new action in the memory, in form of a tuple, for further replay with a batch """

    def write_memory(self, memory, state_list, action, reward, next_state_list,
                     is_done):
        memory.append((state_list, action, reward, next_state_list, is_done))

    """ replays the memory in a batch, learning from past actions to maximize reward """

    def replay(self):

        mini_batch = random.sample(self.agent.memory,
                                   int(self.agent.batch_size))

        fit = None
        for state_list, action, reward, next_state_list, done in mini_batch:
            target = reward
            if not done:
                target = (reward + self.gamma * (np.amax(
                    self.agent.model.predict([
                        next_state_list[0][1].reshape(
                            1, self.agent.input_dimension,
                            self.agent.input_dimension, 1), next_state_list[1]
                        [1].reshape(1, self.agent.input_dimension,
                                    self.agent.input_dimension, 1),
                        next_state_list[2][1].reshape(
                            1, self.agent.input_dimension,
                            self.agent.input_dimension, 1)
                    ])[0])))

            target_f = self.agent.model.predict([
                state_list[0][1].reshape(1, self.agent.input_dimension,
                                         self.agent.input_dimension, 1),
                state_list[1][1].reshape(1, self.agent.input_dimension,
                                         self.agent.input_dimension, 1),
                state_list[2][1].reshape(1, self.agent.input_dimension,
                                         self.agent.input_dimension, 1)
            ])
            target_f[0][action] = target
            fit = self.agent.model.fit([
                state_list[0][1].reshape(1, self.agent.input_dimension,
                                         self.agent.input_dimension, 1),
                state_list[1][1].reshape(1, self.agent.input_dimension,
                                         self.agent.input_dimension, 1),
                state_list[2][1].reshape(1, self.agent.input_dimension,
                                         self.agent.input_dimension, 1)
            ],
                                       target_f,
                                       self.epochs,
                                       verbose=0)

        if fit == None:
            return 0
        else:
            return fit

    """ main loop for the learning itself """

    def run(self):
        print("starting everything")
        for episode in range(self.episodes):
            self.agent.controller.start_sim()
            sleep(0.5)
            now = datetime.now()
            #print(str(now) + " starting ep " + str(episode+1) + "\n")
            init = time.time()

            state_list = []
            self.agent.instant_reward = 0.0
            state_list.append(self.agent.vision.get_image(
                1))  #state = (resolution, grayscale, colored RGB)
            state_list.append(self.agent.vision.get_image(
                2))  #state = (resolution, grayscale, colored RGB)
            state_list.append(self.agent.vision.get_image(
                3))  #state = (resolution, grayscale, colored RGB)

            steps_done = None
            for step in range(self.max_steps):
                steps_done = step

                action_taken = self.agent.act(state_list[0], state_list[1],
                                              state_list[2], self.epsilon)
                next_state1, next_state2, next_state3, reward, done = self.agent.do_step(
                    action_taken)  ##extrair imagem aqui dentro
                self.agent.instant_reward += reward
                self.write_memory(self.agent.memory, state_list, action_taken,
                                  reward,
                                  [next_state1, next_state2, next_state3],
                                  done)
                state_list[0] = next_state1
                state_list[1] = next_state2
                state_list[2] = next_state3

                if done:
                    break

            self.analyzer.steps_list.append(step + 1)

            end = time.time()
            self.agent.controller.stop_sim()
            sleep(0.5)
            evall = None
            if len(self.agent.memory) > int(self.agent.batch_size):
                rep_init = time.time()
                evall = self.replay()
                rep_end = time.time()
                now = datetime.now()
                self.analyzer.mse_values.append(
                    evall.history['mean_squared_error'])
                print(
                    str(now) + " mse value: " +
                    str(round(evall.history['mean_squared_error'][0], 2)) +
                    " loss: " + str(round(evall.history['loss'][0], 4)) +
                    " replay " + str(round((rep_end - rep_init) / 60.0, 2)) +
                    " minutes")

            self.analyzer.rewards_list.append(self.agent.instant_reward)
            self.agent.cummulative_reward += self.agent.instant_reward

            if episode > 0 and (episode % self.episodes_decay == 0):
                self.epsilon *= self.epsilon_decay
                now = datetime.now()
                #print str(now) + " epsilon decay"

            if episode > 0 and episode % 10 == 0:
                now = datetime.now()
                #print str(now) + " weights backup..."
                self.agent.model.save_weights('model_weights.h5')

            now = datetime.now()
            print(
                str(now) + " duration " + str(round((end - init) / 60.0, 2)) +
                " min //  ep " + str(episode + 1) + "/" + str(self.episodes) +
                " // steps " + str(step) + " // reward " +
                str(round(self.agent.instant_reward, 2)))

            self.agent.step_lost_counter = 0

        self.agent.controller.stop_sim()
        self.agent.controller.close_connection()
        cv.destroyAllWindows()

        now = datetime.now()
        self.agent.model.save_weights('model_weights.h5')

        now = datetime.now()

        os.chdir("logs")
        dirr = str(now)
        os.mkdir(dirr)

        file = open(os.path.join(os.getcwd(), dirr, "data.txt"), 'w')
        file.write(str(self.analyzer.rewards_list))
        file.write(str(self.analyzer.steps_list))
        file.write(str(self.analyzer.mse_values))
        file.close()

        self.analyzer.plot_media_n(self.analyzer.rewards_list,
                                   self.analyzer.reward_fig, dirr, 10,
                                   "REWARDxEP(media)",
                                   "Reward Media x 10 Episodio",
                                   "Reward Media")
        self.analyzer.plot_raw(self.analyzer.rewards_list,
                               self.analyzer.reward_fig, dirr, "REWARDxEP",
                               "Reward x Episodio", "Reward")
        self.analyzer.plot_raw(self.analyzer.steps_list,
                               self.analyzer.steps_fig, dirr, "STEPS",
                               "Steps Gastos x Episodio", "Steps")
        self.analyzer.plot_raw(self.analyzer.mse_values,
                               self.analyzer.mse_fig,
                               dirr,
                               "MSE",
                               "Mean Squared Error x Episodio",
                               "Valor MSE",
                               normalize=True)
Пример #25
0
class Simulator(object):
    def __init__(self, env, display=True, log_metrics=False, filename="sim"):
        self.env = env
        self.agent = Agent(env.observation_space.shape[0], env.action_space.n)
        self.testing = False
        self.log_metrics = log_metrics
        self.display = display

        if self.log_metrics:
            self.log_filename = os.path.join("logs",
                                             filename + "_cartpole.csv")
            self.log_fields = [
                'episode', 'testing', 'net_reward', 'epsilon', 'gamma', 'alpha'
            ]
            self.log_file = open(self.log_filename, 'w', newline='')
            self.log_writer = csv.DictWriter(self.log_file,
                                             fieldnames=self.log_fields)
            self.log_writer.writeheader()

    def log_trial(self, episode, net_reward):
        if self.log_metrics:
            self.log_writer.writerow({
                'episode': episode,
                'testing': self.testing,
                'net_reward': net_reward,
                'alpha': self.agent.learn_rate,
                'epsilon': self.agent.epsilon,
                'gamma': self.agent.gamma
            })

    def run(self, episodes=5000, n_test=0):
        state = self.env.reset()

        for e in range(episodes):
            state = self.env.reset()
            state = np.reshape(state, [1, 4])
            net_reward = 0.0

            if (e % 100) == 0:
                display = True
            else:
                display = False

            for time_t in range(5000):
                if display:
                    self.env.render()
                action = self.agent.act(state)
                next_state, reward, done, _ = self.env.step(action)
                next_state = np.reshape(next_state, [1, 4])
                self.agent.remember(state, action, reward, next_state, done)
                state = next_state

                net_reward += reward

                if done:
                    print("episode: {}/{}, , e = {}, score = {}".format(
                        e, episodes, self.agent.epsilon, time_t))
                    break
            if e > 32 and self.agent.epsilon > 0.0:
                self.agent.learn(batch_size=32)
            elif self.agent.epsilon == 0.0 and not self.testing:
                self.testing = True
            self.log_trial(e, net_reward)
        if self.log_metrics:
            self.log_file.close()
Пример #26
0
            state = preprocess(game.get_state().screen_buffer)

    for epoch in range(EPOCHS):
        print("\n\nEpoch %d\n-------" % (epoch + 1))

        train_episodes_finished = 0
        train_scores = []

        print("Training...")
        game.new_episode()

        episode_buffer = []
        agent.reset_cell_state()
        state = preprocess(game.get_state().screen_buffer)
        for learning_step in trange(STEPS_PER_EPOCH, leave=False):
            action = agent.act(state)
            reward = game.make_action(actions[action], FRAME_REPEAT)
            done = game.is_episode_finished()
            if not done:
                state_new = preprocess(game.get_state().screen_buffer)
            else:
                state_new = None

            agent.add_transition(state, action, reward, state_new, done)
            state = state_new

            if learning_step % UPDATE_FREQUENCY == 0:
                agent.learn_from_memory()
                updateTarget(targetOps, SESSION)

            if done:
Пример #27
0
from World import World
from Agent import Agent

world = World()
agent = Agent(world)

for e in range(5000):
    done = False
    result = 0
    while not done:
        done, result = agent.act()
    agent.update_policy(result)
    agent.reset()
    #
    # if result > 0:
    #     print("Agent has won :)")
    # else:
    #     print("Agent has lost :(")

agent.display_policy()
Пример #28
0
    SESSION.run(init)

##########################################

if not SKIP_LEARNING:
    time_start = time()
    print("\nFilling out replay memory")
    updateTarget(targetOps, SESSION)

    agent.reset_cell_state()
    state = game.get_state()
    for _ in range(RANDOM_WANDER_STEPS):
        if not LOAD_MODEL:
            action = agent.random_action()
        else:
            action = agent.act(game.get_last_action(), state)
        img_state, reward, done = game.make_action(action)
        if not done:
            state_new = img_state
        else:
            state_new = None

        agent.add_transition(state, action, reward, state_new, done)
        state = state_new

        if done:
            game.reset()
            agent.reset_cell_state()
            state = game.get_state()

    max_avgR = -10000.0
Пример #29
0
MAX_EPISODES = 200


def plot_results():
    plt.subplot(2, 1, 1)
    plt.plot(np.cumsum(np.array(agent.rewards)))
    plt.ylabel('Cumulative Rewards')
    plt.xlabel('Steps')
    plt.subplot(2, 1, 2)
    plt.plot(np.array(agent.number_of_steps_til_reward))
    plt.ylabel('# Steps to Reward')
    plt.xlabel('Episodes')
    plt.show()


# Iterate through a number of episodes (set by MAX_EPISODES) and plot the results.

if __name__ == '__main__':
    environment = GridWorldModel()
    agent = Agent(environment)

    for _ in itertools.repeat(None, MAX_EPISODES):
        environment.reset()
        agent.state = environment.get_start_state()
        while True:
            agent.act()
            if environment.is_terminal_state():
                break

    plot_results()
Пример #30
0
hyp = np.log(np.array([1, 1, 10]))
cov = NormalARD()
gp = GaussianProcess(lik, hyp, cov)
gp2 = GaussianProcess(lik, hyp, cov)

sig =np.ones((3,)) * 0.001
sig2 = np.ones((3,)) * 0.1
start_z = np.array([[0., 0., 0.]])
agent = Agent(gp, reward, sig, start_z)
agent2 = Agent(gp2, reward, sig2, start_z)
fig = plt.figure(figsize=(20,7), dpi=300)
zlim = (-10, 10, -10, 10)
for i in xrange(0, 1000):
    agent.observe()
    agent.decide()
    agent.act()
    agent2.observe()
    agent2.decide()
    agent2.act()

    t = agent.gp.Z[-1].flatten()[-1]
    a = [0] * 4
    a[0] = agent.gp.Z[-1].flatten()[0]
    a[1] = agent.gp.Z[-1].flatten()[1]
    a[2] = agent.gp.Z[-1].flatten()[0]
    a[3] = agent.gp.Z[-1].flatten()[1]
    extent = np.max(np.abs(a))
    lim = extent + 3 if extent > 10 else 10
    zlim = (-lim, lim, -lim, lim)
    fig.clf()
    ax1 = fig.add_subplot(1, 3, 1)
Пример #31
0
print("Generating " + str(number_of_samples) + " samples for training")

for sample in tqdm(range(number_of_samples)):
    initial_pos = [np.random.randint(0, 360) for i in range(6)]
    Agent.controller.set_positions(Agent.handlers, initial_pos)

    sleep(0.08)

    initial_states = []
    initial_states.append(Agent.vision.get_image(sensor_number=1))
    initial_states.append(Agent.vision.get_image(sensor_number=2))
    initial_states.append(Agent.vision.get_image(sensor_number=3))

    action = Agent.act(initial_states[0],
                       initial_states[1],
                       initial_states[2],
                       epsilon=11)
    new_state1, new_state2, new_state3, reward, done = Agent.do_step(action)
    new_states = [new_state1, new_state2, new_state3]

    writer.writerow([action, reward, done])

    for image in range(len(new_states)):
        cv.imwrite(
            "/media/leonardo/Seagate Expansion Driver/DTASET_IC/init/" +
            str(sample + 1) + "_" + str(image + 1) + ".png",
            initial_states[image][1])
        cv.imwrite(
            "/media/leonardo/Seagate Expansion Driver/DTASET_IC/end/" +
            str(sample + 1) + "_" + str(image + 1) + ".png",
            new_states[image][1])