def sim_test():
    global env_name, save_name, agent_args, env_real, env_sim, nets
    env_real = env_real.Env_real(False)
    env_sim = env_sim.Env_sim(True)
    GAT_model = nets.GAT_net(env_real, env_sim, GAT_args)
    agent = Agent(env_sim, agent_args)

    episodes = int(10)
    max_steps = 2000

    for episode in range(episodes):
        state = env_sim.reset()
        done = False
        score = 0
        step = 0
        while not done and step <= max_steps:
            action, clipped_action, value, cost_value = agent.get_action(
                state, False)
            # action transformer by GAT
            transformed_next_state = GAT_model.forward_transform(
                state, clipped_action)
            transformed_action = GAT_model.backward_transform(
                state, transformed_next_state)
            state, reward, done, info = env_sim.step(transformed_action)
            print(reward, '\t', info.get('cost', 0))
            score += reward
            step += 1
        print("score :", score)
Пример #2
0
    def thread_func(t_idx):
        global total_step, total_max_step, env_name, global_agent, step_period, gamma, \
                loss_logger, score_logger, graph
        env = gym.make(env_name)
        agent = Agent("local_{}".format(t_idx), env, save_name, gamma)
        step = 0
        episode = 0

        while total_step < total_max_step:
            episode += 1
            #gradient reset & parameter synchronize
            agent.update_parameter(global_agent)
            ###
            start_step = step
            states = []
            actions = []
            rewards = []
            score = 0
            cnt = 0
            state = env.reset()
            while True:
                cnt += 1
                step += 1
                total_step += 1
                action = agent.get_action(state, True)
                next_state, reward, done, info = env.step(action)
                ####### modify reward function #######
                #reward = 200-cnt if done else 0
                reward += 10
                ####### modify reward function #######
                states.append(state)
                actions.append(action)
                rewards.append(reward)
                score += reward
                if done or step - start_step == step_period:
                    ret = 0 if done else agent.get_value(next_state)
                    targets = []
                    for i in range(len(states)):
                        ret = rewards[-i - 1] + gamma * ret
                        targets.append(ret)
                    targets = targets[::-1]
                    p_grad, p_loss, v_grad, v_loss, entropy = agent.calc_gradient(
                        states, actions, targets)
                    global_agent.update_with_gradients(p_grad, v_grad)
                    #loss_logger.write([step-start_step,p_loss,v_loss])
                    if done:
                        break
                    agent.update_parameter(global_agent)
                    start_step = step
                    states = []
                    actions = []
                    rewards = []
                state = next_state
            #score_logger.write([cnt, score])
            if t_idx == 0:
                print(score)
                graph.update(score, p_loss, v_loss, entropy)
                if episode % 100 == 0: global_agent.save()
Пример #3
0
def train():
    global env_name, save_name, agent_args
    env = gym.make(env_name)
    if env_name == 'DobroHalfCheetah-v0':
        env.unwrapped.initialize(is_render=False)
    agent = Agent(env, agent_args)

    v_loss_logger = Logger(save_name, 'v_loss')
    p_loss_logger = Logger(save_name, 'p_loss')
    score_logger = Logger(save_name, 'score')
    graph = Graph(1000, save_name.upper(), agent.name)
    episodes = int(5e5)
    save_freq = 1

    save_period = 1000
    p_losses = deque(maxlen=save_period)
    v_losses = deque(maxlen=save_period)
    entropies = deque(maxlen=save_period)
    scores = deque(maxlen=save_period)

    for episode in range(episodes):
        state = env.reset()
        agent.actor_noise.reset()
        done = False
        score = 0
        step = 0

        while not done:
            step += 1
            action = agent.get_action(state, True)
            next_state, reward, done, info = env.step(action)
            agent.replay_memory.append([
                np.array(state, np.float32), action, reward, done,
                np.array(next_state, np.float32)
            ])
            ########################

            if len(agent.replay_memory) > agent.train_start:
                v_loss, p_loss = agent.train()
                v_loss_logger.write([1, v_loss])
                p_loss_logger.write([1, p_loss])
                p_losses.append(p_loss)
                v_losses.append(v_loss)
                value = agent.get_value(state, action)
                entropies.append(value)
                scores.append(reward)
                graph.update(np.mean(scores), np.mean(p_losses),
                             np.mean(v_losses), np.mean(entropies))
            state = next_state
            score += reward

        print(episode, score, agent.epsilon)
        score_logger.write([step, score])
        if (episode + 1) % save_freq == 0:
            agent.save()
            v_loss_logger.save()
            p_loss_logger.save()
            score_logger.save()

    graph.update(0, 0, 0, 0, finished=True)
Пример #4
0
def test():
    agent = Agent(env)
    agent.epsilon = 0.01
    action_low = env.action_space.low[0]
    action_high = env.action_space.high[0]
    episodes = int(1e6)
    avg_Q = deque(maxlen=200)

    for episode in range(episodes):
        state = env.reset()
        done = False

        while not done:
            action = agent.get_action(state)
            a_t = (action/(agent.n_action-1))
            a_t = a_t*(action_high - action_low) + action_low
            state, reward, done, info = env.step([a_t])
            env.render()
Пример #5
0
def test():
    global env_name, save_name, agent_args
    env = gym.make(env_name)
    agent = Agent(env, agent_args)

    episodes = int(1e6)

    for episode in range(episodes):
        state = env.reset()
        done = False
        score = 0
        while not done:
            action, clipped_action, value = agent.get_action(state, False)
            #action, clipped_action, value = agent.get_action(state, True)
            state, reward, done, info = env.step(clipped_action)
            score += reward
            env.render()
            time.sleep(0.01)
        print("score :", score)
Пример #6
0
def test():
    global env_name
    save_name = env_name.split('-')[0]
    gamma = 0.99
    env = gym.make(env_name)
    env.unwrapped.initialize(is_render=True)
    agent = Agent("global", env, save_name, gamma)
    episodes = int(1e6)

    for episode in range(episodes):
        state = env.reset()
        done = False

        while not done:
            #action = agent.get_action(state, False)
            action = agent.get_action(state, True)
            #if action[0] > 0:
            #    a_t = 1
            #else :
            #    a_t = 0
            state, reward, done, info = env.step(action)
            #state, reward, done, info = env.step(a_t)
            env.render()
Пример #7
0
def test():
    global env_name, agent_args
    save_name = env_name.split('-')[0]
    gamma = 0.99
    env = gym.make(env_name)
    agent = Agent("global", env, save_name, gamma, agent_args)
    episodes = int(1e6)

    for episode in range(episodes):
        state = env.reset()
        done = False

        while not done:
            #action = agent.get_action(state, False)
            action = agent.get_action(state, True)
            print(action)
            #time.sleep(0.01)
            #if action[0] > 0:
            #    a_t = 1
            #else :
            #    a_t = 0
            state, reward, done, info = env.step(action)
            #state, reward, done, info = env.step(a_t)
            env.render()
Пример #8
0
def test():
    global env_name, save_name, agent_args
    env = gym.make(env_name)
    if env_name == 'DobroHalfCheetah-v0':
        env.unwrapped.initialize(is_render=True)
    elif env_name == 'HalfCheetahBulletEnv-v0':
        env.render()
    agent = Agent(env, agent_args)

    episodes = int(1e6)
    avg_Q = deque(maxlen=200)

    for episode in range(episodes):
        state = env.reset()
        agent.actor_noise.reset()
        done = False

        while not done:
            #action = agent.get_action(state, False)
            action = agent.get_action(state, True)
            state, reward, done, info = env.step(action)
            print(np.mean(action))
            env.render()
            time.sleep(0.01)
def real_test():
    global env_name, save_name, agent_args, env_real
    env_real = env_real.Env_real(True)
    agent = Agent(env_real, agent_args)

    episodes = int(10)
    max_steps = 2000

    for episode in range(episodes):
        input_value = input('Ready? (y/n)')
        if input_value == 'n':
            break
        state = env_real.reset()
        done = False
        score = 0
        step = 0
        while not done and step <= max_steps:
            action, clipped_action, value, cost_value = agent.get_action(
                state, False)
            state, reward, done, info = env_real.step(clipped_action)
            print(reward, '\t', info.get('cost', 0))
            score += reward
            step += 1
        print("score :", score)
Пример #10
0
def train():
    global env, env_name
    env_name = env_name.split('-')[0]
    agent = Agent(env, env_name)
    loss_logger = Logger(env_name, 'loss')
    score_logger = Logger(env_name, 'score')
    action_low = env.action_space.low[0]
    action_high = env.action_space.high[0]
    episodes = int(5e2)
    avg_Q = deque(maxlen=200)

    for episode in range(episodes):
        state = env.reset()
        done = False
        score = 0
        step = 0

        while not done:
            step += 1
            action = agent.get_action(state)
            a_t = (action/(agent.n_action-1))
            a_t = a_t*(action_high - action_low) + action_low
            next_state, reward, done, info = env.step([a_t])

            agent.replay_memory.append([np.array(state, np.float32), action, reward, done, np.array(next_state, np.float32)])
            ########################

            #replay 메모리에 어느정도 쌓이면 학습시작하기
            if len(agent.replay_memory) > agent.train_start:
                Q, loss = agent.train()
                loss_logger.write([1, loss])
                avg_Q.append(Q)
            state = next_state
            score += reward

        #print(episode, accumulate+100, self.epsilon)
        print(episode, score, agent.epsilon, np.mean(avg_Q))
        agent.update_target_model()
        score_logger.write([step, score])
        if (episode+1)%agent.save_freq == 0:
            agent.save()
            loss_logger.save()
            score_logger.save()
Пример #11
0
def train():
    global env_name, save_name, agent_args
    env = gym.make(env_name)
    agent = Agent(env, agent_args)

    v_loss_logger = Logger(save_name, 'v_loss')
    cost_v_loss_logger = Logger(save_name, 'cost_v_loss')
    kl_logger = Logger(save_name, 'kl')
    score_logger = Logger(save_name, 'score')
    cost_logger = Logger(save_name, 'cost')
    graph = Graph(
        1000, save_name,
        ['score', 'cost', 'value loss', 'cost value loss', 'kl divergence'])
    max_steps = 4000
    max_ep_len = 1000
    episodes = int(max_steps / max_ep_len)
    epochs = 500
    save_freq = 10

    log_length = 10
    p_objectives = deque(maxlen=log_length)
    c_objectives = deque(maxlen=log_length)
    v_losses = deque(maxlen=log_length)
    cost_v_losses = deque(maxlen=log_length)
    kl_divergence = deque(maxlen=log_length)
    scores = deque(maxlen=log_length * episodes)
    costs = deque(maxlen=log_length * episodes)

    for epoch in range(epochs):
        states = []
        actions = []
        targets = []
        cost_targets = []
        gaes = []
        cost_gaes = []
        avg_costs = []
        ep_step = 0
        while ep_step < max_steps:
            state = env.reset()
            done = False
            score = 0
            cost = 0
            step = 0
            temp_rewards = []
            temp_costs = []
            values = []
            cost_values = []
            while True:
                step += 1
                ep_step += 1
                assert env.observation_space.contains(state)
                action, clipped_action, value, cost_value = agent.get_action(
                    state, True)
                assert env.action_space.contains(clipped_action)
                next_state, reward, done, info = env.step(clipped_action)

                #for predict cost
                h_dist = hazard_dist(env.hazards_pos, env.world.robot_pos())
                predict_cost = get_cost(h_dist)

                states.append(state)
                actions.append(action)
                temp_rewards.append(reward)
                temp_costs.append(predict_cost)
                values.append(value)
                cost_values.append(cost_value)

                state = next_state
                score += reward
                cost += info.get('cost',
                                 0)  #로그는 실제 cost를 남겨서, discrete한 cost랑 비교해야함.

                if done or step >= max_ep_len:
                    break

            if step >= max_ep_len:
                action, clipped_action, value, cost_value = agent.get_action(
                    state, True)
            else:
                value = 0
                cost_value = 0
                print("done before max_ep_len...")
            next_values = values[1:] + [value]
            temp_gaes, temp_targets = agent.get_gaes_targets(
                temp_rewards, values, next_values)
            next_cost_values = cost_values[1:] + [cost_value]
            temp_cost_gaes, temp_cost_targets = agent.get_gaes_targets(
                temp_costs, cost_values, next_cost_values)
            avg_costs.append(np.mean(temp_costs))
            targets += list(temp_targets)
            gaes += list(temp_gaes)
            cost_targets += list(temp_cost_targets)
            cost_gaes += list(temp_cost_gaes)

            score_logger.write([step, score])
            cost_logger.write([step, cost])
            scores.append(score)
            costs.append(cost)

        trajs = [
            states, actions, targets, cost_targets, gaes, cost_gaes, avg_costs
        ]
        v_loss, cost_v_loss, p_objective, cost_objective, kl = agent.train(
            trajs)

        v_loss_logger.write([ep_step, v_loss])
        cost_v_loss_logger.write([ep_step, cost_v_loss])
        kl_logger.write([ep_step, kl])

        p_objectives.append(p_objective)
        c_objectives.append(cost_objective)
        v_losses.append(v_loss)
        cost_v_losses.append(cost_v_loss)
        kl_divergence.append(kl)

        print(np.mean(scores), np.mean(costs), np.mean(v_losses),
              np.mean(cost_v_losses), np.mean(kl_divergence),
              np.mean(c_objectives))
        graph.update([
            np.mean(scores),
            np.mean(costs),
            np.mean(v_losses),
            np.mean(cost_v_losses),
            np.mean(kl_divergence)
        ])
        if (epoch + 1) % save_freq == 0:
            agent.save()
            v_loss_logger.save()
            cost_v_loss_logger.save()
            kl_logger.save()
            score_logger.save()
            cost_logger.save()

    graph.update(None, finished=True)
def train():
    global env_name, save_name, agent_args, env_real, env_sim, nets
    env_real = env_real.Env_real(False)
    env_sim = env_sim.Env_sim(True)
    GAT_model = nets.GAT_net(env_real, env_sim, GAT_args)
    agent = Agent(env_sim, agent_args)

    # wandb.init(project=save_name)
    accum_step = 0
    avg_temp_cost = 0

    v_loss_logger = Logger(save_name, 'v_loss')
    cost_v_loss_logger = Logger(save_name, 'cost_v_loss')
    kl_logger = Logger(save_name, 'kl')
    score_logger = Logger(save_name, 'score')
    cost_logger = Logger(save_name, 'cost')
    max_steps = 2000
    max_ep_len = 1000
    episodes = int(max_steps / max_ep_len)
    epochs = 2  #50
    save_freq = 1

    log_length = 10
    p_objectives = deque(maxlen=log_length)
    c_objectives = deque(maxlen=log_length)
    v_losses = deque(maxlen=log_length)
    cost_v_losses = deque(maxlen=log_length)
    kl_divergence = deque(maxlen=log_length)
    scores = deque(maxlen=log_length * episodes)
    costs = deque(maxlen=log_length * episodes)

    is_backup = False
    backup_name = '{}/backup.pkl'.format(save_name)
    if os.path.isfile(backup_name):
        #input_value = raw_input('backup file exists. wanna continue the last work?( y/n )')
        #if input_value != 'n':
        #    is_backup = True
        is_backup = True
    if is_backup:
        with open(backup_name, 'rb') as f:
            backup_list = pickle.load(f)
        start_iter = backup_list[0]
    else:
        start_iter = 0
        backup_list = [start_iter]

    for epoch in range(start_iter, epochs):
        #continue?
        print("=" * 20)
        print("Epoch : {}".format(epoch + 1))
        #input_value = raw_input("wanna continue episodes?( y/n )")
        #if input_value == 'n':
        #    break

        states = []
        actions = []
        targets = []
        cost_targets = []
        gaes = []
        cost_gaes = []
        avg_costs = []
        ep_step = 0
        while ep_step < max_steps:
            #input_value = raw_input("ready?")

            state = env_sim.reset()
            done = False
            score = 0
            cost = 0
            step = 0
            temp_rewards = []
            temp_costs = []
            values = []
            cost_values = []
            while True:
                if rospy.is_shutdown():
                    sys.exit()
                step += 1
                ep_step += 1
                action, clipped_action, value, cost_value = agent.get_action(
                    state, True)
                # action transformer by GAT
                transformed_next_state = GAT_model.forward_transform(
                    state, clipped_action)
                transformed_action = GAT_model.backward_transform(
                    state, transformed_next_state)
                next_state, reward, done, info = env_sim.step(
                    transformed_action)

                predict_cost = info['continuous_cost']

                states.append(state)
                actions.append(action)
                temp_rewards.append(reward)
                temp_costs.append(predict_cost)
                values.append(value)
                cost_values.append(cost_value)

                state = next_state
                score += reward
                cost += info.get('cost', 0)

                if done or step >= max_ep_len:
                    break

            print("step : {}, score : {}".format(step, score))
            if step >= max_ep_len:
                action, clipped_action, value, cost_value = agent.get_action(
                    state, True)
            else:
                value = 0
                cost_value = 0
                print("done before max_ep_len...")
            next_values = values[1:] + [value]
            temp_gaes, temp_targets = agent.get_gaes_targets(
                temp_rewards, values, next_values)
            next_cost_values = cost_values[1:] + [cost_value]
            temp_cost_gaes, temp_cost_targets = agent.get_gaes_targets(
                temp_costs, cost_values, next_cost_values)
            avg_costs.append(np.mean(temp_costs))
            targets += list(temp_targets)
            gaes += list(temp_gaes)
            cost_targets += list(temp_cost_targets)
            cost_gaes += list(temp_cost_gaes)

            score_logger.write([step, score])
            cost_logger.write([step, cost])
            scores.append(score)
            costs.append(cost)

            accum_step += step
            avg_temp_cost = np.mean(temp_costs)
            # wandb.log({'step': accum_step, 'score':score, 'cost':cost, 'avg_temp_cost':avg_temp_cost})

        trajs = [
            states, actions, targets, cost_targets, gaes, cost_gaes, avg_costs
        ]
        v_loss, cost_v_loss, p_objective, cost_objective, kl = agent.train(
            trajs)

        v_loss_logger.write([ep_step, v_loss])
        cost_v_loss_logger.write([ep_step, cost_v_loss])
        kl_logger.write([ep_step, kl])

        p_objectives.append(p_objective)
        c_objectives.append(cost_objective)
        v_losses.append(v_loss)
        cost_v_losses.append(cost_v_loss)
        kl_divergence.append(kl)

        print(np.mean(scores), np.mean(costs), np.mean(v_losses),
              np.mean(cost_v_losses), np.mean(kl_divergence),
              np.mean(c_objectives))
        if (epoch + 1) % save_freq == 0:
            agent.save()
            v_loss_logger.save()
            cost_v_loss_logger.save()
            kl_logger.save()
            score_logger.save()
            cost_logger.save()

        #backup
        backup_list[0] = epoch + 1
        with open(backup_name, 'wb') as f:
            pickle.dump(backup_list, f)
Пример #13
0
def train():
    global total_step, total_max_step, env_name, global_agent, step_period, gamma, \
            loss_logger, score_logger, graph, p_losses, v_losses, entropies, scores
    gamma = 0.99
    num_thread = 10
    total_step = 0
    total_max_step = 1e7
    step_period = 1e4  #1e4
    step_period = int(step_period / num_thread)
    save_name = env_name.split('-')[0]

    env = gym.make(env_name)
    env.unwrapped.initialize(is_render=False)
    global_agent = Agent("global", env, save_name, gamma)
    loss_logger = Logger(save_name, 'loss')
    score_logger = Logger(save_name, 'score')
    graph = Graph(1000, save_name.upper(), 'A3C')
    env.close()

    p_losses = deque(maxlen=step_period)
    v_losses = deque(maxlen=step_period)
    entropies = deque(maxlen=step_period)
    scores = deque(maxlen=step_period)

    def thread_func(t_idx):
        global total_step, total_max_step, env_name, global_agent, step_period, gamma, \
                loss_logger, score_logger, graph, p_losses, v_losses, entropies, scores
        env = gym.make(env_name)
        env.unwrapped.initialize(is_render=False)
        agent = Agent("local_{}".format(t_idx), env, save_name, gamma)
        episode = 0
        step = 0

        p_loss = None
        v_loss = None
        entropy = None

        #gradient reset & parameter synchronize
        agent.update_parameter(global_agent)
        start_step = step
        states = []
        actions = []
        rewards = []
        dones = []

        score = 0
        state = env.reset()
        while total_step < total_max_step:
            step += 1
            total_step += 1

            action = agent.get_action(state, True)
            #if action[0] > 0:
            #    a_t = 1
            #else :
            #    a_t = 0
            next_state, reward, done, info = env.step(action)
            #next_state, reward, done, info = env.step(a_t)
            ####### modify reward function #######
            #reward = 200-cnt if done else 0
            #reward /= 10
            ####### modify reward function #######
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            dones.append(done)
            score += reward

            if step - start_step == step_period:
                ret = 0 if done else agent.get_value(next_state)
                targets = []
                for i in range(len(states)):
                    if dones[-i - 1]:
                        ret = 0
                    #elif i > 0:
                    #    ret = agent.get_value(states[-i])
                    ret = rewards[-i - 1] + gamma * ret
                    targets.append(ret)
                targets = targets[::-1]
                p_grad, p_loss, v_grad, v_loss, entropy = agent.calc_gradient(
                    states, actions, targets)
                p_losses.append(p_loss)
                v_losses.append(v_loss)
                entropies.append(entropy)
                global_agent.update_with_gradients(p_grad, v_grad)
                #loss_logger.write([step-start_step,p_loss,v_loss])
                agent.update_parameter(global_agent)
                if t_idx == 0:
                    graph.update(np.mean(scores), np.mean(p_losses),
                                 np.mean(v_losses), np.mean(entropies))

                start_step = step
                states = []
                actions = []
                rewards = []
                dones = []

            state = next_state
            #score_logger.write([cnt, score])
            if done:
                episode += 1
                if t_idx == 0 and episode % 10 == 0:
                    global_agent.save()
                scores.append(score)
                print(t_idx, score)
                score = 0
                state = env.reset()

    threads = []
    for i in range(num_thread):
        threads.append(threading.Thread(target=thread_func, args=(i, )))
        threads[-1].start()

    for thread in threads:
        thread.join()
    graph.update(0, 0, 0, 0, True)
Пример #14
0
def train():
    global env_name, save_name, agent_args
    env = gym.make(env_name)
    agent = Agent(env, agent_args)

    p_loss_logger = Logger(save_name, 'p_loss')
    v_loss_logger = Logger(save_name, 'v_loss')
    kl_logger = Logger(save_name, 'kl')
    score_logger = Logger(save_name, 'score')
    graph = Graph(
        1000, save_name,
        ['score', 'policy loss', 'value loss', 'kl divergence', 'entropy'])
    episodes = 10
    max_steps = 4000
    max_ep_len = min(1000, env.spec.max_episode_steps)
    epochs = int(1e5)
    save_freq = 10

    save_period = 10
    p_losses = deque(maxlen=save_period)
    v_losses = deque(maxlen=save_period)
    kl_divergence = deque(maxlen=save_period)
    entropies = deque(maxlen=save_period)
    scores = deque(maxlen=save_period * episodes)

    for epoch in range(epochs):
        states = []
        actions = []
        targets = []
        next_states = []
        rewards = []
        gaes = []
        ep_step = 0
        #for episode in range(episodes):
        while ep_step < max_steps:
            state = env.reset()
            done = False
            score = 0
            step = 0
            temp_rewards = []
            values = []
            while True:
                step += 1
                ep_step += 1
                action, clipped_action, value = agent.get_action(state, True)
                next_state, reward, done, info = env.step(clipped_action)

                states.append(state)
                actions.append(action)
                temp_rewards.append(reward)
                next_states.append(next_state)
                rewards.append(reward)
                values.append(value)

                state = next_state
                score += reward

                if done or step >= max_ep_len:
                    break

            if step >= max_ep_len:
                action, clipped_action, value = agent.get_action(state, True)
            else:  #중간에 끝난 거면, 다 돌기전에 죽어버린거니, value = 0 으로 해야함
                value = 0
                print("done before max_ep_len...")
            next_values = values[1:] + [value]
            temp_gaes, temp_targets = agent.get_gaes_targets(
                temp_rewards, values, next_values)
            targets += list(temp_targets)
            gaes += list(temp_gaes)

            score_logger.write([step, score])
            scores.append(score)

        trajs = [states, actions, targets, next_states, rewards, gaes]
        p_loss, v_loss, kl, entropy = agent.train(trajs)

        p_loss_logger.write([ep_step, p_loss])
        v_loss_logger.write([ep_step, v_loss])
        kl_logger.write([ep_step, kl])
        p_losses.append(p_loss)
        v_losses.append(v_loss)
        kl_divergence.append(kl)
        entropies.append(entropy)

        print(np.mean(scores), np.mean(p_losses), np.mean(v_losses),
              np.mean(kl_divergence), np.mean(entropies))
        graph.update([
            np.mean(scores),
            np.mean(p_losses),
            np.mean(v_losses),
            np.mean(kl_divergence),
            np.mean(entropies)
        ])
        if (epoch + 1) % save_freq == 0:
            agent.save()
            p_loss_logger.save()
            v_loss_logger.save()
            kl_logger.save()
            score_logger.save()

    graph.update(None, finished=True)
Пример #15
0
    def thread_func(t_idx):
        global total_step, total_max_step, env_name, global_agent, step_period, gamma, \
                loss_logger, score_logger, graph, p_losses, v_losses, entropies, scores
        env = gym.make(env_name)
        env.unwrapped.initialize(is_render=False)
        agent = Agent("local_{}".format(t_idx), env, save_name, gamma)
        episode = 0
        step = 0

        p_loss = None
        v_loss = None
        entropy = None

        #gradient reset & parameter synchronize
        agent.update_parameter(global_agent)
        start_step = step
        states = []
        actions = []
        rewards = []
        dones = []

        score = 0
        state = env.reset()
        while total_step < total_max_step:
            step += 1
            total_step += 1

            action = agent.get_action(state, True)
            #if action[0] > 0:
            #    a_t = 1
            #else :
            #    a_t = 0
            next_state, reward, done, info = env.step(action)
            #next_state, reward, done, info = env.step(a_t)
            ####### modify reward function #######
            #reward = 200-cnt if done else 0
            #reward /= 10
            ####### modify reward function #######
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            dones.append(done)
            score += reward

            if step - start_step == step_period:
                ret = 0 if done else agent.get_value(next_state)
                targets = []
                for i in range(len(states)):
                    if dones[-i - 1]:
                        ret = 0
                    #elif i > 0:
                    #    ret = agent.get_value(states[-i])
                    ret = rewards[-i - 1] + gamma * ret
                    targets.append(ret)
                targets = targets[::-1]
                p_grad, p_loss, v_grad, v_loss, entropy = agent.calc_gradient(
                    states, actions, targets)
                p_losses.append(p_loss)
                v_losses.append(v_loss)
                entropies.append(entropy)
                global_agent.update_with_gradients(p_grad, v_grad)
                #loss_logger.write([step-start_step,p_loss,v_loss])
                agent.update_parameter(global_agent)
                if t_idx == 0:
                    graph.update(np.mean(scores), np.mean(p_losses),
                                 np.mean(v_losses), np.mean(entropies))

                start_step = step
                states = []
                actions = []
                rewards = []
                dones = []

            state = next_state
            #score_logger.write([cnt, score])
            if done:
                episode += 1
                if t_idx == 0 and episode % 10 == 0:
                    global_agent.save()
                scores.append(score)
                print(t_idx, score)
                score = 0
                state = env.reset()
Пример #16
0
def train():
    global env_name, save_name, agent_args
    env = gym.make(env_name)
    agent = Agent(env, agent_args)

    score_logger = Logger(save_name, 'score')
    graph = Graph(1000, save_name,
                  ['score', 'policy loss', 'Q value loss', 'entropy'])
    max_steps = 4000
    max_ep_len = min(1000, env.spec.max_episode_steps)
    start_training_after_steps = 1000
    step_per_training = 50
    epochs = 1000
    save_freq = 1

    record_length = 10
    p_losses = deque(maxlen=record_length *
                     int(max_ep_len / step_per_training))
    q_losses = deque(maxlen=record_length *
                     int(max_ep_len / step_per_training))
    entropies = deque(maxlen=record_length *
                      int(max_ep_len / step_per_training))
    scores = deque(maxlen=record_length)

    total_step = 0
    for epoch in range(epochs):
        ep_step = 0
        while ep_step < max_steps:
            state = env.reset()
            score = 0
            step = 0
            while True:
                step += 1
                ep_step += 1
                total_step += 1
                action = agent.get_action(state, True)
                next_state, reward, done, info = env.step(action)
                done = False if step >= max_ep_len else done

                agent.replay_memory.append(
                    [state, action, reward,
                     np.float(done), next_state])

                if len(agent.replay_memory) > start_training_after_steps and (
                        total_step + 1) % step_per_training == 0:
                    for _ in range(step_per_training):
                        p_loss, q_loss, entropy = agent.train()
                    p_losses.append(p_loss)
                    q_losses.append(q_loss)
                    entropies.append(entropy)
                    print(np.mean(scores), np.mean(p_losses),
                          np.mean(q_losses), np.mean(entropies))

                state = next_state
                score += reward

                if done or step >= max_ep_len:
                    break

            score_logger.write([step, score])
            scores.append(score)

            graph.update([
                np.mean(scores),
                np.mean(p_losses),
                np.mean(q_losses),
                np.mean(entropies)
            ])

        if (epoch + 1) % save_freq == 0:
            agent.save()
            score_logger.save()

    graph.update(None, finished=True)
Пример #17
0
def train():
    global total_step, total_max_step, env_name, global_agent, step_period, gamma, \
            loss_logger, score_logger, graph
    gamma = 0.99
    num_thread = 10
    total_step = 0
    total_max_step = 1e6
    step_period = 1e3
    step_period = int(step_period / num_thread)
    save_name = env_name.split('-')[0]

    env = gym.make(env_name)
    global_agent = Agent("global", env, save_name, gamma)
    loss_logger = Logger(save_name, 'loss')
    score_logger = Logger(save_name, 'score')
    graph = Graph(1000, save_name.upper(), 'A3C')
    env.close()

    def thread_func(t_idx):
        global total_step, total_max_step, env_name, global_agent, step_period, gamma, \
                loss_logger, score_logger, graph
        env = gym.make(env_name)
        agent = Agent("local_{}".format(t_idx), env, save_name, gamma)
        step = 0
        episode = 0

        while total_step < total_max_step:
            episode += 1
            #gradient reset & parameter synchronize
            agent.update_parameter(global_agent)
            ###
            start_step = step
            states = []
            actions = []
            rewards = []
            score = 0
            cnt = 0
            state = env.reset()
            while True:
                cnt += 1
                step += 1
                total_step += 1
                action = agent.get_action(state, True)
                next_state, reward, done, info = env.step(action)
                ####### modify reward function #######
                #reward = 200-cnt if done else 0
                reward += 10
                ####### modify reward function #######
                states.append(state)
                actions.append(action)
                rewards.append(reward)
                score += reward
                if done or step - start_step == step_period:
                    ret = 0 if done else agent.get_value(next_state)
                    targets = []
                    for i in range(len(states)):
                        ret = rewards[-i - 1] + gamma * ret
                        targets.append(ret)
                    targets = targets[::-1]
                    p_grad, p_loss, v_grad, v_loss, entropy = agent.calc_gradient(
                        states, actions, targets)
                    global_agent.update_with_gradients(p_grad, v_grad)
                    #loss_logger.write([step-start_step,p_loss,v_loss])
                    if done:
                        break
                    agent.update_parameter(global_agent)
                    start_step = step
                    states = []
                    actions = []
                    rewards = []
                state = next_state
            #score_logger.write([cnt, score])
            if t_idx == 0:
                print(score)
                graph.update(score, p_loss, v_loss, entropy)
                if episode % 100 == 0: global_agent.save()

    threads = []
    for i in range(num_thread):
        threads.append(threading.Thread(target=thread_func, args=(i, )))
        threads[-1].start()

    for thread in threads:
        thread.join()
    graph.update(0, 0, 0, 0, True)
Пример #18
0
def train():
    global env_name, save_name, agent_args
    env = gym.make(env_name)
    env.unwrapped.initialize(is_render=False)
    agent = Agent(env, agent_args)

    v_loss_logger = Logger(save_name, 'v_loss')
    p_loss_logger = Logger(save_name, 'p_loss')
    score_logger = Logger(save_name, 'score')
    graph = Graph(1000, save_name.upper(), agent.name)
    episodes = 10
    epochs = int(1e5)
    save_freq = 10

    save_period = 100
    p_losses = deque(maxlen=save_period)
    v_losses = deque(maxlen=save_period)
    entropies = deque(maxlen=save_period)
    scores = deque(maxlen=save_period * episodes)

    for epoch in range(epochs):
        states = []
        actions = []
        targets = []
        ep_step = 0
        for episode in range(episodes):
            state = env.reset()
            done = False
            score = 0
            step = 0
            temp_rewards = []
            while not done:
                step += 1
                ep_step += 1
                action, clipped_action = agent.get_action(state, True)
                next_state, reward, done, info = env.step(clipped_action)

                states.append(state)
                actions.append(action)
                temp_rewards.append(reward)

                state = next_state
                score += reward

            score_logger.write([step, score])
            scores.append(score)
            temp_targets = np.zeros_like(temp_rewards)
            ret = 0
            for t in reversed(range(len(temp_rewards))):
                ret = temp_rewards[t] + agent.discount_factor * ret
                temp_targets[t] = ret
            targets += list(temp_targets)

        trajs = [states, actions, targets]
        v_loss, p_objective, kl = agent.train(trajs)

        v_loss_logger.write([ep_step, v_loss])
        p_loss_logger.write([ep_step, p_objective])
        p_losses.append(p_objective)
        v_losses.append(v_loss)
        entropies.append(kl)

        #print(v_loss, p_objective, kl)
        print(np.mean(scores), np.mean(p_losses), np.mean(v_losses),
              np.mean(entropies))
        graph.update(np.mean(scores), np.mean(p_losses), np.mean(v_losses),
                     np.mean(entropies))
        if (epoch + 1) % save_freq == 0:
            agent.save()
            v_loss_logger.save()
            p_loss_logger.save()
            score_logger.save()

    graph.update(0, 0, 0, 0, finished=True)