max_index_list.clear()
                max_value = value
                max_index_list.append(index)
            elif value == max_value:
                max_index_list.append(index)
        return random.choice(max_index_list)

if __name__ == "__main__":
    env = Env()
    agent = QLearningAgent(actions=list(range(env.n_actions)))

    for episode in range(1000):
        state = env.reset()

        while True:
            env.render()

            # take action and proceed one step in the environment
            action = agent.get_action(str(state))
            next_state, reward, done = env.step(action)

            # with sample <s,a,r,s'>, agent learns new q function
            agent.learn(str(state), action, reward, str(next_state))

            state = next_state
            env.print_value_all(agent.q_table)

            # if episode ends, then break
            if done:
                break
Exemplo n.º 2
0
                max_index_list.append(index)
        return random.choice(max_index_list)

if __name__ == "__main__":
    env = Env()
    agent = SARSAgent(actions=list(range(env.n_actions)))

    for episode in range(1000):
        # reset environment and initialize state

        state = env.reset()
        # get action of state from agent
        action = agent.get_action(str(state))

        while True:
            env.render()

            # take action and proceed one step in the environment
            next_state, reward, done = env.step(action)
            next_action = agent.get_action(str(next_state))

            # with sample <s,a,r,s',a'>, agent learns new q function
            agent.learn(str(state), action, reward, str(next_state), next_action)

            state = next_state
            action = next_action

            # print q function of all states at screen
            env.print_value_all(agent.q_table)

            # if episode ends, then break
Exemplo n.º 3
0
def main():
    expert_demo = pickle.load(open('./Expert dataset 1/expert_20x20_1.p', "rb"))
    demonstrations = np.array(expert_demo[0])

    print("demonstrations.shape", demonstrations.shape)

    print(expert_demo[1])
    print(expert_demo[0])
    print(np.array(expert_demo[0]).shape)

    # expert_x = int(expert_demo[1][0])
    # expert_y = int(expert_demo[1][1])

    expert_x = int(expert_demo[0][0])
    expert_y = int(expert_demo[0][1])


    env = Env(expert_x, expert_y)

    # env.seed(args.seed)
    # torch.manual_seed(args.seed)

    num_inputs = 6
    num_actions = 8
    running_state = ZFilter((num_inputs,), clip=5)

    print('state size:', num_inputs) 
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)
    vdb = VDB(num_inputs + num_actions, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate) 
    vdb_optim = optim.Adam(vdb.parameters(), lr=args.learning_rate)
    
    # load demonstrations

    k = 1
    writer = SummaryWriter(args.logdir)

    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])
        vdb.load_state_dict(ckpt['vdb'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    
    episodes = 0
    train_discrim_flag = True



    for iter in range(args.max_iter_num):
        # expert_demo = pickle.load(open('./paper/{}.p'.format((iter+1)%expert_sample_size), "rb"))
        print(iter)
        expert_demo = pickle.load(open('./Expert dataset 1/expert_20x20_{}.p'.format(np.random.randint(1,50)), "rb"))
        tmp = expert_demo.pop(-1)

        demonstrations = np.array(expert_demo)

        print(demonstrations, demonstrations.shape)
        tot_sample_size = len(demonstrations) + 10
        ##########################

        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        # while steps < args.total_sample_size:

        while steps < tot_sample_size:
            # env.delete_graph()
            state = env.reset()
            # time.sleep(1)

            score = 0

            # state = running_state(state)
            state1 = state
            for _ in range((tot_sample_size+1)*2):
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action2 = np.argmax(get_action(mu, std)[0])
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action2)

                irl_reward = get_reward(vdb, state, action)

                # ###### 동영상 촬영용
                # if iter > 11500 :
                #     time.sleep(0.015)
                # #####
                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, irl_reward, mask])

                # next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break
            ##########################
            env.draw_graph()
            env.render()
            ##########################
            episodes += 1
            scores.append(score)
        
        score_avg = np.mean(scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train(), vdb.train()
        if train_discrim_flag:
            expert_acc, learner_acc = train_vdb(vdb, memory, vdb_optim, demonstrations, 0, args)
            print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100))
            if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen:
                train_discrim_flag = False
        train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(),'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

            save_checkpoint({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'vdb': vdb.state_dict(),
                'z_filter_n':running_state.rs.n,
                'z_filter_m': running_state.rs.mean,
                'z_filter_s': running_state.rs.sum_square,
                'args': args,
                'score': score_avg
            }, filename=ckpt_path)

    ####
    score_avg = int(score_avg)

    model_path = os.path.join(os.getcwd(), 'save_model')
    if not os.path.isdir(model_path):
        os.makedirs(model_path)

    ckpt_path = os.path.join(model_path, 'ckpt_' + 'last_model' + '.pth.tar')

    save_checkpoint({
        'actor': actor.state_dict(),
        'critic': critic.state_dict(),
        'vdb': vdb.state_dict(),
        'z_filter_n': running_state.rs.n,
        'z_filter_m': running_state.rs.mean,
        'z_filter_s': running_state.rs.sum_square,
        'args': args,
        'score': score_avg
    }, filename=ckpt_path)
Exemplo n.º 4
0
    def run(self):
        global episode
        # 환경 생성
        env = Env(mode=False, show=False)

        step = 0

        while episode < EPISODES:
            done = False

            score = 0
            observe = env.reset()
            next_observe = observe

            # 0~30 상태동안 정지
            for _ in range(random.randint(1, 30)):
                observe = next_observe
                _, _ = env.step(0)
                next_observe = env.render(show=False)

            state = pre_processing(observe)
            history = np.stack(
                (state, state, state, state, state, state, state, state),
                axis=2)
            history = np.reshape([history], (1, 84, 84, 8))

            while not done:
                step += 1
                self.t += 1
                observe = next_observe
                self.image = observe
                action, policy = self.get_action(history)

                # 0: 정지, 1: up, 2: right, 3: down, 4: left
                # 선택한 행동으로 한 스텝을 실행
                reward, done = env.step(action)
                next_observe = env.render(show=False)

                # 각 타임스텝마다 상태 전처리 (possibly crashes)
                try:
                    next_state = pre_processing(observe)

                except:
                    next_state = state
                    print("Error is catched!!")

                next_state = np.reshape([next_state], (1, 84, 84, 1))
                next_history = np.append(next_state,
                                         history[:, :, :, :7],
                                         axis=3)

                # 정책의 최대값
                self.avg_p_max += np.amax(
                    self.actor.predict(np.float32(history / 255.)))

                score += reward
                #reward = np.clip(reward, -1., 1.)

                # 샘플을 저장
                self.append_sample(history, action, reward)

                history = next_history

                # 에피소드가 끝나거나 최대 타임스텝 수에 도달하면 학습을 진행
                if self.t >= self.t_max or done:
                    self.train_model(done)
                    self.update_local_model()
                    self.t = 0

                if done:
                    # 각 에피소드 당 학습 정보를 기록
                    episode += 1
                    print("episode:", episode, "  score:", score, "  step:",
                          step, "  actor:#", self.id)

                    stats = [score, self.avg_p_max / float(step), step]
                    for i in range(len(stats)):
                        self.sess.run(self.update_ops[i],
                                      feed_dict={
                                          self.summary_placeholders[i]:
                                          float(stats[i])
                                      })
                    summary_str = self.sess.run(self.summary_op)
                    self.summary_writer.add_summary(summary_str, episode + 1)
                    self.avg_p_max = 0
                    self.avg_loss = 0
                    step = 0
Exemplo n.º 5
0
if __name__ == "__main__":
    logger = get_logger('td.log')

    env = Env()
    agent = TDAgent(actions=list(range(env.n_actions)))

    MAX_EPISODES = 1000  # 최대 에피소드 수
    success_cnt = 0
    fail_cnt = 0
    total_step = 0

    for episode in range(MAX_EPISODES):
        state = env.reset()  # 에피소드 시작 : 환경을 초기화하고, 상태 = 초기상태로 설정

        while True:
            env.render()  # 화면 그리기

            action = agent.get_action(state)

            next_state, reward, done = env.step(action)

            agent.update(state, next_state, reward, done)

            state = next_state

            total_step += 1

            if done:
                # 마지막 상태 가치함수 업데이트
                agent.update(next_state, next_state, reward, done)
Exemplo n.º 6
0
    step = 0

    while episode < EPISODES:
        tick = time.clock()

        done = False

        score = 0
        observe = env.reset()
        next_observe = observe

        for _ in range(random.randint(1, 20)):
            observe = next_observe
            _, _ = env.step(0)
            next_observe = env.render()

            dt = time.clock() - tick
            time.sleep((0.05 - dt) if (0.05 - dt) > 0 else 0)
            tick = time.clock()

        state = pre_processing(observe)
        history = np.stack((state, state, state, state), axis=2)
        history = np.reshape([history], (1, 84, 84, 4))

        while not done:
            dt = time.clock() - tick

            step += 1
            observe = next_observe
Exemplo n.º 7
0
    while not done:
        # take action
        action = network.forward(state)
        # if args.env == 'mount' and action == 1: action += 1
        next_state, reward, done, _ = env.step(action)

        ep_reward += reward

        # distribute reward to all neurons
        network.distribute_task_reward(reward)

        # must come after distribute
        network.store(False)

        if args.render and e // 250 == 1: env.render()

        state = next_state

    if args.neuron_type == 'DQN':
        # hack to get state,next_state to play nicely in DQN
        action = network.forward(next_state)
        # distribute reward to all neurons
        network.distribute_task_reward(reward)
        # must come after distribute
        network.store(done=True)

    # save episode rewards
    network.end_episode(
        ep_reward
    )