Python Statistic.save_step 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: src.statistic

클래스/타입: Statistic

메소드/함수: save_step

hotexamples.com에서의 예제들: 2

Python Statistic.save_step - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 src.statistic.Statistic.save_step에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Statistic(6)

load_model(2)

save_step(2)

collect_statistics(1)

handle_statistic(1)

save_model(1)

예제 #1

파일 보기

def main(_):
    model_dir, data_dir = get_dirs(conf, ['exp_name'])
    # exp_start_time = datetime.datetime.now().strftime("%A_%b%d-%H%M%S")
    # data_dir = "logs/" + conf.exp_name + "_" + exp_start_time
    preprocess_conf(conf, model_dir)

    env = gym.make(conf.env_name)
    env.seed(conf.random_seed)
    state_shape = env.observation_space.shape
    if type(env.action_space) is gym.spaces.Discrete:
        action_shape = env.action_space.n
    else:
        action_shape = env.action_space.shape[0]

    # replay buffer
    buffer = ReplayBuffer2(conf.buffer_size)

    # building agent
    # config = tf.ConfigProto(allow_soft_placement=True)
    # config.gpu_options.allow_growth = True

    config = tf.ConfigProto(intra_op_parallelism_threads=8,
                            inter_op_parallelism_threads=8)
    with tf.Session(config=config) as sess:
        # agent
        agent = SoftPolicyGradient(sess, conf, state_shape, action_shape)
        # statistic
        stat = Statistic(sess, conf, model_dir, data_dir)
        if conf.load_model:
            stat.load_model()

        def var_print():
            for var in tf.global_variables():
                print(var)

        print("printing vars:------------------------------------------------")
        var_print()
        print(
            "printing vars::------------------------------------------------")

        start_steps = 1000
        episode, global_step, local_step = 0, 0, 0
        epi_rewards = 0
        total_Q, Q_loss, pi_loss = [], [], []
        state = env.reset()
        # pbar = tqdm(total=conf.max_steps, dynamic_ncols=True)
        while global_step < conf.max_steps:
            # interaction with environment
            action = agent.sampling_actions(
                [state], is_deterministic=False)[0]  # [-inf, inf]
            next_state, reward, done, info = env.step(
                action_converter(env, action))
            global_step += 1
            local_step += 1
            epi_rewards += reward
            reward *= conf.reward_scale
            buffer.add_transition(state, action, reward, next_state, done)
            state = next_state

            # train step
            if buffer.size() >= conf.batch_size and global_step >= start_steps:
                for i in range(conf.num_train_steps):
                    transitions = buffer.get_transitions(conf.batch_size)
                    Q, single_Q_loss, single_pi_loss = agent.trainer(
                        transitions)
                    total_Q.append(np.mean(Q))
                    Q_loss.append(single_Q_loss)
                    pi_loss.append(single_pi_loss)

            # evaluate step
            if global_step % conf.eval_interval == 0:
                ave_epi_rewards = np.mean(eval_step(env, agent))
                stat.save_step(global_step, ave_epi_rewards)
                print('\n[Evaluation] averaged_epi_rewards: %.3f' %
                      ave_epi_rewards)

            if done:
                # save step
                all_epi_rewards.append(epi_rewards)
                stat.save_step(global_step, epi_rewards, np.mean(total_Q),
                               np.mean(Q_loss), np.mean(pi_loss))
                # pbar.update(local_step)

                lenn = len(all_epi_rewards)
                fromm = max(lenn - 20, 0)
                to = lenn
                min_5_ep_ret = min(all_epi_rewards[fromm:to])

                # pbar.set_description('Episode: %s, epi_rewards: %.3f, pi_loss: %.3f, Q_loss: %.3f avg_5_epi_rew %.1f' %
                #    (episode+1, epi_rewards, np.mean(pi_loss), np.mean(Q_loss), sum(all_epi_rewards[fromm:to])/(to-fromm) ) )
                print(
                    'Episode: %s, epi_rewards: %.3f, pi_loss: %.3f, Q_loss: %.3f \tmin_5_epi_rew %.1f'
                    % (episode + 1, epi_rewards, np.mean(pi_loss),
                       np.mean(Q_loss), min_5_ep_ret))
                threshold = -500.0
                if ((to - fromm) > 3 and min_5_ep_ret > threshold):
                    time_end = time.time()
                    print("SHI hyperParams have made algo converge (",
                          threshold, ") in ", (time_end - time_begin) / 1.0,
                          " s")
                    stat.save_step(global_step, epi_rewards, np.mean(total_Q),
                                   np.mean(Q_loss), np.mean(pi_loss))
                    stat.save_model(global_step)
                    sys.exit()
                episode += 1
                local_step = 0
                epi_rewards = 0
                total_Q, Q_loss, pi_loss = [], [], []
                state = env.reset()

예제 #2

파일 보기

def main(_):
    model_dir, data_dir = get_dirs(conf, ['env_name'])
    preprocess_conf(conf, model_dir)

    env = gym.make(conf.env_name)
    # env.seed(conf.random_seed)
    state_shape = env.observation_space.shape
    if type(env.action_space) is gym.spaces.Discrete:
        action_shape = env.action_space.n
    else:
        action_shape = env.action_space.shape[0]

    # replay buffer
    buffer = ReplayBuffer2(conf.buffer_size)

    # building agent
    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # agent
        agent = SoftPolicyGradient(sess, conf, state_shape, action_shape)
        # statistic
        stat = Statistic(sess, conf, model_dir, data_dir)
        if conf.load_model:
            stat.load_model()

        episode, global_step, local_step = 0, 0, 0
        epi_rewards = 0
        total_Q, Q_loss, pi_loss = [], [], []
        state = env.reset()
        pbar = tqdm(total=conf.max_steps, dynamic_ncols=True)
        while global_step < conf.max_steps:
            # interaction with environment
            action = agent.sampling_actions([state], is_deterministic=False)[0] # [-inf, inf]
            next_state, reward, done, info = env.step(action_converter(env, action))
            global_step += 1
            local_step += 1
            epi_rewards += reward
            reward *= conf.reward_scale
            buffer.add_transition(state, action, reward, next_state, done)
            state = next_state

            # train step
            if buffer.size() >= conf.batch_size:
                for i in range(conf.num_train_steps):
                    transitions = buffer.get_transitions(conf.batch_size)
                    Q, single_Q_loss, single_pi_loss = agent.trainer(transitions)
                    total_Q.append(np.mean(Q))
                    Q_loss.append(single_Q_loss)
                    pi_loss.append(single_pi_loss)

            # evaluate step
            if global_step % conf.eval_interval == 0:
                ave_epi_rewards = np.mean(eval_step(env, agent))
                stat.save_step(global_step, ave_epi_rewards)
                print('\n[Evaluation] averaged_epi_rewards: %.3f' % ave_epi_rewards)

            if done:
                # save step
                stat.save_step(global_step, epi_rewards, np.mean(total_Q), np.mean(Q_loss), np.mean(pi_loss))
                pbar.update(local_step)
                pbar.set_description('Episode: %s, epi_rewards: %.3f, pi_loss: %.3f, Q_loss: %.3f' %
                       (episode+1, epi_rewards, np.mean(pi_loss), np.mean(Q_loss)))
                print()
                episode += 1
                local_step = 0
                epi_rewards = 0
                total_Q, Q_loss, pi_loss = [], [], []
                state = env.reset()
        pbar.close()