예제 #1
0
    def __init__(self, **kwargs):
        self.env = simpleEnv(**kwargs)
        self.env.reset()
        self.action_space_dimensions = self.env.action_space.shape[0]
        self.action_bounds_high, self.action_bounds_low = self.env.action_space.high, self.env.action_space.low
        # print(self.action_bounds_high, self.action_bounds_low)
        self.action_names = range(self.env.action_space.shape[0])
        self.history = pd.DataFrame(columns=self.action_names)
        self.state_history_mean = pd.DataFrame()
        self.state_history_std = pd.DataFrame()

        self.counter = -1
        self.session_name = 1
        self.algorithm_name = None
    # fig.tight_layout()
    plt.show()

    plt.figure()
    plt.scatter(-np.array(starts),
                -np.array(finals),
                c="g",
                alpha=0.5,
                marker=r'$\clubsuit$',
                label="Luck")
    plt.ylim(0, 3)
    plt.title(label)
    plt.show()


env = simpleEnv()
random_seed = 888
# set random seed

tf.set_random_seed(random_seed)
np.random.seed(random_seed)

env.seed(random_seed)
env.reset()
env_fn = lambda: env

ac_kwargs = dict()  #dict(hidden_sizes= (16, 16))

# directory_naf = "logging/awake/NAF"
# if not os.path.exists(directory_naf):
#     os.makedirs(directory_naf)
def main(_):
    model_dir = get_model_dir(conf,
                              ['is_train', 'random_seed', 'monitor', 'display', 'log_level'])

    preprocess_conf(conf)

    with tf.Session() as sess:

        env = simpleEnv()

        env.seed(conf.random_seed)
        env.reset()

        assert isinstance(env.observation_space, gym.spaces.Box), \
            "observation space must be continuous"
        assert isinstance(env.action_space, gym.spaces.Box), \
            "action space must be continuous"

        # exploration strategy
        if conf.noise == 'ou':
            strategy = OUExploration(env, sigma=conf.noise_scale)
        elif conf.noise == 'brownian':
            strategy = BrownianExploration(env, conf.noise_scale)
        elif conf.noise == 'linear_decay':
            strategy = LinearDecayExploration(env)
        else:
            raise ValueError('Unkown exploration strategy: %s' % conf.noise)

        # networks
        shared_args = {
            'sess': sess,
            'input_shape': env.observation_space.shape,
            'action_size': env.action_space.shape[0],
            'hidden_dims': conf.hidden_dims,
            'use_batch_norm': conf.use_batch_norm,
            'use_seperate_networks': conf.use_seperate_networks,
            'hidden_w': conf.hidden_w, 'action_w': conf.action_w,
            'hidden_fn': conf.hidden_fn, 'action_fn': conf.action_fn,
            'w_reg': conf.w_reg,
        }

        logger.info("Creating prediction network...")
        pred_network = Network(
            scope='pred_network', **shared_args
        )

        logger.info("Creating target network...")
        target_network = Network(
            scope='target_network', **shared_args
        )
        target_network.make_soft_update_from(pred_network, conf.tau)

        # statistics and running the agent
        stat = Statistic(sess, conf.env_name, model_dir, pred_network.variables, conf.update_repeat)

        agent = NAF(sess, env, strategy, pred_network, target_network, stat,
                    conf.discount, conf.batch_size, conf.learning_rate,
                    conf.max_steps, conf.update_repeat, conf.max_episodes, private_settings = private_settings)

        # states, actions, rewards = env.readTrainingData()

        # agent.pretune(actions,states,rewards)

        agent.run(conf.monitor, conf.display, conf.is_train)

        # plotting
        print('now plotting')
        rewards = env.rewards
        initial_states = env.initial_conditions

        iterations = []
        finals = []
        starts =[]

        # init_states = pd.read_pickle('/Users/shirlaen/PycharmProjects/DeepLearning/spinningup/Environments/initData')

        for i in range(len(rewards)):
            if (len(rewards[i]) > 0):
                finals.append(rewards[i][len(rewards[i]) - 1])
                starts.append(-np.sqrt(np.mean(np.power(initial_states[i],2))))
                iterations.append(len(rewards[i]))


        plot_suffix = f', number of iterations: {env.TOTAL_COUNTER}, awake time: {env.TOTAL_COUNTER / 600:.1f} h'

        plt.figure(1)
        plt.subplot(211)
        plt.ylim()
        plt.plot(iterations)
        plt.title('Iterations' + plot_suffix)

        plt.subplot(212)
        plt.plot(finals, 'r--')
        plt.plot(starts, c='lime')
        plt.title('Final reward per episode')# + plot_suffix)

        plt.savefig('progress1')

        plt.show()


        plt.figure()
        plt.scatter(-np.array(starts), -np.array(finals), c="g", alpha=0.5, marker=r'$\clubsuit$',
                    label="Luck")
        plt.ylim(0,21)
        plt.show()