def __init__(self, **kwargs): self.env = simpleEnv(**kwargs) self.env.reset() self.action_space_dimensions = self.env.action_space.shape[0] self.action_bounds_high, self.action_bounds_low = self.env.action_space.high, self.env.action_space.low # print(self.action_bounds_high, self.action_bounds_low) self.action_names = range(self.env.action_space.shape[0]) self.history = pd.DataFrame(columns=self.action_names) self.state_history_mean = pd.DataFrame() self.state_history_std = pd.DataFrame() self.counter = -1 self.session_name = 1 self.algorithm_name = None
# fig.tight_layout() plt.show() plt.figure() plt.scatter(-np.array(starts), -np.array(finals), c="g", alpha=0.5, marker=r'$\clubsuit$', label="Luck") plt.ylim(0, 3) plt.title(label) plt.show() env = simpleEnv() random_seed = 888 # set random seed tf.set_random_seed(random_seed) np.random.seed(random_seed) env.seed(random_seed) env.reset() env_fn = lambda: env ac_kwargs = dict() #dict(hidden_sizes= (16, 16)) # directory_naf = "logging/awake/NAF" # if not os.path.exists(directory_naf): # os.makedirs(directory_naf)
def main(_): model_dir = get_model_dir(conf, ['is_train', 'random_seed', 'monitor', 'display', 'log_level']) preprocess_conf(conf) with tf.Session() as sess: env = simpleEnv() env.seed(conf.random_seed) env.reset() assert isinstance(env.observation_space, gym.spaces.Box), \ "observation space must be continuous" assert isinstance(env.action_space, gym.spaces.Box), \ "action space must be continuous" # exploration strategy if conf.noise == 'ou': strategy = OUExploration(env, sigma=conf.noise_scale) elif conf.noise == 'brownian': strategy = BrownianExploration(env, conf.noise_scale) elif conf.noise == 'linear_decay': strategy = LinearDecayExploration(env) else: raise ValueError('Unkown exploration strategy: %s' % conf.noise) # networks shared_args = { 'sess': sess, 'input_shape': env.observation_space.shape, 'action_size': env.action_space.shape[0], 'hidden_dims': conf.hidden_dims, 'use_batch_norm': conf.use_batch_norm, 'use_seperate_networks': conf.use_seperate_networks, 'hidden_w': conf.hidden_w, 'action_w': conf.action_w, 'hidden_fn': conf.hidden_fn, 'action_fn': conf.action_fn, 'w_reg': conf.w_reg, } logger.info("Creating prediction network...") pred_network = Network( scope='pred_network', **shared_args ) logger.info("Creating target network...") target_network = Network( scope='target_network', **shared_args ) target_network.make_soft_update_from(pred_network, conf.tau) # statistics and running the agent stat = Statistic(sess, conf.env_name, model_dir, pred_network.variables, conf.update_repeat) agent = NAF(sess, env, strategy, pred_network, target_network, stat, conf.discount, conf.batch_size, conf.learning_rate, conf.max_steps, conf.update_repeat, conf.max_episodes, private_settings = private_settings) # states, actions, rewards = env.readTrainingData() # agent.pretune(actions,states,rewards) agent.run(conf.monitor, conf.display, conf.is_train) # plotting print('now plotting') rewards = env.rewards initial_states = env.initial_conditions iterations = [] finals = [] starts =[] # init_states = pd.read_pickle('/Users/shirlaen/PycharmProjects/DeepLearning/spinningup/Environments/initData') for i in range(len(rewards)): if (len(rewards[i]) > 0): finals.append(rewards[i][len(rewards[i]) - 1]) starts.append(-np.sqrt(np.mean(np.power(initial_states[i],2)))) iterations.append(len(rewards[i])) plot_suffix = f', number of iterations: {env.TOTAL_COUNTER}, awake time: {env.TOTAL_COUNTER / 600:.1f} h' plt.figure(1) plt.subplot(211) plt.ylim() plt.plot(iterations) plt.title('Iterations' + plot_suffix) plt.subplot(212) plt.plot(finals, 'r--') plt.plot(starts, c='lime') plt.title('Final reward per episode')# + plot_suffix) plt.savefig('progress1') plt.show() plt.figure() plt.scatter(-np.array(starts), -np.array(finals), c="g", alpha=0.5, marker=r'$\clubsuit$', label="Luck") plt.ylim(0,21) plt.show()