os.system( "cp /home/gordon/software/simple-rl/srl/basis_functions/simple_basis_functions.py {0}" .format(results_dir)) os.system( "cp /home/gordon/software/simple-rl/srl/environments/cartpole.py {0}" .format(results_dir)) f_returns = open("{0}{1}".format(results_dir, "/EpisodeReturn.fso"), "w", 1) f_num_steps = open("{0}{1}".format(results_dir, "/NumSteps.fso"), "w", 1) # f_timings = open("{0}{1}".format(results_dir, "/AvgStepTime.fso"), "w", 1) # initialise policy and value functions # policy = PyBrainANNApproximator(actor_config["alpha"]) policy = ANNApproximator(12, "sigmoid") # policy = LinearApprox(actor_config) # basis_functions = PolynomialBasisFunctions(idx=run) basis_functions = BasisFunctions(idx=run) cartpole_environment = CartPole(x=0.05, xdot=0.0, theta=-0.05, thetadot=0.0) # traditional TD(lambda) learning algorithm for the critic td_lambda = TDLinear( len( basis_functions.computeFeatures( [0.0 for _ in range(critic_config["num_input_dims"])])), critic_config["alpha"], CONFIG["gamma"], CONFIG["lambda"])
# f_timings = open("{0}{1}".format(results_dir, "/AvgStepTime.fso"), "w", 1) basis_functions = BasisFunctions( resolution=CONFIG["critic_config"]["rbf_basis_resolution"], scalar=CONFIG["critic_config"]["rbf_basis_scalar"], num_dims=CONFIG["critic_config"]["number_of_dims_in_state"]) # initialise policy and value functions # policy = PyBrainANNApproximator(actor_config["alpha"]) if CONFIG["policy_type"] == "linear": policy = LinearApprox(actor_config, basis_functions=basis_functions) # policy.setParams(np.linspace(-0.2, 0.2, CONFIG["critic_config"]["rbf_basis_resolution"])) elif CONFIG["policy_type"] == "ann": policy = ANNApproximator( CONFIG["actor_config"]["num_input_dims"], CONFIG["actor_config"]["num_hidden_units"], hlayer_activation_func="tanh") # policy.setParams(list(np.load("/tmp/policy_params0.npy"))) elif CONFIG["policy_type"] == "synth": policy = SynthPolicy() # Init the type of Critic Algorithm you wish according to CONFIG dict if CONFIG["critic algorithm"] == "trad": td_lambda = TDLinear( len( basis_functions.computeFeatures([ 0.0 for _ in range(critic_config["num_input_dims"]) ])), critic_config["alpha"], CONFIG["gamma"], CONFIG["lambda"],
class CartPoleSimulation(object): def __init__(self): args = sys.argv if "-r" in args: self.results_dir_name = args[args.index("-r") + 1] else: self.results_dir_name = "cartpole_run" self.position_normaliser = DynamicNormalizer([-2.4, 2.4], [-1.0, 1.0]) self.position_deriv_normaliser = DynamicNormalizer([-1.75, 1.75], [-1.0, 1.0]) self.angle_normaliser = DynamicNormalizer([-0.25944, 0.25944], [-1.0, 1.0]) self.angle_deriv_normaliser = DynamicNormalizer([-1.5, 1.5], [-1.0, 1.0]) self.angle_dt_moving_window = SlidingWindow(5) self.last_150_episode_returns = SlidingWindow(150) self.last_action = None # self.last_action_greedy = None def update_critic(self, reward): state_t_value = self.approx_critic.computeOutput(list(self.state_t)) state_t_p1_value = self.approx_critic.computeOutput(list(self.state_t_plus_1)) if CONFIG["critic algorithm"] == "ann_trad": td_error = reward + (CONFIG["gamma"] * state_t_p1_value) - state_t_value elif CONFIG["critic algorithm"] == "ann_true": td_error = reward + (CONFIG["gamma"] * state_t_p1_value) - \ self.approx_critic.computeOutputThetaMinusOne(list(self.state_t)) prev_critic_weights = self.approx_critic.getParams() critic_gradient = self.approx_critic.calculateGradient(list(self.state_t)) self.traces_policy.updateTrace(self.approx_policy.calculateGradient(list(self.state_t)), 1.0) p = self.approx_critic.getParams() if CONFIG["critic algorithm"] == "ann_trad": self.traces_critic.updateTrace(critic_gradient, 1.0) # for standard TD(lambda) X, T = self.traces_critic.getTraces() for x, trace in zip(X, T): # print("updating critic using gradient vector: {0}\t{1}".format(x, trace)) p += critic_config["alpha"] * td_error * (x * trace) # self.approx_critic.setParams(prev_critic_weights + CONFIG["critic_config"]["alpha"] * td_error * critic_gradient) elif CONFIG["critic algorithm"] == "ann_true": # For True TD(lambda) #print("UPDATING ANN CRITC with TRUE TD(lambda)") self.traces_critic.updateTrace(critic_gradient) # for True TD(lambda) part_1 = td_error * self.traces_critic.e part_2 = critic_config["alpha"] * \ np.dot((self.approx_critic.computeOutputThetaMinusOne(list(self.state_t)) - state_t_value), critic_gradient) p += part_1 + part_2 self.approx_critic.setParams(p) return td_error, critic_gradient, state_t_value, state_t_p1_value def update_policy(self, td_error, exploration): UPDATE_CONDITION = False if CONFIG["actor update rule"] == "cacla": if td_error > 0.0: UPDATE_CONDITION = True else: UPDATE_CONDITION = False elif CONFIG["actor update rule"] == "td lambda": UPDATE_CONDITION = True if UPDATE_CONDITION: # print("Updating the policy") # get original values params = self.approx_policy.getParams() old_action = self.approx_policy.computeOutput(list(self.state_t)) policy_gradient = self.approx_policy.calculateGradient() # now update if CONFIG["actor update rule"] == "cacla": # policy.setParams(params + actor_config["alpha"] * (policy_gradient * exploration)) X, T = self.traces_policy.getTraces() p = self.approx_policy.getParams() #print("Number of traces: {0}".format(len(T))) for x, trace in zip(X, T): # print("updating critic using gradient vector: {0}\t{1}".format(x, trace)) p += actor_config["alpha"] * (x * trace) * exploration self.approx_policy.setParams(p) # self.approx_policy_greedy.setParams(p) else: self.approx_policy.setParams(params + actor_config["alpha"] * (policy_gradient * td_error)) # self.approx_policy_greedy.setParams(params + actor_config["alpha"] * (policy_gradient * td_error)) def run(self): # Loop number of runs for run in range(CONFIG["num_runs"]): # Create logging directory and files results_dir = "/home/gordon/data/tmp/{0}{1}".format(self.results_dir_name, run) if not os.path.exists(results_dir): os.makedirs(results_dir) filename = os.path.basename(sys.argv[0]) os.system("cp {0} {1}".format(filename, results_dir)) f_returns = open("{0}{1}".format(results_dir, "/EpisodeReturn.fso"), "w", 1) # f_returns_greedy = open("{0}{1}".format(results_dir, "/GreedyEpisodeReturn.fso"), "w", 1) # env_name = 'MountainCarContinuous-v0' env_name = 'Pendulum-v0' self.env = gym.make(env_name) self.env = wrappers.Monitor(self.env, directory="{0}_gym".format(results_dir), force=True) self.env.seed(0) # self.env_greedy = gym.make(env_name) # policies and critics self.approx_critic = ANNApproximator(actor_config["num_input_dims"], actor_config["num_hidden_units"], hlayer_activation_func="tanh") self.approx_policy = ANNApproximator(actor_config["num_input_dims"], actor_config["num_hidden_units"], hlayer_activation_func="tanh") # self.approx_policy_greedy = ANNApproximator(actor_config["num_input_dims"], actor_config["num_hidden_units"], hlayer_activation_func="tanh") prev_critic_gradient = np.zeros(self.approx_critic.getParams().shape) # Set up trace objects if CONFIG["critic algorithm"] == "ann_trad": self.traces_critic = Traces(CONFIG["lambda"], CONFIG["min_trace_value"]) elif CONFIG["critic algorithm"] == "ann_true": self.traces_critic = TrueTraces(critic_config["alpha"], CONFIG["gamma"], CONFIG["lambda"]) self.traces_policy = Traces(CONFIG["lambda"], CONFIG["min_trace_value"]) for episode_number in range(CONFIG["num_episodes"]): reward_cum = 0.0 reward_cum_greedy = 0.0 if episode_number % CONFIG["log_actions"] == 0: f_actions = open("{0}{1}".format(results_dir, "/actions{0}.csv".format(episode_number)), "w", 1) # f_actions_greedy = open("{0}{1}".format(results_dir, "/greedy_actions{0}.csv".format(episode_number)), "w", 1) # reset everything for the next episode self.traces_critic.reset() self.traces_policy.reset() observation = self.env.reset() # self.env_greedy.reset() episode_ended = False episode_ended_learning = False # episode_ended_greedy = False for step_number in range(CONFIG["max_num_steps"]): # Update the state for timestep t # self.update_state_t() self.state_t = deepcopy(observation) # action_t_greedy = self.approx_policy_greedy.computeOutput(list(self.state_t_greedy)) action_t_deterministic = self.approx_policy.computeOutput(list(self.state_t)) if step_number % 5 == 0: exploration = np.random.normal(0.0, CONFIG["exploration_sigma"]) action_t = np.clip(action_t_deterministic + exploration, -2, 2) observation, reward, episode_ended_learning, diagnostics = self.env.step([action_t]) self.state_t_plus_1 = deepcopy(observation) # self.env_greedy.performAction(action_t_greedy) # Update the state for timestep t + 1, after action is performed # self.update_state_t_p1(next_state) if self.last_action is None: self.last_action = action_t # if self.last_action_greedy is None: # self.last_action_greedy = action_t_greedy # action_diff = self.last_action - action_t_deterministic # reward = self.env.getReward(action_diff) # print("action {0} reward: {1}".format(action_t, reward)) self.last_action = deepcopy(action_t_deterministic) # Always log the greedy actions # if episode_number % CONFIG["log_actions"] == 0: # if step_number == 0: # state_keys = list(self.state_t_greedy.keys()) # state_keys.append("action") # label_logging_format = "#{" + "}\t{".join( # [str(state_keys.index(el)) for el in state_keys]) + "}\n" # f_actions_greedy.write(label_logging_format.format(*state_keys)) # # logging_list = list(self.state_t_greedy) # logging_list.append(action_t_greedy) # # action_logging_format = "{" + "}\t{".join( # [str(logging_list.index(el)) for el in logging_list]) + "}\n" # f_actions_greedy.write(action_logging_format.format(*logging_list)) if not episode_ended_learning: # ---- Critic Update ---- (td_error, critic_gradient, state_t_value, state_tp1_value) = self.update_critic(reward) # ---- Policy Update ------- self.update_policy(td_error, exploration) # only log the learning actions whilst learning if episode_number % CONFIG["log_actions"] == 0: # if step_number == 0: # # state_keys = list(self.state_t.keys()) # # state_keys.append("exploration") # # state_keys.append("reward") # # state_keys.append("tde") # # state_keys.append("st") # # state_keys.append("stp1") # # state_keys.append("explore_action") # # state_keys.append("action") # # label_logging_format = "#{" + "}\t{".join( # # [str(state_keys.index(el)) for el in state_keys]) + "}\n" # # f_actions.write(label_logging_format.format(*state_keys)) # logging_list = list(self.state_t) # logging_list.append(exploration) # logging_list.append(reward) # logging_list.append(td_error) # logging_list.append(state_t_value) # logging_list.append(state_tp1_value) logging_list.append(action_t) logging_list.append(action_t_deterministic) action_logging_format = "{" + "}\t{".join( [str(logging_list.index(el)) for el in logging_list]) + "}\n" f_actions.write(action_logging_format.format(*logging_list)) prev_critic_gradient = deepcopy(critic_gradient) reward_cum += reward # if not episode_ended_greedy: # reward_cum_greedy += self.env_greedy.getReward(action_t_greedy - self.last_action_greedy) # self.last_action_greedy = deepcopy(action_t_greedy) # episode_ended_learning = self.env.episodeEnded() # episode_ended_greedy = self.env_greedy.episodeEnded() self.state_t = deepcopy(self.state_t_plus_1) if episode_ended_learning:# and episode_ended_greedy: # episode complete, start a new one break # episode either ended early due to failure or completed max number of steps print("Episode ended - Learning {0} {1}".format(episode_number, reward_cum)) # print("Episode ended - Greedy {0} {1}".format(episode_number, reward_cum_greedy)) last_150_avg = sum(self.last_150_episode_returns.getWindow(reward_cum_greedy)) / 150.0 f_returns.write("{0}\t{1}\n".format(episode_number, reward_cum)) # f_returns_greedy.write("{0}\t{1}\n".format(episode_number, reward_cum_greedy)) if last_150_avg > 1995: break
def run(self): # Loop number of runs for run in range(CONFIG["num_runs"]): # Create logging directory and files results_dir = "/home/gordon/data/tmp/{0}{1}".format(self.results_dir_name, run) if not os.path.exists(results_dir): os.makedirs(results_dir) filename = os.path.basename(sys.argv[0]) os.system("cp {0} {1}".format(filename, results_dir)) f_returns = open("{0}{1}".format(results_dir, "/EpisodeReturn.fso"), "w", 1) # f_returns_greedy = open("{0}{1}".format(results_dir, "/GreedyEpisodeReturn.fso"), "w", 1) # env_name = 'MountainCarContinuous-v0' env_name = 'Pendulum-v0' self.env = gym.make(env_name) self.env = wrappers.Monitor(self.env, directory="{0}_gym".format(results_dir), force=True) self.env.seed(0) # self.env_greedy = gym.make(env_name) # policies and critics self.approx_critic = ANNApproximator(actor_config["num_input_dims"], actor_config["num_hidden_units"], hlayer_activation_func="tanh") self.approx_policy = ANNApproximator(actor_config["num_input_dims"], actor_config["num_hidden_units"], hlayer_activation_func="tanh") # self.approx_policy_greedy = ANNApproximator(actor_config["num_input_dims"], actor_config["num_hidden_units"], hlayer_activation_func="tanh") prev_critic_gradient = np.zeros(self.approx_critic.getParams().shape) # Set up trace objects if CONFIG["critic algorithm"] == "ann_trad": self.traces_critic = Traces(CONFIG["lambda"], CONFIG["min_trace_value"]) elif CONFIG["critic algorithm"] == "ann_true": self.traces_critic = TrueTraces(critic_config["alpha"], CONFIG["gamma"], CONFIG["lambda"]) self.traces_policy = Traces(CONFIG["lambda"], CONFIG["min_trace_value"]) for episode_number in range(CONFIG["num_episodes"]): reward_cum = 0.0 reward_cum_greedy = 0.0 if episode_number % CONFIG["log_actions"] == 0: f_actions = open("{0}{1}".format(results_dir, "/actions{0}.csv".format(episode_number)), "w", 1) # f_actions_greedy = open("{0}{1}".format(results_dir, "/greedy_actions{0}.csv".format(episode_number)), "w", 1) # reset everything for the next episode self.traces_critic.reset() self.traces_policy.reset() observation = self.env.reset() # self.env_greedy.reset() episode_ended = False episode_ended_learning = False # episode_ended_greedy = False for step_number in range(CONFIG["max_num_steps"]): # Update the state for timestep t # self.update_state_t() self.state_t = deepcopy(observation) # action_t_greedy = self.approx_policy_greedy.computeOutput(list(self.state_t_greedy)) action_t_deterministic = self.approx_policy.computeOutput(list(self.state_t)) if step_number % 5 == 0: exploration = np.random.normal(0.0, CONFIG["exploration_sigma"]) action_t = np.clip(action_t_deterministic + exploration, -2, 2) observation, reward, episode_ended_learning, diagnostics = self.env.step([action_t]) self.state_t_plus_1 = deepcopy(observation) # self.env_greedy.performAction(action_t_greedy) # Update the state for timestep t + 1, after action is performed # self.update_state_t_p1(next_state) if self.last_action is None: self.last_action = action_t # if self.last_action_greedy is None: # self.last_action_greedy = action_t_greedy # action_diff = self.last_action - action_t_deterministic # reward = self.env.getReward(action_diff) # print("action {0} reward: {1}".format(action_t, reward)) self.last_action = deepcopy(action_t_deterministic) # Always log the greedy actions # if episode_number % CONFIG["log_actions"] == 0: # if step_number == 0: # state_keys = list(self.state_t_greedy.keys()) # state_keys.append("action") # label_logging_format = "#{" + "}\t{".join( # [str(state_keys.index(el)) for el in state_keys]) + "}\n" # f_actions_greedy.write(label_logging_format.format(*state_keys)) # # logging_list = list(self.state_t_greedy) # logging_list.append(action_t_greedy) # # action_logging_format = "{" + "}\t{".join( # [str(logging_list.index(el)) for el in logging_list]) + "}\n" # f_actions_greedy.write(action_logging_format.format(*logging_list)) if not episode_ended_learning: # ---- Critic Update ---- (td_error, critic_gradient, state_t_value, state_tp1_value) = self.update_critic(reward) # ---- Policy Update ------- self.update_policy(td_error, exploration) # only log the learning actions whilst learning if episode_number % CONFIG["log_actions"] == 0: # if step_number == 0: # # state_keys = list(self.state_t.keys()) # # state_keys.append("exploration") # # state_keys.append("reward") # # state_keys.append("tde") # # state_keys.append("st") # # state_keys.append("stp1") # # state_keys.append("explore_action") # # state_keys.append("action") # # label_logging_format = "#{" + "}\t{".join( # # [str(state_keys.index(el)) for el in state_keys]) + "}\n" # # f_actions.write(label_logging_format.format(*state_keys)) # logging_list = list(self.state_t) # logging_list.append(exploration) # logging_list.append(reward) # logging_list.append(td_error) # logging_list.append(state_t_value) # logging_list.append(state_tp1_value) logging_list.append(action_t) logging_list.append(action_t_deterministic) action_logging_format = "{" + "}\t{".join( [str(logging_list.index(el)) for el in logging_list]) + "}\n" f_actions.write(action_logging_format.format(*logging_list)) prev_critic_gradient = deepcopy(critic_gradient) reward_cum += reward # if not episode_ended_greedy: # reward_cum_greedy += self.env_greedy.getReward(action_t_greedy - self.last_action_greedy) # self.last_action_greedy = deepcopy(action_t_greedy) # episode_ended_learning = self.env.episodeEnded() # episode_ended_greedy = self.env_greedy.episodeEnded() self.state_t = deepcopy(self.state_t_plus_1) if episode_ended_learning:# and episode_ended_greedy: # episode complete, start a new one break # episode either ended early due to failure or completed max number of steps print("Episode ended - Learning {0} {1}".format(episode_number, reward_cum)) # print("Episode ended - Greedy {0} {1}".format(episode_number, reward_cum_greedy)) last_150_avg = sum(self.last_150_episode_returns.getWindow(reward_cum_greedy)) / 150.0 f_returns.write("{0}\t{1}\n".format(episode_number, reward_cum)) # f_returns_greedy.write("{0}\t{1}\n".format(episode_number, reward_cum_greedy)) if last_150_avg > 1995: break
# Create logging directory and files results_dir = "/tmp/{0}{1}".format(results_dir_name, run) if not os.path.exists(results_dir): os.makedirs(results_dir) filename = os.path.basename(sys.argv[0]) os.system("cp {0} {1}".format(filename, results_dir)) os.system("cp /home/gordon/software/simple-rl/srl/basis_functions/simple_basis_functions.py {0}".format(results_dir)) os.system("cp /home/gordon/software/simple-rl/srl/environments/cartpole.py {0}".format(results_dir)) f_returns = open("{0}{1}".format(results_dir, "/EpisodeReturn.fso"), "w", 1) f_num_steps = open("{0}{1}".format(results_dir, "/NumSteps.fso"), "w", 1) f_timings = open("{0}{1}".format(results_dir, "/AvgStepTime.fso"), "w", 1) # initialise policy and value functions # policy = LinearApprox(actor_config) policy = ANNApproximator(4, controlled_var[run], hlayer_activation_func="tanh") # basis_functions = PolynomialBasisFunctions(idx=run) basis_functions = BasisFunctions(idx=0) cartpole_environment = CartPoleEnvironment() # traditional TD(lambda) learning algorithm for the critic # td_lambda = Traditional_TD_LAMBDA(ANNApproximator(critic_config["alpha"]), CONFIG["lambda"], CONFIG["min_trace_value"]) # tmp_critic_config = deepcopy(critic_config) # tmp_critic_config["alpha"] = critic_config["alpha"][run] td_lambda = TrueOnlineTDLambda(basis_functions, critic_config, CONFIG["lambda"], CONFIG["gamma"]) # td_lambda = ANNApproximator(critic_config["alpha"]) # Loop number of episodes for episode_number in range(CONFIG["num_episodes"]): global episode_number, results_dir
class NessieRlSimulation(object): def __init__(self): args = sys.argv if "-r" in args: self.results_dir_name = args[args.index("-r") + 1] else: self.results_dir_name = "nessie_run" self.position_normaliser = DynamicNormalizer([-2.4, 2.4], [-1.0, 1.0]) self.position_deriv_normaliser = DynamicNormalizer([-1.75, 1.75], [-1.0, 1.0]) self.angle_normaliser = DynamicNormalizer([-3.14, 3.14], [-1.0, 1.0]) self.angle_deriv_normaliser = DynamicNormalizer([-0.02, 0.02], [-1.0, 1.0]) self.angle_dt_moving_window = SlidingWindow(5) self.last_150_episode_returns = SlidingWindow(150) self.thrusters = Thrusters() self.env = ROSBehaviourInterface() self.environment_info = EnvironmentInfo() self.ounoise = OUNoise() self.prev_action = 0.0 def update_critic(self, reward, update): state_t_value = self.approx_critic.computeOutput(self.state_t.values()) state_t_p1_value = self.approx_critic.computeOutput( self.state_t_plus_1.values()) # print("state t: {0}".format(state_t_value)) # print("state tp1: {0}".format(state_t_p1_value)) if CONFIG["critic algorithm"] == "ann_trad": td_error = reward + (CONFIG["gamma"] * state_t_p1_value) - state_t_value elif CONFIG["critic algorithm"] == "ann_true": td_error = reward + (CONFIG["gamma"] * state_t_p1_value) - \ self.approx_critic.computeOutputThetaMinusOne(self.state_t.values()) prev_critic_weights = self.approx_critic.getParams() critic_gradient = self.approx_critic.calculateGradient( self.state_t.values()) self.traces_policy.updateTrace( self.approx_policy.calculateGradient(self.state_t.values()), 1.0) if update: p = self.approx_critic.getParams() if CONFIG["critic algorithm"] == "ann_trad": self.traces_critic.updateTrace(critic_gradient, 1.0) # for standard TD(lambda) X, T = self.traces_critic.getTraces() for x, trace in zip(X, T): # print("updating critic using gradient vector: {0}\t{1}".format(x, trace)) p += critic_config["alpha"] * td_error * (x * trace) # self.approx_critic.setParams(prev_critic_weights + CONFIG["critic_config"]["alpha"] * td_error * critic_gradient) elif CONFIG["critic algorithm"] == "ann_true": # For True TD(lambda) #print("UPDATING ANN CRITC with TRUE TD(lambda)") self.traces_critic.updateTrace( critic_gradient) # for True TD(lambda) part_1 = td_error * self.traces_critic.e part_2 = critic_config["alpha"] * \ np.dot((self.approx_critic.computeOutputThetaMinusOne(self.state_t.values()) - state_t_value), critic_gradient) p += part_1 + part_2 self.approx_critic.setParams(p) return (td_error, critic_gradient, state_t_value, state_t_p1_value) def update_state_t(self): raw_angle = deepcopy(self.environment_info.raw_angle_to_goal) # print("raw angle:") # raw_angle_dt = raw_angle - self.prev_angle_dt_t # print("raw angle dt: {0}".format(raw_angle_dt)) self.state_t = { "angle": self.angle_normaliser.scale_value(raw_angle), "angle_deriv": self.prev_angle_dt_t } self.prev_angle_dt_t = deepcopy(raw_angle) def update_state_t_p1(self): raw_angle = deepcopy(self.environment_info.raw_angle_to_goal) angle_tp1 = self.angle_normaliser.scale_value(raw_angle) angle_t = self.state_t["angle"] # if (abs(angle_t)) > 0.5: # if angle_t > 0 and angle_tp1 < 0: # angle_change = (1.0 - angle_t) + (-1.0 - angle_tp1) # elif angle_t < 0 and angle_tp1 > 0: # angle_change = (1.0 - angle_tp1) + (-1.0 - angle_t) # else: # angle_change = angle_tp1 - angle_t # else: abs_angle_tp1 = np.abs(angle_tp1) abs_angle_t = np.abs(angle_t) if abs_angle_tp1 > abs_angle_t: sign = -1 else: sign = 1 angle_change = sign * abs(abs_angle_tp1 - abs_angle_t) # print("angle t: {0}".format(abs_angle_t)) # print("angle tp1: {0}".format(abs_angle_tp1)) # print("angle change: {0}".format(angle_change)) tmp_angle_change = sum( self.angle_dt_moving_window.getWindow(angle_change)) / 5.0 self.state_t_plus_1 = { "angle": self.angle_normaliser.scale_value(raw_angle), "angle_deriv": self.angle_deriv_normaliser.scale_value(tmp_angle_change) } self.prev_angle_dt_t = self.angle_deriv_normaliser.scale_value( tmp_angle_change) def update_policy(self, td_error, exploration): UPDATE_CONDITION = False if CONFIG["actor update rule"] == "cacla": if td_error > 0.0: UPDATE_CONDITION = True else: UPDATE_CONDITION = False elif CONFIG["actor update rule"] == "td lambda": UPDATE_CONDITION = True if UPDATE_CONDITION: # get original values params = self.approx_policy.getParams() old_action = self.approx_policy.computeOutput( self.state_t.values()) policy_gradient = self.approx_policy.calculateGradient() # now update if CONFIG["actor update rule"] == "cacla": # policy.setParams(params + actor_config["alpha"] * (policy_gradient * exploration)) X, T = self.traces_policy.getTraces() p = self.approx_policy.getParams() #print("Number of traces: {0}".format(len(T))) for x, trace in zip(X, T): # print("updating critic using gradient vector: {0}\t{1}".format(x, trace)) p += actor_config["alpha"] * (x * trace) * exploration self.approx_policy.setParams(p) else: self.approx_policy.setParams(params + actor_config["alpha"] * (policy_gradient * td_error)) def run(self): # Loop number of runs if CONFIG["test_policy"]: runs = TEST_CONFIG["run_numbers"] else: runs = range(CONFIG["num_runs"]) for run in runs: if CONFIG["test_policy"]: self.results_dir_name = "nessie_validate_{0}".format( TEST_CONFIG["folder"]) results_to_load_directory = "/tmp/{0}{1}".format( TEST_CONFIG["folder"], run) # Create logging directory and files results_dir = "/home/gordon/data/tmp/{0}{1}".format( self.results_dir_name, run) if not os.path.exists(results_dir): os.makedirs(results_dir) filename = os.path.basename(sys.argv[0]) os.system("cp {0} {1}".format(filename, results_dir)) os.system( "cp /home/gordon/rosbuild_ws/ros_simple_rl/src/srl/environments/ros_behaviour_interface.py {0}" .format(results_dir)) os.system( "cp /home/gordon/rosbuild_ws/ros_simple_rl/src/utilities/orstein_exploration.py {0}" .format(results_dir)) if CONFIG["test_policy"]: os.system("cp {0}/Epi* {1}/LearningEpisodeReturn.fso".format( results_to_load_directory, results_dir)) os.system("cp {0}/basic* {1}/LearningMainScript.py".format( results_to_load_directory, results_dir)) f_returns = open( "{0}{1}".format(results_dir, "/EpisodeReturn.fso"), "w", 1) # policies and critics self.approx_critic = ANNApproximator( actor_config["num_input_dims"], actor_config["num_hidden_units"], hlayer_activation_func="tanh") if not CONFIG["generate_initial_weights"]: critic_init = "/home/gordon/data/tmp/critic_params_48h.npy" self.approx_critic.setParams(list(np.load(critic_init))) self.approx_policy = ANNApproximator( actor_config["num_input_dims"], actor_config["num_hidden_units"], hlayer_activation_func="tanh") if not CONFIG["generate_initial_weights"]: policy_init = "/home/gordon/data/tmp/initial_2dim_48h_policy_params.npy" self.approx_policy.setParams(list(np.load(policy_init))) # if CONFIG["test_policy"] is True: # if not os.path.exists("/tmp/{0}".format(results_to_validate)): # continue #self.approx_policy.setParams() prev_critic_gradient = np.zeros( self.approx_critic.getParams().shape) # Set up trace objects if CONFIG["critic algorithm"] == "ann_trad": self.traces_critic = Traces(CONFIG["lambda"], CONFIG["min_trace_value"]) elif CONFIG["critic algorithm"] == "ann_true": self.traces_critic = TrueTraces(critic_config["alpha"], CONFIG["gamma"], CONFIG["lambda"]) self.traces_policy = Traces(CONFIG["lambda"], CONFIG["min_trace_value"]) exploration_sigma = CONFIG["exploration_sigma"] for episode_number in range(CONFIG["num_episodes"]): if CONFIG["test_policy"] and episode_number not in TEST_CONFIG[ "episode_numbers"]: # don't do anything for the episode number if we are testing policies and # this episodes policy is not in the list to test continue reward_cum = 0.0 reward_cum_greedy = 0.0 if episode_number % CONFIG["log_actions"] == 0: f_actions = open( "{0}{1}".format( results_dir, "/actions{0}.csv".format(episode_number)), "w", 1) # If testing a learnt policy, load it if CONFIG["test_policy"]: policy_to_load = "{0}/policy_params{1}.npy".format( results_to_load_directory, episode_number) critic_to_load = "{0}/critic_params{1}.npy".format( results_to_load_directory, episode_number) print("policy_to_load: {0}".format(policy_to_load)) self.approx_policy.setParams(list(np.load(policy_to_load))) self.approx_critic.setParams(list(np.load(critic_to_load))) # reset everything for the next episode self.traces_critic.reset() self.traces_policy.reset() # self.env.nav_reset() self.env.reset() self.ounoise.reset() self.angle_dt_moving_window.reset() episode_ended = False episode_ended_learning = False # if episode_number > 5 and exploration_sigma > 0.1: exploration_sigma *= CONFIG["exploration_decay"] self.prev_angle_dt_t = 0.0 self.prev_angle_dt_tp1 = 0.0 if CONFIG["generate_initial_weights"]: self.approx_policy = ANNApproximator( actor_config["num_input_dims"], actor_config["num_hidden_units"], hlayer_activation_func="tanh") for step_number in range(CONFIG["max_num_steps"]): # Update the state for timestep t self.update_state_t() action_t_deterministic = self.approx_policy.computeOutput( self.state_t.values()) # if episode_number > 9: # control_rate = 0.5 # else: control_rate = 3 if step_number % (control_rate * CONFIG["spin_rate"]) == 0: # exploration = self.ounoise.get_action(action_t_deterministic) # exploration = np.random.normal(0.0, exploration_sigma) tmp_action = self.ounoise.get_action( action_t_deterministic)[0] exploration = tmp_action - action_t_deterministic # exploration = self.ounoise.function(action_t_deterministic, 0, 0.2, 0.1)[0] # else: # action_t = deepcopy(self.prev_action) # self.prev_action = deepcopy(action_t) if not CONFIG["generate_initial_weights"] and not CONFIG[ "test_policy"]: action_t = np.clip( action_t_deterministic + exploration, -10, 10) else: action_t = np.clip(action_t_deterministic, -10, 10) # TODO - investigate what happens with the action!!! self.env.performAction("gaussian_variance", action_t) # TODO - time rather than rospy.sleep?! time.sleep(1.0 / CONFIG["spin_rate"]) # Update the state for timestep t + 1, after action is performed self.update_state_t_p1() to_end = False # if self.state_t["angle"] > 0.9: # reward = -10 # to_end = True # else: reward = self.env.getReward(self.state_t_plus_1, action_t) if not episode_ended_learning: if not CONFIG["generate_initial_weights"]: # ---- Critic Update ---- (td_error, critic_gradient, state_t_value, state_tp1_value) = self.update_critic( reward, not CONFIG["test_policy"]) if episode_number % CONFIG["log_actions"] == 0: if step_number == 0: state_keys = self.state_t.keys() state_keys.append("exploration") state_keys.append("reward") state_keys.append("tde") state_keys.append("st") state_keys.append("stp1") state_keys.append("explore_action") state_keys.append("action") label_logging_format = "#{" + "}\t{".join([ str(state_keys.index(el)) for el in state_keys ]) + "}\n" f_actions.write( label_logging_format.format( *state_keys)) logging_list = self.state_t.values() logging_list.append(exploration) logging_list.append(reward) logging_list.append(td_error) logging_list.append(state_t_value) logging_list.append(state_tp1_value) logging_list.append(action_t) logging_list.append(action_t_deterministic) action_logging_format = "{" + "}\t{".join([ str(logging_list.index(el)) for el in logging_list ]) + "}\n" f_actions.write( action_logging_format.format( *logging_list)) if not CONFIG["test_policy"]: # ---- Policy Update ------- self.update_policy(td_error, exploration) prev_critic_gradient = deepcopy(critic_gradient) reward_cum += reward if to_end: reward_cum = -3000 # for logging only break # TODO - add check for if episode ended early. i.e. moving average """ episode_ended_learning = self.env.episodeEnded() if episode_ended_learning: # episode complete, start a new one break """ # episode either ended early due to failure or completed max number of steps print("Episode ended - Learning {0} {1}".format( episode_number, reward_cum)) f_returns.write("{0}\t{1}\n".format(episode_number, reward_cum)) np.save( "{0}/policy_params{1}".format(results_dir, episode_number), self.approx_policy.getParams()) np.save( "{0}/critic_params{1}".format(results_dir, episode_number), self.approx_critic.getParams())
def run(self): # Loop number of runs if CONFIG["test_policy"]: runs = TEST_CONFIG["run_numbers"] else: runs = range(CONFIG["num_runs"]) for run in runs: if CONFIG["test_policy"]: self.results_dir_name = "nessie_validate_{0}".format( TEST_CONFIG["folder"]) results_to_load_directory = "/tmp/{0}{1}".format( TEST_CONFIG["folder"], run) # Create logging directory and files results_dir = "/home/gordon/data/tmp/{0}{1}".format( self.results_dir_name, run) if not os.path.exists(results_dir): os.makedirs(results_dir) filename = os.path.basename(sys.argv[0]) os.system("cp {0} {1}".format(filename, results_dir)) os.system( "cp /home/gordon/rosbuild_ws/ros_simple_rl/src/srl/environments/ros_behaviour_interface.py {0}" .format(results_dir)) os.system( "cp /home/gordon/rosbuild_ws/ros_simple_rl/src/utilities/orstein_exploration.py {0}" .format(results_dir)) if CONFIG["test_policy"]: os.system("cp {0}/Epi* {1}/LearningEpisodeReturn.fso".format( results_to_load_directory, results_dir)) os.system("cp {0}/basic* {1}/LearningMainScript.py".format( results_to_load_directory, results_dir)) f_returns = open( "{0}{1}".format(results_dir, "/EpisodeReturn.fso"), "w", 1) # policies and critics self.approx_critic = ANNApproximator( actor_config["num_input_dims"], actor_config["num_hidden_units"], hlayer_activation_func="tanh") if not CONFIG["generate_initial_weights"]: critic_init = "/home/gordon/data/tmp/critic_params_48h.npy" self.approx_critic.setParams(list(np.load(critic_init))) self.approx_policy = ANNApproximator( actor_config["num_input_dims"], actor_config["num_hidden_units"], hlayer_activation_func="tanh") if not CONFIG["generate_initial_weights"]: policy_init = "/home/gordon/data/tmp/initial_2dim_48h_policy_params.npy" self.approx_policy.setParams(list(np.load(policy_init))) # if CONFIG["test_policy"] is True: # if not os.path.exists("/tmp/{0}".format(results_to_validate)): # continue #self.approx_policy.setParams() prev_critic_gradient = np.zeros( self.approx_critic.getParams().shape) # Set up trace objects if CONFIG["critic algorithm"] == "ann_trad": self.traces_critic = Traces(CONFIG["lambda"], CONFIG["min_trace_value"]) elif CONFIG["critic algorithm"] == "ann_true": self.traces_critic = TrueTraces(critic_config["alpha"], CONFIG["gamma"], CONFIG["lambda"]) self.traces_policy = Traces(CONFIG["lambda"], CONFIG["min_trace_value"]) exploration_sigma = CONFIG["exploration_sigma"] for episode_number in range(CONFIG["num_episodes"]): if CONFIG["test_policy"] and episode_number not in TEST_CONFIG[ "episode_numbers"]: # don't do anything for the episode number if we are testing policies and # this episodes policy is not in the list to test continue reward_cum = 0.0 reward_cum_greedy = 0.0 if episode_number % CONFIG["log_actions"] == 0: f_actions = open( "{0}{1}".format( results_dir, "/actions{0}.csv".format(episode_number)), "w", 1) # If testing a learnt policy, load it if CONFIG["test_policy"]: policy_to_load = "{0}/policy_params{1}.npy".format( results_to_load_directory, episode_number) critic_to_load = "{0}/critic_params{1}.npy".format( results_to_load_directory, episode_number) print("policy_to_load: {0}".format(policy_to_load)) self.approx_policy.setParams(list(np.load(policy_to_load))) self.approx_critic.setParams(list(np.load(critic_to_load))) # reset everything for the next episode self.traces_critic.reset() self.traces_policy.reset() # self.env.nav_reset() self.env.reset() self.ounoise.reset() self.angle_dt_moving_window.reset() episode_ended = False episode_ended_learning = False # if episode_number > 5 and exploration_sigma > 0.1: exploration_sigma *= CONFIG["exploration_decay"] self.prev_angle_dt_t = 0.0 self.prev_angle_dt_tp1 = 0.0 if CONFIG["generate_initial_weights"]: self.approx_policy = ANNApproximator( actor_config["num_input_dims"], actor_config["num_hidden_units"], hlayer_activation_func="tanh") for step_number in range(CONFIG["max_num_steps"]): # Update the state for timestep t self.update_state_t() action_t_deterministic = self.approx_policy.computeOutput( self.state_t.values()) # if episode_number > 9: # control_rate = 0.5 # else: control_rate = 3 if step_number % (control_rate * CONFIG["spin_rate"]) == 0: # exploration = self.ounoise.get_action(action_t_deterministic) # exploration = np.random.normal(0.0, exploration_sigma) tmp_action = self.ounoise.get_action( action_t_deterministic)[0] exploration = tmp_action - action_t_deterministic # exploration = self.ounoise.function(action_t_deterministic, 0, 0.2, 0.1)[0] # else: # action_t = deepcopy(self.prev_action) # self.prev_action = deepcopy(action_t) if not CONFIG["generate_initial_weights"] and not CONFIG[ "test_policy"]: action_t = np.clip( action_t_deterministic + exploration, -10, 10) else: action_t = np.clip(action_t_deterministic, -10, 10) # TODO - investigate what happens with the action!!! self.env.performAction("gaussian_variance", action_t) # TODO - time rather than rospy.sleep?! time.sleep(1.0 / CONFIG["spin_rate"]) # Update the state for timestep t + 1, after action is performed self.update_state_t_p1() to_end = False # if self.state_t["angle"] > 0.9: # reward = -10 # to_end = True # else: reward = self.env.getReward(self.state_t_plus_1, action_t) if not episode_ended_learning: if not CONFIG["generate_initial_weights"]: # ---- Critic Update ---- (td_error, critic_gradient, state_t_value, state_tp1_value) = self.update_critic( reward, not CONFIG["test_policy"]) if episode_number % CONFIG["log_actions"] == 0: if step_number == 0: state_keys = self.state_t.keys() state_keys.append("exploration") state_keys.append("reward") state_keys.append("tde") state_keys.append("st") state_keys.append("stp1") state_keys.append("explore_action") state_keys.append("action") label_logging_format = "#{" + "}\t{".join([ str(state_keys.index(el)) for el in state_keys ]) + "}\n" f_actions.write( label_logging_format.format( *state_keys)) logging_list = self.state_t.values() logging_list.append(exploration) logging_list.append(reward) logging_list.append(td_error) logging_list.append(state_t_value) logging_list.append(state_tp1_value) logging_list.append(action_t) logging_list.append(action_t_deterministic) action_logging_format = "{" + "}\t{".join([ str(logging_list.index(el)) for el in logging_list ]) + "}\n" f_actions.write( action_logging_format.format( *logging_list)) if not CONFIG["test_policy"]: # ---- Policy Update ------- self.update_policy(td_error, exploration) prev_critic_gradient = deepcopy(critic_gradient) reward_cum += reward if to_end: reward_cum = -3000 # for logging only break # TODO - add check for if episode ended early. i.e. moving average """ episode_ended_learning = self.env.episodeEnded() if episode_ended_learning: # episode complete, start a new one break """ # episode either ended early due to failure or completed max number of steps print("Episode ended - Learning {0} {1}".format( episode_number, reward_cum)) f_returns.write("{0}\t{1}\n".format(episode_number, reward_cum)) np.save( "{0}/policy_params{1}".format(results_dir, episode_number), self.approx_policy.getParams()) np.save( "{0}/critic_params{1}".format(results_dir, episode_number), self.approx_critic.getParams())