class AgentPolicyGradient: def __init__(self, n_x, n_y, learning_rate = 0.02, reward_decay=0.99, load_path=None, save_path=None): self.PG = PolicyGradient(n_x, n_y, learning_rate=learning_rate, reward_decay=reward_decay, load_path=load_path, save_path=save_path ) def choose_action(self, observation): return self.PG.choose_action(observation) def store_transition(self, s, a, r): return self.PG.store_transition(s,a,r) def learn(self): return self.PG.learn() def plot_cost(self): import matplotlib matplotlib.use('TkAgg') import matplotlib.pyplot as plt plt.plot(np.arange(len(self.PG.cost_history)), self.PG.cost_history) plt.ylabel('Cost Ex') plt.xlabel('Training Steps Ex') plt.show() def crashed(self): episode_rewards_sum = sum(self.PG.episode_rewards) return episode_rewards_sum < -250 def episode_reward(self): episode_rewards_sum = sum(self.PG.episode_rewards) return episode_rewards_sum def costs(self): return self.PG.costs()
n_y=env.action_space.n, learning_rate=0.01, reward_decay=0.95, load_path=load_path, save_path=save_path) for episode in range(EPISODES): # start nauki observation = env.reset() episode_reward = 0 while True: if RENDER_ENV: env.render() # 1. Choose an action based on observation action = PG.choose_action(observation) # 2. Take action in the environment observation_, reward, done, info = env.step(action) # 3. Store transition for training PG.store_transition(observation, action, reward) if done: episode_rewards_sum = sum(PG.episode_rewards) rewards.append(episode_rewards_sum) max_reward_so_far = np.amax(rewards) print("==========================================") print("Episode: ", episode) print("Reward: ", episode_rewards_sum)
def simulation(): users_num = 1 action_rewards = [10, 9, 1, 1, 1, 1, 1, 1, 1, 1] actions = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] observations = [[random.randint(0, i * 10) for i in range(1, 4)] for j in range(1, 101)] # nums of items to recommend K = 2 load_version = 1 save_version = load_version + 1 load_path = "output/weights/topk{}.ckpt".format(load_version) save_path = "output/weights/topk{}.ckpt".format(save_version) EPISODES = 5000 RENDER_ENV = True rewards = [] PG = PolicyGradient(n_x=len(observations[0]), n_y=len(actions), s0=observations[random.randint(0, len(observations) - 1)], learning_rate=0.005, reward_decay=1, load_path=None, save_path=save_path, weight_capping_c=2**3, k=K, b_distribution='uniform') for episode in range(EPISODES): episode_reward = 0 tic = time.clock() done = False while True: ''' TODO:initialize the env ''' if RENDER_ENV: observation = observations[random.randint( 0, len(observations) - 1)] # 1. Choose an action based on observation # action = PG.uniform_choose_action(observation) action = PG.choose_action(observation) # 2. Take action in the environment observation_, reward = observations[random.randint( 0, len(observations) - 1)], action_rewards[action] # 4. Store transition for training PG.store_transition(observation, action, reward) toc = time.clock() elapsed_sec = toc - tic if elapsed_sec > 120: done = True if len(PG.episode_observations) > 100: done = True if done: episode_rewards_sum = sum(PG.episode_rewards) rewards.append(episode_rewards_sum) max_reward_so_far = np.amax(rewards) PG.cost_history.append(episode_rewards_sum) print("==========================================") print("Episode: ", episode) print("Seconds: ", elapsed_sec) print("Reward: ", episode_rewards_sum) print("Max reward so far: ", max_reward_so_far) #print(PG.outputs_softmax) print("distribution at {} is :{}".format( PG.s0, PG.get_distribution(PG.s0))) # 5. Train neural network discounted_episode_rewards_norm = PG.learn() break # Save new observation observation = observation_ PG.plot_cost() plt.bar(actions, PG.get_distribution(PG.s0)) plt.xlabel("action") # 显示纵轴标签 plt.ylabel("probability") # 显示图标题 plt.title("top-k correction policy") plt.show()
env.seed(1) env = env.unwrapped PG = PolicyGradient(n_actions=env.action_space.n, n_features=env.observation_space.shape[0], lr=0.02, gamma=0.99, output_graph=FLAGS.output_graph) for i in range(FLAGS.episode): s = env.reset() while True: if RENDER: env.render() action = PG.choose_action(s) s_, r, done, info = env.step(action) PG.store_transition(s_, action, r) if done: episode_rs_sum = sum(PG.ep_rs) if 'running_reward' not in globals(): running_reward = episode_rs_sum else: running_reward = running_reward * 0.99 + episode_rs_sum * 0.01 if running_reward > FLAGS.display_threshold: RENDER = True print('episode:', i, ' reward:', running_reward) norm_reward = PG.learn() if i == 30:
def simulation(): users_num = 1 ''' action_rewards = {'11':4,'12':1,'13':1,'14':1,'21':1,'22':2,'23':3,'24':16,'31':1,'32':2,'33':3,'34':4} observation_action_transfer = {'11':[2],'12':[2],'13':[2],'14':[2],'21':[3],'22':[3],'23':[3],'24':[3],\ '31':[1],'32':[1],'33':[3],'34':[3]} actions = [1,2,3,4] observations = [[1],[2],[3]] ''' action_rewards = {'11': 5,'12': 0,'13': 0,'14':0,'15':0,'16':13, \ '21': 10,'22': 0, '23': 0,'24':0,'25':0,'26':8} observation_action_transfer = {'11': [1,1], '12': [1,1], '13': [1,1],'14':[1,1],'15':[1,1],'16':[1,1], \ '21': [1,1], '22': [1,1], '23': [1,1],'24':[1,1],'25':[1,1],'26':[0,1]} actions = [1, 2, 3, 4, 5, 6] observations = [[0, 1], [1, 1]] # nums of items to recommend K = 2 load_version = 4 save_version = load_version + 1 load_path = "output/weights/topk{}.ckpt".format(load_version) save_path = "output/weights/topk{}.ckpt".format(save_version) EPISODES = 3000 RENDER_ENV = True rewards = [] PG = PolicyGradient(n_x=len(observations[0]), n_y=len(actions), s0=observations[-1], learning_rate=0.001, reward_decay=1, load_path=None, save_path=save_path, weight_capping_c=2**3, k=K, b_distribution='uniform') for episode in range(EPISODES): episode_reward = 0 tic = time.clock() done = False while True: ''' TODO:initialize the env ''' if RENDER_ENV: observation = PG.episode_observations[-1] #print(observation) # 1. Choose an action based on observation #action = PG.uniform_choose_action(observation) action = PG.choose_action(observation) # 2. Take action in the environment observation_, reward = observation_action_transfer[str(sum(observation))+str(actions[action])], \ action_rewards[str(sum(observation))+str(actions[action])] # 4. Store transition for training PG.store_transition(observation_, action, reward) #print(PG.episode_observations) #print(PG.episode_actions) #print(PG.episode_rewards) toc = time.clock() elapsed_sec = toc - tic if elapsed_sec > 120: done = True if len(PG.episode_observations) > 100: done = True if done: episode_rewards_sum = sum(PG.episode_rewards) rewards.append(episode_rewards_sum) max_reward_so_far = np.amax(rewards) PG.cost_history.append(episode_rewards_sum) print("==========================================") print("Episode: ", episode) print("Seconds: ", elapsed_sec) print("Reward: ", episode_rewards_sum) print("Max reward so far: ", max_reward_so_far) #print(PG.outputs_softmax) #print(PG.episode_rewards) # 5. Train neural network print("distribution at {} is :{}".format( observations[0], PG.get_distribution(observations[0]))) print("distribution at {} is :{}".format( observations[1], PG.get_distribution(observations[1]))) discounted_episode_rewards_norm = PG.learn() break # Save new observation observation = observation_ PG.plot_cost() plt.bar(actions, PG.get_distribution(observations[0])) plt.xlabel("action at state[0,1]") # 显示纵轴标签 plt.ylabel("probability") # 显示图标题 plt.title("policy distribution at state[0,1]") plt.show() plt.bar(actions, PG.get_distribution(observations[1])) plt.xlabel("action at state[1,1]") # 显示纵轴标签 plt.ylabel("probability") # 显示图标题 plt.title("policy distribution at state[1,1]") plt.show()
def train(self, max_episode=10, max_path_length=200, verbose=0): env = self.env avg_reward_sum = 0. #f_eps = open("episode.csv","w") #write_eps = csv.write(f_eps) for e in range(max_episode): env._reset() observation = env._reset() game_over = False reward_sum = 0 inputs = [] outputs = [] predicteds = [] rewards = [] #f_iter = open("episode_{0}.csv".format(e),"w") #write_iter = csv.writer(f_iter) f_episode = "episode_{0}.csv".format(e) os.system("rm -rf {0}".format(f_episode)) print(observation[0].shape, observation[1].shape) RL = PolicyGradient( n_actions=self.env.action_space.n, # n_features=observation.shape[0], learning_rate=0.02, reward_decay=0.995, # output_graph=True, ) while not game_over: action, aprob = RL.choose_action(observation) inputs.append(observation) predicteds.append(aprob) y = np.zeros([self.env.action_space.n]) y[action] = 1. outputs.append(y) observation, reward, actual_reward, game_over, info = self.env._step( action) reward_sum += float(actual_reward) #rewards.append(float(reward)) rewards.append(float(reward_sum)) RL.store_transition(observation, action, rewards) # check memory for RNN model if len(inputs) > self.max_memory: del inputs[0] del outputs[0] del predicteds[0] del rewards[0] if verbose > 0: if env.actions[action] == "LONG" or env.actions[ action] == "SHORT": #if env.actions[action] == "LONG" or env.actions[action] == "SHORT" or env.actions[action] == "HOLD": color = bcolors.FAIL if env.actions[ action] == "LONG" else bcolors.OKBLUE print("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], color + env.actions[action] + bcolors.ENDC, reward_sum, info["cum"]) + ("\t".join([ "%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist()) ]))) #write_iter.writerow("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], env.actions[action], reward_sum, info["cum"]) + ("\t".join(["%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist())]))) os.system("echo %s >> %s" % ("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], env.actions[action], reward_sum, info["cum"]) + ("\t".join([ "%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist()) ])), f_episode)) avg_reward_sum = avg_reward_sum * 0.99 + reward_sum * 0.01 toPrint = "%d\t%s\t%s\t%.2f\t%.2f" % ( e, info["code"], (bcolors.FAIL if reward_sum >= 0 else bcolors.OKBLUE) + ("%.2f" % reward_sum) + bcolors.ENDC, info["cum"], avg_reward_sum) print(toPrint) if self.history_filename != None: os.system("echo %s >> %s" % (toPrint, self.history_filename)) discounted_rewards_ = RL.learn() # train dim = len(inputs[0]) inputs_ = [[] for i in range(dim)] for obs in inputs: for i, block in enumerate(obs): inputs_[i].append(block[0]) inputs_ = [np.array(inputs_[i]) for i in range(dim)] outputs_ = np.vstack(outputs) predicteds_ = np.vstack(predicteds) rewards_ = np.vstack(rewards) print("shape: ", np.shape(rewards), np.shape(discounted_rewards_)) #outputs_ *= discounted_rewards_ for i, r in enumerate(zip(rewards, discounted_rewards_)): reward, discounted_reward = r if verbose > 1: # print (outputs_[i],) print(outputs_[i], ) if verbose > 0: print(predicteds_[i], outputs_[i], reward, discounted_reward) print("fit model input.shape %s, output.shape %s" % ([inputs_[i].shape for i in range(len(inputs_))], outputs_.shape)) np.set_printoptions(linewidth=200, suppress=True) print("currentTargetIndex:", env.currentTargetIndex)
class runPG(): n_inputs = 4 n_outputs = 4 # right and left for each finger # n_outputs = 8 # right, left and stop for each finger net = 0 X = 0 A = np.array([[-1, -1], [-1, 1], [1, -1], [1, 1], [0, -1], [0, 1], [-1, 0], [1, 0]]) mode = 5 reward_mode = 2 R = [] gripper_closed = False stLearning = True possible_plot = False def __init__(self): rospy.init_node('runPG', anonymous=True) if self.mode == 5: self.n_inputs = 4 if self.mode == 8: self.n_inputs = 8 self.RL = PolicyGradient( n_actions=self.n_outputs, n_features=self.n_inputs, learning_rate=0.02, reward_decay=0.99, load_saved_net=False, # output_graph=True, ) rospy.Subscriber('/RL/gripper_status', String, self.callbackGripperStatus) rospy.Service('/RL/net', net_eval, self.EvalNet) rospy.Service('/RL/start_learning', Empty, self.start_learning) obs_srv = rospy.ServiceProxy('/RL/observation', observation) drop_srv = rospy.ServiceProxy('/RL/IsObjDropped', IsDropped) move_srv = rospy.ServiceProxy('/RL/MoveGripper', TargetAngles) open_srv = rospy.ServiceProxy('/RL/OpenGripper', Empty) close_srv = rospy.ServiceProxy('/RL/CloseGripper', Empty) rospy.sleep(3) o = open_srv() episode_count = 0 rate = rospy.Rate(15) # 15hz while not rospy.is_shutdown(): if self.stLearning: ## Start episode ## episode_count += 1 # Close gripper raw_input( "Place object between fingers and press Enter to close gripper..." ) close_srv() while not self.gripper_closed: rate.sleep() raw_input("Remove table and press Enter to start episode...") # Get observation obs = np.array(obs_srv().state) self.VT = [] while True: # Choose action action = self.RL.choose_action(obs) # Act suc = move_srv(self.A[action]).success rospy.sleep(0.05) rate.sleep() if suc: # Get observation obs_ = np.array(obs_srv().state) fail = drop_srv( ).dropped # Check if dropped - end of episode else: # End episode if overload or angle limits reached rospy.logerr( '[RL] Failed to move gripper. Episode declared failed.' ) fail = True reward, done = self.transition_reward(obs_, fail) self.RL.store_transition(obs, action, reward) obs = obs_ if done: ep_rs_sum = sum(self.RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 print("*** episode: " + str(episode_count) + ", episode reward: " + str(ep_rs_sum) + ", running reward: " + str(int(running_reward)) + " ***") vt = self.RL.learn() self.R.append(running_reward) self.possible_plot = True break rate.sleep() elif self.possible_plot: self.plot_sav() self.possible_plot = False # Open gripper if self.gripper_closed: o = open_srv() rospy.sleep(0.2) # self.stLearning = False # print(obs_srv().state) # rospy.spin() rate.sleep() def plot_sav(self): plt.plot(range(len(self.R)), self.R) # plot the episode vt plt.xlabel('episode steps') plt.ylabel('normalized state-action value') plt.show() def EvalNet(self, msg): a = 0 return {'action': a} def callbackGripperStatus(self, msg): self.gripper_closed = msg.data == "closed" def start_learning(self, msg): self.stLearning = not self.stLearning return EmptyResponse() def transition_reward(self, obs, fail): # Keep moving as much as possible if self.reward_mode == 1: if fail: reward = 0. else: reward = 1. done = fail # Get to a certain coodrinate if self.reward_mode == 2: if fail: reward = -3. else: reward = -1. done = fail if obs[0] > 135.: raw_input('Reached goal, x = %f.' % obs[0]) reward = 5. done = True return reward, done
n_actions = env.action_space.n n_features = env.observation_space.shape[0] RL = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=0.02, reward_decay=0.99) for i_episode in range(3000): observation = env.reset() # 车的位置,杆子的角度,车速,角度变化率 while True: if RENDER: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) RL.store_transition(observation, action, reward) if done: ep_rs_sum = sum(RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering time.sleep(2)
RL = PolicyGradient( s_dim = env.observation_space.shape[0], a_dim = env.action_space.n, learning_rate = 0.02, reward_decay = 0.99, #output_graph = True ) for i_epsiode in range(3000): s = env.reset() while True: if RENDER: env.render() a = RL.choose_action(s) s_,r,done,info = env.step(a) RL.store_transition(s,a,r) if done: ep_rs_sum = sum(RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True print('episode:',i_epsiode,"reward:",int(running_reward))
episode_reward = 0 h = 5 l = 1 pizza_lines = ["TMMMTTT","MMMMTMM", "TTMTTMT", "TMMTMMM", "TTTTTTM", "TTTTTTM"] pizza_config = { 'pizza_lines': pizza_lines, 'r': R, 'c': C, 'l': l, 'h': h } state = env.init(pizza_config)[0] print("\nPIZZA CONFIG: ", pizza_config) print("\nSTATE: ", state) print("\n\nSTATE[0]", state[0]) #state[0] #get only first value of tuple for step in range(STEPS): if RENDER_ENV: env.render() # sample one action with the given probability distribution # 1. Choose an action based on observation action = PG.choose_action(state) # 2. Take action in the environment state_, reward, done, info = env.step(ACTIONS[action]) # 3. Store transition for training PG.store_transition(preprocess(state), action, reward) # Save new state #state = state_ if done: episode_rewards_sum = sum(PG.episode_rewards) rewards.append(episode_rewards_sum) max_reward_so_far = np.amax(rewards) print("==========================================")
class runPG(): n_inputs = 4 # n_outputs = 4 # right and left for each finger n_outputs = 8 # right, left and stop for each finger max_episodes = 1200 max_steps = 2500 net = 0 X = 0 A = np.array([[-1, -1], [1, -1], [-1, 1], [1, 1], [0, -1], [0, 1], [-1, 0], [1, 0]]) mode = 5 reward_mode = 3 R = [] g = np.array([-35.0, 104.0], dtype='f') # Goal gripper_closed = False stLearning = True # Enable learning possible_plot = False # For reward mode 3 prev_dis2goal = 1e9 def __init__(self): rospy.init_node('runPG', anonymous=True) if self.mode == 5: self.n_inputs = 4 if self.mode == 8: self.n_inputs = 8 self.RL = PolicyGradient( n_actions = self.n_outputs, n_features = self.n_inputs, learning_rate=0.001, reward_decay=0.98, load_saved_net=True, # output_graph=True, ) rospy.Subscriber('/RL/gripper_status', String, self.callbackGripperStatus) rospy.Service('/RL/net', net_eval, self.EvalNet) rospy.Service('/RL/start_learning', Empty, self.start_learning) obs_srv = rospy.ServiceProxy('/RL/observation', observation) drop_srv = rospy.ServiceProxy('/RL/IsObjDropped', IsDropped) move_srv = rospy.ServiceProxy('/RL/MoveGripper', TargetAngles) reset_srv = rospy.ServiceProxy('/RL/ResetGripper', Empty) pub_goal = rospy.Publisher('/RL/Goal', Float32MultiArray, queue_size=10) gg = Float32MultiArray() gg.data = self.g episode_count = 0 rate = rospy.Rate(100) # 100hz while not rospy.is_shutdown(): if self.stLearning: ## Start episode ## episode_count += 1 self.prev_dis2goal = 1e9 # Set gripper reset_srv() while not self.gripper_closed: rate.sleep() # Get observation obs = np.array(obs_srv().state) self.VT = [] step = 0 while True: step += 1 print('[RL] Step %d in episode %d, distance to goal: %f.' % (step, episode_count, self.prev_dis2goal)) pub_goal.publish(gg) # Choose action action = self.RL.choose_action(obs) # Act suc = move_srv(self.A[action]).success rospy.sleep(0.05) rate.sleep() if suc: # Get observation obs_ = np.array(obs_srv().state) fail = drop_srv().dropped # Check if dropped - end of episode else: # End episode if overload or angle limits reached rospy.logerr('[RL] Failed to move gripper. Episode declared failed.') fail = True reward, done = self.transition_reward(obs_, fail) self.RL.store_transition(obs, action, reward) obs = obs_ if step > self.max_steps: done = True if done: ep_rs_sum = sum(self.RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 print("*** episode: " + str(episode_count) + ", episode reward: " + str(ep_rs_sum) + ", running reward: " + str(int(running_reward)) + " ***") vt = self.RL.learn() self.R.append(running_reward) self.possible_plot = True break rate.sleep() elif self.possible_plot: self.plot_sav() self.possible_plot = False if self.max_episodes < episode_count: self.plot_sav() break rate.sleep() def plot_sav(self): plt.plot(range(len(self.R)),self.R) # plot the episode vt plt.xlabel('episode steps') plt.ylabel('normalized state-action value') plt.show() def EvalNet(self, msg): a = 0 return {'action': a} def callbackGripperStatus(self, msg): self.gripper_closed = msg.data == "closed" def start_learning(self, msg): self.stLearning = not self.stLearning return EmptyResponse() def transition_reward(self, obs, fail): # Keep moving as much as possible if self.reward_mode == 1: if fail: reward = 0. else: reward = 1. done = fail # Cross a line if self.reward_mode == 2: if fail: reward = -3. else: reward = -1. done = fail if obs[0] > 40.: print('Reached goal, x = %f.' % obs[0]) reward = 5. done = True # Get to a certain coordinate if self.reward_mode == 3: d = np.linalg.norm(self.g-obs[:2]) if fail or d > self.prev_dis2goal: reward = 0. else: reward = 1. done = fail if d < 5: print('Reached goal, (x,y) = (%f,%f).' % (obs[0],obs[1])) reward = 50. done = True self.prev_dis2goal = d return reward, done