def main(grid_size, discount, L): wind = 0.3 trajectory_length = 3 * grid_size gw = gridworld.Gridworld(grid_size, wind, discount) ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) #policy = [gw.optimal_policy_deterministic(s) for s in range(gw.n_states)]#由确定性最优策略(自己预先设置的)求R #由强化学习求最优策略 policy = find_policy( gw.n_states, gw.n_actions, gw.transition_probability, ground_r, discount, ) # Need a value function for each basis function. feature_matrix = gw.feature_matrix() values = [] for dim in range(feature_matrix.shape[1]): reward = feature_matrix[:, dim] values.append( value(policy, gw.n_states, gw.transition_probability, reward, gw.discount)) values = np.array(values).T rl1, rl2, rl1l2 = linear_irl.large_irl(values, gw.transition_probability, feature_matrix, gw.n_states, gw.n_actions, policy, L) return ground_r, rl1, rl2, rl1l2
def main(discount, epochs, learning_rate): """ Run maximum entropy inverse reinforcement learning on the gridworld MDP. discount: MDP discount factor. float. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ try: starttime = datetime.datetime.now() path = "/home/ubuntu/Data/KDDI/#201111.CDR-data/vks2564k/slot/" id_traj = load.load_directory_trajectory(path) print(len(id_traj)) trajectories = id_traj.values() g = load.load_graph_traj(trajectories) g.set_start("53397561") gw = gridworld.Gridworld(g, discount) feature_matrix = gw.feature_matrix(g) if not os.path.exists(path + "param/"): os.mkdir(path + "param/") maxent.t_irl(g, feature_matrix, trajectories, epochs, learning_rate, path + "param/") endtime = datetime.datetime.now() print("finished reading files with time of" + str(endtime - starttime)) except Exception: print("mian class wrong") raise
def main(grid_size, discount): """ Run linear programming inverse reinforcement learning on the gridworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. """ wind = 0.3 trajectory_length = 3 * grid_size gw = gridworld.Gridworld(grid_size, wind, discount) ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) policy = [gw.optimal_policy_deterministic(s) for s in range(gw.n_states)] r = linear_irl.irl(gw.n_states, gw.n_actions, gw.transition_probability, policy, gw.discount, 1, 5) plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
def simulation(trajectories, path, start, count): if os.path.exists(path): parampath = path + start + "_0_param.csv" try: g = load.load_graph_traj(trajectories) g.set_start(start) gw = gridworld.Gridworld(g, 0.9) feature_matrix = gw.feature_matrix(g) alpha = load_param(parampath) r = dict() for t in range(12, 48): r[t] = dict().fromkeys(g.get_edges(), 0) for t in range(12, 48): for edge in g.get_edges(): if t in alpha.keys(): r[t][edge] = feature_matrix[edge].dot(alpha[t]) # print r for i in range(count): print("****************") directory = "/home/t-iho/Result/sim/" + start if not os.path.exists(directory): os.mkdir(directory) tools.simple_trajectory( g, r, start, "/home/t-iho/Result/sim/" + start + "/", start + "_" + str(i)) except KeyError: return 0
def main(grid_size, discount, n_trajectories, epochs, learning_rate): """ Run maximum entropy inverse reinforcement learning on the gridworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ wind = 0.3 trajectory_length = 3 * grid_size gw = gridworld.Gridworld(grid_size, wind, discount) trajectories = gw.generate_trajectories(n_trajectories, trajectory_length, gw.optimal_policy) feature_matrix = gw.feature_matrix() ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) r = maxent.irl(feature_matrix, gw.n_actions, discount, gw.transition_probability, trajectories, epochs, learning_rate) plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
def simulation(trajectories, path, start, count): if os.path.exists(path): parampath = path try: g = load.load_graph_traj(trajectories) g.set_start(start) gw = gridworld.Gridworld(g, 0.9, "") feature_matrix = gw.feature_matrix(g) alpha = load.load_param(parampath) print(alpha) r = dict() for t in range(12, 48): r[t] = dict().fromkeys(g.get_edges(), 0) for t in range(12, 48): for edge in g.get_edges(): if t in alpha.keys(): r[t][edge] = feature_matrix[t][edge].dot(alpha[t]) print(r) for i in range(count): print("****************") directory = "/home/ubuntu/Data/PT_Result/100expert_1agent/" + start + "/sim/" if not os.path.exists(directory): os.mkdir(directory) tools.simple_trajectory( g, r, start, "/home/ubuntu/Data/PT_Result/100expert_1agent/" + start + "/", start + "_" + str(i + 50)) except KeyError: return 0
def training(pos): id_trajectory = load.load_trajectory(1000) graph_trajectories = tools.choose_trajectory(1000, id_trajectory) _graph = load.load_graph_traj(graph_trajectories) sample_trajectories = tools.choose_trajectory(100, id_trajectory) gw = gridworld.Gridworld(_graph, 0.9) feature_matrix = gw.feature_matrix(_graph) alpha = maxent.irl(_graph, feature_matrix, sample_trajectories, 1, 0.05) path = str("D:/Ubicomp/alpha" + str(pos) + ".txt") type(path) print path numpy.savetxt(path, alpha) _graph = graph.Graph([], {}, False, False) del _graph return alpha
def generating(g, id_trajectory, alpha, mesh): gw = gridworld.Gridworld(g, 0.9) feature_matrix = gw.feature_matrix(g) reward = dict() for edge in g.get_edges(): reward[edge] = feature_matrix[edge].dot(alpha) mesh_parameter = tools.duration_gaussian(id_trajectory) tools.generate_traj(g, reward, mesh, mesh_parameter)
def main(epochs, learning_rate, discount, number): """ discount: MDP discount factor. float. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ try: starttime = datetime.datetime.now() mesh_list = read_list("/home/ubuntu/Data/Tokyo/MeshCode/Tokyo.csv", number) print(len(mesh_list)) print(mesh_list) for mesh_id in mesh_list: # if not os.path.exists("/home/ubuntu/Data/PT_Result/commuter/test_sim/" + mesh_id + "/"): # os.mkdir("/home/ubuntu/Data/PT_Result/commuter/test_sim/" + mesh_id + "/") # # if not os.path.exists("/home/ubuntu/Data/PT_Result/commuter/test_param/" + mesh_id + "/"): # os.mkdir("/home/ubuntu/Data/PT_Result/commuter/test_param/" + mesh_id + "/") if os.path.exists("/home/ubuntu/Data/pflow_data/pflow-csv/" + mesh_id + "/train_irl.csv"): id_traj = load.load_trajectory( "/home/ubuntu/Data/pflow_data/pflow-csv/" + mesh_id + "/train_irl.csv") # parameter set numbers if len(id_traj) > 200: # for i in range(len(id_traj)/50): trajectories = random.sample(id_traj.values(), 200) g = load.load_graph_traj(trajectories) g.set_start(mesh_id) print(g.get_start()) gw = gridworld.Gridworld(g, discount, "") feature_matrix = gw.feature_matrix(g) # train# maxent.t_irl( g, feature_matrix, trajectories, epochs, learning_rate, "/home/ubuntu/Data/PT_Result/param_15/" + mesh_id + "_" + str(1) + "_") fo = open("/home/ubuntu//Data/PT_Result/finished_mesh.csv", "a") fo.write(mesh_id + "/n") fo.close() endtime = datetime.datetime.now() print("finished reading files with time of" + str(endtime - starttime)) except Exception: print("mian class wrong") raise
def evaluation(path): id_traj = load.load_directory_trajectory( path + "training/") # validation directory files = os.listdir(path + "param/") for filename in files: parampath = path + "param/" + filename if not os.path.isdir(parampath): trajectories = random.sample(id_traj.values(), 500) g = load.load_graph_traj(trajectories) gw = gridworld.Gridworld(g, 0.9) feature_matrix = gw.feature_matrix(g) t_alpha = {} with open(parampath, 'r') as f: t = 12 for line in f: line = line.strip('\n') tokens = line.split(",") param = np.zeros(11) for j in range(11): if len(tokens) > j: param[j] = tokens[j] t_alpha[t] = param.copy() t += 1 r = dict() for t in range(12, 48): r[t] = dict().fromkeys(g.get_edges(), 0) for edge in g.get_edges(): for t in range(12, 48): if t in t_alpha.keys(): r[t][edge] = feature_matrix[edge].dot(t_alpha[t]) print "#######################################################" policy = irl.value_iteration.find_temporal_policy(g, r, 0.9, 46, stochastic=True) nll = irl_nll(policy, trajectories) m_nll = markov_nll(trajectories) print len(trajectories), nll, m_nll
def main(grid_size, discount, n_trajectories, epochs, learning_rate): """ Run maximum entropy inverse reinforcement learning on the gridworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ wind = 0.3 trajectory_length = 3*grid_size gw = gridworld.Gridworld(grid_size, wind, discount) #trajectories = gw.generate_trajectories(n_trajectories,trajectory_length,gw.optimal_policy) trajectories = gw.my_generate_trajectories(n_trajectories,trajectory_length,gw.optimal_policy) feature_matrix = gw.feature_matrix() #feature_matrix = gw.feature_matrix_goalVsOther() #feature_matrix = gw.feature_matrix_goalVsOtherTwo() #feature_matrix = gw.feature_matrix_goalVsOtherThree() #ground truth given by us as we know which states are good vs bad ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) #reard recovered using IRL algorithm recovered_reward = maxent.irl(feature_matrix, gw.n_actions, discount, gw.transition_probability, trajectories, epochs, learning_rate) #let's standardiese it scaler = StandardScaler() standardised_reward=scaler.fit_transform(recovered_reward.reshape(-1,1)) #print(recovered_reward) #print(standardised_reward) plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(standardised_reward.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
def main(grid_size, discount, n_improvements=5): """ Run linear programming inverse reinforcement learning on the gridworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. """ wind = 0.3 trajectory_length = 3 * grid_size gw = gridworld.Gridworld(grid_size, wind, discount) ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) prob_optimal = 0.0 rewards = [] policies = [] for i in range(n_improvements): policy = [ gw.optimal_policy_improving(s, prob_optimal) for s in range(gw.n_states) ] r = linear_irl.irl(gw.n_states, gw.n_actions, gw.transition_probability, policy, gw.discount, 1, 5) rewards.append(r) policies.append(policy) # print(r) # plt.subplot(1, 2, 1) # plt.pcolor(ground_r.reshape((grid_size, grid_size))) # plt.colorbar() # plt.title("Groundtruth reward") # plt.subplot(1, 2, 2) # plt.pcolor(r.reshape((grid_size, grid_size))) # plt.colorbar() # plt.title("Recovered reward") # plt.show() print(prob_optimal) prob_optimal += np.float(1 / n_improvements) return rewards, policies
def simulation(trajectories, path, start, count): files = os.listdir(path + "param/") for filename in files: parampath = path + "param/" + filename if not os.path.isdir(parampath): # trajectories = random.sample(trajectories, 50) try: g = load.load_graph_traj(trajectories) gw = gridworld.Gridworld(g, 0.9) feature_matrix = gw.feature_matrix(g) t_alpha = {} with open(parampath, 'r') as f: t = 12 for line in f: line = line.strip('\n') tokens = line.split(",") param = numpy.zeros(11) for j in range(11): if len(tokens) > j: param[j] = tokens[j] t_alpha[t] = param.copy() t += 1 r = dict() for t in range(12, 48): r[t] = dict().fromkeys(g.get_edges(), 0) for edge in g.get_edges(): for t in range(12, 48): if t in t_alpha.keys(): r[t][edge] = feature_matrix[edge].dot(t_alpha[t]) for i in range(count): # start = random.choice(initial) tools.generate_temporal_traj(g, r, start, 0.5, path + "sim/", str(i) + filename[0:2]) except KeyError: return 0
def simulation(path): id_traj = load.load_directory_trajectory(path + "slot/") files = os.listdir(path + "param/") if not os.path.exists(path + "sim/"): os.mkdir(path + "sim/") for filename in files: parampath = path + "param/" + filename if not os.path.isdir(parampath): trajectories = id_traj.values() g = load.load_graph_traj(trajectories) gw = gridworld.Gridworld(g, 0.9) feature_matrix = gw.feature_matrix(g) alpha = load.load_param(parampath) print(alpha) r = dict() for t in range(12, 48): r[t] = dict().fromkeys(g.get_edges(), 0) for t in range(12, 48): for edge in g.get_edges(): if t in alpha.keys(): r[t][edge] = feature_matrix[t][edge].dot(alpha[t]) print(r) for i in range(10): print("****************") directory = "/home/ubuntu/Data/KDDI/#201111.CDR-data/abf7380g/sim/" if not os.path.exists(directory): os.mkdir(directory) tools.simple_trajectory( g, r, "53397561", "/home/ubuntu/Data/KDDI/#201111.CDR-data/abf7380g/sim/", "53397561" + "_" + str(i)) start = "53397561" tools.generate_temporal_traj(g, r, start, 0.5, path + "sim/", filename[0:2])
def main(discount, epochs, learning_rate, target): """ Run maximum entropy inverse reinforcement learning on the gridworld MDP. discount: MDP discount factor. float. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ try: starttime = datetime.datetime.now() path = "/home/ubuntu/Data/PT_Result/" + target + "/" if not os.path.exists(path + "sim/"): os.mkdir(path + "sim/") if not os.path.exists(path + "param/"): os.mkdir(path + "param/") if os.path.exists(path + "training/"): id_traj = load.load_trajectory( "/home/ubuntu/Data/PT_Result/commuter/training/PT_commuter_irl_revised.csv" ) # parameter set numbers for i in range(10000): trajectories = random.sample(id_traj.values(), 200) print trajectories g = load.load_graph_traj(trajectories) gw = gridworld.Gridworld(g, discount) feature_matrix = gw.feature_matrix(g) # train# print("training ", path) maxent.t_irl(g, feature_matrix, trajectories, epochs, learning_rate, path + "param/" + str(i)) endtime = datetime.datetime.now() print("finished reading files with time of" + str(endtime - starttime)) except Exception: print("mian class wrong") raise
def main(grid_size, discount, L, trust): #L正则化系数 wind = 1 - trust #专家随机动作系数, trajectory_length = 3 * grid_size #最大轨迹长度 gw = gridworld.Gridworld(grid_size, wind, discount) ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) #真实奖赏函数 #policy = [gw.optimal_policy_stochastic(s) for s in range(gw.n_states)] #采用随机(非确定性)策略,效果没那么好 #policy = [gw.optimal_policy_deterministic(s) for s in range(gw.n_states)] #采用确定性策略,效果好 # 由强化学习求最优策略 policy = find_policy( gw.n_states, gw.n_actions, gw.transition_probability, ground_r, discount, ) rl1, rl2, rl1l2 = linear_irl.irl(gw.n_states, gw.n_actions, gw.transition_probability, policy, gw.discount, 1, L) #Rmax=1,L1可变 return ground_r, rl1, rl2, rl1l2
def main(date, discount, epochs, learning_rate, train=True): """ Run maximum entropy inverse reinforcement learning on the gridworld MDP. discount: MDP discount factor. float. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ try: starttime = datetime.datetime.now() path = "D:/ClosePFLOW/53393575/" if not os.path.exists(path + "sim/"): os.mkdir(path + "sim/") if not os.path.exists(path + "param/"): os.mkdir(path + "param/") tools.move_files(path) if os.path.exists(path + "training/"): id_traj = load.load_directory_trajectory(path + "training/") # parameter set numbers for i in range(26): trajectories = random.sample(id_traj.values(), 50) g = load.load_graph_traj(trajectories) gw = gridworld.Gridworld(g, discount) feature_matrix = gw.feature_matrix(g) # train# print "training ", path maxent.t_irl(g, feature_matrix, trajectories, epochs, learning_rate, path + "param/" + str(i)) endtime = datetime.datetime.now() print "finished reading files with time of" + str(endtime - starttime) except Exception: print "mian class wrong" raise
def simulation(path): id_traj = load.load_directory_trajectory(path + "slot/") files = os.listdir(path + "param/") if not os.path.exists(path + "sim/"): os.mkdir(path + "sim/") for filename in files: parampath = path + "param/" + filename if not os.path.isdir(parampath): trajectories = id_traj.values() g = load.load_graph_traj(trajectories) gw = gridworld.Gridworld(g, 0.9) feature_matrix = gw.feature_matrix(g) t_alpha = {} with open(parampath, 'r') as f: t = 12 for line in f: line = line.strip('\n') tokens = line.split(",") param = numpy.zeros(11) for j in range(11): if len(tokens) > j: param[j] = tokens[j] t_alpha[t] = param.copy() t += 1 r = dict() for t in range(12, 48): r[t] = dict().fromkeys(g.get_edges(), 0) for edge in g.get_edges(): for t in range(12, 48): if t in t_alpha.keys(): r[t][edge] = feature_matrix[edge].dot(t_alpha[t]) start = "53397561" tools.generate_temporal_traj(g, r, start, 0.5, path + "sim/", filename[0:2])
def main(grid_size, discount): """ Run large state space linear programming inverse reinforcement learning on the gridworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. """ wind = 0.3 trajectory_length = 3 * grid_size gw = gridworld.Gridworld(grid_size, wind, discount) ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) policy = [gw.optimal_policy_deterministic(s) for s in range(gw.n_states)] # Need a value function for each basis function. feature_matrix = gw.feature_matrix() values = [] for dim in range(feature_matrix.shape[1]): reward = feature_matrix[:, dim] values.append( value(policy, gw.n_states, gw.transition_probability, reward, gw.discount)) values = np.array(values) r = linear_irl.large_irl(values, gw.transition_probability, feature_matrix, gw.n_states, gw.n_actions, policy) plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, learning_rate, start_state, wind=0.0, algo="maxnet", mdp="gridworld"): """ Run inverse reinforcement learning on the objectworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_objects: Number of objects. int. n_colours: Number of colours. int. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. start_state: start location to generate trajectory from algo: IRL algo to run (Currently, support maxnet and deep_maxnet) """ sx, sy = start_state trajectory_length = 8 if mdp == "objectworld": import irl.mdp.objectworld as objectworld ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, discount) elif mdp == "gridworld": import irl.mdp.gridworld as gridworld ow = gridworld.Gridworld(grid_size, wind, discount) ground_r = np.array([ow.reward(s) for s in range(ow.n_states)]) policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, ground_r, ow.discount, stochastic=False) optimal_v = optimal_value(ow.n_states, ow.n_actions, ow.transition_probability, normalize(ground_r), ow.discount) trajectories = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: policy[s], random_start=True) feature_matrix = ow.feature_matrix() print("trajectories = ", trajectories.shape) print("epochs = ", epochs) print("feature_matrix.shape = ", feature_matrix.shape) print("policy.shape = ", policy.shape) # ow.plot_grid("value_{}_t{}_e{}_w{}.png".format(algo, # n_trajectories, epochs, wind), value=optimal_v) ow.plot_grid("policy_{}_t{}_e{}_w{}.png".format(algo, n_trajectories, epochs, wind), policy=policy, value=optimal_v) r = [] ground_svf = [] if algo == "maxent": import irl.maxent as maxent ground_svf = maxent.find_svf(ow.n_states, trajectories) r = maxent.irl(feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate) elif algo == "deep_maxnet": import irl.deep_maxent as deep_maxent l1 = l2 = 0 structure = (3, 3) r = deep_maxent.irl((feature_matrix.shape[1], ) + structure, feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate, l1=l1, l2=l2) recovered_policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, normalize(r), ow.discount, stochastic=False) recovered_v = value(recovered_policy, ow.n_states, ow.transition_probability, normalize(r), ow.discount) new_trajectory = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: recovered_policy[s], True, (sx, sy)) recovered_svf = maxent.find_svf(ow.n_states, new_trajectory) # ow.plot_grid("recovered_value_{}_t{}_e{}_w{}.png".format(algo, # n_trajectories, epochs, wind), # value=recovered_v) ow.plot_grid("recovered_policy_{}_t{}_e{}_w{}.png".format( algo, n_trajectories, epochs, wind), policy=recovered_policy, value=recovered_v) # print("new trajectory") # for t in new_trajectory: # for s, a, rw in t: # print (ow.int_to_point(s), ow.actions[a], rw) # print ("---------") y, x = np.mgrid[-0.5:grid_size + 0.5, -0.5:grid_size + 0.5] plt.subplot(111) plt.pcolor(x, y, ground_svf.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth SVF") plt.savefig("ground_svf_{}_t{}_e{}_w{}.png".format(algo, n_trajectories, epochs, wind), format="png", dpi=150) plt.pcolor(x, y, recovered_svf.reshape((grid_size, grid_size))) plt.title("Recovered SVF") plt.savefig("recovered_svf_{}_t{}_e{}_w{}.png".format( algo, n_trajectories, epochs, wind), format="png", dpi=150) plt.pcolor(x, y, normalize(ground_r).reshape((grid_size, grid_size))) plt.title("Groundtruth reward") plt.savefig("ground_reward_{}_t{}_e{}_w{}.png".format( algo, n_trajectories, epochs, wind), format="png", dpi=150) plt.pcolor(x, y, normalize(r).reshape((grid_size, grid_size))) plt.title("Recovered reward") plt.savefig("recovered_reward_{}_t{}_e{}_w{}.png".format( algo, n_trajectories, epochs, wind), format="png", dpi=150)
def main(): root = "/home/ubuntu/Data/pflow_data/pflow-csv/" mesh_list = read_list("/home/ubuntu/Data/Tokyo/MeshCode/Tokyo.csv") list_dirs = os.walk(root) count = 0 print mesh_list for root, dirs, files in list_dirs: with open("/home/ubuntu/Data/PT_Result/exp1/result.csv", "w") as f: for d in dirs: if d in mesh_list: file_list = os.listdir(os.path.join(root, d)) if len(file_list) > 100 and "train_irl.csv" in file_list: count += 1 id_traj = load.load_trajectory(os.path.join(root, d) + "/train_irl.csv") train, validation = train_test_split(id_traj.values(), test_size=0.4) g = load.load_graph_traj(train) gw = gridworld.Gridworld(g, 0.9) feature_matrix = gw.feature_matrix(g) path = "/home/ubuntu/Data/PT_Result/exp1/" # train if not os.path.exists(path + "parameter/" + d + "param.csv"): maxent.t_irl(g, feature_matrix, train, 200, 0.2, path + "parameter/" + d) # simulation t_alpha = read_param(path + "parameter/" + os.listdir(path+"parameter/")[0]) r = dict() for t in range(12, 48): r[t] = dict().fromkeys(g.get_edges(), 0) for edge in g.get_edges(): for t in range(12, 48): if t in t_alpha.keys(): r[t][edge] = feature_matrix[edge].dot(t_alpha[t]) if not os.path.exists(path + "sim/" + d + "/"): os.mkdir(path + "sim/" + d + "/") for i in range(80): tools.generate_temporal_traj(g, r, d, 0.5, path + "sim/" + d + "/", d + "_" + str(i)) # markov chain if not os.path.exists(path + "markov/" + d + "/"): os.mkdir(path + "markov/" + d + "/") for i in range(80): pairs = makepairs(train) cfd = nltk.ConditionalFreqDist(pairs) generate(cfd, str(i), path + "markov/" + d + "/" + str(i) + ".csv", d) # expansion validation expansion10_trajecotry = random.sample(train, int(len(train)*0.1)) diff_list = [] for validation_traj in validation: min_dist = sys.maxint for traj in expansion10_trajecotry: dist = traj_dist((traj, validation_traj)) if dist < min_dist: min_dist = dist diff_list.append(min_dist) expansion10_score = np.average(diff_list) expansion50_trajecotry = random.sample(train, int(len(train) * 0.5)) diff_list = [] for validation_traj in validation: min_dist = sys.maxint for traj in expansion50_trajecotry: dist = traj_dist((traj, validation_traj)) if dist < min_dist: min_dist = dist diff_list.append(min_dist) expansion50_score = np.average(diff_list) # validation markov_id_traj = load.load_directory_trajectory(path + "markov/" + d + "/") diff_list = [] print markov_id_traj.keys() for traj in validation: min_dist = sys.maxint for markov_id in markov_id_traj.keys(): dist = traj_dist((traj, markov_id_traj[markov_id])) if dist < min_dist: min_dist = dist diff_list.append(min_dist) markov_score = np.average(diff_list) sim_id_traj = load.load_directory_trajectory(path + "sim/" + d + "/") diff_list = [] for traj in validation: min_dist = sys.maxint for sim_id in sim_id_traj.keys(): dist = traj_dist((traj, sim_id_traj[sim_id])) if dist < min_dist: min_dist = dist if min_dist > 10: continue diff_list.append(min_dist) sim_score = np.average(diff_list) print d+","+str(sim_score)+","+str(markov_score)+","+str(expansion10_score)+","+str(expansion50_score) f.write(d+","+str(sim_score)+","+str(markov_score)+","+str(expansion10_score)+","+str(expansion50_score)) f.write("\n") if count > 80: f.close()
def main(mesh_id): """ discount: MDP discount factor. float. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ discount = .9 epochs = 400 learning_rate = 3 try: starttime = datetime.datetime.now() if not os.path.exists("/home/ubuntu/Data/PT_Result/100expert_1agent/" + mesh_id + "/"): os.mkdir("/home/ubuntu/Data/PT_Result/100expert_1agent/" + mesh_id + '/') if os.path.exists("/home/ubuntu/Data/pflow_data/pflow-csv/" + mesh_id + "/train_irl.csv"): id_traj = load.load_trajectory("/home/ubuntu/Data/pflow_data/pflow-csv/" + mesh_id + "/train_irl.csv") # parameter set numbers for i in range(3): print(type(list(id_traj.values()))) trajectories = random.sample(list(id_traj.values()), 100) # save out expert data writeout.write_trajs(trajectories, "/home/ubuntu/Data/PT_Result/100expert_1agent/" + mesh_id + "/training_data.csv") g = load.load_graph_traj(trajectories) g.set_start(mesh_id) print(g.get_start()) gw = gridworld.Gridworld(g, discount) feature_matrix = gw.feature_matrix(g) # train# maxent.t_irl(g, feature_matrix, trajectories, epochs, learning_rate, "/home/ubuntu/Data/PT_Result/100expert_1agent/" + mesh_id + "/" + str(i+3)+"_") # alpha = load.load_param("/home/ubuntu/Data/PT_Result/100expert_1agent/" + mesh_id + "/" + str(i) + # "_" + 'param.csv') # r = dict() # for t in range(12, 48): # r[t] = dict().fromkeys(g.get_edges(), 0) # # for t in range(12, 48): # for edge in g.get_edges(): # if t in alpha.keys(): # r[t][edge] = feature_matrix[t][edge].dot(alpha[t]) # # for j in range(20): # tools.simple_trajectory(g, r, mesh_id, "/home/ubuntu/Data/PT_Result/100expert_1agent/" + mesh_id + # "/", mesh_id + "_" + str(j)) endtime = datetime.datetime.now() print ("finished reading files with time of" + str(endtime - starttime)) except Exception: print("mian class wrong") raise
def main(grid_size, discount, n_trajectories, epochs, learning_rate): """ Run maximum entropy inverse reinforcement learning on the gridworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ wind = 0.1 #模拟干扰,噪声,专家出错导致动作非最优的概率 trajectory_length = 3 * grid_size gw = gridworld.Gridworld(grid_size, wind, discount) ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) # 由强化学习求最优策略让它代表专家策略产生示例轨迹 policy = find_policy(gw.n_states, gw.n_actions, gw.transition_probability, ground_r, discount) trajectories = gw.generate_trajectories(n_trajectories, trajectory_length, policy, random_start=True) # 画轨迹图 预处理前 paths = [] for i in trajectories: path = [j[0] for j in i] paths.append(path) draw_path(gw.grid_size, paths, '预处理前专家示例轨迹') # 预处理专家轨迹 new_trajectories = pre_treated(gw.n_states, gw.n_actions, trajectories) # 画轨迹图 预处理后 paths = [] for i in new_trajectories: path = [j[0] for j in i] paths.append(path) draw_path(gw.grid_size, paths, '预处理后专家示例轨迹') feature_matrix = gw.feature_matrix() trajectories = [[(s, a, r) for (s, a, r, _) in trajectory] for trajectory in trajectories] # maxent irl处理的格式 r1, R1 = maxent.irl(feature_matrix, gw.n_actions, discount, gw.transition_probability, np.array(trajectories), epochs, learning_rate) r1 = r1 / max(r1) loss1 = [] for r in R1: r = r / max(r) loss = abs(r - ground_r).sum() loss1.append(loss) new_trajectories = [[(s, a, r) for (s, a, r, _) in trajectory] for trajectory in new_trajectories] # maxent irl处理的格式 feature_matrix = gw.feature_matrix() r2, R2 = maxent.irl(feature_matrix, gw.n_actions, discount, gw.transition_probability, np.array(new_trajectories), epochs, learning_rate) r2 = r2 / max(r2) loss2 = [] for r in R2: r = r / max(r) loss = abs(r - ground_r).sum() loss2.append(loss) # 监督学习 policy_sl = supervised_learning(new_trajectories, policy) # 监督学习 equal = 0 for i in range(len(policy)): if policy_sl[i] == policy[i]: equal += 1 / len(policy) print("监督学习得到的策略正确率{}%".format(100 * equal)) # 由监督学习策略生成轨迹 sl_trajectories = gw.generate_trajectories(n_trajectories, trajectory_length, policy_sl, random_start=True) # 预处理监督学习策略轨迹 new_sl_trajectories = pre_treated(gw.n_states, gw.n_actions, sl_trajectories) # 画轨迹图 监督学习策略 paths = [] for i in new_sl_trajectories: path = [j[0] for j in i] paths.append(path) draw_path(gw.grid_size, paths, '监督学习策略估计出的专家轨迹') new_sl_trajectories = [[(s, a, r) for (s, a, r, _) in trajectory] for trajectory in new_sl_trajectories] mix_trajectories = new_trajectories for trajectory in new_sl_trajectories: for i in new_trajectories: if trajectory[-1] == i[-1]: mix_trajectories.append(trajectory) break feature_matrix = gw.feature_matrix() r3, R3 = maxent.irl(feature_matrix, gw.n_actions, discount, gw.transition_probability, np.array(mix_trajectories), epochs, learning_rate) r3 = r3 / max(r3) loss3 = [] for r in R3: r = r / max(r) loss = abs(r - ground_r).sum() loss3.append(loss) # # 2维图 # plt.subplot(1, 3, 1) # plt.pcolor(r1.reshape((grid_size, grid_size))) # plt.colorbar() # plt.title("未进行预处理恢复的R") # plt.subplot(1, 3, 2) # plt.pcolor(r2.reshape((grid_size, grid_size))) # plt.colorbar() # plt.title("进行预处理恢复的R") # plt.subplot(1, 3, 3) # plt.pcolor(r3.reshape((grid_size, grid_size))) # plt.colorbar() # plt.title("预处理且监督学习恢复的R") # plt.show() # 画三维图 # 绘图设置 # X和Y的个数要相同 X = range(gw.grid_size) Y = range(gw.grid_size) Z1 = r1 Z2 = r2 Z3 = r3 # meshgrid把X和Y变成平方长度,比如原来都是4,经过meshgrid和ravel之后,长度都变成了16,因为网格点是16个 xx, yy = np.meshgrid(X, Y) # 网格化坐标 X, Y = xx.ravel(), yy.ravel() # 矩阵扁平化 # # 设置柱子属性 height = np.zeros_like(Z1) # 新建全0数组,shape和Z相同,据说是图中底部的位置 width = depth = 1 # 柱子的长和宽 # # 颜色数组,长度和Z一致 c = ['y'] * len(Z1) # 开始画图,注意本来的顺序是X, Y, Z, width, depth, height,但是那样会导致不能形成柱子,只有柱子顶端薄片,所以Z和height要互换 fig = plt.figure() ax = fig.gca(projection='3d') # 三维坐标轴 ax.bar3d(X, Y, height, width, depth, Z1, color=c, shade=True) # width, depth, height ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('reward_vale') plt.title("未进行预处理恢复的R") plt.show() # 开始画图,注意本来的顺序是X, Y, Z, width, depth, height,但是那样会导致不能形成柱子,只有柱子顶端薄片,所以Z和height要互换 fig = plt.figure() ax = fig.gca(projection='3d') # 三维坐标轴 ax.bar3d(X, Y, height, width, depth, Z2, color=c, shade=True) # width, depth, height ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('reward_vale') plt.title("预处理后恢复的R") plt.show() # 开始画图,注意本来的顺序是X, Y, Z, width, depth, height,但是那样会导致不能形成柱子,只有柱子顶端薄片,所以Z和height要互换 fig = plt.figure() ax = fig.gca(projection='3d') # 三维坐标轴 ax.bar3d(X, Y, height, width, depth, Z3, color=c, shade=True) # width, depth, height ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('reward_vale') plt.title("预处理且监督学习恢复的R") plt.show() # 画误差图 plt.plot(range(epochs), loss1, color='r', label='未加预处理') plt.plot(range(epochs), loss2, color='g', label='加了预处理') plt.plot(range(epochs), loss3, color='b', label='预处理且监督学习') plt.legend(loc=1) # 标签展示位置,数字代表标签具位置右上 plt.xlabel('epochs') plt.ylabel('Error') plt.title('grid_size=10,discount=0.9') plt.plot() plt.show()
def main(): env = GridWorld(5, 5) #Define the state matrix state_matrix = np.zeros((5, 5)) state_matrix[0, 4] = 1 print("State Matrix:") print(state_matrix) #Define the reward matrix reward_matrix = np.full((5, 5), 0) reward_matrix[0, 4] = 1 print("Reward Matrix:") print(reward_matrix) #Define the transition matrix transition_matrix = np.array([[0.7, 0.1, 0.1, 0.1], [0.1, 0.7, 0.1, 0.1], [0.1, 0.1, 0.7, 0.1], [0.1, 0.1, 0.1, 0.7]]) #Random policy policy_matrix = np.random.randint(low=0, high=4, size=(5, 5)).astype(np.float32) policy_matrix[0, 4] = -1 #Set the matrices in the world env.setStateMatrix(state_matrix) env.setRewardMatrix(reward_matrix) env.setTransitionMatrix(transition_matrix) state_action_matrix = np.random.random_sample((4, 25)) # Q #init with 1.0e-10 to avoid division by zero running_mean_matrix = np.full((4, 25), 1.0e-10) gamma = 0.5 tot_epoch = 30000 print_epoch = 3000 for epoch in range(tot_epoch): #Starting a new episode episode_list = list() #Reset and return the first observation and reward observation = env.reset(exploring_starts=False) #action = np.random.choice(4, 1) #action = policy_matrix[observation[0], observation[1]] #episode_list.append((observation, action, reward)) is_starting = True for _ in range(1000): #Take the action from the action matrix action = policy_matrix[observation[0], observation[1]] #If the episode just started then it is #necessary to choose a random action (exploring starts) if (is_starting): action = np.random.randint(0, 4) is_starting = False #Move one step in the environment and get obs and reward new_observation, reward, done = env.step(action) #Append the visit in the episode list episode_list.append((observation, action, reward)) observation = new_observation if done: break #The episode is finished, now estimating the utilities #pdb.set_trace() counter = 0 #Checkup to identify if it is the first visit to a state checkup_matrix = np.zeros((4, 25)) #This cycle is the implementation of First-Visit MC. #For each state stored in the episode list check if it #is the rist visit and then estimate the return. for visit in episode_list: observation = visit[0] action = visit[1] col = int(observation[1] + (observation[0] * 4)) row = int(action) if (checkup_matrix[row, col] == 0): return_value = get_return(episode_list[counter:], gamma) running_mean_matrix[row, col] += 1 state_action_matrix[row, col] += return_value checkup_matrix[row, col] = 1 counter += 1 #Policy Update policy_matrix = update_policy( episode_list, policy_matrix, state_action_matrix / running_mean_matrix) #Printing if (epoch % print_epoch == 0): print("") print("State-Action matrix after " + str(epoch + 1) + " iterations:") print(state_action_matrix / running_mean_matrix) print("Policy matrix after " + str(epoch + 1) + " iterations:") print(policy_matrix) print_policy(policy_matrix) #Time to check the utility matrix obtained print("Utility matrix after " + str(tot_epoch) + " iterations:") print(state_action_matrix / running_mean_matrix) print(policy_matrix) state_value_matrix = state_action_matrix.max(axis=0) print(state_value_matrix) policy_matrix[policy_matrix == -1] = 0 final_policy_list = policy_matrix.reshape(-1).astype(int) print(final_policy_list) # ## Random State Transition Matrix # In[13]: random_state_transition_matrix = np.random.rand(25, 4, 25) random_state_transition_matrix = random_state_transition_matrix / random_state_transition_matrix.sum( axis=1)[:, None] print(random_state_transition_matrix.shape) # ## With a handcrafted State Transition Matrix fixed_state_transition_matrix = np.load("gw_transition_probability.npy") print(fixed_state_transition_matrix.shape) # In[20]: r_random = irl(n_states=25, n_actions=4, transition_probability=random_state_transition_matrix, policy=final_policy_list, discount=0.2, Rmax=1, l1=5) # In[24]: fig = plt.figure() fig.subplots_adjust(hspace=0.4, wspace=0.4) plt.subplot(3, 2, 1) plt.pcolor(np.flip(reward_matrix, 0)) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(3, 2, 2) plt.pcolor(r_random.reshape((5, 5))) plt.colorbar() plt.title("Recovered reward (RANDOM)") #plt.show() r_fixed = irl(n_states=25, n_actions=4, transition_probability=fixed_state_transition_matrix, policy=final_policy_list, discount=0.5, Rmax=10, l1=5) plt.subplot(3, 2, 3) plt.pcolor(np.flip(reward_matrix, 0)) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(3, 2, 4) plt.pcolor(r_fixed.reshape((5, 5))) plt.colorbar() plt.title("Recovered reward (FIXED)") #plt.show() """ PART TO BE IGNORED Since its is just using Mathew code for verification """ import irl.linear_irl as linear_irl import irl.mdp.gridworld as gridworld grid_size = 5 discount = 0.2 wind = 0.3 trajectory_length = 3 * grid_size gw = gridworld.Gridworld(grid_size, wind, discount) ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) policy = [gw.optimal_policy_deterministic(s) for s in range(gw.n_states)] print(policy) #final_policy_list = list(final_policy.reshape(-1).astype(int)) #r = linear_irl.irl(gw.n_states, gw.n_actions, gw.transition_probability,policy, gw.discount, 1, 5) r = linear_irl.irl(gw.n_states, gw.n_actions, gw.transition_probability, policy, gw.discount, 1, 5) print(r.shape) print(gw.optimal_policy_deterministic) print(np.array(policy).reshape(5, 5)) plt.subplot(3, 2, 5) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(3, 2, 6) plt.pcolor(r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
def main(date, discount, epochs, learning_rate, train=True): """ Run maximum entropy inverse reinforcement learning on the gridworld MDP. discount: MDP discount factor. float. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ """ # this part is used for calculate uniform reward parameter id_trajectory = load.load_trajectory(10000) print tools.motion_model_policy(id_trajectory) for i in range(1000): graph_trajectories = tools.choose_trajectory(1000, id_trajectory) g = load.load_graph_traj(graph_trajectories) sample_trajectories = sample(graph_trajectories, 100) gw = gridworld.Gridworld(g, 0.9) feature_matrix = gw.feature_matrix(g) alpha = maxent.irl(g, feature_matrix, sample_trajectories, 40, 0.05) path = str("D:/Ubicomp/alpha" + str(i) + ".txt") numpy.savetxt(path, alpha) """ """ this part is usedfor temporal reward parameter training """ try: starttime = datetime.datetime.now() path = "D:/ClosePFLOW/" dirs = os.listdir(path) for dirname in dirs: directory = path + dirname + "/" print directory if not os.path.exists(directory + "sim/"): os.mkdir(directory + "sim/") tools.move_files(directory) if os.path.exists(directory + "training/"): id_traj = load.load_directory_trajectory(directory + "training/") if (len(id_traj) >= 40 and not os.path.exists(directory + "param.csv") ) or os.path.getsize(directory + "param.csv") > 2038: trajectories = id_traj.values() g = load.load_graph_traj(trajectories) gw = gridworld.Gridworld(g, discount) feature_matrix = gw.feature_matrix(g) # train# print "training ", directory maxent.t_irl(g, feature_matrix, trajectories, epochs, learning_rate, directory) indicator = 0 i = 0 while indicator <= 5000: sample_id = [] trajectories = [] for k in range(indicator, indicator + 100): sample_id.append(id_list[k]) for sid in sample_id: trajectories.append(id_traj.get(sid)) start_state = [] for traj in trajectories: start_state.append(traj[12][0]) training_data = "C:/Users/PangYanbo/Desktop/UbiResult/TrainingTrajectoriesGroup_" + str( i) + ".csv" with open(training_data, "wb") as f: for k in range(100): for j in range(12, 47): if j in trajectories[k].keys(): f.write( str(j) + ',' + trajectories[k][j][1].get_origin() + ',' + trajectories[k][j][1].get_destination() + ',' + trajectories[k][j][1].get_mode() + '\n') # initial environment based on trajectories g = load.load_graph_traj(trajectories) gw = gridworld.Gridworld(g, discount) feature_matrix = gw.feature_matrix(g) print g if train: # training the model maxent.t_irl(g, feature_matrix, trajectories, epochs, learning_rate, date) else: # simulation for start in start_state: # read alpha from saved file root = "C:/Users/PangYanbo/Desktop/UbiResult/param/" para_list = list( os.path.join(root, name) for name in os.listdir(root)) for filename in para_list: if os.path.isdir(filename): para_list.remove(filename) param_path = random.choice(para_list) agent_id = param_path[43:-4] print agent_id, param_path t_alpha = {} with open(param_path, 'r') as f: t = 12 for line in f: line = line.strip('\n') tokens = line.split(",") param = numpy.zeros(11) for j in range(11): if len(tokens) > j: param[j] = tokens[j] t_alpha[t] = param.copy() t += 1 r = dict() for t in range(12, 48): r[t] = dict().fromkeys(g.get_edges(), 0) for edge in g.get_edges(): for t in range(12, 48): if t in t_alpha.keys(): r[t][edge] = feature_matrix[edge].dot( t_alpha[t]) tools.generate_temporal_traj(g, r, start, 0.5, i, agent_id) i += 1 indicator += 50 endtime = datetime.datetime.now() print "finished reading files with time of" + str(endtime - starttime) except Exception: print "something wrong" raise