def parameterTest(): train50eps1_avgs = [] train50eps2_avgs = [] train50eps3_avgs = [] train70eps1_avgs = [] train70eps2_avgs = [] train70eps3_avgs = [] train100eps1_avgs = [] train100eps2_avgs = [] train100eps3_avgs = [] training_steps = 10000000 for i in range(0, 3): gridWorldModel = GridWorld(m, n, k, debug=False, gamma=1, no_stochastisity=False) Q1 = np.zeros((gridWorldModel.spec.nS, gridWorldModel.spec.nA)) Q2 = np.zeros((gridWorldModel.spec.nS, gridWorldModel.spec.nA)) Q3 = np.zeros((gridWorldModel.spec.nS, gridWorldModel.spec.nA)) Q4 = np.zeros((gridWorldModel.spec.nS, gridWorldModel.spec.nA)) Q5 = np.zeros((gridWorldModel.spec.nS, gridWorldModel.spec.nA)) Q6 = np.zeros((gridWorldModel.spec.nS, gridWorldModel.spec.nA)) Q7 = np.zeros((gridWorldModel.spec.nS, gridWorldModel.spec.nA)) Q8 = np.zeros((gridWorldModel.spec.nS, gridWorldModel.spec.nA)) Q9 = np.zeros((gridWorldModel.spec.nS, gridWorldModel.spec.nA)) learning_rate = 0.1 q1, pi1, episode_steps1 = tabular_dyna_q(gridWorldModel, Q1, learning_rate, training_steps, 50, num_of_episodes=1000, eps=0.1) q2, pi2, episode_steps2 = tabular_dyna_q(gridWorldModel, Q2, learning_rate, training_steps, 50, num_of_episodes=1000, eps=0.2) q3, pi3, episode_steps3 = tabular_dyna_q(gridWorldModel, Q3, learning_rate, training_steps, 50, num_of_episodes=1000, eps=0.3) #eps = range(len(episode_steps1)) #plt.plot(eps, episode_steps1) #plt.plot(eps, episode_steps2) #plt.plot(eps, episode_steps3) #plt.xlabel('Episodes') #plt.ylabel('Steps') #plt.title('Steps per Episode') #plt.show() q4, pi4, episode_steps4 = tabular_dyna_q(gridWorldModel, Q4, learning_rate, training_steps, 70, num_of_episodes=1000, eps=0.1) q5, pi5, episode_steps5 = tabular_dyna_q(gridWorldModel, Q5, learning_rate, training_steps, 70, num_of_episodes=1000, eps=0.2) q6, pi6, episode_steps6 = tabular_dyna_q(gridWorldModel, Q6, learning_rate, training_steps, 70, num_of_episodes=1000, eps=0.3) #eps = range(len(episode_steps4)) #plt.plot(eps, episode_steps4) #plt.plot(eps, episode_steps5) #plt.plot(eps, episode_steps6) #plt.xlabel('Episodes') #plt.ylabel('Steps') #plt.title('Steps per Episode') #plt.show() q7, pi7, episode_steps7 = tabular_dyna_q(gridWorldModel, Q7, learning_rate, training_steps, 100, num_of_episodes=1000, eps=0.1) q8, pi8, episode_steps8 = tabular_dyna_q(gridWorldModel, Q8, learning_rate, training_steps, 100, num_of_episodes=1000, eps=0.2) q9, pi9, episode_steps9 = tabular_dyna_q(gridWorldModel, Q9, learning_rate, training_steps, 100, num_of_episodes=1000, eps=0.3) #eps = range(len(episode_steps7)) #plt.plot(eps, episode_steps7) #plt.plot(eps, episode_steps8) #plt.plot(eps, episode_steps9) #plt.xlabel('Episodes') #plt.ylabel('Steps') #plt.title('Steps per Episode') #plt.show() train50eps1_steps = 0 train50eps2_steps = 0 train50eps3_steps = 0 train70eps1_steps = 0 train70eps2_steps = 0 train70eps3_steps = 0 train100eps1_steps = 0 train100eps2_steps = 0 train100eps3_steps = 0 for i in range(0, 10): #print("inst world model...") gridWorldModel.reset(start_cell=(m - 1)) gw1 = copy.deepcopy(gridWorldModel) gw2 = copy.deepcopy(gridWorldModel) gw3 = copy.deepcopy(gridWorldModel) gw4 = copy.deepcopy(gridWorldModel) gw5 = copy.deepcopy(gridWorldModel) gw6 = copy.deepcopy(gridWorldModel) gw7 = copy.deepcopy(gridWorldModel) gw8 = copy.deepcopy(gridWorldModel) gw9 = copy.deepcopy(gridWorldModel) #visualizeGridValueFunc(gw) #print("exec sweep policy for episode...") train50eps1_steps += exec_policy_for_episode(gw1, pi1) train50eps2_steps += exec_policy_for_episode(gw2, pi2) train50eps3_steps += exec_policy_for_episode(gw3, pi3) train70eps1_steps += exec_policy_for_episode(gw4, pi4) train70eps2_steps += exec_policy_for_episode(gw5, pi5) train70eps3_steps += exec_policy_for_episode(gw6, pi6) train100eps1_steps += exec_policy_for_episode(gw7, pi7) train100eps2_steps += exec_policy_for_episode(gw8, pi8) train100eps3_steps += exec_policy_for_episode(gw9, pi9) #print("rl steps" + str(rl_steps)) #print("sweep steps" + str(sweep_steps)) # nn_tour_expected_steps += gw.graph.calc_path_cost(base_line_tour) train50eps1_avgs.append(train50eps1_steps / 10) train50eps2_avgs.append(train50eps2_steps / 10) train50eps3_avgs.append(train50eps3_steps / 10) train70eps1_avgs.append(train70eps1_steps / 10) train70eps2_avgs.append(train70eps2_steps / 10) train70eps3_avgs.append(train70eps3_steps / 10) train100eps1_avgs.append(train100eps1_steps / 10) train100eps2_avgs.append(train100eps2_steps / 10) train100eps3_avgs.append(train100eps3_steps / 10) experiment_nums = ('1', '2', '3') y_pos = np.arange(len(experiment_nums)) bar_width = 0.075 rects1 = plt.bar(y_pos, train50eps1_avgs, bar_width, color='b', label='train50eps.1') rects2 = plt.bar(y_pos + bar_width, train50eps2_avgs, bar_width, color='g', label='train50eps.2') rects3 = plt.bar(y_pos + 2 * bar_width, train50eps3_avgs, bar_width, color='r', label='train50eps.3') rects4 = plt.bar(y_pos + 3 * bar_width, train70eps1_avgs, bar_width, color='b', label='train70eps.1') rects5 = plt.bar(y_pos + 4 * bar_width, train70eps2_avgs, bar_width, color='g', label='train70eps.2') rects6 = plt.bar(y_pos + 5 * bar_width, train70eps3_avgs, bar_width, color='r', label='train70eps.3') rects7 = plt.bar(y_pos + 6 * bar_width, train100eps1_avgs, bar_width, color='b', label='train100eps.1') rects8 = plt.bar(y_pos + 7 * bar_width, train100eps2_avgs, bar_width, color='g', label='train100eps.2') rects9 = plt.bar(y_pos + 8 * bar_width, train100eps3_avgs, bar_width, color='r', label='train100eps.3') plt.xticks(y_pos + bar_width, experiment_nums) plt.ylabel('Average Number of Steps') plt.xlabel('Experiment Number') plt.title('Average Number of Steps per Combination with Reward 20') plt.legend() plt.show()
# Intitalize 4x4 gridworld with 2 items n = 8 m = 8 k = 2 nn_avgs = [] sweep_avgs = [] dyna_avgs = [] # Run for 10 different distributions. Train RL, and then compare on 100 episodes each. plot_learning_curve = True for i in range(0, 10): gridWorldModel = GridWorld(m, n, k, debug=False, gamma=1, no_stochastisity=False) #visualizeGridValueFunc(gridWorldModel) visualizeGridProbabilities(gridWorldModel, k, aggregate=True) # Testing # testRandomPolicy(gridWorldModel) eval_pi = testDynaQ(gridWorldModel, plot=plot_learning_curve) #parameterTest() (nn_avg, sweep_avg, dyna_avg) = compareToBaseLine(gridWorldModel, eval_pi, k) nn_avgs.append(nn_avg) sweep_avgs.append(sweep_avg) dyna_avgs.append(dyna_avg) plot_learning_curve = False
params.MAX_MEM_SIZE=50 params.NUM_BOX=4 params.BRANCH_COUNT = 1 params.MAX_EPISODES=200000 params.LR_ACTOR=.002 params.NORMALIZE_Q = False params.ST= 10 params.IC_Lambda = 2.5 optimizer = tf.keras.optimizers.Adam(params.LR_ACTOR ) # optimizer = tfa.optimizers.SWA(optimizer,average_period=4) env = GridWorld(max_episode = params.MAX_STEPS, max_branch_num=params.BRANCH_COUNT) env.set_rewards(0,1.,10.,-.1) env.max_length = 3 params.LR_ACTOR= .001 params.DISCOUNT_GAMMA=.2 class Memory(object): def __init__(self): self.ep_obs, self.ep_act, self.ep_rwd = [], [], []
ep += 1 d = 0 if (np.sum(_locals['true_reward']) > 2): d = 1 if (np.sum(_locals['true_reward']) > 0): p = 2 print(ep, avgSuccess.add(d)) logging.info("episode : %d average success rate : ,%.2f" % (ep, avgSuccess.get())) n_steps += 1 return True # Create log dir log_dir = "tmp/" os.makedirs(log_dir, exist_ok=True) env = DummyVecEnv([lambda: GridWorld(50, max_branch_num=1) for _ in range(1)]) model = A2C(MlpPolicy, env, verbose=0, tensorboard_log=None, full_tensorboard_log=False, learning_rate=0.001, gamma=0.99) time_steps = 1e8 model.learn(total_timesteps=int(time_steps), callback=callback)
params = dotdict({}) params.ILP_VALUE = False params.HARD_CHOICE = False params.DBL_SOFTMAX = False params.REMOVE_REP = False params.RESET_RANDOM = False params.MAX_EPISODES = 200000 params.MAX_STEPS = 50 params.EPS_TH = 0 update_freq = 10 params.MAX_MEM_SIZE = 50 params.NUM_BOX = 4 params.BRANCH_COUNT = 1 env = GridWorld(max_episode=params.MAX_STEPS, max_branch_num=params.BRANCH_COUNT) env.set_rewards(0., 1, 10., -.01) env.max_length = 3 params.LR_ACTOR = .0005 params.LR_VALUE = .01 params.DISCOUNT_GAMMA = .2 print(params) # non cnn version def img_to_act(x, dim_out): state_in_x = tf.layers.dense(x / 255, 30 * COLOR_COUNT, tf.nn.relu,
k = 2 #gw = GridWorld(m, n, k, debug=False) #visualizeGridProbabilities(gw, k) #Q = np.zeros((gridWorldModel._env_spec.nS,gridWorldModel._env_spec.nA)) #dyna_model_training_steps = 50 #learning_rate = 0.1 #q, pi = tabular_dyna_q(gridWorldModel, Q, learning_rate, training_steps, model_training_steps) sweep_pi = policy.HandMadeSweepPolicy(4, m, n) episodes_num = 100 start_state = 0 sweep_steps = 0 nn_tour_expected_steps = 0 #for i in tqdm(range(episodes_num)): gw = GridWorld(m, n, k, debug=True) visualizeGridProbabilities(gw, k, aggregate=True) base_line_tour, nn_tour_expected_steps = gw.graph.get_approximate_best_path( start_vertex=m - 1) print("nearest_neighbor_tour:" + str(base_line_tour)) for i in range(0, episodes_num): print("inst world model...") gw.reset(start_cell=m - 1) visualizeGridValueFunc(gw) print("exec sweep policy for episode...") sweep_steps += exec_policy_for_episode(gw, sweep_pi) print("get nearest neighbor tour...") print("get nn tour cost...") #nn_tour_expected_steps += gw.graph.calc_path_cost(base_line_tour)