def path_discounted_returns(env, gamma, num_traj, policy=test_policy(), simpy=False, printing=False): # print('Env is of type ', type(env)) # print('Policy is of type ', type(policy)) if printing: print('Simulating %d Rollouts...' % (num_traj)) start_time = time.time() if (not isinstance(env, TfEnv)): env = TfEnv(env) paths = [] rollout_times = [] if printing: bar = progressbar.ProgressBar() iterator = bar(range(num_traj)) else: iterator = range(num_traj) for i in iterator: start_time_r = time.time() if (simpy): paths.append(ed_simpy_dec_rollout(env, policy)) else: paths.append(ed_dec_rollout(env, policy)) elapsed_r = time.time() - start_time_r rollout_times.append(elapsed_r) paths = [item for sublist in paths for item in sublist] adr = [] for path in paths: t_sojourn = path["offset_t_sojourn"] discount_gamma = np.exp(-gamma * t_sojourn) path_adr = variable_discount_cumsum(path["rewards"], discount_gamma) avg_discounted_return = path_adr[0] adr.append(avg_discounted_return) elapsed = time.time() - start_time if printing: print('Time Elapsed %.2f, or %.7f +- %.7f per rollout' % (elapsed, mean(rollout_times), std(rollout_times) / np.sqrt(num_traj))) return mean(adr), std(adr) / np.sqrt(num_traj), adr
) print(obj) env = obj['env'] policy = obj['policy'] agents = policy print('Learned Policy') average_discounted_rewards = [] GAMMA = 0. #math.log(0.9)/(-5.) # decay to 90% in 5 seconds for i in range(20): paths = ed_dec_rollout(env, agents) for path in paths: t_sojourn = path["offset_t_sojourn"] discount_gamma = np.exp(-GAMMA * t_sojourn) path["returns"] = variable_discount_cumsum( path["rewards"], discount_gamma) average_discounted_rewards.append(sum(path["rewards"])) if (i % 10 == 0): print('Iteration: ', i) print(len(average_discounted_rewards)) print(np.mean(average_discounted_rewards), np.std(average_discounted_rewards)) from fire_smdp_params import test_policy_smarter