def init_transitions(self, horizon): neutral_tmat = generate_transitions(self.states, self.actions, self.true_tran_func) realvals, _ = generate_policy(horizon, self.max_time, neutral_tmat, self.rmat) for i, (aname, gamma) in enumerate(self.attitudes): transition_func = self.make_transition_func(aname, gamma, realvals) self.transitions[aname] = generate_transitions(self.states, self.actions, f, self.max_time) fname = self.dirname + "transitions_{0}_{1}_h{2}.p".format(i, aname, horizon) self.transitions[aname].dump(fname)
def run_sims(dirname, horizons, attitudes, model_type, model, num_episodes): max_time = max(horizons) time_based = False if model_type == "valopt": time_based = True # output setup csvout = open(dirname + "results_{0}_data.csv".format(num_episodes), 'w+') writer = csv.writer(csvout) writer.writerow(["Horizon", "AvgEarnings", "StdErr", "AttitudeIndex", "AttitudeName"]) readableout = open(dirname + "results_{0}_readable.txt".format(num_episodes), 'w+') verboseout = open(dirname + "results_{0}_verbose.txt".format(num_episodes), 'w+') print_readverb = make_multiprint([readableout, verboseout]) # variables only used for constvar # TODO: allow for >2 actions? num_actions = 2 new_attitudes = [] policies = [] if model_type == "constvar": for action_idx in range(num_actions): policies.append(const_policy(action_idx, max_time, len(model.states))) results = np.zeros((len(attitudes) * num_actions, len(horizons), 3)) else: results = np.zeros((len(attitudes), len(horizons), 3)) for horizon_idx, horizon in enumerate(horizons): print_readverb("Horizon: {}".format(horizon)) if model_type == "constvar": """In the `constvar` model, we vary the true transition matrix, rather than the policy, based on beliefs. We take a constant policy (one for each action).""" for attitude_idx, attitude in enumerate(attitudes): # compute results tmat = model.transitions[attitude] rmat = model.rewards[attitude] for action_idx in range(num_actions): new_att_idx = (attitude_idx * num_actions) + action_idx new_attitude = attitude + str(action_idx) if len(new_attitudes) < len(attitudes) * num_actions: new_attitudes.append(new_attitude) totals, mean, stderr = episode_stats(policies[action_idx], num_episodes, max_time, tmat, rmat, time_based) results[new_att_idx][horizon_idx] = [horizon, mean, stderr] # output to files writer.writerow([horizon, mean, stderr, new_att_idx, new_attitude]) print_readverb("{0}: mean = {1}, stderr = {2}".format(new_attitude, mean, stderr)) print("Totals: {0}".format(totals), file=verboseout) print("Policy for {0}: {1}".format(new_attitude, policies[action_idx]), file=verboseout) else: if model_type == "valopt": model.init_transitions(horizon) # note: "neutral" must be an attitude neutral_tmat = model.transitions['neutral'] for attitude_idx, attitude in enumerate(attitudes): rmat = model.rewards[attitude] if model_type == "const": # TODO: allow const to have something other than 0 policy = const_policy(0, max_time, rmat.shape[1]) else: tmat = model.transitions[attitude] _, policy = generate_policy(horizon, max_time, tmat, rmat, time_based) totals, mean, stderr = episode_stats(policy, num_episodes, max_time, neutral_tmat, rmat, time_based) results[attitude_idx][horizon_idx] = [horizon, mean, stderr] # output to files writer.writerow([horizon, mean, stderr, attitude_idx, attitude]) print_readverb("{0}: mean = {1}, stderr = {2}".format(attitude, mean, stderr)) print("Totals: {0}".format(totals), file=verboseout) print("Policy for {0}: {1}".format(attitude, policy), file=verboseout) polfname = "policy_{0}_{1}_{2}_{3}.p".format(num_episodes, horizon, attitude_idx, attitude) cPickle.dump((attitude, policy), open(dirname + polfname, 'w+')) if len(new_attitudes) > 0: attitudes = new_attitudes return results, attitudes