def compute_reward(self, states, actions, resps):
     trajectory = pytorch_model.unwrap(states[:-1, :self.traj_dim])
     saliency_trajectory = pytorch_model.unwrap(states[:-1, self.traj_dim:])
     # print("states shape", trajectory.shape, saliency_trajectory.shape)
     assignments, cps = self.model.get_mode(trajectory, saliency_trajectory)
     rewards = []
     # print(assignments, cps)
     rewarded = False
     for asmt in assignments:
         # if asmt == self.desired_mode:
         #### DANGEROUS LINE ####
         if asmt == self.desired_mode and not rewarded:
             rewards.append(1)
             rewarded = True
         else:
             rewards.append(0)
     rewards.append(0)  # match the number of changepoints
     full_rewards = []
     lcp = 0
     lr = 0
     cps.append(len(trajectory))
     # print(cps, rewards)
     for cp, r in zip(cps, rewards):
         if self.seg_reward:  # reward copied over all time steps
             full_rewards += [r] * (cp - lcp)
         else:
             if r == 1 and cp == 0:
                 r = 0
             full_rewards += [0] * (cp - lcp - 1) + [r]
         lcp = cp
         lr = r
     # print(rewards, cps, full_rewards)
     return pytorch_model.wrap(np.array(full_rewards), cuda=self.cuda)
 def fc_dqn_network(num_actions, network_type, state):
     """The convolutional network used to compute the agent's Q-values.
 Args:
 num_actions: int, number of actions.
 network_type: namedtuple, collection of expected values to return.
 state: `tf.Tensor`, contains the agent's current state.
 Returns:
 net: _network_type object containing the tensors output by the network.
 """
     q_values = gym_lib._basic_discrete_domain_network(
         pytorch_model.unwrap(minmax[0]), pytorch_model.unwrap(minmax[1]),
         num_actions, state)
     return network_type(q_values)
Пример #3
0
def unwrap_or_none(val):
    if val is not None:
        if type(val) == torch.tensor:
            return pytorch_model.unwrap(val)
        return val
    else:
        return -1.0
Пример #4
0
 def determineChanged(self, states, actions, resps):
     change_indexes, ats, states = self.state_class.determine_delta_target(
         pytorch_model.unwrap(states))
     change = len(change_indexes) > 0
     if change:
         return change, states[0]
     return change, None
Пример #5
0
def remove_mean_batch(imgs, focus, nb_size=(5, 5)):
    in_np = isinstance(imgs, np.ndarray)
    if not in_np:
        imgs = pytorch_model.unwrap(imgs)
    focus = (focus * imgs.shape[2:]).astype(int)
    focus_mean = image_focus_mean(imgs, focus, nb_size)
    imgs = image_focus_subtract(imgs, focus, focus_mean, nb_size)
    return imgs if in_np else torch.from_numpy(imgs).float()
 def forward(self, x, reward):
     '''
 TODO: make use of time_estimator, link up Q vals and action probs
 TODO: clean up cuda = True to something that is actually true
 '''
     x = pytorch_model.unwrap(x)
     # print(reward, x)
     return self.dope_dqn.step(reward[0], x)
Пример #7
0
 def hash(self, state):
     '''
     assuming state of the form [changepoint state dim]
     '''
     state = self.normalize(state)
     basis = []
     for order_vector, val in zip(self.order_vectors, state):
         basis.append(int(pytorch_model.unwrap(torch.exp(-(val - order_vector).pow(2)).argmax()))) # could use any monotonically decreasing function
     return tuple(basis)
Пример #8
0
 def remove_mean_memory(self, imgs, focus):
     in_np = isinstance(imgs, np.ndarray)
     if not in_np:
         imgs = pytorch_model.unwrap(imgs)
     focus = (focus * imgs.shape[2:]).astype(int)
     focus_mean = image_focus_mean(imgs, focus, self.nb_size)
     self.mean = (self.n_mean * self.mean + focus.shape[0] * focus_mean) \
                 / (self.n_mean + focus.shape[0])
     self.n_mean = self.n_mean + focus.shape[0]
     imgs = image_focus_subtract(imgs, focus, self.mean, self.nb_size)
     return imgs if in_np else torch.from_numpy(imgs).float()
Пример #9
0
 def compute_reward(self, states, actions, resps, precomputed=None):
     self.reward_base = -0.01 # TODO: patchwork line
     trajectory = pytorch_model.unwrap(states[:-1,:self.traj_dim])
     saliency_trajectory = pytorch_model.unwrap(states[:-1,self.traj_dim:])
     # print("states shape", trajectory.shape, saliency_trajectory.shape)
     if precomputed is not None:
         assignments, cps = precomputed
     else:
         assignments, cps = self.model.get_mode(trajectory, saliency_trajectory)
     # print(assignments, cps, self.desired_mode)
     rewards = []
     # print(assignments, cps)
     rewarded = False
     for cp, asmt in zip(cps, assignments):
         # print(cp, asmt, asmt == self.desired_mode)
         if asmt == self.desired_mode:
         #### DANGEROUS LINE ####
         # if asmt == self.desired_mode and not rewarded:
             rewards.append(1)
             # rewarded = True
         else:
             rewards.append(self.reward_base)
     rewards = [self.reward_base] + rewards # match the number of changepoints, first value ignored
     full_rewards = []
     lcp = 0
     lr = 0
     cps.append(len(trajectory))
     # print(len(cps), len(rewards), len(assignments), cps, rewards) 
     for cp, r in zip(cps, rewards):
         if self.seg_reward: # reward copied over all time steps
             full_rewards += [r] * (cp - lcp)
         else:
             if r == 1 and cp == 0:
                 r = self.reward_base
             full_rewards +=  [self.reward_base] * (cp-lcp-1) + [r]
         lcp = cp
         lr = r
     # print(full_rewards, trajectory)
     # print(np.concatenate((np.array(full_rewards).reshape(len(full_rewards),1), trajectory), axis=1))
     # print(rewards, cps, full_rewards)
     return pytorch_model.wrap(np.array(full_rewards), cuda=self.cuda)
Пример #10
0
 def compute_reward(self, states, actions, resps):
     rewards = torch.zeros(len(states))
     change_indexes, ats, st = self.state_class.determine_delta_target(
         pytorch_model.unwrap(states))
     if len(change_indexes) > 0:
         dists = np.linalg.norm(self.parameters - st[0])
         rewards[change_indexes[0]] = (self.max_dist -
                                       dists) / self.max_dist
     rewards[states[:, -2] == 79] = -1.0
     if self.cuda:
         rewards = rewards.cuda()
     return rewards
Пример #11
0
 def fc_rainbow_network(num_actions, num_atoms, support, network_type,
                        state):
     """Build the deep network used to compute the agent's Q-value distributions.
 Args:
 num_actions: int, number of actions.
 num_atoms: int, the number of buckets of the value function distribution.
 support: tf.linspace, the support of the Q-value distribution.
 network_type: `namedtuple`, collection of expected values to return.
 state: `tf.Tensor`, contains the agent's current state.
 Returns:
 net: _network_type object containing the tensors output by the network.
 """
     print(minmax)
     net = gym_lib._basic_discrete_domain_network(
         pytorch_model.unwrap(minmax[0]),
         pytorch_model.unwrap(minmax[1]),
         num_actions,
         state,
         num_atoms=num_atoms)
     logits = tf.reshape(net, [-1, num_actions, num_atoms])
     probabilities = tf.contrib.layers.softmax(logits)
     q_values = tf.reduce_sum(support * probabilities, axis=2)
     return network_type(q_values, logits, probabilities)
def generate_soft_dataset(states, resps, true_environment, reward_fns, args):
    pre_load_weights = args.load_weights
    args.load_weights = True
    option_chain = OptionChain(true_environment, args.changepoint_dir,
                               args.train_edge, args)
    print(args.load_weights)
    environments = option_chain.initialize(args)
    proxy_environment = environments.pop(-1)
    proxy_chain = environments
    train_models = proxy_environment.models
    head, tail = get_edge(args.train_edge)
    if len(
            environments
    ) > 1:  # there is a difference in the properties of a proxy environment and the true environment
        num_actions = len(environments[-1].reward_fns)
    else:
        num_actions = environments[-1].num_actions
    state_class = GetState(head,
                           state_forms=list(
                               zip(args.state_names, args.state_forms)))
    proxy_environment.initialize(args,
                                 proxy_chain,
                                 reward_fns,
                                 state_class,
                                 behavior_policy=None)

    train_models.initialize(args, len(reward_fns), state_class, num_actions)
    train_models.session(args)
    proxy_environment.duplicate(args)  # assumes that we are loading weights
    args.load_weights = pre_load_weights

    soft_actions = [[] for i in range(train_models.num_options)]
    for oidx in range(train_models.num_options):
        train_models.option_index = oidx
        if args.model_form == 'population':
            train_models.currentModel().use_mean = True
        for i in range(len(states) // 30 + 1):
            state = states[i * 30:(i + 1) * 30]
            resp = resps[i * 30:(i + 1) * 30]
            values, dist_entropy, action_probs, Q_vals = train_models.determine_action(
                pytorch_model.wrap(state, cuda=args.cuda),
                pytorch_model.wrap(resp, cuda=args.cuda))
            # print (action_probs)
            values, action_probs, Q_vals = train_models.get_action(
                values, action_probs, Q_vals)
            soft_actions[oidx] += pytorch_model.unwrap(action_probs).tolist()
    print("soft actions", np.sum(np.array(soft_actions[0]), axis=0))
    for i in range(len(soft_actions)):
        soft_actions[i] = smooth_weight(soft_actions[i], args.weighting_lambda)
    return np.array(soft_actions)
Пример #13
0
    def step(self, action):
        # TODO: action is tensor, might not be safe assumption
        # t = time.time()
        uaction = pytorch_model.unwrap(action.long())
        raw_state, reward, done, info = self.screen.step([uaction])
        # a = time.time()
        # print("screen step", a - t)
        raw_state = np.squeeze(raw_state)
        # raw_state[:10,:] = 0.0
        self.current_raw = raw_state
        raw_factor_state = {'Action': [[0.0, 0.0], (float(uaction), )]}
        self.current_action = action
        self.reward = reward[0]
        self.factor_state = raw_factor_state
        self.last_action = uaction

        # logging
        if len(self.save_path) > 0:
            if self.recycle > 0:
                state_path = os.path.join(
                    self.save_path, str((self.itr % self.recycle) // 2000))
                count = self.itr % self.recycle
            else:
                state_path = os.path.join(self.save_path,
                                          str(self.itr // 2000))
                count = self.itr
            try:
                os.makedirs(state_path)
            except OSError:
                pass
            if self.itr != 0:
                object_dumps = open(
                    os.path.join(self.save_path, "focus_dumps.txt"), 'a')
            else:
                object_dumps = open(
                    os.path.join(self.save_path, "focus_dumps.txt"),
                    'w')  # create file if it does not exist
            for key in factor_state.keys():
                writeable = list(factor_state[key][0]) + list(
                    factor_state[key][1])
                object_dumps.write(
                    key + ":" + " ".join([str(fs) for fs in writeable]) +
                    "\t")  # TODO: attributes are limited to single floats
            object_dumps.write(
                "\n")  # TODO: recycling does not stop object dumping

            # imio.imsave(os.path.join(state_path, "state" + str(count % 2000) + ".png"), self.current_raw)
            self.itr += 1
        # print("elapsed ", time.time() - t)
        return raw_state, self.factor_state, done
Пример #14
0
    def step(self, action):
        action = int(pytorch_model.unwrap(action[0]))
        # print("action", action)
        if action == 0:  # noop
            pass
        elif action % 2 == 1:
            v = self.current_state[(action - 1) // 2]
            if v != 0:
                ncs = self.current_state.copy()
                ncs[(action - 1) // 2] -= 1
                self.current_state = ncs
        elif action % 2 == 0:
            v = self.current_state[(action - 1) // 2]
            if v != self.num_states - 1:
                ncs = self.current_state.copy()
                ncs[(action - 1) // 2] += 1
                self.current_state = ncs
        done = False
        if self.terminal_state:
            done = True
            for i in range(self.num_dims):
                done = done and ((self.current_state[i] == self.num_states - 1)
                                 or (self.current_state[i] == 0))
            if done:
                self.current_state = self.initial_state
        if len(self.save_path) != 0:
            state_path = os.path.join(self.save_path, str(self.itr // 2000))
            try:
                os.makedirs(state_path)
            except OSError:
                pass
            # imio.imsave(os.path.join(state_path, "state" + str(self.itr % 2000) + ".png"), self.current_state)
            # print(self.save_path, state_path)
            if self.itr != 0:
                object_dumps = open(self.save_path + "/object_dumps.txt", 'a')
            else:
                object_dumps = open(self.save_path + "/object_dumps.txt",
                                    'w')  # create file if it does not exist
            # print("writing", self.save_path + "/object_dumps.txt")
            object_dumps.write("chain:" + str(self.current_state[0]) + "\t\n")
            object_dumps.close()
        self.itr += 1
        if self.itr % 100 == 0:
            self.current_state = self.initial_state
            done = True

        # if done:
        #     self.current_state[0] = 0
        return self.current_state, {"chain": (self.current_state, 1)}, done
Пример #15
0
def construct_tile_order(minmax, normalize, order):
    minvs, maxvs = minmax
    order_vectors = []
    for minv, maxv in zip(minvs, maxvs):
        order_vector = []
        numv = min(order, int(pytorch_model.unwrap(torch.ceil(maxv - minv) + 1))) # TODO: assumes integer differences between states, fix?
        for i in range (numv): 
            if not normalize:
                order_vector.append((minv + i * (maxv - minv) / (max(numv - 1, 1))))
            else:
                order_vector.append((i / max(numv - 1, 1)))
        order_vectors.append(pytorch_model.wrap(np.array(order_vector)).detach())
    for vec in order_vectors:
        vec.requires_grad = False   
    return order_vectors
Пример #16
0
    def forward(self, img, prev_out=None, ret_numpy=True, ret_extra=False):
        out = img
        for layer in self.layers:
            out = layer(out).detach()
        if prev_out is not None:  # apply prior filter if specified
            pfilter = prior_filter(prev_out, out.size())
            pfilter = self.preprocess(pfilter)
            out = torch.mul(out, pfilter)
        focus_out = self.argmax_xy(out)

        focus_out = focus_out if ret_numpy \
                    else torch.from_numpy(focus_out).float()
        if ret_extra:
            return focus_out, pytorch_model.unwrap(out)
        return focus_out
Пример #17
0
 def __init__(self, args, train_models):
     self.optimizers = []
     self.solutions = []
     self.weight_sharing = args.weight_sharing
     for i in range(len(train_models.models)):
         if args.load_weights and not args.freeze_initial: # TODO: initialize from non-population model
             xinit = pytorch_model.unwrap(train_models.models[i].mean.get_parameters())
             # TODO: parameter for sigma?
             sigma = 0.6#pytorch_model.unwrap(torch.stack([train_models.models[i].networks[j].get_parameters() for j in range(train_models.models[i].num_population)]).var(dim=1).mean())
             print(xinit, sigma)
         else:
             xinit = (np.random.rand(train_models.currentModel().networks[0].count_parameters())-0.5)*2 # initializes [-1,1]
             sigma = 1.0
         cmaes_params = {"popsize": args.num_population} # might be different than the population in the model...
         cmaes = cma.CMAEvolutionStrategy(xinit, sigma, cmaes_params)
         self.optimizers.append(cmaes)
         self.solutions.append(cmaes.ask())
     for i in range(len(self.models.models)):
         self.assign_solutions(train_models, i)
Пример #18
0
 def argmax_xy(self, out):
     out = pytorch_model.unwrap(out)
     batch_size = out.shape[0]
     row_size = out.shape[2]
     col_size = out.shape[3]
     
     if self.argmax_mode == 'first':
         # first argmax
         argmax = np.argmax(out.reshape((batch_size, -1)), axis=1)
     elif self.argmax_mode == 'rand':
         # random argmax for tie-breaking
         out = out.reshape((batch_size, -1))
         out_max = np.max(out, axis=1)
         argmax = np.array([np.random.choice(np.flatnonzero(line == line_max)) 
                            for line, line_max in zip(out, out_max)])
     else:
         raise ValueError('argmax_mode %s invalid'%(self.argmax_mode))
     
     argmax %= row_size * col_size  # in case of multiple filters
     argmax_coor = np.array([np.unravel_index(argmax_i, (row_size, col_size)) 
                             for argmax_i in argmax], dtype=float)
     argmax_coor = argmax_coor / np.array([row_size, col_size])
     return argmax_coor
Пример #19
0
def train_dopamine(args, save_path, true_environment, train_models,
                   proxy_environment, proxy_chain, reward_classes, state_class,
                   num_actions, behavior_policy):
    print("#######")
    print("Training Options")
    print("#######")
    # if option_chain is not None: #TODO: implement this
    base_env = proxy_chain[0]
    base_env.set_save(0, args.save_dir, args.save_recycle)
    snum = args.num_stack
    args.num_stack = 1
    proxy_environment.initialize(args, proxy_chain, reward_classes,
                                 state_class, behavior_policy)
    args.num_stack = snum
    if args.save_models:
        save_to_pickle(os.path.join(save_path, "env.pkl"), proxy_environment)
    behavior_policy.initialize(args, num_actions)
    train_models.initialize(args, len(reward_classes), state_class,
                            proxy_environment.action_size)
    proxy_environment.set_models(train_models)
    proxy_environment.set_save(0, args.save_dir, args.save_recycle)
    state = pytorch_model.wrap(proxy_environment.getState(), cuda=args.cuda)
    cs, cr = proxy_environment.getHistState()
    hist_state = pytorch_model.wrap(cs, cuda=args.cuda)
    raw_state = base_env.getState()
    cp_state = proxy_environment.changepoint_state([raw_state])
    # print("initial_state (s, hs, rs, cps)", state, hist_state, raw_state, cp_state)
    # print(cp_state.shape, state.shape, hist_state.shape, state_class.shape)
    # rollouts = RolloutOptionStorage(args.num_processes, (state_class.shape,), proxy_environment.action_size, cr.flatten().shape[0],
    #     state.shape, hist_state.shape, args.buffer_steps, args.changepoint_queue_len, args.trace_len,
    #     args.trace_queue_len, args.dilated_stack, args.target_stack, args.dilated_queue_len, train_models.currentOptionParam().shape[1:], len(train_models.models), cp_state[0].shape,
    #     args.lag_num, args.cuda)
    option_actions = {
        option.name: collections.Counter()
        for option in train_models.models
    }
    total_duration = 0
    total_elapsed = 0
    true_reward = 0
    ep_reward = 0
    start = time.time()
    fcnt = 0
    final_rewards = list()
    option_counter = collections.Counter()
    option_value = collections.Counter()
    print(hist_state)
    val = None
    train_models.currentModel().begin_episode(pytorch_model.unwrap(hist_state))
    for j in range(args.num_iters):
        raw_actions = []
        last_total_steps, total_steps = 0, 0
        for step in range(args.num_steps):
            # start = time.time()
            fcnt += 1
            total_steps += 1
            current_state, current_resp = proxy_environment.getHistState()
            estate = proxy_environment.getState()
            if args.true_environment:
                reward = pytorch_model.wrap([[base_env.reward]])
            else:
                reward = proxy_environment.computeReward(1)
            true_reward += base_env.reward
            ep_reward += base_env.reward
            # print(current_state, reward[train_models.option_index])
            action = train_models.currentModel().forward(
                current_state,
                pytorch_model.unwrap(reward[train_models.option_index]))
            # print("ap", action)
            action = pytorch_model.wrap([action])
            cp_state = proxy_environment.changepoint_state([raw_state])
            # print(state, action)
            # print("step states (cs, s, cps, act)", current_state, estate, cp_state, action)
            # print("step outputs (val, de, ap, qv, v, ap, qv)", values, dist_entropy, action_probs, Q_vals, v, ap, qv)

            state, raw_state, resp, done, action_list = proxy_environment.step(
                action, model=False
            )  #, render=len(args.record_rollouts) != 0, save_path=args.record_rollouts, itr=fcnt)
            # print("step check (al, s)", action_list, state)
            # learning_algorithm.interUpdateModel(step)
            #### logging
            option_actions[train_models.currentName()][int(
                pytorch_model.unwrap(action.squeeze()))] += 1
            #### logging
            # print(train_models.currentModel().dope_rainbow)

            if done:
                # print("reached end")
                print("Episode Reward: ", ep_reward)
                ep_reward = 0
                train_models.currentModel().end_episode(
                    pytorch_model.unwrap(reward[train_models.option_index]))
                state, resp = proxy_environment.getHistState()
                train_models.currentModel().begin_episode(
                    pytorch_model.unwrap(state))
                # print(step)
                break
        # var = [v for v in tf.trainable_variables() if v.name == "Online/fully_connected/weights:0"][0]
        # nval = train_models.currentModel().sess.run(var)
        # if val is not None:
        #     print(var, np.sum(abs(nval - val)), train_models.currentModel().dope_rainbow.eval_mode)
        # val = nval
        current_state = proxy_environment.getHistState()
        # print(state, action)
        # print("step states (cs, s, cps, act)", current_state, estate, cp_state, action)
        # print("step outputs (val, de, ap, qv, v, ap, qv)", values, dist_entropy, action_probs, Q_vals, v, ap, qv)

        cp_state = proxy_environment.changepoint_state([raw_state])
        # print("states and actions (es, cs, a, m)", rollouts.extracted_state, rollouts.current_state, rollouts.actions, rollouts.masks)
        # print("actions and Qvals (qv, vp, ap)", rollouts.Qvals, rollouts.value_preds, rollouts.action_probs)

        total_duration += step + 1
        # print("rewards", rewards)
        # rollouts.insert_rewards(rewards)
        # print(rollouts.extracted_state)
        # print(rewards)
        # rollouts.compute_returns(args, values)
        # print("returns and rewards (rew, ret)", rollouts.rewards, rollouts.returns)
        # print("returns and return queue", rollouts.returns, rollouts.return_queue)
        # print("reward check (cs, rw, rol rw, rt", rollouts.current_state, rewards, rollouts.rewards, rollouts.returns)
        name = train_models.currentName()
        # print(name, rollouts.extracted_state, rollouts.rewards, rollouts.actions)

        #### logging
        option_counter[name] += step + 1
        option_value[name] += true_reward
        #### logging
        if j % args.save_interval == 0 and args.save_models and args.train:  # no point in saving if not training
            print("=========SAVING MODELS==========")
            train_models.save(save_path)  # TODO: implement save_options

        #### logging
        if j % args.log_interval == 0:
            # print("Qvalue and state", pytorch_model.unwrap(Q_vals.squeeze()), pytorch_model.unwrap(current_state.squeeze()))
            # print("probs and state", pytorch_model.unwrap(action_probs.squeeze()), pytorch_model.unwrap(current_state.squeeze()))
            for name in train_models.names():
                if option_counter[name] > 0:
                    print(name, option_value[name] / option_counter[name], [
                        option_actions[name][i] / option_counter[name]
                        for i in range(len(option_actions[name]))
                    ])
                if j % (args.log_interval * 20) == 0:
                    option_value[name] = 0
                    option_counter[name] = 0
                    for i in range(len(option_actions[name])):
                        option_actions[name][i] = 0
            end = time.time()
            total_elapsed += total_duration
            log_stats = "Updates {}, num timesteps {}, FPS {}, reward {}".format(
                j, total_elapsed, int(total_elapsed / (end - start)),
                true_reward / (args.num_steps * args.log_interval))
            print(log_stats)
            true_reward = 0.0
            total_duration = 0
Пример #20
0
def unwrap_or_none(val):
    if val is not None:
        return pytorch_model.unwrap(val)
    else:
        return -1.0
Пример #21
0
def testRL(args,
           save_path,
           true_environment,
           proxy_chain,
           proxy_environment,
           state_class,
           behavior_policy,
           num_actions,
           reward_classes=None):
    print("#######")
    print("Evaluating Options")
    print("#######")
    # if option_chain is not None: #TODO: implement this
    base_env = proxy_chain[0]
    base_env.set_save(0, args.save_dir, args.save_recycle)
    if reward_classes is not None:
        proxy_environment.reward_fns = reward_classes
    args.changepoint_queue_len = max(args.changepoint_queue_len,
                                     args.num_iters)
    proxy_environment.initialize(args, proxy_chain,
                                 proxy_environment.reward_fns,
                                 proxy_environment.stateExtractor,
                                 behavior_policy)
    print(base_env.save_path)
    behavior_policy.initialize(args, num_actions)
    train_models = proxy_environment.models
    train_models.initialize(args, len(reward_classes), state_class,
                            num_actions)
    proxy_environment.duplicate(args)
    proxy_environment.set_save(0, args.save_dir, args.save_recycle)
    state = pytorch_model.wrap(proxy_environment.getState(), cuda=args.cuda)
    resp = proxy_environment.getResp()
    print(state.shape)
    raw_state = base_env.getState()
    cs, cr = proxy_environment.getHistState()
    hist_state = pytorch_model.wrap(cs, cuda=args.cuda)
    cp_state = proxy_environment.changepoint_state([raw_state])
    rollouts = RolloutOptionStorage(
        args.num_processes, (state_class.shape, ),
        proxy_environment.action_size,
        cr.flatten().shape[0], state.shape, hist_state.shape,
        args.buffer_steps, args.changepoint_queue_len, args.trace_len,
        args.trace_queue_len, args.dilated_stack, args.target_stack,
        args.dilated_queue_len,
        train_models.currentOptionParam().shape[1:], len(train_models.models),
        cp_state[0].shape, args.lag_num, args.cuda)
    option_actions = {
        option.name: collections.Counter()
        for option in train_models.models
    }
    total_duration = 0
    start = time.time()
    fcnt = 0
    final_rewards = list()
    option_counter = collections.Counter()
    option_value = collections.Counter()
    raw_states = dict()
    ep_reward = 0
    rollouts.set_parameters(args.num_iters * train_models.num_options)
    # if args.num_iters > rollouts.changepoint_queue_len:
    #     rollouts.set_changepoint_queue(args.num_iters)
    done = False
    for i in range(train_models.num_options):
        train_models.option_index = i
        train_models.currentModel().test = True
        raw_states[train_models.currentName()] = []
        for j in range(args.num_iters):
            fcnt += 1
            raw_actions = []
            rollouts.cuda()
            current_state, current_resp = proxy_environment.getHistState()
            values, dist_entropy, action_probs, Q_vals = train_models.determine_action(
                current_state.unsqueeze(0), current_resp.unsqueeze(0))
            v, ap, qv = train_models.get_action(values, action_probs, Q_vals)
            cp_state = proxy_environment.changepoint_state([raw_state])
            ep_reward += base_env.reward
            # print(ap, qv)
            action = behavior_policy.take_action(ap, qv)
            rollouts.insert(
                False, state, current_state,
                pytorch_model.wrap(args.greedy_epsilon, cuda=args.cuda), done,
                current_resp, action, cp_state[0],
                train_models.currentOptionParam(), train_models.option_index,
                None, None, action_probs, Q_vals, values)
            state, raw_state, resp, done, action_list = proxy_environment.step(
                action, model=False
            )  #, render=len(args.record_rollouts) != 0, save_path=args.record_rollouts, itr=fcnt)
            raw_states[train_models.currentName()].append(raw_state)
            option_actions[train_models.currentName()][int(
                pytorch_model.unwrap(action.squeeze()))] += 1
            if done:
                print("Episode Reward: ", ep_reward, " ", fcnt)
                ep_reward = 0
                # print("reached end")
            # proxy_environment.determine_swaps(length, needs_rewards=True) # doesn't need to generate rewards

        print(args.num_iters)
        print(action_probs)
        print("Episode Reward: ", ep_reward, " ", fcnt)
        rewards = proxy_environment.computeReward(args.num_iters)
        # print(rewards.shape)
        print(rewards.sum())
        rollouts.insert_rewards(rewards, total_duration)
        total_duration += j
        rollouts.compute_returns(args, values)
        rollouts.cpu()
        save_rols = copy.copy(rollouts)
        save_to_pickle(os.path.join(args.save_dir, "rollouts.pkl"), save_rols)

        reward_total = rollouts.rewards.sum(
            dim=1)[train_models.option_index] / args.num_iters
        print("Rewards for Policy:", reward_total)
Пример #22
0
 def save_actions(self, action_list):
     if len(self.save_path) > 0:
         for i, action in enumerate(action_list):
             self.save_files[i].write(
                 str(int(pytorch_model.unwrap(action.squeeze()))) + '\n')
Пример #23
0
def trainRL(args, save_path, true_environment, train_models,
            learning_algorithm, proxy_environment, proxy_chain, reward_classes,
            state_class, behavior_policy):
    print("#######")
    print("Training Options")
    print("#######")
    # if option_chain is not None: #TODO: implement this
    base_env = proxy_chain[0]
    base_env.set_save(0, args.save_dir, args.save_recycle,
                      args.single_save_dir)
    proxy_environment.initialize(args, proxy_chain, reward_classes,
                                 state_class, behavior_policy)
    if args.save_models:
        if args.env.find("Atari") != -1:
            screen = base_env.screen
            base_env.screen = None
        save_to_pickle(os.path.join(save_path, "env.pkl"), proxy_environment)
        if args.env.find("Atari") != -1:
            base_env.screen = screen
    behavior_policy.initialize(args, proxy_environment.action_size)
    print(reward_classes[0], reward_classes[0].parameter_minmax)
    if not args.load_weights:
        train_models.initialize(
            args,
            len(reward_classes),
            state_class,
            proxy_environment.action_size,
            parameter_minmax=reward_classes[0].parameter_minmax)
        proxy_environment.set_models(train_models)
    else:
        print("loading weights", len(reward_classes))
        train_models.initialize(
            args,
            len(reward_classes),
            state_class,
            proxy_environment.action_size,
            parameter_minmax=reward_classes[0].parameter_minmax)
        train_models.session(args)
        proxy_environment.duplicate(args)
    train_models.train()
    proxy_environment.set_save(0, args.save_dir, args.save_recycle)
    learning_algorithm.initialize(args,
                                  train_models,
                                  reward_classes=reward_classes)
    print(proxy_environment.get_names())
    state = pytorch_model.wrap(proxy_environment.getState(), cuda=args.cuda)
    cs, cr = proxy_environment.getHistState()
    hist_state = pytorch_model.wrap(cs, cuda=args.cuda)
    raw_state = base_env.getState()
    resp = proxy_environment.getResp()
    cp_state = proxy_environment.changepoint_state([raw_state])
    # print("initial_state (s, hs, rs, cps)", state, hist_state, raw_state, cp_state)
    # print(cp_state.shape, state.shape, hist_state.shape, state_class.shape)
    print(args.trace_len, args.trace_queue_len)
    args.buffer_clip = max(args.buffer_clip, args.reward_check)
    rollouts = RolloutOptionStorage(
        args.num_processes, (state_class.shape, ),
        proxy_environment.action_size,
        cr.flatten().shape[0],
        state.shape,
        hist_state.shape,
        args.buffer_steps,
        args.changepoint_queue_len,
        args.trace_len,
        args.trace_queue_len,
        args.dilated_stack,
        args.target_stack,
        args.dilated_queue_len,
        train_models.currentOptionParam().shape[1:],
        len(train_models.models),
        cp_state[0].shape,
        args.lag_num,
        args.cuda,
        return_form=args.return_form)
    option_actions = {
        option.name: collections.Counter()
        for option in train_models.models
    }
    total_duration = 0
    total_elapsed = 0
    true_reward = 0
    ep_reward = 0
    sample_schedule = args.sample_schedule
    start = time.time()
    fcnt = 0
    final_rewards = list()
    average_rewards, average_counts = [], []
    option_counter = collections.Counter()
    option_value = collections.Counter()
    trace_queue = [
    ]  # keep the last states until end of trajectory (or until a reset), and dump when a reward is found
    retest = False
    done = False
    for j in range(args.num_iters):
        rollouts.set_parameters(learning_algorithm.current_duration *
                                args.reward_check)
        # print("set_parameters", state)
        raw_actions = []
        rollouts.cuda()
        last_total_steps, total_steps = 0, 0
        s = time.time()
        for step in range(learning_algorithm.current_duration):

            for m in range(args.reward_check):
                fcnt += 1
                total_steps += 1

                current_state, current_resp = proxy_environment.getHistState()
                estate = proxy_environment.getState()
                values, log_probs, action_probs, Q_vals = train_models.determine_action(
                    current_state.unsqueeze(0),
                    current_resp.unsqueeze(0),
                    use_grad=False)
                v, ap, lp, qv = train_models.get_action(
                    values, action_probs, log_probs, Q_vals)

                # a = time.time()
                # print("choose action", a-s)
                # print(action_probs, Q_vals, ap, lp, qv)
                action = behavior_policy.take_action(ap, qv)
                cp_state = proxy_environment.changepoint_state([raw_state])
                # print(state, action)
                # print("before_insert", state)
                # print(current_state.reshape((4,84,84))[0].cpu().numpy().shape)
                # cv2.imshow('frame',current_state.reshape((4,84,84))[0].cpu().numpy())
                # if cv2.waitKey(1) & 0xFF == ord('q'):
                #     pass

                # print(action, true_environment.paddle.pos, true_environment.ball.vel, true_environment.ball.pos)

                if args.behavior_policy == "dem" or args.visualize:
                    cv2.imshow('frame', raw_state[0].reshape((84, 84)))
                    if cv2.waitKey(1) & 0xFF == ord('q'):
                        pass
                # cv2.imshow('frame',raw_state[0].reshape((84,84)))
                # if cv2.waitKey(1) & 0xFF == ord('q'):
                #     pass
                rollouts.insert(
                    retest, state, current_state,
                    pytorch_model.wrap(args.greedy_epsilon, cuda=args.cuda),
                    done, current_resp, action, cp_state[0],
                    train_models.currentOptionParam(),
                    train_models.option_index, None, None, action_probs,
                    Q_vals, values)
                rollouts.insert_dilation(proxy_environment.swap)

                retest = False
                # print("step states (cs, ns, cps, act)", current_state, estate, cp_state, action)
                # print("step outputs (val, de, ap, qv, v, ap, qv)", values, dist_entropy, action_probs, Q_vals, v, ap, qv)
                trace_queue.append(
                    (current_state.clone().detach(), action.clone().detach()))
                state, raw_state, resp, done, action_list = proxy_environment.step(
                    action, model=False
                )  #, render=len(args.record_rollouts) != 0, save_path=args.record_rollouts, itr=fcnt)
                # print(action_list)
                # s = time.time()
                # print("step time", s-a)
                # print("after step", state)
                true_reward += base_env.reward
                ep_reward += base_env.reward
                if args.reward_form == 'raw':
                    for rc in reward_classes:
                        rc.insert_reward(base_env.reward)
                # print(base_env.reward)
                # print(action_list, action)
                # print("step check (al, s)", action_list, state)
                #### logging
                option_actions[train_models.currentName()][int(
                    pytorch_model.unwrap(action.squeeze()))] += 1
                #### logging
                if done:
                    print("Episode Reward: ", ep_reward, " ", fcnt, j)
                    ep_reward = 0
                    if not args.sample_duration > 0 or (args.done_swapping <=
                                                        j):
                        # print("reached end")
                        # print(step)
                        if args.trace_queue_len > -1:
                            trace_queue = rollouts.insert_trace(trace_queue)

                        trace_queue = []
                        break
                    else:  # need to clear out trace queue
                        trace_queue = rollouts.insert_trace(trace_queue)
                        trace_queue = []
                # time.sleep(.1)
            # print(m, args.reward_check)
            # rl = time.time()
            # print("run loop", start - rl)
            rewards = proxy_environment.computeReward(m + 1)
            # print(rewards, proxy_environment.changepoint_queue)
            # print(rewards.sum())
            # a = time.time()
            # print("reward time", a-s)
            change, target = proxy_environment.determineChanged(m + 1)
            proxy_environment.determine_swaps(
                m + 1, needs_rewards=True)  # doesn't need to generate rewards
            # print("reward time", time.time() - start)
            # print("rewards", torch.sum(rewards))

            # reenter to get next value
            current_state, current_resp = proxy_environment.getHistState()
            values, log_probs, action_probs, Q_vals = train_models.determine_action(
                current_state.unsqueeze(0), current_resp.unsqueeze(0))
            v, ap, lp, qv = train_models.get_action(values, action_probs,
                                                    log_probs, Q_vals)
            action = behavior_policy.take_action(ap, qv)
            trace_queue.append(
                (current_state.clone().detach(), action.clone().detach()))
            cp_state = proxy_environment.changepoint_state([raw_state])
            rollouts.insert(
                retest, state, current_state,
                pytorch_model.wrap(args.greedy_epsilon, cuda=args.cuda), done,
                current_resp, action, cp_state[0],
                train_models.currentOptionParam(), train_models.option_index,
                None, None, action_probs, Q_vals,
                values)  # inserting the last state and unused action
            retest = True  # need to re-insert value with true state
            # ########
            rollouts.insert_hindsight_target(change, target)
            rollouts.insert_rewards(args, rewards)
            name = train_models.currentName()
            option_counter[name] += m + 1
            option_value[name] += rewards.sum(dim=1)[train_models.option_index]

            last_total_steps = total_steps
            completed = learning_algorithm.interUpdateModel(
                total_steps, rewards, change, done)
            # rw = time.time()
            # print("rewards", rl - rw, start - rw)

            if completed or (done and not args.sample_duration > 0):
                break

        retest = args.buffer_steps > 0 or args.lag_num > 0  # if we roll, don't retest
        # print("steptime", time.time() - start)
        # start = time.time()
        # print(done)
        # print(rollouts.base_rollouts.extracted_state, rollouts.base_rollouts.rewards)
        # print("rew, state", rollouts.rewards[0,-50:], rollouts.extracted_state[-50:])
        # print("inserttime", time.time() - start)
        # print("states and actions (es, cs, a, m)", rollouts.extracted_state, rollouts.current_state, rollouts.actions, rollouts.masks)
        # print("actions and Qvals (qv, vp, ap)", rollouts.Qvals, rollouts.value_preds, rollouts.action_probs)
        # start = time.time()
        total_duration += total_steps
        # if done:
        #     trace_queue = rollouts.insert_trace(trace_queue)
        #     trace_queue = [] # insert first
        # else:
        #     trace_queue = rollouts.insert_trace(trace_queue)
        # print(rollouts.extracted_state)
        # print(rewards)
        # rollouts.compute_returns(args, values) # don't need to compute returns because they are computed upon reward reception
        # print("returns and rewards (rew, ret)", rollouts.rewards, rollouts.returns)
        # print("returns and return queue", rollouts.returns, rollouts.return_queue)
        # print("reward check (cs, rw, rol rw, rt", rollouts.current_state, rewards, rollouts.rewards, rollouts.returns)

        # print(name, rollouts.extracted_state, rollouts.rewards, rollouts.actions)
        # n = 0
        # for obj in gc.get_objects():
        #     try:
        #         if torch.is_tensor(obj):
        #             n+=1
        #     except:
        #         pass
        # print("learning at", j, n)

        #### logging
        # print(rollouts.base_rollouts.rewards.shape)
        reward_total = rollouts.get_current(
            names=['rewards'])[0][train_models.option_index].sum(dim=0)
        # print("reward_total", reward_total.shape)
        final_rewards.append(reward_total)
        #### logging
        # start = time.time()
        learning_algorithm.step_counter += 1
        if j >= args.warm_up:  # TODO: clean up this to learning algorithm?
            value_loss, action_loss, dist_entropy, output_entropy, entropy_loss, action_log_probs = learning_algorithm.step(
                args, train_models, rollouts)
            if args.dist_interval != -1 and j % args.dist_interval == 0:
                learning_algorithm.distibutional_sparcity_step(
                    args, train_models, rollouts)
                # print("di", time.time() - start)
            if args.correlate_steps > 0 and j % args.diversity_interval == 0:
                loss = learning_algorithm.correlate_diversity_step(
                    args, train_models, rollouts)
                # print("corr", time.time() - start)
            if args.greedy_epsilon_decay > 0 and j % args.greedy_epsilon_decay == 0 and j != 0:
                behavior_policy.epsilon = max(
                    args.min_greedy_epsilon, behavior_policy.epsilon *
                    0.9)  # TODO: more advanced greedy epsilon methods
                # print("eps", time.time() - start)
            if args.sample_schedule > 0 and j % sample_schedule == 0 and j != 0:
                learning_algorithm.sample_duration = (
                    j // args.sample_schedule + 1) * args.sample_duration
                learning_algorithm.reset_current_duration(
                    learning_algorithm.sample_duration, args.reward_check)
                args.changepoint_queue_len = max(
                    learning_algorithm.max_duration,
                    args.changepoint_queue_len)
                sample_schedule = args.sample_schedule * (
                    j // args.sample_schedule + 1
                )  # sum([args.sample_schedule * (i+1) for i in range(j // args.sample_schedule + 1)])
            if args.retest_schedule > 0 and j % args.retest_schedule == 0 and j != 0:
                learning_algorithm.retest += 1
                learning_algorithm.reset_current_duration(
                    learning_algorithm.sample_duration, args.reward_check)
                args.changepoint_queue_len = max(
                    learning_algorithm.max_duration,
                    args.changepoint_queue_len)
                # print("resample", time.time() - start)
            if j > args.done_swapping:
                learning_algorithm.reset_current_duration(
                    learning_algorithm.sample_duration, args.reward_check)
        else:
            value_loss, action_loss, dist_entropy, output_entropy, entropy_loss, action_log_probs = None, None, None, None, None, None
        parameter = proxy_environment.get_next_parameter()
        if args.reward_swapping:
            parameter = completed
        learning_algorithm.updateModel(parameter)
        # s = time.time()
        # print("learning step time", s-a)
        # n = 0
        # for obj in gc.get_objects():
        #     try:
        #         if torch.is_tensor(obj):
        #             n+=1
        #     except:
        #         pass
        # print("objects at", j, n)

        # print("update", time.time() - start)
        # print("learn time", time.time() - rw)
        if j % args.save_interval == 0 and args.save_models and args.train:  # no point in saving if not training
            print("=========SAVING MODELS==========")
            train_models.save(save_path)  # TODO: implement save_options

        #### logging
        if j % args.log_interval == 0:
            print("Qvalue and state", pytorch_model.unwrap(Q_vals.squeeze()),
                  pytorch_model.unwrap(current_state.squeeze()))
            print("probs and state",
                  pytorch_model.unwrap(action_probs.squeeze()),
                  pytorch_model.unwrap(current_state.squeeze()))
            for name in train_models.names():
                if option_counter[name] > 0:
                    print(name, option_value[name] / option_counter[name], [
                        option_actions[name][i] / option_counter[name]
                        for i in range(len(option_actions[name]))
                    ])
                # if j % (args.log_interval * 20) == 0:
                option_value[name] = 0
                option_counter[name] = 0
                for i in range(len(option_actions[name])):
                    option_actions[name][i] = 0
            end = time.time()
            final_rewards = torch.stack(final_rewards).detach()
            average_rewards.append(final_rewards.sum())
            average_counts.append(total_duration)
            acount = np.sum(average_counts)
            best_reward = true_reward
            true_reward = true_reward / total_steps
            mean_reward = true_reward
            if len(base_env.episode_rewards) > 0:
                true_reward = np.median(base_env.episode_rewards)
                mean_reward = np.mean(base_env.episode_rewards)
                best_reward = np.max(base_env.episode_rewards)

            el, vl, al = unwrap_or_none(entropy_loss), unwrap_or_none(
                value_loss), unwrap_or_none(action_loss)
            total_elapsed += total_duration
            log_stats = "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {}, value loss {}, policy loss {}, average_reward {}, true_reward median: {}, mean: {}, max: {}".format(
                j, total_elapsed, int(total_elapsed / (end - start)),
                final_rewards.mean(), np.median(final_rewards.cpu()),
                final_rewards.min(), final_rewards.max(), el, vl, al,
                torch.stack(average_rewards).sum() / acount, true_reward,
                mean_reward, best_reward)
            if acount > 300:
                average_counts.pop(0)
                average_rewards.pop(0)
            true_reward = 0.0
            print(log_stats)
            final_rewards = list()
            total_duration = 0
        #### logging
    if args.save_models and args.train:  # no point in saving if not training
        print("=========SAVING MODELS==========")
        train_models.save(save_path)  # TODO: implement save_options

    proxy_environment.close_files()
 def forward(self, x, reward):
     x = pytorch_model.unwrap(x)
     # print(reward[0], x)
     return self.dope_rainbow.step(reward[0], x)
Пример #25
0
def remove_mean(imgs, focus, nb_size=(4, 9)):
    in_np = isinstance(imgs, np.ndarray)
    if not in_np:
        imgs = pytorch_model.unwrap(imgs)
    imgs = np.array(imgs)
    focus = (focus * imgs.shape[2:]).astype(int)
Пример #26
0
 def forward(self, img, ret_numpy=False, ret_extra=False):
     # out = img
     out = self.preprocess(img)
     for layer in self.layers:
         out = layer(out)
     return out if not ret_numpy else pytorch_model.unwrap(out)
Пример #27
0
 def precompute(self, states, actions, resps):
     trajectory = pytorch_model.unwrap(states[:-1,:self.traj_dim])
     saliency_trajectory = pytorch_model.unwrap(states[:-1,self.traj_dim:])
     # print("states shape", trajectory.shape, saliency_trajectory.shape)
     assignments, cps = self.model.get_mode(trajectory, saliency_trajectory)
     return assignments, cps
Пример #28
0
    def step(self, action):
        # TODO: action is tensor, might not be safe assumption
        # t = time.time()
        uaction = pytorch_model.unwrap(action.long())
        raw_state, reward, done, infos = self.screen.step([uaction])
        for info in infos:
            if 'episode' in info.keys():
                self.episode_rewards.append(info['episode']['r'])

        # a = time.time()
        # print("screen step", a - t)
        raw_state = np.squeeze(raw_state)
        # raw_state[:10,:] = 0.0
        self.current_raw = raw_state
        factor_state = {'Action': [[0.0, 0.0], (float(uaction), )]}
        self.current_action = action
        self.reward = reward[0]
        # cv2.imshow('frame',raw_state)
        # if cv2.waitKey(10000) & 0xFF == ord('q'):
        #     pass

        if self.focus_model is not None:
            factor_state = self.focus_model.forward(
                pytorch_model.wrap(raw_state.astype(float) / 255.0,
                                   cuda=True).unsqueeze(0).unsqueeze(0),
                ret_numpy=True)
            # t = time.time()
            # print("model step", t - a)
            for key in factor_state.keys():
                factor_state[key] *= 84
                factor_state[key] = (np.squeeze(factor_state[key]), (1.0, ))
        self.factor_state = factor_state
        self.last_action = uaction

        #
        rs = raw_state.copy()
        time_dict = factor_state
        pval = ""
        for k in time_dict.keys():
            if k != 'Action' and k != 'Reward':
                raw_state[int(time_dict[k][0][0]), :] = 255
                raw_state[:, int(time_dict[k][0][1])] = 255
            if k == 'Action' or k == 'Reward':
                pval += k + ": " + str(time_dict[k][1]) + ", "
            else:
                pval += k + ": " + str(time_dict[k][0]) + ", "
        print(pval[:-2])
        raw_state = cv2.resize(raw_state, (336, 336))
        cv2.imshow('frame', raw_state)
        if cv2.waitKey(1) & 0xFF == ord(' ') & 0xFF == ord('c'):
            pass

        # logging
        if len(self.save_path) > 0:
            if self.recycle > 0:
                state_path = os.path.join(
                    self.save_path, str((self.itr % self.recycle) // 2000))
                count = self.itr % self.recycle
            else:
                state_path = os.path.join(self.save_path,
                                          str(self.itr // 2000))
                count = self.itr
            try:
                os.makedirs(state_path)
            except OSError:
                pass
            if self.itr != 0:
                object_dumps = open(
                    os.path.join(self.save_path, "focus_dumps.txt"), 'a')
            else:
                object_dumps = open(
                    os.path.join(self.save_path, "focus_dumps.txt"),
                    'w')  # create file if it does not exist
            for key in factor_state.keys():
                writeable = list(factor_state[key][0]) + list(
                    factor_state[key][1])
                object_dumps.write(
                    key + ":" + " ".join([str(fs) for fs in writeable]) +
                    "\t")  # TODO: attributes are limited to single floats
            object_dumps.write(
                "\n")  # TODO: recycling does not stop object dumping

            imio.imsave(
                os.path.join(state_path, "state" + str(count % 2000) + ".png"),
                self.current_raw)
            self.itr += 1
        # print("elapsed ", time.time() - t)
        return raw_state, factor_state, done
Пример #29
0
def testRL(args,
           save_path,
           true_environment,
           proxy_chain,
           proxy_environment,
           state_class,
           behavior_policy,
           num_actions,
           reward_classes=None):
    print("#######")
    print("Evaluating Options")
    print("#######")
    # if option_chain is not None: #TODO: implement this
    base_env = proxy_chain[0]
    base_env.set_save(0, args.save_dir, args.save_recycle)
    if reward_classes is not None:
        proxy_environment.reward_fns = reward_classes
    args.changepoint_queue_len = max(args.changepoint_queue_len,
                                     args.num_iters * args.num_update_model)
    proxy_environment.initialize(args, proxy_chain,
                                 proxy_environment.reward_fns, state_class,
                                 behavior_policy)
    print(base_env.save_path)
    behavior_policy.initialize(args, num_actions)
    train_models = proxy_environment.models
    train_models.initialize(args, len(reward_classes), state_class,
                            num_actions)
    proxy_environment.duplicate(args)
    proxy_environment.set_save(0, args.save_dir, args.save_recycle)
    state = pytorch_model.wrap(proxy_environment.getState(), cuda=args.cuda)
    resp = proxy_environment.getResp()
    print(state.shape)
    raw_state = base_env.getState()
    cs, cr = proxy_environment.getHistState()
    hist_state = pytorch_model.wrap(cs, cuda=args.cuda)
    cp_state = proxy_environment.changepoint_state([raw_state])
    rollouts = RolloutOptionStorage(
        args.num_processes, (state_class.shape, ),
        proxy_environment.action_size,
        cr.flatten().shape[0], state.shape, hist_state.shape,
        args.buffer_steps, args.changepoint_queue_len, args.trace_len,
        args.trace_queue_len, args.dilated_stack, args.target_stack,
        args.dilated_queue_len,
        train_models.currentOptionParam().shape[1:], len(train_models.models),
        cp_state[0].shape, args.lag_num, args.cuda)
    option_actions = {
        option.name: collections.Counter()
        for option in train_models.models
    }
    total_duration = 0
    start = time.time()
    fcnt = 0
    final_rewards = list()
    option_counter = collections.Counter()
    option_value = collections.Counter()
    raw_states = dict()
    ep_reward = 0
    rollouts.set_parameters(args.num_iters * args.num_update_model)
    # if args.num_iters > rollouts.changepoint_queue_len:
    #     rollouts.set_changepoint_queue(args.num_iters)
    done = False
    ctr = 0
    raw_indexes = dict()
    for i in range(args.num_iters):
        train_models.option_index = np.random.randint(train_models.num_options)
        train_models.currentModel().test = True
        if train_models.currentName() not in raw_states:
            raw_states[train_models.currentName()] = []
            raw_indexes[train_models.currentName()] = []

        for j in range(args.num_update_model):
            raw_indexes[train_models.currentName()].append(ctr)
            ctr += 1
            fcnt += 1
            raw_actions = []
            rollouts.cuda()
            current_state, current_resp = proxy_environment.getHistState()
            values, log_probs, action_probs, Q_vals = train_models.determine_action(
                current_state.unsqueeze(0), current_resp.unsqueeze(0))
            v, ap, lp, qv = train_models.get_action(values, action_probs,
                                                    log_probs, Q_vals)
            cp_state = proxy_environment.changepoint_state([raw_state])
            ep_reward += base_env.reward
            # print(ap, qv)
            action = behavior_policy.take_action(ap, qv)
            # print(train_models.currentName(), action, qv.squeeze())
            rollouts.insert(
                False, state, current_state,
                pytorch_model.wrap(args.greedy_epsilon, cuda=args.cuda), done,
                current_resp, action, cp_state[0],
                train_models.currentOptionParam(), train_models.option_index,
                None, None, action_probs, Q_vals, values)
            state, raw_state, resp, done, action_list = proxy_environment.step(
                action, model=False
            )  #, render=len(args.record_rollouts) != 0, save_path=args.record_rollouts, itr=fcnt)
            print(train_models.currentName(), j, action)
            cv2.imshow('frame', raw_state[0])
            if cv2.waitKey(50) & 0xFF == ord('q'):
                break
            raw_states[train_models.currentName()].append(raw_state)
            option_actions[train_models.currentName()][int(
                pytorch_model.unwrap(action.squeeze()))] += 1
            if done:
                print("Episode Reward: ", ep_reward, " ", fcnt)
                ep_reward = 0
                # print("reached end")
            # proxy_environment.determine_swacurrent_durationps(length, needs_rewards=True) # doesn't need to generate rewards
        # rewards = proxy_environment.computeReward(args.num_update_model)
        # print(rewards)
    if len(base_env.episode_rewards) > 0:
        true_reward = np.median(base_env.episode_rewards)
        mean_reward = np.mean(base_env.episode_rewards)
        best_reward = np.max(base_env.episode_rewards)
        print("true reward median: %f, mean: %f, max: %f" %
              (true_reward, mean_reward, best_reward))

    print(args.num_iters)
    print(action_probs)
    print("Episode Reward: ", ep_reward, " ", fcnt)
    print(proxy_environment.reward_fns)
    rewards = proxy_environment.computeReward(args.num_iters *
                                              args.num_update_model)
    # print(rewards.shape)
    # print(rewards.sum())
    rollouts.insert_rewards(args, rewards)
    total_duration += j
    save_rols = copy.deepcopy(rollouts)
    if len(args.save_dir) > 0:
        save_to_pickle(os.path.join(args.save_dir, "rollouts.pkl"), save_rols)

    for i in range(train_models.num_options):
        print(rollouts.base_rollouts.rewards.shape, raw_indexes)
        reward_total = rollouts.base_rollouts.rewards.sum(
            dim=1)[i] / (args.num_iters * args.num_update_model)
        # print(rollouts.base_rollouts.rewards, raw_indexes, rollouts.base_rollouts.rewards.shape)
        reward_adjusted = rollouts.base_rollouts.rewards[
            i,
            np.array(raw_indexes[train_models.models[i].name]) +
            args.num_stack].sum(dim=0) / len(
                raw_indexes[train_models.models[i].name])
        print("Num policy steps:",
              len(raw_indexes[train_models.models[i].name]))
        print("Rewards during Policy:", reward_adjusted)
        print("Rewards for Policy:", reward_total)