示例#1
0
    def generate_changepoints(self, data, save_dict=False):
        seg_models, changepoints = generate_changepoints([self.model_class],
                                                         self.params, data)

        if save_dict:
            changepoints = np.array(changepoints[::-1], dtype=np.int64)
            for cpt, seg_model in zip(changepoints, seg_models):
                print(cpt)
                print("model: \n", seg_model.A)
                print("data: \n", seg_model.data)
                print("predictions: \n", np.dot(seg_model.data, seg_model.A))
                print("diff: ", seg_model.diff)
                print("log likelihood: ", seg_model.logLikelihood)
            print(changepoints)
            # correlate_data(seg_models, changepoints, action_data, args.num_frames, data)
            # correlate_data(seg_models, changepoints, paddle_data, args.num_frames, data, prox_distance=5)
            cp_dict = {cp: m for cp, m in zip(changepoints, seg_models)}
            # print(cp_dict)
            cp_dict[-2] = args.num_frames
            # print(cp_dict)
            save_to_pickle(
                os.path.join(args.record_rollouts,
                             'changepoints-' + self.head + '.pkl'), cp_dict)
            # with open(os.path.join(args.record_rollouts, 'changepoints-' + self.head + '.pkl'), 'wb') as fid:
            #     pickle.dump(cp_dict, fid)
        return seg_models, changepoints
示例#2
0
def train_dopamine(args, save_path, true_environment, train_models,
                   proxy_environment, proxy_chain, reward_classes, state_class,
                   num_actions, behavior_policy):
    print("#######")
    print("Training Options")
    print("#######")
    # if option_chain is not None: #TODO: implement this
    base_env = proxy_chain[0]
    base_env.set_save(0, args.save_dir, args.save_recycle)
    snum = args.num_stack
    args.num_stack = 1
    proxy_environment.initialize(args, proxy_chain, reward_classes,
                                 state_class, behavior_policy)
    args.num_stack = snum
    if args.save_models:
        save_to_pickle(os.path.join(save_path, "env.pkl"), proxy_environment)
    behavior_policy.initialize(args, num_actions)
    train_models.initialize(args, len(reward_classes), state_class,
                            proxy_environment.action_size)
    proxy_environment.set_models(train_models)
    proxy_environment.set_save(0, args.save_dir, args.save_recycle)
    state = pytorch_model.wrap(proxy_environment.getState(), cuda=args.cuda)
    cs, cr = proxy_environment.getHistState()
    hist_state = pytorch_model.wrap(cs, cuda=args.cuda)
    raw_state = base_env.getState()
    cp_state = proxy_environment.changepoint_state([raw_state])
    # print("initial_state (s, hs, rs, cps)", state, hist_state, raw_state, cp_state)
    # print(cp_state.shape, state.shape, hist_state.shape, state_class.shape)
    # rollouts = RolloutOptionStorage(args.num_processes, (state_class.shape,), proxy_environment.action_size, cr.flatten().shape[0],
    #     state.shape, hist_state.shape, args.buffer_steps, args.changepoint_queue_len, args.trace_len,
    #     args.trace_queue_len, args.dilated_stack, args.target_stack, args.dilated_queue_len, train_models.currentOptionParam().shape[1:], len(train_models.models), cp_state[0].shape,
    #     args.lag_num, args.cuda)
    option_actions = {
        option.name: collections.Counter()
        for option in train_models.models
    }
    total_duration = 0
    total_elapsed = 0
    true_reward = 0
    ep_reward = 0
    start = time.time()
    fcnt = 0
    final_rewards = list()
    option_counter = collections.Counter()
    option_value = collections.Counter()
    print(hist_state)
    val = None
    train_models.currentModel().begin_episode(pytorch_model.unwrap(hist_state))
    for j in range(args.num_iters):
        raw_actions = []
        last_total_steps, total_steps = 0, 0
        for step in range(args.num_steps):
            # start = time.time()
            fcnt += 1
            total_steps += 1
            current_state, current_resp = proxy_environment.getHistState()
            estate = proxy_environment.getState()
            if args.true_environment:
                reward = pytorch_model.wrap([[base_env.reward]])
            else:
                reward = proxy_environment.computeReward(1)
            true_reward += base_env.reward
            ep_reward += base_env.reward
            # print(current_state, reward[train_models.option_index])
            action = train_models.currentModel().forward(
                current_state,
                pytorch_model.unwrap(reward[train_models.option_index]))
            # print("ap", action)
            action = pytorch_model.wrap([action])
            cp_state = proxy_environment.changepoint_state([raw_state])
            # print(state, action)
            # print("step states (cs, s, cps, act)", current_state, estate, cp_state, action)
            # print("step outputs (val, de, ap, qv, v, ap, qv)", values, dist_entropy, action_probs, Q_vals, v, ap, qv)

            state, raw_state, resp, done, action_list = proxy_environment.step(
                action, model=False
            )  #, render=len(args.record_rollouts) != 0, save_path=args.record_rollouts, itr=fcnt)
            # print("step check (al, s)", action_list, state)
            # learning_algorithm.interUpdateModel(step)
            #### logging
            option_actions[train_models.currentName()][int(
                pytorch_model.unwrap(action.squeeze()))] += 1
            #### logging
            # print(train_models.currentModel().dope_rainbow)

            if done:
                # print("reached end")
                print("Episode Reward: ", ep_reward)
                ep_reward = 0
                train_models.currentModel().end_episode(
                    pytorch_model.unwrap(reward[train_models.option_index]))
                state, resp = proxy_environment.getHistState()
                train_models.currentModel().begin_episode(
                    pytorch_model.unwrap(state))
                # print(step)
                break
        # var = [v for v in tf.trainable_variables() if v.name == "Online/fully_connected/weights:0"][0]
        # nval = train_models.currentModel().sess.run(var)
        # if val is not None:
        #     print(var, np.sum(abs(nval - val)), train_models.currentModel().dope_rainbow.eval_mode)
        # val = nval
        current_state = proxy_environment.getHistState()
        # print(state, action)
        # print("step states (cs, s, cps, act)", current_state, estate, cp_state, action)
        # print("step outputs (val, de, ap, qv, v, ap, qv)", values, dist_entropy, action_probs, Q_vals, v, ap, qv)

        cp_state = proxy_environment.changepoint_state([raw_state])
        # print("states and actions (es, cs, a, m)", rollouts.extracted_state, rollouts.current_state, rollouts.actions, rollouts.masks)
        # print("actions and Qvals (qv, vp, ap)", rollouts.Qvals, rollouts.value_preds, rollouts.action_probs)

        total_duration += step + 1
        # print("rewards", rewards)
        # rollouts.insert_rewards(rewards)
        # print(rollouts.extracted_state)
        # print(rewards)
        # rollouts.compute_returns(args, values)
        # print("returns and rewards (rew, ret)", rollouts.rewards, rollouts.returns)
        # print("returns and return queue", rollouts.returns, rollouts.return_queue)
        # print("reward check (cs, rw, rol rw, rt", rollouts.current_state, rewards, rollouts.rewards, rollouts.returns)
        name = train_models.currentName()
        # print(name, rollouts.extracted_state, rollouts.rewards, rollouts.actions)

        #### logging
        option_counter[name] += step + 1
        option_value[name] += true_reward
        #### logging
        if j % args.save_interval == 0 and args.save_models and args.train:  # no point in saving if not training
            print("=========SAVING MODELS==========")
            train_models.save(save_path)  # TODO: implement save_options

        #### logging
        if j % args.log_interval == 0:
            # print("Qvalue and state", pytorch_model.unwrap(Q_vals.squeeze()), pytorch_model.unwrap(current_state.squeeze()))
            # print("probs and state", pytorch_model.unwrap(action_probs.squeeze()), pytorch_model.unwrap(current_state.squeeze()))
            for name in train_models.names():
                if option_counter[name] > 0:
                    print(name, option_value[name] / option_counter[name], [
                        option_actions[name][i] / option_counter[name]
                        for i in range(len(option_actions[name]))
                    ])
                if j % (args.log_interval * 20) == 0:
                    option_value[name] = 0
                    option_counter[name] = 0
                    for i in range(len(option_actions[name])):
                        option_actions[name][i] = 0
            end = time.time()
            total_elapsed += total_duration
            log_stats = "Updates {}, num timesteps {}, FPS {}, reward {}".format(
                j, total_elapsed, int(total_elapsed / (end - start)),
                true_reward / (args.num_steps * args.log_interval))
            print(log_stats)
            true_reward = 0.0
            total_duration = 0
示例#3
0
    option_determiner_model = ChangepointModels(args, changepoint_model,
                                                transforms, clusters,
                                                determiner)
    option_determiner_model.changepoint_statistics(models, changepoints,
                                                   trajectory,
                                                   correlate_trajectory)

    try:
        os.makedirs(os.path.join(args.changepoint_dir, args.train_edge))
    except OSError:
        pass  # folder already created
    print(args.changepoint_dir)
    reward_fns = []
    for i in range(option_determiner_model.determiner.num_mappings):
        reward_function = reward_forms[args.reward_form](
            option_determiner_model, args, i)
        if args.train:
            reward_function.generate_training_set(combined, models,
                                                  np.array(changepoints))
            reward_function.train_rewards(20000)
        save_to_pickle(
            os.path.join(args.changepoint_dir, args.train_edge,
                         "reward__function__" + str(i) + "__rwd.pkl"),
            reward_function)
        reward_fns.append(reward_function)
    # if args.train:
    #     minvar = np.min(np.max([rf.markovModel.variance.tolist() for rf in reward_fns]), axis=0)
    #     print(minvar)
    #     for i, rf in enumerate(reward_fns):
    #         rf.setvar(minvar)
    #         save_to_pickle(os.path.join(args.changepoint_dir, args.train_edge, "reward__function__" + str(i) +"__rwd.pkl"), rf)
def testRL(args,
           save_path,
           true_environment,
           proxy_chain,
           proxy_environment,
           state_class,
           behavior_policy,
           num_actions,
           reward_classes=None):
    print("#######")
    print("Evaluating Options")
    print("#######")
    # if option_chain is not None: #TODO: implement this
    base_env = proxy_chain[0]
    base_env.set_save(0, args.save_dir, args.save_recycle)
    if reward_classes is not None:
        proxy_environment.reward_fns = reward_classes
    args.changepoint_queue_len = max(args.changepoint_queue_len,
                                     args.num_iters)
    proxy_environment.initialize(args, proxy_chain,
                                 proxy_environment.reward_fns,
                                 proxy_environment.stateExtractor,
                                 behavior_policy)
    print(base_env.save_path)
    behavior_policy.initialize(args, num_actions)
    train_models = proxy_environment.models
    train_models.initialize(args, len(reward_classes), state_class,
                            num_actions)
    proxy_environment.duplicate(args)
    proxy_environment.set_save(0, args.save_dir, args.save_recycle)
    state = pytorch_model.wrap(proxy_environment.getState(), cuda=args.cuda)
    resp = proxy_environment.getResp()
    print(state.shape)
    raw_state = base_env.getState()
    cs, cr = proxy_environment.getHistState()
    hist_state = pytorch_model.wrap(cs, cuda=args.cuda)
    cp_state = proxy_environment.changepoint_state([raw_state])
    rollouts = RolloutOptionStorage(
        args.num_processes, (state_class.shape, ),
        proxy_environment.action_size,
        cr.flatten().shape[0], state.shape, hist_state.shape,
        args.buffer_steps, args.changepoint_queue_len, args.trace_len,
        args.trace_queue_len, args.dilated_stack, args.target_stack,
        args.dilated_queue_len,
        train_models.currentOptionParam().shape[1:], len(train_models.models),
        cp_state[0].shape, args.lag_num, args.cuda)
    option_actions = {
        option.name: collections.Counter()
        for option in train_models.models
    }
    total_duration = 0
    start = time.time()
    fcnt = 0
    final_rewards = list()
    option_counter = collections.Counter()
    option_value = collections.Counter()
    raw_states = dict()
    ep_reward = 0
    rollouts.set_parameters(args.num_iters * train_models.num_options)
    # if args.num_iters > rollouts.changepoint_queue_len:
    #     rollouts.set_changepoint_queue(args.num_iters)
    done = False
    for i in range(train_models.num_options):
        train_models.option_index = i
        train_models.currentModel().test = True
        raw_states[train_models.currentName()] = []
        for j in range(args.num_iters):
            fcnt += 1
            raw_actions = []
            rollouts.cuda()
            current_state, current_resp = proxy_environment.getHistState()
            values, dist_entropy, action_probs, Q_vals = train_models.determine_action(
                current_state.unsqueeze(0), current_resp.unsqueeze(0))
            v, ap, qv = train_models.get_action(values, action_probs, Q_vals)
            cp_state = proxy_environment.changepoint_state([raw_state])
            ep_reward += base_env.reward
            # print(ap, qv)
            action = behavior_policy.take_action(ap, qv)
            rollouts.insert(
                False, state, current_state,
                pytorch_model.wrap(args.greedy_epsilon, cuda=args.cuda), done,
                current_resp, action, cp_state[0],
                train_models.currentOptionParam(), train_models.option_index,
                None, None, action_probs, Q_vals, values)
            state, raw_state, resp, done, action_list = proxy_environment.step(
                action, model=False
            )  #, render=len(args.record_rollouts) != 0, save_path=args.record_rollouts, itr=fcnt)
            raw_states[train_models.currentName()].append(raw_state)
            option_actions[train_models.currentName()][int(
                pytorch_model.unwrap(action.squeeze()))] += 1
            if done:
                print("Episode Reward: ", ep_reward, " ", fcnt)
                ep_reward = 0
                # print("reached end")
            # proxy_environment.determine_swaps(length, needs_rewards=True) # doesn't need to generate rewards

        print(args.num_iters)
        print(action_probs)
        print("Episode Reward: ", ep_reward, " ", fcnt)
        rewards = proxy_environment.computeReward(args.num_iters)
        # print(rewards.shape)
        print(rewards.sum())
        rollouts.insert_rewards(rewards, total_duration)
        total_duration += j
        rollouts.compute_returns(args, values)
        rollouts.cpu()
        save_rols = copy.copy(rollouts)
        save_to_pickle(os.path.join(args.save_dir, "rollouts.pkl"), save_rols)

        reward_total = rollouts.rewards.sum(
            dim=1)[train_models.option_index] / args.num_iters
        print("Rewards for Policy:", reward_total)
示例#5
0
                             'changepoints-' + self.head + '.pkl'), cp_dict)
            # with open(os.path.join(args.record_rollouts, 'changepoints-' + self.head + '.pkl'), 'wb') as fid:
            #     pickle.dump(cp_dict, fid)
        return seg_models, changepoints


if __name__ == "__main__":
    # python ChangepointDetection/CHAMP.py --train-edge "Action->Paddle" --record-rollouts data/random/ --champ-parameters "Paddle"
    # python ChangepointDetection/CHAMP.py --train-edge "Paddle->Ball" --record-rollouts data/integrationpaddle/ --champ-parameters "Ball" > integration/ballCHAMP.txt
    args = get_args()
    detector = CHAMPDetector(args.train_edge, args.champ_parameters)
    data = detector.load_obj_dumps(args, dumps_name=args.focus_dumps_name)
    print(data[:100])
    models, changepoints = detector.generate_changepoints(data, save_dict=True)
    save_to_pickle(
        os.path.join(args.record_rollouts,
                     'detector-' + detector.head + '.pkl'), detector)
    # with open(os.path.join(args.record_rollouts, 'detector-' + detector.head + '.pkl'), 'wb') as fid:
    #      pickle.dump(detector, fid)

    # All data must now follow obj_dumps methodology
    # args = get_args()
    # obj_dumps = read_obj_dumps(args.record_rollouts, get_last=100000)
    # paddle_data = np.array(get_individual_data('Paddle', obj_dumps, pos_val_hash=1))
    # ball_data = np.array(get_individual_data('Ball', obj_dumps, pos_val_hash=1))
    # action_data = get_individual_data('Action', obj_dumps, pos_val_hash=2)
    # action_data = np.array(hot_actions(action_data))
    # ### PADDLE DATA ###
    # if args.train_edge == "Paddle":
    #     data = paddle_data[-args.num_frames-2:, :2] # paddle
    # ### BALL DATA ###
示例#6
0
def testRL(args,
           save_path,
           true_environment,
           proxy_chain,
           proxy_environment,
           state_class,
           behavior_policy,
           num_actions,
           reward_classes=None):
    print("#######")
    print("Evaluating Options")
    print("#######")
    # if option_chain is not None: #TODO: implement this
    base_env = proxy_chain[0]
    base_env.set_save(0, args.save_dir, args.save_recycle)
    if reward_classes is not None:
        proxy_environment.reward_fns = reward_classes
    args.changepoint_queue_len = max(args.changepoint_queue_len,
                                     args.num_iters * args.num_update_model)
    proxy_environment.initialize(args, proxy_chain,
                                 proxy_environment.reward_fns, state_class,
                                 behavior_policy)
    print(base_env.save_path)
    behavior_policy.initialize(args, num_actions)
    train_models = proxy_environment.models
    train_models.initialize(args, len(reward_classes), state_class,
                            num_actions)
    proxy_environment.duplicate(args)
    proxy_environment.set_save(0, args.save_dir, args.save_recycle)
    state = pytorch_model.wrap(proxy_environment.getState(), cuda=args.cuda)
    resp = proxy_environment.getResp()
    print(state.shape)
    raw_state = base_env.getState()
    cs, cr = proxy_environment.getHistState()
    hist_state = pytorch_model.wrap(cs, cuda=args.cuda)
    cp_state = proxy_environment.changepoint_state([raw_state])
    rollouts = RolloutOptionStorage(
        args.num_processes, (state_class.shape, ),
        proxy_environment.action_size,
        cr.flatten().shape[0], state.shape, hist_state.shape,
        args.buffer_steps, args.changepoint_queue_len, args.trace_len,
        args.trace_queue_len, args.dilated_stack, args.target_stack,
        args.dilated_queue_len,
        train_models.currentOptionParam().shape[1:], len(train_models.models),
        cp_state[0].shape, args.lag_num, args.cuda)
    option_actions = {
        option.name: collections.Counter()
        for option in train_models.models
    }
    total_duration = 0
    start = time.time()
    fcnt = 0
    final_rewards = list()
    option_counter = collections.Counter()
    option_value = collections.Counter()
    raw_states = dict()
    ep_reward = 0
    rollouts.set_parameters(args.num_iters * args.num_update_model)
    # if args.num_iters > rollouts.changepoint_queue_len:
    #     rollouts.set_changepoint_queue(args.num_iters)
    done = False
    ctr = 0
    raw_indexes = dict()
    for i in range(args.num_iters):
        train_models.option_index = np.random.randint(train_models.num_options)
        train_models.currentModel().test = True
        if train_models.currentName() not in raw_states:
            raw_states[train_models.currentName()] = []
            raw_indexes[train_models.currentName()] = []

        for j in range(args.num_update_model):
            raw_indexes[train_models.currentName()].append(ctr)
            ctr += 1
            fcnt += 1
            raw_actions = []
            rollouts.cuda()
            current_state, current_resp = proxy_environment.getHistState()
            values, log_probs, action_probs, Q_vals = train_models.determine_action(
                current_state.unsqueeze(0), current_resp.unsqueeze(0))
            v, ap, lp, qv = train_models.get_action(values, action_probs,
                                                    log_probs, Q_vals)
            cp_state = proxy_environment.changepoint_state([raw_state])
            ep_reward += base_env.reward
            # print(ap, qv)
            action = behavior_policy.take_action(ap, qv)
            # print(train_models.currentName(), action, qv.squeeze())
            rollouts.insert(
                False, state, current_state,
                pytorch_model.wrap(args.greedy_epsilon, cuda=args.cuda), done,
                current_resp, action, cp_state[0],
                train_models.currentOptionParam(), train_models.option_index,
                None, None, action_probs, Q_vals, values)
            state, raw_state, resp, done, action_list = proxy_environment.step(
                action, model=False
            )  #, render=len(args.record_rollouts) != 0, save_path=args.record_rollouts, itr=fcnt)
            print(train_models.currentName(), j, action)
            cv2.imshow('frame', raw_state[0])
            if cv2.waitKey(50) & 0xFF == ord('q'):
                break
            raw_states[train_models.currentName()].append(raw_state)
            option_actions[train_models.currentName()][int(
                pytorch_model.unwrap(action.squeeze()))] += 1
            if done:
                print("Episode Reward: ", ep_reward, " ", fcnt)
                ep_reward = 0
                # print("reached end")
            # proxy_environment.determine_swacurrent_durationps(length, needs_rewards=True) # doesn't need to generate rewards
        # rewards = proxy_environment.computeReward(args.num_update_model)
        # print(rewards)
    if len(base_env.episode_rewards) > 0:
        true_reward = np.median(base_env.episode_rewards)
        mean_reward = np.mean(base_env.episode_rewards)
        best_reward = np.max(base_env.episode_rewards)
        print("true reward median: %f, mean: %f, max: %f" %
              (true_reward, mean_reward, best_reward))

    print(args.num_iters)
    print(action_probs)
    print("Episode Reward: ", ep_reward, " ", fcnt)
    print(proxy_environment.reward_fns)
    rewards = proxy_environment.computeReward(args.num_iters *
                                              args.num_update_model)
    # print(rewards.shape)
    # print(rewards.sum())
    rollouts.insert_rewards(args, rewards)
    total_duration += j
    save_rols = copy.deepcopy(rollouts)
    if len(args.save_dir) > 0:
        save_to_pickle(os.path.join(args.save_dir, "rollouts.pkl"), save_rols)

    for i in range(train_models.num_options):
        print(rollouts.base_rollouts.rewards.shape, raw_indexes)
        reward_total = rollouts.base_rollouts.rewards.sum(
            dim=1)[i] / (args.num_iters * args.num_update_model)
        # print(rollouts.base_rollouts.rewards, raw_indexes, rollouts.base_rollouts.rewards.shape)
        reward_adjusted = rollouts.base_rollouts.rewards[
            i,
            np.array(raw_indexes[train_models.models[i].name]) +
            args.num_stack].sum(dim=0) / len(
                raw_indexes[train_models.models[i].name])
        print("Num policy steps:",
              len(raw_indexes[train_models.models[i].name]))
        print("Rewards during Policy:", reward_adjusted)
        print("Rewards for Policy:", reward_total)
def trainRL(args, save_path, true_environment, train_models,
            learning_algorithm, proxy_environment, proxy_chain, reward_classes,
            state_class, behavior_policy):
    print("#######")
    print("Training Options")
    print("#######")
    # if option_chain is not None: #TODO: implement this
    base_env = proxy_chain[0]
    base_env.set_save(0, args.save_dir, args.save_recycle,
                      args.single_save_dir)
    proxy_environment.initialize(args, proxy_chain, reward_classes,
                                 state_class, behavior_policy)
    if args.save_models:
        if args.env.find("Atari") != -1:
            screen = base_env.screen
            base_env.screen = None
        save_to_pickle(os.path.join(save_path, "env.pkl"), proxy_environment)
        if args.env.find("Atari") != -1:
            base_env.screen = screen
    behavior_policy.initialize(args, proxy_environment.action_size)
    print(reward_classes[0], reward_classes[0].parameter_minmax)
    if not args.load_weights:
        train_models.initialize(
            args,
            len(reward_classes),
            state_class,
            proxy_environment.action_size,
            parameter_minmax=reward_classes[0].parameter_minmax)
        proxy_environment.set_models(train_models)
    else:
        print("loading weights", len(reward_classes))
        train_models.initialize(
            args,
            len(reward_classes),
            state_class,
            proxy_environment.action_size,
            parameter_minmax=reward_classes[0].parameter_minmax)
        train_models.session(args)
        proxy_environment.duplicate(args)
    train_models.train()
    proxy_environment.set_save(0, args.save_dir, args.save_recycle)
    learning_algorithm.initialize(args,
                                  train_models,
                                  reward_classes=reward_classes)
    print(proxy_environment.get_names())
    state = pytorch_model.wrap(proxy_environment.getState(), cuda=args.cuda)
    cs, cr = proxy_environment.getHistState()
    hist_state = pytorch_model.wrap(cs, cuda=args.cuda)
    raw_state = base_env.getState()
    resp = proxy_environment.getResp()
    cp_state = proxy_environment.changepoint_state([raw_state])
    # print("initial_state (s, hs, rs, cps)", state, hist_state, raw_state, cp_state)
    # print(cp_state.shape, state.shape, hist_state.shape, state_class.shape)
    print(args.trace_len, args.trace_queue_len)
    args.buffer_clip = max(args.buffer_clip, args.reward_check)
    rollouts = RolloutOptionStorage(
        args.num_processes, (state_class.shape, ),
        proxy_environment.action_size,
        cr.flatten().shape[0],
        state.shape,
        hist_state.shape,
        args.buffer_steps,
        args.changepoint_queue_len,
        args.trace_len,
        args.trace_queue_len,
        args.dilated_stack,
        args.target_stack,
        args.dilated_queue_len,
        train_models.currentOptionParam().shape[1:],
        len(train_models.models),
        cp_state[0].shape,
        args.lag_num,
        args.cuda,
        return_form=args.return_form)
    option_actions = {
        option.name: collections.Counter()
        for option in train_models.models
    }
    total_duration = 0
    total_elapsed = 0
    true_reward = 0
    ep_reward = 0
    sample_schedule = args.sample_schedule
    start = time.time()
    fcnt = 0
    final_rewards = list()
    average_rewards, average_counts = [], []
    option_counter = collections.Counter()
    option_value = collections.Counter()
    trace_queue = [
    ]  # keep the last states until end of trajectory (or until a reset), and dump when a reward is found
    retest = False
    done = False
    for j in range(args.num_iters):
        rollouts.set_parameters(learning_algorithm.current_duration *
                                args.reward_check)
        # print("set_parameters", state)
        raw_actions = []
        rollouts.cuda()
        last_total_steps, total_steps = 0, 0
        s = time.time()
        for step in range(learning_algorithm.current_duration):

            for m in range(args.reward_check):
                fcnt += 1
                total_steps += 1

                current_state, current_resp = proxy_environment.getHistState()
                estate = proxy_environment.getState()
                values, log_probs, action_probs, Q_vals = train_models.determine_action(
                    current_state.unsqueeze(0),
                    current_resp.unsqueeze(0),
                    use_grad=False)
                v, ap, lp, qv = train_models.get_action(
                    values, action_probs, log_probs, Q_vals)

                # a = time.time()
                # print("choose action", a-s)
                # print(action_probs, Q_vals, ap, lp, qv)
                action = behavior_policy.take_action(ap, qv)
                cp_state = proxy_environment.changepoint_state([raw_state])
                # print(state, action)
                # print("before_insert", state)
                # print(current_state.reshape((4,84,84))[0].cpu().numpy().shape)
                # cv2.imshow('frame',current_state.reshape((4,84,84))[0].cpu().numpy())
                # if cv2.waitKey(1) & 0xFF == ord('q'):
                #     pass

                # print(action, true_environment.paddle.pos, true_environment.ball.vel, true_environment.ball.pos)

                if args.behavior_policy == "dem" or args.visualize:
                    cv2.imshow('frame', raw_state[0].reshape((84, 84)))
                    if cv2.waitKey(1) & 0xFF == ord('q'):
                        pass
                # cv2.imshow('frame',raw_state[0].reshape((84,84)))
                # if cv2.waitKey(1) & 0xFF == ord('q'):
                #     pass
                rollouts.insert(
                    retest, state, current_state,
                    pytorch_model.wrap(args.greedy_epsilon, cuda=args.cuda),
                    done, current_resp, action, cp_state[0],
                    train_models.currentOptionParam(),
                    train_models.option_index, None, None, action_probs,
                    Q_vals, values)
                rollouts.insert_dilation(proxy_environment.swap)

                retest = False
                # print("step states (cs, ns, cps, act)", current_state, estate, cp_state, action)
                # print("step outputs (val, de, ap, qv, v, ap, qv)", values, dist_entropy, action_probs, Q_vals, v, ap, qv)
                trace_queue.append(
                    (current_state.clone().detach(), action.clone().detach()))
                state, raw_state, resp, done, action_list = proxy_environment.step(
                    action, model=False
                )  #, render=len(args.record_rollouts) != 0, save_path=args.record_rollouts, itr=fcnt)
                # print(action_list)
                # s = time.time()
                # print("step time", s-a)
                # print("after step", state)
                true_reward += base_env.reward
                ep_reward += base_env.reward
                if args.reward_form == 'raw':
                    for rc in reward_classes:
                        rc.insert_reward(base_env.reward)
                # print(base_env.reward)
                # print(action_list, action)
                # print("step check (al, s)", action_list, state)
                #### logging
                option_actions[train_models.currentName()][int(
                    pytorch_model.unwrap(action.squeeze()))] += 1
                #### logging
                if done:
                    print("Episode Reward: ", ep_reward, " ", fcnt, j)
                    ep_reward = 0
                    if not args.sample_duration > 0 or (args.done_swapping <=
                                                        j):
                        # print("reached end")
                        # print(step)
                        if args.trace_queue_len > -1:
                            trace_queue = rollouts.insert_trace(trace_queue)

                        trace_queue = []
                        break
                    else:  # need to clear out trace queue
                        trace_queue = rollouts.insert_trace(trace_queue)
                        trace_queue = []
                # time.sleep(.1)
            # print(m, args.reward_check)
            # rl = time.time()
            # print("run loop", start - rl)
            rewards = proxy_environment.computeReward(m + 1)
            # print(rewards, proxy_environment.changepoint_queue)
            # print(rewards.sum())
            # a = time.time()
            # print("reward time", a-s)
            change, target = proxy_environment.determineChanged(m + 1)
            proxy_environment.determine_swaps(
                m + 1, needs_rewards=True)  # doesn't need to generate rewards
            # print("reward time", time.time() - start)
            # print("rewards", torch.sum(rewards))

            # reenter to get next value
            current_state, current_resp = proxy_environment.getHistState()
            values, log_probs, action_probs, Q_vals = train_models.determine_action(
                current_state.unsqueeze(0), current_resp.unsqueeze(0))
            v, ap, lp, qv = train_models.get_action(values, action_probs,
                                                    log_probs, Q_vals)
            action = behavior_policy.take_action(ap, qv)
            trace_queue.append(
                (current_state.clone().detach(), action.clone().detach()))
            cp_state = proxy_environment.changepoint_state([raw_state])
            rollouts.insert(
                retest, state, current_state,
                pytorch_model.wrap(args.greedy_epsilon, cuda=args.cuda), done,
                current_resp, action, cp_state[0],
                train_models.currentOptionParam(), train_models.option_index,
                None, None, action_probs, Q_vals,
                values)  # inserting the last state and unused action
            retest = True  # need to re-insert value with true state
            # ########
            rollouts.insert_hindsight_target(change, target)
            rollouts.insert_rewards(args, rewards)
            name = train_models.currentName()
            option_counter[name] += m + 1
            option_value[name] += rewards.sum(dim=1)[train_models.option_index]

            last_total_steps = total_steps
            completed = learning_algorithm.interUpdateModel(
                total_steps, rewards, change, done)
            # rw = time.time()
            # print("rewards", rl - rw, start - rw)

            if completed or (done and not args.sample_duration > 0):
                break

        retest = args.buffer_steps > 0 or args.lag_num > 0  # if we roll, don't retest
        # print("steptime", time.time() - start)
        # start = time.time()
        # print(done)
        # print(rollouts.base_rollouts.extracted_state, rollouts.base_rollouts.rewards)
        # print("rew, state", rollouts.rewards[0,-50:], rollouts.extracted_state[-50:])
        # print("inserttime", time.time() - start)
        # print("states and actions (es, cs, a, m)", rollouts.extracted_state, rollouts.current_state, rollouts.actions, rollouts.masks)
        # print("actions and Qvals (qv, vp, ap)", rollouts.Qvals, rollouts.value_preds, rollouts.action_probs)
        # start = time.time()
        total_duration += total_steps
        # if done:
        #     trace_queue = rollouts.insert_trace(trace_queue)
        #     trace_queue = [] # insert first
        # else:
        #     trace_queue = rollouts.insert_trace(trace_queue)
        # print(rollouts.extracted_state)
        # print(rewards)
        # rollouts.compute_returns(args, values) # don't need to compute returns because they are computed upon reward reception
        # print("returns and rewards (rew, ret)", rollouts.rewards, rollouts.returns)
        # print("returns and return queue", rollouts.returns, rollouts.return_queue)
        # print("reward check (cs, rw, rol rw, rt", rollouts.current_state, rewards, rollouts.rewards, rollouts.returns)

        # print(name, rollouts.extracted_state, rollouts.rewards, rollouts.actions)
        # n = 0
        # for obj in gc.get_objects():
        #     try:
        #         if torch.is_tensor(obj):
        #             n+=1
        #     except:
        #         pass
        # print("learning at", j, n)

        #### logging
        # print(rollouts.base_rollouts.rewards.shape)
        reward_total = rollouts.get_current(
            names=['rewards'])[0][train_models.option_index].sum(dim=0)
        # print("reward_total", reward_total.shape)
        final_rewards.append(reward_total)
        #### logging
        # start = time.time()
        learning_algorithm.step_counter += 1
        if j >= args.warm_up:  # TODO: clean up this to learning algorithm?
            value_loss, action_loss, dist_entropy, output_entropy, entropy_loss, action_log_probs = learning_algorithm.step(
                args, train_models, rollouts)
            if args.dist_interval != -1 and j % args.dist_interval == 0:
                learning_algorithm.distibutional_sparcity_step(
                    args, train_models, rollouts)
                # print("di", time.time() - start)
            if args.correlate_steps > 0 and j % args.diversity_interval == 0:
                loss = learning_algorithm.correlate_diversity_step(
                    args, train_models, rollouts)
                # print("corr", time.time() - start)
            if args.greedy_epsilon_decay > 0 and j % args.greedy_epsilon_decay == 0 and j != 0:
                behavior_policy.epsilon = max(
                    args.min_greedy_epsilon, behavior_policy.epsilon *
                    0.9)  # TODO: more advanced greedy epsilon methods
                # print("eps", time.time() - start)
            if args.sample_schedule > 0 and j % sample_schedule == 0 and j != 0:
                learning_algorithm.sample_duration = (
                    j // args.sample_schedule + 1) * args.sample_duration
                learning_algorithm.reset_current_duration(
                    learning_algorithm.sample_duration, args.reward_check)
                args.changepoint_queue_len = max(
                    learning_algorithm.max_duration,
                    args.changepoint_queue_len)
                sample_schedule = args.sample_schedule * (
                    j // args.sample_schedule + 1
                )  # sum([args.sample_schedule * (i+1) for i in range(j // args.sample_schedule + 1)])
            if args.retest_schedule > 0 and j % args.retest_schedule == 0 and j != 0:
                learning_algorithm.retest += 1
                learning_algorithm.reset_current_duration(
                    learning_algorithm.sample_duration, args.reward_check)
                args.changepoint_queue_len = max(
                    learning_algorithm.max_duration,
                    args.changepoint_queue_len)
                # print("resample", time.time() - start)
            if j > args.done_swapping:
                learning_algorithm.reset_current_duration(
                    learning_algorithm.sample_duration, args.reward_check)
        else:
            value_loss, action_loss, dist_entropy, output_entropy, entropy_loss, action_log_probs = None, None, None, None, None, None
        parameter = proxy_environment.get_next_parameter()
        if args.reward_swapping:
            parameter = completed
        learning_algorithm.updateModel(parameter)
        # s = time.time()
        # print("learning step time", s-a)
        # n = 0
        # for obj in gc.get_objects():
        #     try:
        #         if torch.is_tensor(obj):
        #             n+=1
        #     except:
        #         pass
        # print("objects at", j, n)

        # print("update", time.time() - start)
        # print("learn time", time.time() - rw)
        if j % args.save_interval == 0 and args.save_models and args.train:  # no point in saving if not training
            print("=========SAVING MODELS==========")
            train_models.save(save_path)  # TODO: implement save_options

        #### logging
        if j % args.log_interval == 0:
            print("Qvalue and state", pytorch_model.unwrap(Q_vals.squeeze()),
                  pytorch_model.unwrap(current_state.squeeze()))
            print("probs and state",
                  pytorch_model.unwrap(action_probs.squeeze()),
                  pytorch_model.unwrap(current_state.squeeze()))
            for name in train_models.names():
                if option_counter[name] > 0:
                    print(name, option_value[name] / option_counter[name], [
                        option_actions[name][i] / option_counter[name]
                        for i in range(len(option_actions[name]))
                    ])
                # if j % (args.log_interval * 20) == 0:
                option_value[name] = 0
                option_counter[name] = 0
                for i in range(len(option_actions[name])):
                    option_actions[name][i] = 0
            end = time.time()
            final_rewards = torch.stack(final_rewards).detach()
            average_rewards.append(final_rewards.sum())
            average_counts.append(total_duration)
            acount = np.sum(average_counts)
            best_reward = true_reward
            true_reward = true_reward / total_steps
            mean_reward = true_reward
            if len(base_env.episode_rewards) > 0:
                true_reward = np.median(base_env.episode_rewards)
                mean_reward = np.mean(base_env.episode_rewards)
                best_reward = np.max(base_env.episode_rewards)

            el, vl, al = unwrap_or_none(entropy_loss), unwrap_or_none(
                value_loss), unwrap_or_none(action_loss)
            total_elapsed += total_duration
            log_stats = "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {}, value loss {}, policy loss {}, average_reward {}, true_reward median: {}, mean: {}, max: {}".format(
                j, total_elapsed, int(total_elapsed / (end - start)),
                final_rewards.mean(), np.median(final_rewards.cpu()),
                final_rewards.min(), final_rewards.max(), el, vl, al,
                torch.stack(average_rewards).sum() / acount, true_reward,
                mean_reward, best_reward)
            if acount > 300:
                average_counts.pop(0)
                average_rewards.pop(0)
            true_reward = 0.0
            print(log_stats)
            final_rewards = list()
            total_duration = 0
        #### logging
    if args.save_models and args.train:  # no point in saving if not training
        print("=========SAVING MODELS==========")
        train_models.save(save_path)  # TODO: implement save_options

    proxy_environment.close_files()