def get_states(args,
               true_environment,
               length_constraint=50000,
               raws=None,
               dumps=None):
    dataset_path = args.record_rollouts
    changepoint_path = args.changepoint_dir
    option_chain = OptionChain(true_environment, args.changepoint_dir,
                               args.train_edge, args)
    environments = option_chain.initialize(args)
    print(environments)
    proxy_environment = environments.pop(-1)
    head, tail = get_edge(args.train_edge)
    if len(
            environments
    ) > 1:  # there is a difference in the properties of a proxy environment and the true environment
        num_actions = len(environments[-1].reward_fns)
    else:
        num_actions = environments[-1].num_actions
    state_class = GetState(head,
                           state_forms=list(
                               zip(args.state_names, args.state_forms)))
    use_raw = 'raw' in args.state_forms
    state_class.minmax = compute_minmax(state_class, dataset_path)
    states, resps, raws, dumps = load_states(
        state_class.get_state,
        dataset_path,
        length_constraint=length_constraint,
        use_raw=use_raw,
        raws=raws,
        dumps=dumps)
    return states, resps, num_actions, state_class, environments, raws, dumps
def pretrain(args, true_environment, desired, num_actions, state_class, states,
             resps, targets, criteria, reward_fns):
    # args = get_args()
    # true_environment = Paddle()
    # true_environment = PaddleNoBlocks()
    dataset_path = args.record_rollouts
    changepoint_path = args.changepoint_dir
    option_chain = OptionChain(true_environment, args.changepoint_dir,
                               args.train_edge, args)
    environments = option_chain.initialize(args)
    proxy_environment = environments.pop(-1)
    proxy_chain = environments
    if args.load_weights:
        train_models = proxy_environment.models
    else:
        train_models = MultiOption(1, models[args.model_form])
    head, tail = get_edge(args.train_edge)
    print(args.state_names, args.state_forms)
    print(state_class.minmax)
    # behavior_policy = EpsilonGreedyProbs()
    save_dir = args.save_graph
    if args.save_graph == "graph":
        save_dir = option_chain.save_dir
    proxy_environment.initialize(args,
                                 proxy_chain,
                                 reward_fns,
                                 state_class,
                                 behavior_policy=None)
    fit(args, save_dir, true_environment, train_models, state_class, desired,
        states, resps, targets, num_actions, criteria, proxy_environment,
        reward_fns)
def generate_soft_dataset(states, resps, true_environment, reward_fns, args):
    pre_load_weights = args.load_weights
    args.load_weights = True
    option_chain = OptionChain(true_environment, args.changepoint_dir,
                               args.train_edge, args)
    print(args.load_weights)
    environments = option_chain.initialize(args)
    proxy_environment = environments.pop(-1)
    proxy_chain = environments
    train_models = proxy_environment.models
    head, tail = get_edge(args.train_edge)
    if len(
            environments
    ) > 1:  # there is a difference in the properties of a proxy environment and the true environment
        num_actions = len(environments[-1].reward_fns)
    else:
        num_actions = environments[-1].num_actions
    state_class = GetState(head,
                           state_forms=list(
                               zip(args.state_names, args.state_forms)))
    proxy_environment.initialize(args,
                                 proxy_chain,
                                 reward_fns,
                                 state_class,
                                 behavior_policy=None)

    train_models.initialize(args, len(reward_fns), state_class, num_actions)
    train_models.session(args)
    proxy_environment.duplicate(args)  # assumes that we are loading weights
    args.load_weights = pre_load_weights

    soft_actions = [[] for i in range(train_models.num_options)]
    for oidx in range(train_models.num_options):
        train_models.option_index = oidx
        if args.model_form == 'population':
            train_models.currentModel().use_mean = True
        for i in range(len(states) // 30 + 1):
            state = states[i * 30:(i + 1) * 30]
            resp = resps[i * 30:(i + 1) * 30]
            values, dist_entropy, action_probs, Q_vals = train_models.determine_action(
                pytorch_model.wrap(state, cuda=args.cuda),
                pytorch_model.wrap(resp, cuda=args.cuda))
            # print (action_probs)
            values, action_probs, Q_vals = train_models.get_action(
                values, action_probs, Q_vals)
            soft_actions[oidx] += pytorch_model.unwrap(action_probs).tolist()
    print("soft actions", np.sum(np.array(soft_actions[0]), axis=0))
    for i in range(len(soft_actions)):
        soft_actions[i] = smooth_weight(soft_actions[i], args.weighting_lambda)
    return np.array(soft_actions)
                                       frameskip=args.frameskip)
        else:
            true_environment = None  # TODO: implement
    elif args.env == 'SelfBreakout':
        if args.true_environment:
            true_environment = Screen(frameskip=args.frameskip)
        else:
            true_environment = FocusEnvironment(model,
                                                display=args.display_focus)
    elif args.env.find('Atari') != -1:
        true_environment = FocusAtariEnvironment(model,
                                                 args.env[len("Atari"):],
                                                 args.seed, 0, args.save_dir)
    dataset_path = args.record_rollouts
    changepoint_path = args.changepoint_dir
    option_chain = OptionChain(true_environment, args.changepoint_dir,
                               args.train_edge, args)
    reward_paths = glob.glob(os.path.join(option_chain.save_dir, "*rwd.pkl"))
    print(reward_paths)
    reward_paths.sort(key=lambda x: int(x.split("__")[2]))

    head, tail = get_edge(args.train_edge)
    if args.reward_form == 'rawdist' and args.env == 'SelfPusher':
        true_environment.use_distance_reward()
        args.reward_form = 'raw'
    if args.reward_form != 'raw':
        reward_classes = [load_from_pickle(pth) for pth in reward_paths]
        for rc in reward_classes:
            if type(rc) == ChangepointMarkovReward:
                rc.markovModel = rc.markovModel.cuda(args.gpu)
    else:
        reward_classes = [RawReward(args)]
示例#5
0
 # python rl_template.py --model-form basic --optimizer-form DQN --record-rollouts "data/testchain/" --train-edge "Action->chain" --num-stack 1 --train --num-iters 10000 --save-dir data/test
 # python rl_template.py --model-form tab --optimizer-form TabQ --record-rollouts "data/testchain/" --train-edge "Action->chain" --num-stack 1 --train --num-iters 10000 --save-dir data/test
 # with 3 rewards: python rl_template.py --model-form basic --optimizer-form DQN --record-rollouts "data/testchain/" --train-edge "Action->chain" --num-stack 1 --train --num-iters 1000 --save-dir data/test --num-update-model 1
 # python rl_template.py --model-form tab --optimizer-form TabQ --record-rollouts "data/testchain/" --train-edge "Action->chain" --num-stack 1 --train --num-iters 1000 --save-dir data/test --num-update-model 1
 args = get_args()
 true_environment = ChainMDP(30)
 # train_models = MultiOption(1, BasicModel)
 reward_classes = [
     RewardLeft(None, args),
     RewardCenter(None, args),
     RewardRight(None, args)
 ]
 train_models = MultiOption(len(reward_classes), models[args.model_form])
 learning_algorithm = learning_algorithms[args.optimizer_form]()
 option_chain = OptionChain(
     true_environment, args.record_rollouts, args.train_edge,
     args)  # here, train_edge should act like a save folder
 minmax = (0, 30)
 state_class = GetRaw(3, minmax=minmax, state_shape=[1])
 behavior_policy = EpsilonGreedyQ()
 # behavior_policy = EpsilonGreedyProbs()
 proxy_chain = option_chain.initialize(
     args
 )  # the last term is None since the last environment is not yet made
 proxy_chain.pop(-1)
 print(proxy_chain)
 trainRL(args,
         option_chain.save_dir,
         true_environment,
         train_models,
         learning_algorithm,