def get_states(args, true_environment, length_constraint=50000, raws=None, dumps=None): dataset_path = args.record_rollouts changepoint_path = args.changepoint_dir option_chain = OptionChain(true_environment, args.changepoint_dir, args.train_edge, args) environments = option_chain.initialize(args) print(environments) proxy_environment = environments.pop(-1) head, tail = get_edge(args.train_edge) if len( environments ) > 1: # there is a difference in the properties of a proxy environment and the true environment num_actions = len(environments[-1].reward_fns) else: num_actions = environments[-1].num_actions state_class = GetState(head, state_forms=list( zip(args.state_names, args.state_forms))) use_raw = 'raw' in args.state_forms state_class.minmax = compute_minmax(state_class, dataset_path) states, resps, raws, dumps = load_states( state_class.get_state, dataset_path, length_constraint=length_constraint, use_raw=use_raw, raws=raws, dumps=dumps) return states, resps, num_actions, state_class, environments, raws, dumps
def pretrain(args, true_environment, desired, num_actions, state_class, states, resps, targets, criteria, reward_fns): # args = get_args() # true_environment = Paddle() # true_environment = PaddleNoBlocks() dataset_path = args.record_rollouts changepoint_path = args.changepoint_dir option_chain = OptionChain(true_environment, args.changepoint_dir, args.train_edge, args) environments = option_chain.initialize(args) proxy_environment = environments.pop(-1) proxy_chain = environments if args.load_weights: train_models = proxy_environment.models else: train_models = MultiOption(1, models[args.model_form]) head, tail = get_edge(args.train_edge) print(args.state_names, args.state_forms) print(state_class.minmax) # behavior_policy = EpsilonGreedyProbs() save_dir = args.save_graph if args.save_graph == "graph": save_dir = option_chain.save_dir proxy_environment.initialize(args, proxy_chain, reward_fns, state_class, behavior_policy=None) fit(args, save_dir, true_environment, train_models, state_class, desired, states, resps, targets, num_actions, criteria, proxy_environment, reward_fns)
def generate_soft_dataset(states, resps, true_environment, reward_fns, args): pre_load_weights = args.load_weights args.load_weights = True option_chain = OptionChain(true_environment, args.changepoint_dir, args.train_edge, args) print(args.load_weights) environments = option_chain.initialize(args) proxy_environment = environments.pop(-1) proxy_chain = environments train_models = proxy_environment.models head, tail = get_edge(args.train_edge) if len( environments ) > 1: # there is a difference in the properties of a proxy environment and the true environment num_actions = len(environments[-1].reward_fns) else: num_actions = environments[-1].num_actions state_class = GetState(head, state_forms=list( zip(args.state_names, args.state_forms))) proxy_environment.initialize(args, proxy_chain, reward_fns, state_class, behavior_policy=None) train_models.initialize(args, len(reward_fns), state_class, num_actions) train_models.session(args) proxy_environment.duplicate(args) # assumes that we are loading weights args.load_weights = pre_load_weights soft_actions = [[] for i in range(train_models.num_options)] for oidx in range(train_models.num_options): train_models.option_index = oidx if args.model_form == 'population': train_models.currentModel().use_mean = True for i in range(len(states) // 30 + 1): state = states[i * 30:(i + 1) * 30] resp = resps[i * 30:(i + 1) * 30] values, dist_entropy, action_probs, Q_vals = train_models.determine_action( pytorch_model.wrap(state, cuda=args.cuda), pytorch_model.wrap(resp, cuda=args.cuda)) # print (action_probs) values, action_probs, Q_vals = train_models.get_action( values, action_probs, Q_vals) soft_actions[oidx] += pytorch_model.unwrap(action_probs).tolist() print("soft actions", np.sum(np.array(soft_actions[0]), axis=0)) for i in range(len(soft_actions)): soft_actions[i] = smooth_weight(soft_actions[i], args.weighting_lambda) return np.array(soft_actions)
frameskip=args.frameskip) else: true_environment = None # TODO: implement elif args.env == 'SelfBreakout': if args.true_environment: true_environment = Screen(frameskip=args.frameskip) else: true_environment = FocusEnvironment(model, display=args.display_focus) elif args.env.find('Atari') != -1: true_environment = FocusAtariEnvironment(model, args.env[len("Atari"):], args.seed, 0, args.save_dir) dataset_path = args.record_rollouts changepoint_path = args.changepoint_dir option_chain = OptionChain(true_environment, args.changepoint_dir, args.train_edge, args) reward_paths = glob.glob(os.path.join(option_chain.save_dir, "*rwd.pkl")) print(reward_paths) reward_paths.sort(key=lambda x: int(x.split("__")[2])) head, tail = get_edge(args.train_edge) if args.reward_form == 'rawdist' and args.env == 'SelfPusher': true_environment.use_distance_reward() args.reward_form = 'raw' if args.reward_form != 'raw': reward_classes = [load_from_pickle(pth) for pth in reward_paths] for rc in reward_classes: if type(rc) == ChangepointMarkovReward: rc.markovModel = rc.markovModel.cuda(args.gpu) else: reward_classes = [RawReward(args)]
# python rl_template.py --model-form basic --optimizer-form DQN --record-rollouts "data/testchain/" --train-edge "Action->chain" --num-stack 1 --train --num-iters 10000 --save-dir data/test # python rl_template.py --model-form tab --optimizer-form TabQ --record-rollouts "data/testchain/" --train-edge "Action->chain" --num-stack 1 --train --num-iters 10000 --save-dir data/test # with 3 rewards: python rl_template.py --model-form basic --optimizer-form DQN --record-rollouts "data/testchain/" --train-edge "Action->chain" --num-stack 1 --train --num-iters 1000 --save-dir data/test --num-update-model 1 # python rl_template.py --model-form tab --optimizer-form TabQ --record-rollouts "data/testchain/" --train-edge "Action->chain" --num-stack 1 --train --num-iters 1000 --save-dir data/test --num-update-model 1 args = get_args() true_environment = ChainMDP(30) # train_models = MultiOption(1, BasicModel) reward_classes = [ RewardLeft(None, args), RewardCenter(None, args), RewardRight(None, args) ] train_models = MultiOption(len(reward_classes), models[args.model_form]) learning_algorithm = learning_algorithms[args.optimizer_form]() option_chain = OptionChain( true_environment, args.record_rollouts, args.train_edge, args) # here, train_edge should act like a save folder minmax = (0, 30) state_class = GetRaw(3, minmax=minmax, state_shape=[1]) behavior_policy = EpsilonGreedyQ() # behavior_policy = EpsilonGreedyProbs() proxy_chain = option_chain.initialize( args ) # the last term is None since the last environment is not yet made proxy_chain.pop(-1) print(proxy_chain) trainRL(args, option_chain.save_dir, true_environment, train_models, learning_algorithm,