def generate_changepoints(self, data, save_dict=False): seg_models, changepoints = generate_changepoints([self.model_class], self.params, data) if save_dict: changepoints = np.array(changepoints[::-1], dtype=np.int64) for cpt, seg_model in zip(changepoints, seg_models): print(cpt) print("model: \n", seg_model.A) print("data: \n", seg_model.data) print("predictions: \n", np.dot(seg_model.data, seg_model.A)) print("diff: ", seg_model.diff) print("log likelihood: ", seg_model.logLikelihood) print(changepoints) # correlate_data(seg_models, changepoints, action_data, args.num_frames, data) # correlate_data(seg_models, changepoints, paddle_data, args.num_frames, data, prox_distance=5) cp_dict = {cp: m for cp, m in zip(changepoints, seg_models)} # print(cp_dict) cp_dict[-2] = args.num_frames # print(cp_dict) save_to_pickle( os.path.join(args.record_rollouts, 'changepoints-' + self.head + '.pkl'), cp_dict) # with open(os.path.join(args.record_rollouts, 'changepoints-' + self.head + '.pkl'), 'wb') as fid: # pickle.dump(cp_dict, fid) return seg_models, changepoints
def train_dopamine(args, save_path, true_environment, train_models, proxy_environment, proxy_chain, reward_classes, state_class, num_actions, behavior_policy): print("#######") print("Training Options") print("#######") # if option_chain is not None: #TODO: implement this base_env = proxy_chain[0] base_env.set_save(0, args.save_dir, args.save_recycle) snum = args.num_stack args.num_stack = 1 proxy_environment.initialize(args, proxy_chain, reward_classes, state_class, behavior_policy) args.num_stack = snum if args.save_models: save_to_pickle(os.path.join(save_path, "env.pkl"), proxy_environment) behavior_policy.initialize(args, num_actions) train_models.initialize(args, len(reward_classes), state_class, proxy_environment.action_size) proxy_environment.set_models(train_models) proxy_environment.set_save(0, args.save_dir, args.save_recycle) state = pytorch_model.wrap(proxy_environment.getState(), cuda=args.cuda) cs, cr = proxy_environment.getHistState() hist_state = pytorch_model.wrap(cs, cuda=args.cuda) raw_state = base_env.getState() cp_state = proxy_environment.changepoint_state([raw_state]) # print("initial_state (s, hs, rs, cps)", state, hist_state, raw_state, cp_state) # print(cp_state.shape, state.shape, hist_state.shape, state_class.shape) # rollouts = RolloutOptionStorage(args.num_processes, (state_class.shape,), proxy_environment.action_size, cr.flatten().shape[0], # state.shape, hist_state.shape, args.buffer_steps, args.changepoint_queue_len, args.trace_len, # args.trace_queue_len, args.dilated_stack, args.target_stack, args.dilated_queue_len, train_models.currentOptionParam().shape[1:], len(train_models.models), cp_state[0].shape, # args.lag_num, args.cuda) option_actions = { option.name: collections.Counter() for option in train_models.models } total_duration = 0 total_elapsed = 0 true_reward = 0 ep_reward = 0 start = time.time() fcnt = 0 final_rewards = list() option_counter = collections.Counter() option_value = collections.Counter() print(hist_state) val = None train_models.currentModel().begin_episode(pytorch_model.unwrap(hist_state)) for j in range(args.num_iters): raw_actions = [] last_total_steps, total_steps = 0, 0 for step in range(args.num_steps): # start = time.time() fcnt += 1 total_steps += 1 current_state, current_resp = proxy_environment.getHistState() estate = proxy_environment.getState() if args.true_environment: reward = pytorch_model.wrap([[base_env.reward]]) else: reward = proxy_environment.computeReward(1) true_reward += base_env.reward ep_reward += base_env.reward # print(current_state, reward[train_models.option_index]) action = train_models.currentModel().forward( current_state, pytorch_model.unwrap(reward[train_models.option_index])) # print("ap", action) action = pytorch_model.wrap([action]) cp_state = proxy_environment.changepoint_state([raw_state]) # print(state, action) # print("step states (cs, s, cps, act)", current_state, estate, cp_state, action) # print("step outputs (val, de, ap, qv, v, ap, qv)", values, dist_entropy, action_probs, Q_vals, v, ap, qv) state, raw_state, resp, done, action_list = proxy_environment.step( action, model=False ) #, render=len(args.record_rollouts) != 0, save_path=args.record_rollouts, itr=fcnt) # print("step check (al, s)", action_list, state) # learning_algorithm.interUpdateModel(step) #### logging option_actions[train_models.currentName()][int( pytorch_model.unwrap(action.squeeze()))] += 1 #### logging # print(train_models.currentModel().dope_rainbow) if done: # print("reached end") print("Episode Reward: ", ep_reward) ep_reward = 0 train_models.currentModel().end_episode( pytorch_model.unwrap(reward[train_models.option_index])) state, resp = proxy_environment.getHistState() train_models.currentModel().begin_episode( pytorch_model.unwrap(state)) # print(step) break # var = [v for v in tf.trainable_variables() if v.name == "Online/fully_connected/weights:0"][0] # nval = train_models.currentModel().sess.run(var) # if val is not None: # print(var, np.sum(abs(nval - val)), train_models.currentModel().dope_rainbow.eval_mode) # val = nval current_state = proxy_environment.getHistState() # print(state, action) # print("step states (cs, s, cps, act)", current_state, estate, cp_state, action) # print("step outputs (val, de, ap, qv, v, ap, qv)", values, dist_entropy, action_probs, Q_vals, v, ap, qv) cp_state = proxy_environment.changepoint_state([raw_state]) # print("states and actions (es, cs, a, m)", rollouts.extracted_state, rollouts.current_state, rollouts.actions, rollouts.masks) # print("actions and Qvals (qv, vp, ap)", rollouts.Qvals, rollouts.value_preds, rollouts.action_probs) total_duration += step + 1 # print("rewards", rewards) # rollouts.insert_rewards(rewards) # print(rollouts.extracted_state) # print(rewards) # rollouts.compute_returns(args, values) # print("returns and rewards (rew, ret)", rollouts.rewards, rollouts.returns) # print("returns and return queue", rollouts.returns, rollouts.return_queue) # print("reward check (cs, rw, rol rw, rt", rollouts.current_state, rewards, rollouts.rewards, rollouts.returns) name = train_models.currentName() # print(name, rollouts.extracted_state, rollouts.rewards, rollouts.actions) #### logging option_counter[name] += step + 1 option_value[name] += true_reward #### logging if j % args.save_interval == 0 and args.save_models and args.train: # no point in saving if not training print("=========SAVING MODELS==========") train_models.save(save_path) # TODO: implement save_options #### logging if j % args.log_interval == 0: # print("Qvalue and state", pytorch_model.unwrap(Q_vals.squeeze()), pytorch_model.unwrap(current_state.squeeze())) # print("probs and state", pytorch_model.unwrap(action_probs.squeeze()), pytorch_model.unwrap(current_state.squeeze())) for name in train_models.names(): if option_counter[name] > 0: print(name, option_value[name] / option_counter[name], [ option_actions[name][i] / option_counter[name] for i in range(len(option_actions[name])) ]) if j % (args.log_interval * 20) == 0: option_value[name] = 0 option_counter[name] = 0 for i in range(len(option_actions[name])): option_actions[name][i] = 0 end = time.time() total_elapsed += total_duration log_stats = "Updates {}, num timesteps {}, FPS {}, reward {}".format( j, total_elapsed, int(total_elapsed / (end - start)), true_reward / (args.num_steps * args.log_interval)) print(log_stats) true_reward = 0.0 total_duration = 0
option_determiner_model = ChangepointModels(args, changepoint_model, transforms, clusters, determiner) option_determiner_model.changepoint_statistics(models, changepoints, trajectory, correlate_trajectory) try: os.makedirs(os.path.join(args.changepoint_dir, args.train_edge)) except OSError: pass # folder already created print(args.changepoint_dir) reward_fns = [] for i in range(option_determiner_model.determiner.num_mappings): reward_function = reward_forms[args.reward_form]( option_determiner_model, args, i) if args.train: reward_function.generate_training_set(combined, models, np.array(changepoints)) reward_function.train_rewards(20000) save_to_pickle( os.path.join(args.changepoint_dir, args.train_edge, "reward__function__" + str(i) + "__rwd.pkl"), reward_function) reward_fns.append(reward_function) # if args.train: # minvar = np.min(np.max([rf.markovModel.variance.tolist() for rf in reward_fns]), axis=0) # print(minvar) # for i, rf in enumerate(reward_fns): # rf.setvar(minvar) # save_to_pickle(os.path.join(args.changepoint_dir, args.train_edge, "reward__function__" + str(i) +"__rwd.pkl"), rf)
def testRL(args, save_path, true_environment, proxy_chain, proxy_environment, state_class, behavior_policy, num_actions, reward_classes=None): print("#######") print("Evaluating Options") print("#######") # if option_chain is not None: #TODO: implement this base_env = proxy_chain[0] base_env.set_save(0, args.save_dir, args.save_recycle) if reward_classes is not None: proxy_environment.reward_fns = reward_classes args.changepoint_queue_len = max(args.changepoint_queue_len, args.num_iters) proxy_environment.initialize(args, proxy_chain, proxy_environment.reward_fns, proxy_environment.stateExtractor, behavior_policy) print(base_env.save_path) behavior_policy.initialize(args, num_actions) train_models = proxy_environment.models train_models.initialize(args, len(reward_classes), state_class, num_actions) proxy_environment.duplicate(args) proxy_environment.set_save(0, args.save_dir, args.save_recycle) state = pytorch_model.wrap(proxy_environment.getState(), cuda=args.cuda) resp = proxy_environment.getResp() print(state.shape) raw_state = base_env.getState() cs, cr = proxy_environment.getHistState() hist_state = pytorch_model.wrap(cs, cuda=args.cuda) cp_state = proxy_environment.changepoint_state([raw_state]) rollouts = RolloutOptionStorage( args.num_processes, (state_class.shape, ), proxy_environment.action_size, cr.flatten().shape[0], state.shape, hist_state.shape, args.buffer_steps, args.changepoint_queue_len, args.trace_len, args.trace_queue_len, args.dilated_stack, args.target_stack, args.dilated_queue_len, train_models.currentOptionParam().shape[1:], len(train_models.models), cp_state[0].shape, args.lag_num, args.cuda) option_actions = { option.name: collections.Counter() for option in train_models.models } total_duration = 0 start = time.time() fcnt = 0 final_rewards = list() option_counter = collections.Counter() option_value = collections.Counter() raw_states = dict() ep_reward = 0 rollouts.set_parameters(args.num_iters * train_models.num_options) # if args.num_iters > rollouts.changepoint_queue_len: # rollouts.set_changepoint_queue(args.num_iters) done = False for i in range(train_models.num_options): train_models.option_index = i train_models.currentModel().test = True raw_states[train_models.currentName()] = [] for j in range(args.num_iters): fcnt += 1 raw_actions = [] rollouts.cuda() current_state, current_resp = proxy_environment.getHistState() values, dist_entropy, action_probs, Q_vals = train_models.determine_action( current_state.unsqueeze(0), current_resp.unsqueeze(0)) v, ap, qv = train_models.get_action(values, action_probs, Q_vals) cp_state = proxy_environment.changepoint_state([raw_state]) ep_reward += base_env.reward # print(ap, qv) action = behavior_policy.take_action(ap, qv) rollouts.insert( False, state, current_state, pytorch_model.wrap(args.greedy_epsilon, cuda=args.cuda), done, current_resp, action, cp_state[0], train_models.currentOptionParam(), train_models.option_index, None, None, action_probs, Q_vals, values) state, raw_state, resp, done, action_list = proxy_environment.step( action, model=False ) #, render=len(args.record_rollouts) != 0, save_path=args.record_rollouts, itr=fcnt) raw_states[train_models.currentName()].append(raw_state) option_actions[train_models.currentName()][int( pytorch_model.unwrap(action.squeeze()))] += 1 if done: print("Episode Reward: ", ep_reward, " ", fcnt) ep_reward = 0 # print("reached end") # proxy_environment.determine_swaps(length, needs_rewards=True) # doesn't need to generate rewards print(args.num_iters) print(action_probs) print("Episode Reward: ", ep_reward, " ", fcnt) rewards = proxy_environment.computeReward(args.num_iters) # print(rewards.shape) print(rewards.sum()) rollouts.insert_rewards(rewards, total_duration) total_duration += j rollouts.compute_returns(args, values) rollouts.cpu() save_rols = copy.copy(rollouts) save_to_pickle(os.path.join(args.save_dir, "rollouts.pkl"), save_rols) reward_total = rollouts.rewards.sum( dim=1)[train_models.option_index] / args.num_iters print("Rewards for Policy:", reward_total)
'changepoints-' + self.head + '.pkl'), cp_dict) # with open(os.path.join(args.record_rollouts, 'changepoints-' + self.head + '.pkl'), 'wb') as fid: # pickle.dump(cp_dict, fid) return seg_models, changepoints if __name__ == "__main__": # python ChangepointDetection/CHAMP.py --train-edge "Action->Paddle" --record-rollouts data/random/ --champ-parameters "Paddle" # python ChangepointDetection/CHAMP.py --train-edge "Paddle->Ball" --record-rollouts data/integrationpaddle/ --champ-parameters "Ball" > integration/ballCHAMP.txt args = get_args() detector = CHAMPDetector(args.train_edge, args.champ_parameters) data = detector.load_obj_dumps(args, dumps_name=args.focus_dumps_name) print(data[:100]) models, changepoints = detector.generate_changepoints(data, save_dict=True) save_to_pickle( os.path.join(args.record_rollouts, 'detector-' + detector.head + '.pkl'), detector) # with open(os.path.join(args.record_rollouts, 'detector-' + detector.head + '.pkl'), 'wb') as fid: # pickle.dump(detector, fid) # All data must now follow obj_dumps methodology # args = get_args() # obj_dumps = read_obj_dumps(args.record_rollouts, get_last=100000) # paddle_data = np.array(get_individual_data('Paddle', obj_dumps, pos_val_hash=1)) # ball_data = np.array(get_individual_data('Ball', obj_dumps, pos_val_hash=1)) # action_data = get_individual_data('Action', obj_dumps, pos_val_hash=2) # action_data = np.array(hot_actions(action_data)) # ### PADDLE DATA ### # if args.train_edge == "Paddle": # data = paddle_data[-args.num_frames-2:, :2] # paddle # ### BALL DATA ###
def testRL(args, save_path, true_environment, proxy_chain, proxy_environment, state_class, behavior_policy, num_actions, reward_classes=None): print("#######") print("Evaluating Options") print("#######") # if option_chain is not None: #TODO: implement this base_env = proxy_chain[0] base_env.set_save(0, args.save_dir, args.save_recycle) if reward_classes is not None: proxy_environment.reward_fns = reward_classes args.changepoint_queue_len = max(args.changepoint_queue_len, args.num_iters * args.num_update_model) proxy_environment.initialize(args, proxy_chain, proxy_environment.reward_fns, state_class, behavior_policy) print(base_env.save_path) behavior_policy.initialize(args, num_actions) train_models = proxy_environment.models train_models.initialize(args, len(reward_classes), state_class, num_actions) proxy_environment.duplicate(args) proxy_environment.set_save(0, args.save_dir, args.save_recycle) state = pytorch_model.wrap(proxy_environment.getState(), cuda=args.cuda) resp = proxy_environment.getResp() print(state.shape) raw_state = base_env.getState() cs, cr = proxy_environment.getHistState() hist_state = pytorch_model.wrap(cs, cuda=args.cuda) cp_state = proxy_environment.changepoint_state([raw_state]) rollouts = RolloutOptionStorage( args.num_processes, (state_class.shape, ), proxy_environment.action_size, cr.flatten().shape[0], state.shape, hist_state.shape, args.buffer_steps, args.changepoint_queue_len, args.trace_len, args.trace_queue_len, args.dilated_stack, args.target_stack, args.dilated_queue_len, train_models.currentOptionParam().shape[1:], len(train_models.models), cp_state[0].shape, args.lag_num, args.cuda) option_actions = { option.name: collections.Counter() for option in train_models.models } total_duration = 0 start = time.time() fcnt = 0 final_rewards = list() option_counter = collections.Counter() option_value = collections.Counter() raw_states = dict() ep_reward = 0 rollouts.set_parameters(args.num_iters * args.num_update_model) # if args.num_iters > rollouts.changepoint_queue_len: # rollouts.set_changepoint_queue(args.num_iters) done = False ctr = 0 raw_indexes = dict() for i in range(args.num_iters): train_models.option_index = np.random.randint(train_models.num_options) train_models.currentModel().test = True if train_models.currentName() not in raw_states: raw_states[train_models.currentName()] = [] raw_indexes[train_models.currentName()] = [] for j in range(args.num_update_model): raw_indexes[train_models.currentName()].append(ctr) ctr += 1 fcnt += 1 raw_actions = [] rollouts.cuda() current_state, current_resp = proxy_environment.getHistState() values, log_probs, action_probs, Q_vals = train_models.determine_action( current_state.unsqueeze(0), current_resp.unsqueeze(0)) v, ap, lp, qv = train_models.get_action(values, action_probs, log_probs, Q_vals) cp_state = proxy_environment.changepoint_state([raw_state]) ep_reward += base_env.reward # print(ap, qv) action = behavior_policy.take_action(ap, qv) # print(train_models.currentName(), action, qv.squeeze()) rollouts.insert( False, state, current_state, pytorch_model.wrap(args.greedy_epsilon, cuda=args.cuda), done, current_resp, action, cp_state[0], train_models.currentOptionParam(), train_models.option_index, None, None, action_probs, Q_vals, values) state, raw_state, resp, done, action_list = proxy_environment.step( action, model=False ) #, render=len(args.record_rollouts) != 0, save_path=args.record_rollouts, itr=fcnt) print(train_models.currentName(), j, action) cv2.imshow('frame', raw_state[0]) if cv2.waitKey(50) & 0xFF == ord('q'): break raw_states[train_models.currentName()].append(raw_state) option_actions[train_models.currentName()][int( pytorch_model.unwrap(action.squeeze()))] += 1 if done: print("Episode Reward: ", ep_reward, " ", fcnt) ep_reward = 0 # print("reached end") # proxy_environment.determine_swacurrent_durationps(length, needs_rewards=True) # doesn't need to generate rewards # rewards = proxy_environment.computeReward(args.num_update_model) # print(rewards) if len(base_env.episode_rewards) > 0: true_reward = np.median(base_env.episode_rewards) mean_reward = np.mean(base_env.episode_rewards) best_reward = np.max(base_env.episode_rewards) print("true reward median: %f, mean: %f, max: %f" % (true_reward, mean_reward, best_reward)) print(args.num_iters) print(action_probs) print("Episode Reward: ", ep_reward, " ", fcnt) print(proxy_environment.reward_fns) rewards = proxy_environment.computeReward(args.num_iters * args.num_update_model) # print(rewards.shape) # print(rewards.sum()) rollouts.insert_rewards(args, rewards) total_duration += j save_rols = copy.deepcopy(rollouts) if len(args.save_dir) > 0: save_to_pickle(os.path.join(args.save_dir, "rollouts.pkl"), save_rols) for i in range(train_models.num_options): print(rollouts.base_rollouts.rewards.shape, raw_indexes) reward_total = rollouts.base_rollouts.rewards.sum( dim=1)[i] / (args.num_iters * args.num_update_model) # print(rollouts.base_rollouts.rewards, raw_indexes, rollouts.base_rollouts.rewards.shape) reward_adjusted = rollouts.base_rollouts.rewards[ i, np.array(raw_indexes[train_models.models[i].name]) + args.num_stack].sum(dim=0) / len( raw_indexes[train_models.models[i].name]) print("Num policy steps:", len(raw_indexes[train_models.models[i].name])) print("Rewards during Policy:", reward_adjusted) print("Rewards for Policy:", reward_total)
def trainRL(args, save_path, true_environment, train_models, learning_algorithm, proxy_environment, proxy_chain, reward_classes, state_class, behavior_policy): print("#######") print("Training Options") print("#######") # if option_chain is not None: #TODO: implement this base_env = proxy_chain[0] base_env.set_save(0, args.save_dir, args.save_recycle, args.single_save_dir) proxy_environment.initialize(args, proxy_chain, reward_classes, state_class, behavior_policy) if args.save_models: if args.env.find("Atari") != -1: screen = base_env.screen base_env.screen = None save_to_pickle(os.path.join(save_path, "env.pkl"), proxy_environment) if args.env.find("Atari") != -1: base_env.screen = screen behavior_policy.initialize(args, proxy_environment.action_size) print(reward_classes[0], reward_classes[0].parameter_minmax) if not args.load_weights: train_models.initialize( args, len(reward_classes), state_class, proxy_environment.action_size, parameter_minmax=reward_classes[0].parameter_minmax) proxy_environment.set_models(train_models) else: print("loading weights", len(reward_classes)) train_models.initialize( args, len(reward_classes), state_class, proxy_environment.action_size, parameter_minmax=reward_classes[0].parameter_minmax) train_models.session(args) proxy_environment.duplicate(args) train_models.train() proxy_environment.set_save(0, args.save_dir, args.save_recycle) learning_algorithm.initialize(args, train_models, reward_classes=reward_classes) print(proxy_environment.get_names()) state = pytorch_model.wrap(proxy_environment.getState(), cuda=args.cuda) cs, cr = proxy_environment.getHistState() hist_state = pytorch_model.wrap(cs, cuda=args.cuda) raw_state = base_env.getState() resp = proxy_environment.getResp() cp_state = proxy_environment.changepoint_state([raw_state]) # print("initial_state (s, hs, rs, cps)", state, hist_state, raw_state, cp_state) # print(cp_state.shape, state.shape, hist_state.shape, state_class.shape) print(args.trace_len, args.trace_queue_len) args.buffer_clip = max(args.buffer_clip, args.reward_check) rollouts = RolloutOptionStorage( args.num_processes, (state_class.shape, ), proxy_environment.action_size, cr.flatten().shape[0], state.shape, hist_state.shape, args.buffer_steps, args.changepoint_queue_len, args.trace_len, args.trace_queue_len, args.dilated_stack, args.target_stack, args.dilated_queue_len, train_models.currentOptionParam().shape[1:], len(train_models.models), cp_state[0].shape, args.lag_num, args.cuda, return_form=args.return_form) option_actions = { option.name: collections.Counter() for option in train_models.models } total_duration = 0 total_elapsed = 0 true_reward = 0 ep_reward = 0 sample_schedule = args.sample_schedule start = time.time() fcnt = 0 final_rewards = list() average_rewards, average_counts = [], [] option_counter = collections.Counter() option_value = collections.Counter() trace_queue = [ ] # keep the last states until end of trajectory (or until a reset), and dump when a reward is found retest = False done = False for j in range(args.num_iters): rollouts.set_parameters(learning_algorithm.current_duration * args.reward_check) # print("set_parameters", state) raw_actions = [] rollouts.cuda() last_total_steps, total_steps = 0, 0 s = time.time() for step in range(learning_algorithm.current_duration): for m in range(args.reward_check): fcnt += 1 total_steps += 1 current_state, current_resp = proxy_environment.getHistState() estate = proxy_environment.getState() values, log_probs, action_probs, Q_vals = train_models.determine_action( current_state.unsqueeze(0), current_resp.unsqueeze(0), use_grad=False) v, ap, lp, qv = train_models.get_action( values, action_probs, log_probs, Q_vals) # a = time.time() # print("choose action", a-s) # print(action_probs, Q_vals, ap, lp, qv) action = behavior_policy.take_action(ap, qv) cp_state = proxy_environment.changepoint_state([raw_state]) # print(state, action) # print("before_insert", state) # print(current_state.reshape((4,84,84))[0].cpu().numpy().shape) # cv2.imshow('frame',current_state.reshape((4,84,84))[0].cpu().numpy()) # if cv2.waitKey(1) & 0xFF == ord('q'): # pass # print(action, true_environment.paddle.pos, true_environment.ball.vel, true_environment.ball.pos) if args.behavior_policy == "dem" or args.visualize: cv2.imshow('frame', raw_state[0].reshape((84, 84))) if cv2.waitKey(1) & 0xFF == ord('q'): pass # cv2.imshow('frame',raw_state[0].reshape((84,84))) # if cv2.waitKey(1) & 0xFF == ord('q'): # pass rollouts.insert( retest, state, current_state, pytorch_model.wrap(args.greedy_epsilon, cuda=args.cuda), done, current_resp, action, cp_state[0], train_models.currentOptionParam(), train_models.option_index, None, None, action_probs, Q_vals, values) rollouts.insert_dilation(proxy_environment.swap) retest = False # print("step states (cs, ns, cps, act)", current_state, estate, cp_state, action) # print("step outputs (val, de, ap, qv, v, ap, qv)", values, dist_entropy, action_probs, Q_vals, v, ap, qv) trace_queue.append( (current_state.clone().detach(), action.clone().detach())) state, raw_state, resp, done, action_list = proxy_environment.step( action, model=False ) #, render=len(args.record_rollouts) != 0, save_path=args.record_rollouts, itr=fcnt) # print(action_list) # s = time.time() # print("step time", s-a) # print("after step", state) true_reward += base_env.reward ep_reward += base_env.reward if args.reward_form == 'raw': for rc in reward_classes: rc.insert_reward(base_env.reward) # print(base_env.reward) # print(action_list, action) # print("step check (al, s)", action_list, state) #### logging option_actions[train_models.currentName()][int( pytorch_model.unwrap(action.squeeze()))] += 1 #### logging if done: print("Episode Reward: ", ep_reward, " ", fcnt, j) ep_reward = 0 if not args.sample_duration > 0 or (args.done_swapping <= j): # print("reached end") # print(step) if args.trace_queue_len > -1: trace_queue = rollouts.insert_trace(trace_queue) trace_queue = [] break else: # need to clear out trace queue trace_queue = rollouts.insert_trace(trace_queue) trace_queue = [] # time.sleep(.1) # print(m, args.reward_check) # rl = time.time() # print("run loop", start - rl) rewards = proxy_environment.computeReward(m + 1) # print(rewards, proxy_environment.changepoint_queue) # print(rewards.sum()) # a = time.time() # print("reward time", a-s) change, target = proxy_environment.determineChanged(m + 1) proxy_environment.determine_swaps( m + 1, needs_rewards=True) # doesn't need to generate rewards # print("reward time", time.time() - start) # print("rewards", torch.sum(rewards)) # reenter to get next value current_state, current_resp = proxy_environment.getHistState() values, log_probs, action_probs, Q_vals = train_models.determine_action( current_state.unsqueeze(0), current_resp.unsqueeze(0)) v, ap, lp, qv = train_models.get_action(values, action_probs, log_probs, Q_vals) action = behavior_policy.take_action(ap, qv) trace_queue.append( (current_state.clone().detach(), action.clone().detach())) cp_state = proxy_environment.changepoint_state([raw_state]) rollouts.insert( retest, state, current_state, pytorch_model.wrap(args.greedy_epsilon, cuda=args.cuda), done, current_resp, action, cp_state[0], train_models.currentOptionParam(), train_models.option_index, None, None, action_probs, Q_vals, values) # inserting the last state and unused action retest = True # need to re-insert value with true state # ######## rollouts.insert_hindsight_target(change, target) rollouts.insert_rewards(args, rewards) name = train_models.currentName() option_counter[name] += m + 1 option_value[name] += rewards.sum(dim=1)[train_models.option_index] last_total_steps = total_steps completed = learning_algorithm.interUpdateModel( total_steps, rewards, change, done) # rw = time.time() # print("rewards", rl - rw, start - rw) if completed or (done and not args.sample_duration > 0): break retest = args.buffer_steps > 0 or args.lag_num > 0 # if we roll, don't retest # print("steptime", time.time() - start) # start = time.time() # print(done) # print(rollouts.base_rollouts.extracted_state, rollouts.base_rollouts.rewards) # print("rew, state", rollouts.rewards[0,-50:], rollouts.extracted_state[-50:]) # print("inserttime", time.time() - start) # print("states and actions (es, cs, a, m)", rollouts.extracted_state, rollouts.current_state, rollouts.actions, rollouts.masks) # print("actions and Qvals (qv, vp, ap)", rollouts.Qvals, rollouts.value_preds, rollouts.action_probs) # start = time.time() total_duration += total_steps # if done: # trace_queue = rollouts.insert_trace(trace_queue) # trace_queue = [] # insert first # else: # trace_queue = rollouts.insert_trace(trace_queue) # print(rollouts.extracted_state) # print(rewards) # rollouts.compute_returns(args, values) # don't need to compute returns because they are computed upon reward reception # print("returns and rewards (rew, ret)", rollouts.rewards, rollouts.returns) # print("returns and return queue", rollouts.returns, rollouts.return_queue) # print("reward check (cs, rw, rol rw, rt", rollouts.current_state, rewards, rollouts.rewards, rollouts.returns) # print(name, rollouts.extracted_state, rollouts.rewards, rollouts.actions) # n = 0 # for obj in gc.get_objects(): # try: # if torch.is_tensor(obj): # n+=1 # except: # pass # print("learning at", j, n) #### logging # print(rollouts.base_rollouts.rewards.shape) reward_total = rollouts.get_current( names=['rewards'])[0][train_models.option_index].sum(dim=0) # print("reward_total", reward_total.shape) final_rewards.append(reward_total) #### logging # start = time.time() learning_algorithm.step_counter += 1 if j >= args.warm_up: # TODO: clean up this to learning algorithm? value_loss, action_loss, dist_entropy, output_entropy, entropy_loss, action_log_probs = learning_algorithm.step( args, train_models, rollouts) if args.dist_interval != -1 and j % args.dist_interval == 0: learning_algorithm.distibutional_sparcity_step( args, train_models, rollouts) # print("di", time.time() - start) if args.correlate_steps > 0 and j % args.diversity_interval == 0: loss = learning_algorithm.correlate_diversity_step( args, train_models, rollouts) # print("corr", time.time() - start) if args.greedy_epsilon_decay > 0 and j % args.greedy_epsilon_decay == 0 and j != 0: behavior_policy.epsilon = max( args.min_greedy_epsilon, behavior_policy.epsilon * 0.9) # TODO: more advanced greedy epsilon methods # print("eps", time.time() - start) if args.sample_schedule > 0 and j % sample_schedule == 0 and j != 0: learning_algorithm.sample_duration = ( j // args.sample_schedule + 1) * args.sample_duration learning_algorithm.reset_current_duration( learning_algorithm.sample_duration, args.reward_check) args.changepoint_queue_len = max( learning_algorithm.max_duration, args.changepoint_queue_len) sample_schedule = args.sample_schedule * ( j // args.sample_schedule + 1 ) # sum([args.sample_schedule * (i+1) for i in range(j // args.sample_schedule + 1)]) if args.retest_schedule > 0 and j % args.retest_schedule == 0 and j != 0: learning_algorithm.retest += 1 learning_algorithm.reset_current_duration( learning_algorithm.sample_duration, args.reward_check) args.changepoint_queue_len = max( learning_algorithm.max_duration, args.changepoint_queue_len) # print("resample", time.time() - start) if j > args.done_swapping: learning_algorithm.reset_current_duration( learning_algorithm.sample_duration, args.reward_check) else: value_loss, action_loss, dist_entropy, output_entropy, entropy_loss, action_log_probs = None, None, None, None, None, None parameter = proxy_environment.get_next_parameter() if args.reward_swapping: parameter = completed learning_algorithm.updateModel(parameter) # s = time.time() # print("learning step time", s-a) # n = 0 # for obj in gc.get_objects(): # try: # if torch.is_tensor(obj): # n+=1 # except: # pass # print("objects at", j, n) # print("update", time.time() - start) # print("learn time", time.time() - rw) if j % args.save_interval == 0 and args.save_models and args.train: # no point in saving if not training print("=========SAVING MODELS==========") train_models.save(save_path) # TODO: implement save_options #### logging if j % args.log_interval == 0: print("Qvalue and state", pytorch_model.unwrap(Q_vals.squeeze()), pytorch_model.unwrap(current_state.squeeze())) print("probs and state", pytorch_model.unwrap(action_probs.squeeze()), pytorch_model.unwrap(current_state.squeeze())) for name in train_models.names(): if option_counter[name] > 0: print(name, option_value[name] / option_counter[name], [ option_actions[name][i] / option_counter[name] for i in range(len(option_actions[name])) ]) # if j % (args.log_interval * 20) == 0: option_value[name] = 0 option_counter[name] = 0 for i in range(len(option_actions[name])): option_actions[name][i] = 0 end = time.time() final_rewards = torch.stack(final_rewards).detach() average_rewards.append(final_rewards.sum()) average_counts.append(total_duration) acount = np.sum(average_counts) best_reward = true_reward true_reward = true_reward / total_steps mean_reward = true_reward if len(base_env.episode_rewards) > 0: true_reward = np.median(base_env.episode_rewards) mean_reward = np.mean(base_env.episode_rewards) best_reward = np.max(base_env.episode_rewards) el, vl, al = unwrap_or_none(entropy_loss), unwrap_or_none( value_loss), unwrap_or_none(action_loss) total_elapsed += total_duration log_stats = "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {}, value loss {}, policy loss {}, average_reward {}, true_reward median: {}, mean: {}, max: {}".format( j, total_elapsed, int(total_elapsed / (end - start)), final_rewards.mean(), np.median(final_rewards.cpu()), final_rewards.min(), final_rewards.max(), el, vl, al, torch.stack(average_rewards).sum() / acount, true_reward, mean_reward, best_reward) if acount > 300: average_counts.pop(0) average_rewards.pop(0) true_reward = 0.0 print(log_stats) final_rewards = list() total_duration = 0 #### logging if args.save_models and args.train: # no point in saving if not training print("=========SAVING MODELS==========") train_models.save(save_path) # TODO: implement save_options proxy_environment.close_files()