def mpc_step(self, env_model, obs, args, desired_goal): mpc_sample = 10 mpc_step = 5 pure_obs_batch = np.array( [obs['observation'] for _ in range(mpc_sample)]) desired_goal_batch = np.array( [obs['desired_goal'] for _ in range(mpc_sample)]) selected = False original_acts = None for x in range(mpc_step): pi_input = np.concatenate([pure_obs_batch, desired_goal_batch], axis=1) actions = self.step_batch(pi_input, explore=True, batch_size=10) if selected == False: original_acts = actions.copy() selected = True pure_obs_batch = step_fake_batch( env_model=env_model, obs=pure_obs_batch, action=actions, dims=args.env_param['step_fake_param'], distance_threshold=args.distance_threshold, args=args, batch=10) min_id = -1 min_dis = 999999999999 for x in range(mpc_sample): achieved = pure_obs_batch[ mpc_step - 1][args.env_param['start_in_obs']:args.env_param['end_in_obs']] if goal_distance(desired_goal, achieved) < min_dis: min_dis = goal_distance(desired_goal, achieved) min_id = x return original_acts[min_id]
def learn(self, args, env, env_test, agent, buffer): initial_goals = [] desired_goals = [] for i in range(args.episodes): obs = self.env_List[i].reset() goal_a = obs['achieved_goal'].copy() goal_d = obs['desired_goal'].copy() initial_goals.append(goal_a.copy()) desired_goals.append(goal_d.copy()) self.sampler.update(initial_goals, desired_goals) achieved_trajectories = [] achieved_init_states = [] for i in range(args.episodes): obs = self.env_List[i].get_obs() init_state = obs['observation'].copy() explore_goal = self.sampler.sample(i) self.env_List[i].goal = explore_goal.copy() obs = self.env_List[i].get_obs() current = Trajectory(obs) trajectory = [obs['achieved_goal'].copy()] for timestep in range(args.timesteps): action = agent.step(obs, explore=True) obs, reward, done, info = self.env_List[i].step(action) trajectory.append(obs['achieved_goal'].copy()) if timestep == args.timesteps - 1: done = True current.store_step(action, obs, reward, done) if done: break achieved_trajectories.append(np.array(trajectory)) achieved_init_states.append(init_state) buffer.store_trajectory(current) agent.normalizer_update(buffer.sample_batch()) if buffer.steps_counter >= args.warmup: for _ in range(args.train_batches): info = agent.train(buffer.sample_batch()) args.logger.add_dict(info) agent.target_update() selection_trajectory_idx = {} for i in range(self.args.episodes): if goal_distance(achieved_trajectories[i][0], achieved_trajectories[i][-1]) > 0.01: selection_trajectory_idx[i] = True for idx in selection_trajectory_idx.keys(): self.achieved_trajectory_pool.insert( achieved_trajectories[idx].copy(), achieved_init_states[idx].copy())
def __init__(self, args, achieved_trajectory_pool): self.args = args self.env = make_env(args) self.env_test = make_env(args) self.dim = np.prod(self.env.reset()['achieved_goal'].shape) self.delta = self.env.distance_threshold self.length = args.episodes init_goal = self.env.reset()['achieved_goal'].copy() self.pool = np.tile(init_goal[np.newaxis, :], [self.length, 1]) + np.random.normal( 0, self.delta, size=(self.length, self.dim)) self.init_state = self.env.reset()['observation'].copy() self.match_lib = gcc_load_lib('learner/cost_flow.c') self.achieved_trajectory_pool = achieved_trajectory_pool # estimating diameter self.max_dis = 0 for i in range(1000): obs = self.env.reset() dis = goal_distance(obs['achieved_goal'], obs['desired_goal']) if dis > self.max_dis: self.max_dis = dis
def compute_reward_direct(self, achieved, goal): dis = goal_distance(achieved, goal) return -1.0 if dis > self.distance_threshold else 0
def compute_reward(self, observation_current, observation_old, goal): dis = goal_distance(observation_current['achieved_goal'], goal) return -1.0 if dis > self.distance_threshold else 0.0
def learn(self, args, env, env_test, agent, buffer, write_goals=0): # Actual learning cycle takes place here! initial_goals = [] desired_goals = [] goal_list = [] # get initial position and goal from environment for each epsiode for i in range(args.episodes): obs = self.env_List[i].reset() goal_a = obs['achieved_goal'].copy() goal_d = obs['desired_goal'].copy() initial_goals.append(goal_a.copy()) desired_goals.append(goal_d.copy()) # if HGG has not been stopped yet, perform crucial HGG update step here # by updating the sampler, a set of intermediate goals is provided and stored in sampler # based on distance to target goal distribution, similarity of initial states and expected reward (see paper) # by bipartite matching if not self.stop: self.sampler.update(initial_goals, desired_goals) if self.stop: buffer.stop_trade_off = True achieved_trajectories = [] achieved_init_states = [] explore_goals = [] test_goals = [] inside = [] left_dis_total = 0 for i in range(args.episodes): obs = self.env_List[i].get_obs() init_state = obs['observation'].copy() # if HGG has not been stopped yet, sample from the goals provided by the update step # if it has been stopped, the goal to explore is simply the one generated by the environment if not self.stop: explore_goal = self.sampler.sample(i) else: explore_goal = desired_goals[i] left_dis_total += self.sampler.get_graph_goal_distance( explore_goal, desired_goals[i]) # store goals in explore_goals list to check whether goals are within goal space later explore_goals.append(explore_goal) test_goal = self.env.generate_goal() if test_goal.shape[-1] == 7: test_goal = test_goal[3:] # for some hand tasks test_goals.append(test_goal) # Perform HER training by interacting with the environment self.env_List[i].goal = explore_goal.copy() if write_goals != 0 and len(goal_list) < write_goals: goal_list.append(explore_goal.copy()) obs = self.env_List[i].get_obs() current = Trajectory(obs) trajectory = [obs['achieved_goal'].copy()] for timestep in range(args.timesteps): # get action from the ddpg policy action = agent.step(obs, explore=True) # feed action to environment, get observation and reward obs, reward, done, info = self.env_List[i].step(action) trajectory.append(obs['achieved_goal'].copy()) if timestep == args.timesteps - 1: done = True current.store_step(action, obs, reward, done) if done: break achieved_trajectories.append(np.array(trajectory)) achieved_init_states.append(init_state) # Trajectory is stored in replay buffer, replay buffer can be normal or EBP buffer.store_trajectory(current) agent.normalizer_update(buffer.sample_batch()) if buffer.steps_counter >= args.warmup: for _ in range(args.train_batches): # train with Hindsight Goals (HER step) info = agent.train(buffer.sample_batch()) args.logger.add_dict(info) # update target network agent.target_update() if left_dis_total == 0: buffer.dis_balance = 1000 # maximum else: buffer.dis_balance = args.balance_eta * pow( 2.71, (-left_dis_total / args.episodes) / (args.balance_sigma * args.balance_sigma)) selection_trajectory_idx = {} for i in range(self.args.episodes): # only add trajectories with movement to the trajectory pool --> use default (L2) distance measure! if goal_distance(achieved_trajectories[i][0], achieved_trajectories[i][-1]) > 0.01: selection_trajectory_idx[i] = True for idx in selection_trajectory_idx.keys(): self.achieved_trajectory_pool.insert( achieved_trajectories[idx].copy(), achieved_init_states[idx].copy()) # unless in first call: Check which of the explore goals are inside the target goal space target goal space # is represented by a sample of test_goals directly generated from the environment an explore goal is # considered inside the target goal space, if it is closer than the distance_threshold to one of the test # goals (i.e. would yield a non-negative reward if that test goal was to be achieved) if self.learn_calls > 0: assert len(explore_goals) == len(test_goals) for ex in explore_goals: is_inside = 0 for te in test_goals: # TODO: check: originally with self.sampler.get_graph_goal_distance, now trying with goal_distance (L2) if goal_distance( ex, te) <= self.env.env.env.distance_threshold: is_inside = 1 inside.append(is_inside) assert len(inside) == len(test_goals) inside_sum = 0 for i in inside: inside_sum += i # If more than stop_hgg_threshold (e.g. 0.9) of the explore goals are inside the target goal space, stop HGG # and continue with normal HER. # By default, stop_hgg_threshold is disabled (set to a value > 1) average_inside = inside_sum / len(inside) self.args.logger.info("Average inside: {}".format(average_inside)) if average_inside > self.stop_hgg_threshold: self.stop = True self.args.logger.info("Continue with normal HER") self.learn_calls += 1 return goal_list if len(goal_list) > 0 else None
def update(self, initial_goals, desired_goals): if self.achieved_trajectory_pool.counter == 0: self.pool = copy.deepcopy(desired_goals) return achieved_pool, achieved_pool_init_state = self.achieved_trajectory_pool.pad( ) candidate_goals = [] candidate_edges = [] candidate_id = [] agent = self.args.agent achieved_value = [] for i in range(len(achieved_pool)): obs = [ goal_concat(achieved_pool_init_state[i], achieved_pool[i][j]) for j in range(achieved_pool[i].shape[0]) ] feed_dict = {agent.raw_obs_ph: obs} value = agent.sess.run(agent.q_pi, feed_dict)[:, 0] value = np.clip(value, -1.0 / (1.0 - self.args.gamma), 0) achieved_value.append(value.copy()) n = 0 graph_id = {'achieved': [], 'desired': []} for i in range(len(achieved_pool)): n += 1 graph_id['achieved'].append(n) for i in range(len(desired_goals)): n += 1 graph_id['desired'].append(n) n += 1 self.match_lib.clear(n) for i in range(len(achieved_pool)): self.match_lib.add(0, graph_id['achieved'][i], 1, 0) for i in range(len(achieved_pool)): for j in range(len(desired_goals)): # use graph_goal_distance here! if self.args.graph: size = achieved_pool[i].shape[0] res_1 = np.zeros(size) for k in range(size): res_1[k] = self.get_graph_goal_distance( achieved_pool[i][k], desired_goals[j]) res = res_1 - achieved_value[i] / (self.args.hgg_L / self.max_dis / (1 - self.args.gamma)) elif self.args.route and self.args.env == 'FetchPickObstacle-v1': size = achieved_pool[i].shape[0] res_1 = np.zeros(size) for k in range(size): res_1[k] = self.get_route_goal_distance( achieved_pool[i][k], desired_goals[j]) res = res_1 - achieved_value[i] / (self.args.hgg_L / self.max_dis / (1 - self.args.gamma)) else: res = np.sqrt( np.sum(np.square(achieved_pool[i] - desired_goals[j]), axis=1)) - achieved_value[i] / ( self.args.hgg_L / self.max_dis / (1 - self.args.gamma)) # that was original match_dis = np.min(res) + goal_distance( achieved_pool[i][0], initial_goals[j] ) * self.args.hgg_c # distance of initial positions: take l2 norm_as before match_idx = np.argmin(res) edge = self.match_lib.add(graph_id['achieved'][i], graph_id['desired'][j], 1, c_double(match_dis)) candidate_goals.append(achieved_pool[i][match_idx]) candidate_edges.append(edge) candidate_id.append(j) for i in range(len(desired_goals)): self.match_lib.add(graph_id['desired'][i], n, 1, 0) match_count = self.match_lib.cost_flow(0, n) assert match_count == self.length explore_goals = [0] * self.length for i in range(len(candidate_goals)): if self.match_lib.check_match(candidate_edges[i]) == 1: explore_goals[candidate_id[i]] = candidate_goals[i].copy() assert len(explore_goals) == self.length self.pool = np.array(explore_goals)
def update(self, initial_goals, desired_goals): if self.achieved_trajectory_pool.counter == 0: self.pool = copy.deepcopy(desired_goals) return achieved_pool, achieved_pool_init_state = self.achieved_trajectory_pool.pad( ) candidate_goals = [] candidate_edges = [] candidate_id = [] agent = self.args.agent achieved_value = [] for i in range(len(achieved_pool)): obs = [ goal_concat(achieved_pool_init_state[i], achieved_pool[i][j]) for j in range(achieved_pool[i].shape[0]) ] feed_dict = {agent.raw_obs_ph: obs} value = agent.sess.run(agent.q_pi, feed_dict)[:, 0] value = np.clip(value, -1.0 / (1.0 - self.args.gamma), 0) achieved_value.append(value.copy()) n = 0 graph_id = {'achieved': [], 'desired': []} for i in range(len(achieved_pool)): n += 1 graph_id['achieved'].append(n) for i in range(len(desired_goals)): n += 1 graph_id['desired'].append(n) n += 1 self.match_lib.clear(n) for i in range(len(achieved_pool)): self.match_lib.add(0, graph_id['achieved'][i], 1, 0) for i in range(len(achieved_pool)): for j in range(len(desired_goals)): res = np.sqrt( np.sum(np.square(achieved_pool[i] - desired_goals[j]), axis=1)) - achieved_value[i] / ( self.args.hgg_L / self.max_dis / (1 - self.args.gamma)) match_dis = np.min(res) + goal_distance( achieved_pool[i][0], initial_goals[j]) * self.args.hgg_c match_idx = np.argmin(res) edge = self.match_lib.add(graph_id['achieved'][i], graph_id['desired'][j], 1, c_double(match_dis)) candidate_goals.append(achieved_pool[i][match_idx]) candidate_edges.append(edge) candidate_id.append(j) for i in range(len(desired_goals)): self.match_lib.add(graph_id['desired'][i], n, 1, 0) match_count = self.match_lib.cost_flow(0, n) assert match_count == self.length explore_goals = [0] * self.length for i in range(len(candidate_goals)): if self.match_lib.check_match(candidate_edges[i]) == 1: explore_goals[candidate_id[i]] = candidate_goals[i].copy() assert len(explore_goals) == self.length self.pool = np.array(explore_goals)
def learn(self, args, env, env_test, agent, buffer, write_goals=0): # Actual learning cycle takes place here! initial_goals = [] desired_goals = [] episodes = args.episodes // 5 # get initial position and goal from environment for each epsiode for i in range(episodes): obs = self.env_List[i].reset() goal_a = obs['achieved_goal'].copy() goal_d = obs['desired_goal'].copy() initial_goals.append(goal_a.copy()) desired_goals.append(goal_d.copy()) goal_list = [] achieved_trajectories = [] achieved_init_states = [] explore_goals = [] test_goals = [] inside = [] for i in range(episodes): obs = self.env_List[i].get_obs() init_state = obs['observation'].copy() sampler = MatchSampler(args, self.env_List[i]) loop = train_goalGAN(agent, initialize_GAN(env=self.env_List[i]), sampler, 5, True) next(loop) if not self.stop: explore_goal = sampler.sample() else: explore_goal = desired_goals[i] # store goals in explore_goals list to check whether goals are within goal space later explore_goals.append(explore_goal) test_goals.append(self.env.generate_goal()) # Perform HER training by interacting with the environment self.env_List[i].goal = explore_goal.copy() if write_goals != 0 and len(goal_list) < write_goals: goal_list.append(explore_goal.copy()) current = None trajectory = None for iters in range(NUM): if iters < 20: obs = self.env_List[i].get_obs() current = Trajectory(obs) trajectory = [obs['achieved_goal'].copy()] has_success = False for timestep in range(args.timesteps // SCALE): # get action from the ddpg policy action = agent.step(obs, explore=True) # feed action to environment, get observation and reward obs, reward, done, info = self.env_List[i].step(action) is_success = reward == 0 if iters < 20: trajectory.append(obs['achieved_goal'].copy()) current.store_step(action, obs, reward, done) if is_success and not has_success: has_success = True if len(sampler.successes_per_goal) > 0: sampler.successes_per_goal[tuple(self.env_List[i].goal)].append(is_success) if timestep == args.timesteps // SCALE -1: if len(sampler.successes_per_goal) > 0: sampler.successes_per_goal[tuple(self.env_List[i].goal)].append(is_success) next(loop) sampler.reset() if iters < 20: achieved_trajectories.append(np.array(trajectory)) achieved_init_states.append(init_state) # Trajectory is stored in replay buffer, replay buffer can be normal or EBP buffer.store_trajectory(current) agent.normalizer_update(buffer.sample_batch()) if buffer.steps_counter >= args.warmup: for _ in range(args.train_batches): # train with Hindsight Goals (HER step) info = agent.train(buffer.sample_batch()) args.logger.add_dict(info) # update target network agent.target_update() selection_trajectory_idx = {} for i in range(episodes): # only add trajectories with movement to the trajectory pool --> use default (L2) distance measure! if goal_distance(achieved_trajectories[i][0], achieved_trajectories[i][-1]) > 0.01: selection_trajectory_idx[i] = True for idx in selection_trajectory_idx.keys(): self.achieved_trajectory_pool.insert(achieved_trajectories[idx].copy(), achieved_init_states[idx].copy()) # unless in first call: Check which of the explore goals are inside the target goal space target goal space # is represented by a sample of test_goals directly generated from the environment an explore goal is # considered inside the target goal space, if it is closer than the distance_threshold to one of the test # goals (i.e. would yield a non-negative reward if that test goal was to be achieved) if self.learn_calls > 0: assert len(explore_goals) == len(test_goals) for ex in explore_goals: is_inside = 0 for te in test_goals: # TODO: check: originally with self.sampler.get_graph_goal_distance, now trying with goal_distance (L2) if goal_distance(ex, te) <= self.env.env.env.distance_threshold: is_inside = 1 inside.append(is_inside) assert len(inside) == len(test_goals) inside_sum = 0 for i in inside: inside_sum += i # If more than stop_hgg_threshold (e.g. 0.9) of the explore goals are inside the target goal space, stop HGG # and continue with normal HER. # By default, stop_hgg_threshold is disabled (set to a value > 1) average_inside = inside_sum / len(inside) self.args.logger.info("Average inside: {}".format(average_inside)) if average_inside > self.stop_hgg_threshold: self.stop = True self.args.logger.info("Continue with normal HER") self.learn_calls += 1 return goal_list if len(goal_list) > 0 else None
def learn(self, args, env, env_test, agent, buffer, buffer_fake=None, env_model=None, fake=False, test=False): self.current_trajs = {"eps": [], "obs": [], "goal": []} self.hist_trajs = {"eps": [], "obs": [], "goal": []} initial_goals = [] desired_goals = [] for i in range(args.episodes): obs = self.env_List[i].reset() goal_a = obs['achieved_goal'].copy() goal_d = obs['desired_goal'].copy() initial_goals.append(goal_a.copy()) desired_goals.append(goal_d.copy()) if args.goal_generator: self.sampler.update(initial_goals, desired_goals) achieved_trajectories = [] achieved_init_states = [] for i in range(args.episodes): obs = self.env_List[i].get_obs() init_state = obs['observation'].copy() # decide on whether to use goal generator if args.goal_generator: # generate goal by HGG or GoalGAN explore_goal = self.sampler.sample(i) # replace goal given by the environment self.env_List[i].goal = explore_goal.copy() # initialization for interaction with the environment obs = self.env_List[i].get_obs() current = Trajectory(obs) trajectory = [obs['achieved_goal'].copy()] for timestep in range(args.timesteps): action = agent.step(obs, explore=True) self.dynamic_buffer.add(obs['observation'].copy(), 'st') obss = obs.copy() obs, reward, done, info = self.env_List[i].step(action) self.dynamic_buffer.add(action.copy(), 'at') self.dynamic_buffer.add(obs['observation'].copy(), 'stpo') trajectory.append(obs['achieved_goal'].copy()) if buffer.steps_counter >= args.warmup: for _ in range(self.args.training_freq): batch_real = buffer.sample_batch(batch_size=12, sample_for_mb=False) batch_fake = buffer_fake.sample_batch(batch_size=244) batch_new = { 'obs': batch_real['obs'] + batch_fake['obs'], 'obs_next': batch_real['obs_next'] + batch_fake['obs_next'], 'acts': batch_real['acts'] + batch_fake['acts'], 'rews': batch_real['rews'] + batch_fake['rews'] } info = agent.train(batch_new) args.logger.add_dict(info) agent.target_update() if timestep == args.timesteps - 1: done = True current.store_step(action, obs, reward, done) if done: break # dynamic model training if args.fgi or args.model_based_training: # calculate delta state (st+1 minus st), which is a trick if self.dynamic_buffer.dynamic_buffer_number <= 1000000: # if len(self.st)<=20000: _st = np.array( self.dynamic_buffer.data['st'] [:self.dynamic_buffer.dynamic_buffer_number].copy()) _at = np.array( self.dynamic_buffer.data['at'] [:self.dynamic_buffer.dynamic_buffer_number].copy()) _stpo = np.array( self.dynamic_buffer.data['stpo'] [:self.dynamic_buffer.dynamic_buffer_number].copy()) target = _stpo - _st inputs = np.concatenate([_st, _at], axis=1) outputs = np.array(target) else: _st = [] _at = [] _stpo = [] target = [] inds = np.random.randint( 0, self.dynamic_buffer.dynamic_buffer_number, size=1000000) for x in range(1000000): _st.append( self.dynamic_buffer.data['st'][inds[x]].copy()) _at.append( self.dynamic_buffer.data['at'][inds[x]].copy()) target.append( (self.dynamic_buffer.data['stpo'][inds[x]] - self.dynamic_buffer.data['st'][inds[x]]).copy()) _st = np.array(_st) _at = np.array(_at) target = np.array(target) inputs = np.concatenate([_st, _at], axis=1) outputs = np.array(target) if len(self.model_loss) > 0 and self.model_loss[-1] < 0.03: los = env_model.train(inputs=inputs, targets=outputs, holdout_ratio=0.2, batch_size=256, max_epochs=10) else: los = env_model.train(inputs=inputs, targets=outputs, holdout_ratio=0.2, batch_size=256, max_epochs=None) del (_st) del (_at) del (_stpo) del (inputs) del (outputs) self.model_loss.append(los['val_loss']) # update buffer and normalizer achieved_trajectories.append(np.array(trajectory)) achieved_init_states.append(init_state) buffer.store_trajectory(current) if buffer.steps_counter > args.warmup: agent.normalizer_update(buffer.sample_batch()) # generate fake data if buffer.steps_counter > args.warmup - 1 and args.model_based_training: print('extending...') extend_length = self.args.extend_length self.extend_traj(extend_length=extend_length, env_model=env_model, buffer=buffer, agent=agent, buffer_fake=buffer_fake) print('extend over.') # update achieved_trajectories for HGG sampler selection_trajectory_idx = {} for i in range(self.args.episodes): if goal_distance(achieved_trajectories[i][0], achieved_trajectories[i][-1]) > 0.01: selection_trajectory_idx[i] = True for idx in selection_trajectory_idx.keys(): self.achieved_trajectory_pool.insert( achieved_trajectories[idx].copy(), achieved_init_states[idx].copy())