def print_model_stats(pre_trained_reward_network, test_batch_size, sess): # read the data test = load_data_from(os.path.join('supervised_data', 'test'), max_read=10 * test_batch_size) print len(test) # partition to train and test random.shuffle(test) openrave_manager = OpenraveManager( 0.001, PotentialPoint.from_config(pre_trained_reward_network.config)) sess.run(tf.global_variables_initializer()) # run test for one (random) batch random.shuffle(test) test_batch = oversample_batch(test, 0, test_batch_size) test_batch, test_rewards, test_status = get_batch_and_labels( test_batch, openrave_manager) reward_prediction, status_prediction = pre_trained_reward_network.make_prediction( *([sess] + test_batch)) # see what happens for different reward classes: goal_rewards_stats, collision_rewards_stats, other_rewards_stats = compute_stats_per_class( test_status, test_rewards, status_prediction, reward_prediction) print 'before loading weights' print 'goal mean_error {} max_error {} accuracy {}'.format( *goal_rewards_stats) print 'collision mean_error {} max_error {} accuracy {}'.format( *collision_rewards_stats) print 'other mean_error {} max_error {} accuracy {}'.format( *other_rewards_stats) # load weights pre_trained_reward_network.load_weights(sess) # run test for one (random) batch random.shuffle(test) test_batch = oversample_batch(test, 0, test_batch_size) test_batch, test_rewards, test_status = get_batch_and_labels( test_batch, openrave_manager) reward_prediction, status_prediction = pre_trained_reward_network.make_prediction( *([sess] + test_batch)) # see what happens for different reward classes: goal_rewards_stats, collision_rewards_stats, other_rewards_stats = compute_stats_per_class( test_status, test_rewards, status_prediction, reward_prediction) print 'after loading weights' print 'goal mean_error {} max_error {} accuracy {}'.format( *goal_rewards_stats) print 'collision mean_error {} max_error {} accuracy {}'.format( *collision_rewards_stats) print 'other mean_error {} max_error {} accuracy {}'.format( *other_rewards_stats)
def __init__(self, config): self.action_step_size = config['openrave_rl']['action_step_size'] self.goal_sensitivity = config['openrave_rl']['goal_sensitivity'] self.keep_alive_penalty = config['openrave_rl']['keep_alive_penalty'] self.truncate_penalty = config['openrave_rl']['truncate_penalty'] self.openrave_manager = OpenraveManager( config['openrave_rl']['segment_validity_step'], PotentialPoint.from_config(config)) self.current_joints = None self.goal_joints = None self.start_joints = None self.traj = None
def produce_transitions(data_dir, cache_dir): print "producing transition data from original trajectories at {}".format( data_dir) assert os.path.exists(data_dir) if os.path.exists(cache_dir): print "found cache dir at {}, assuming all transitions are present there (if not delete the directory)".format( cache_dir) return print "cache not found, creating cache at: {}".format(cache_dir) if not os.path.exists(cache_dir): os.makedirs(cache_dir) files = [ file for file in os.listdir(data_dir) if file.endswith(".path_pkl") ] assert len(files) > 0 target_point = PotentialPoint.from_config(config)[-1] for file in files: print "loading file {}".format(file) with bz2.BZ2File(os.path.join(data_dir, file), "r") as compressed_file: paths = pickle.load(compressed_file) print "asserting step sizes match" step_size = config["openrave_rl"]["action_step_size"] + 0.00001 for (traj, _) in paths: for i in range(len(traj) - 1): assert ( np.linalg.norm(np.array(traj[i]) - np.array(traj[i + 1])) < step_size) print "creating transitions" transitions = [] for (traj, poses_trajectory) in paths: goal_joints = traj[-1] goal_pose = poses_trajectory[-1][target_point.tuple] for i in range(len(traj) - 1): joints = traj[i] next_joints = traj[i + 1] transition = (joints[1:], next_joints[1:], goal_joints[1:], goal_pose) transitions.append(transition) transition_file = os.path.join(cache_dir, file + ".transitions_cache") print "writing transitions file {}".format(transition_file) with open(transition_file, "w") as pickle_file: pickle.dump(transitions, pickle_file) # with bz2.BZ2File(transition_file, 'w') as compressed_file: # pickle.dump(transitions, compressed_file) print "cache created at {}".format(cache_dir)
def __init__(self, config): self.action_step_size = config["openrave_rl"]["action_step_size"] self.goal_sensitivity = config["openrave_rl"]["goal_sensitivity"] self.keep_alive_penalty = config["openrave_rl"]["keep_alive_penalty"] self.truncate_penalty = config["openrave_rl"]["truncate_penalty"] self.openrave_manager = OpenraveManager( config["openrave_rl"]["segment_validity_step"], PotentialPoint.from_config(config), ) self.current_joints = None self.goal_joints = None self.start_joints = None self.traj = None
def __init__(self, config): self.action_step_size = config['openrave_rl']['action_step_size'] self.goal_sensitivity = config['openrave_rl']['goal_sensitivity'] self.challenging_trajectories_only = config['openrave_planner'][ 'challenging_trajectories_only'] self.planner_iterations_start = config['openrave_planner'][ 'planner_iterations_start'] self.planner_iterations_increase = config['openrave_planner'][ 'planner_iterations_increase'] self.planner_iterations_decrease = config['openrave_planner'][ 'planner_iterations_decrease'] self.max_planner_iterations = self.planner_iterations_start self.openrave_manager = OpenraveManager( config['openrave_rl']['segment_validity_step'], PotentialPoint.from_config(config))
def __init__(self, config): self.action_step_size = config["openrave_rl"]["action_step_size"] self.goal_sensitivity = config["openrave_rl"]["goal_sensitivity"] self.challenging_trajectories_only = config["openrave_planner"][ "challenging_trajectories_only"] self.planner_iterations_start = config["openrave_planner"][ "planner_iterations_start"] self.planner_iterations_increase = config["openrave_planner"][ "planner_iterations_increase"] self.planner_iterations_decrease = config["openrave_planner"][ "planner_iterations_decrease"] self.max_planner_iterations = self.planner_iterations_start self.openrave_manager = OpenraveManager( config["openrave_rl"]["segment_validity_step"], PotentialPoint.from_config(config), )
def run_motion_planner(): result = None openrave_manager = OpenraveManager( config['openrave_rl']['segment_validity_step'], PotentialPoint.from_config(config)) for start_joints, goal_joints, workspace_id, _ in queries: params_file_path = image_cache.items[workspace_id].full_filename openrave_manager.set_params(params_file_path) for i in range(repeat): start_time = datetime.datetime.now() traj = openrave_manager.plan(start_joints, goal_joints, None) # assert traj is not None end_time = datetime.datetime.now() time_diff = end_time - start_time if result is None: result = time_diff else: result += time_diff return result
scenario = 'hard' model_name = '2019_01_25_10_09_04' number_of_imitation_files = 3 sphere_limitation = 1000 imitation_data_path = os.path.abspath(os.path.expanduser(os.path.join('~/ModelBasedDDPG/imitation_data', scenario))) rl_trajectories_data_path = os.path.abspath(os.path.expanduser( os.path.join('~/ModelBasedDDPG/', scenario, 'trajectories', model_name))) # load configuration config_path = os.path.join(os.getcwd(), 'config/config.yml') with open(config_path, 'r') as yml_file: config = yaml.load(yml_file) # load the workspace openrave_manager = OpenraveManager(config['openrave_rl']['segment_validity_step'], PotentialPoint.from_config(config)) def process_poses(target_poses, x_coordinate_range=(0.0, 0.13), z_coordinate_range=(0.3, 0.45)): return [p for p in target_poses if x_coordinate_range[0] <= p[0] <= x_coordinate_range[1] and z_coordinate_range[0] <= p[1] <= z_coordinate_range[1]] def process_rl_files(data_dir, trajectory_limitation): steps_offset = 40 steps_increase = 2000 trajectories_seen = 0 result = [] while trajectories_seen < trajectory_limitation: global_step_dir = os.path.join(data_dir, '{}'.format(steps_offset)) steps_offset += steps_increase for dirpath, dirnames, filenames in os.walk(global_step_dir):
collision_samples = 10000 # show_close_to_goal = True show_close_to_goal = False close_to_goal_samples = 10000 show_pose_action_direction_arrow = True show_goal_end_effector_pose = True # load configuration config_path = os.path.join(os.getcwd(), 'config/config.yml') with open(config_path, 'r') as yml_file: config = yaml.load(yml_file) # load the workspace openrave_manager = OpenraveManager( config['openrave_rl']['segment_validity_step'], PotentialPoint.from_config(config)) params_file = os.path.abspath( os.path.expanduser( os.path.join('~/ModelBasedDDPG/scenario_params', scenario, 'params.pkl'))) openrave_manager.load_params(WorkspaceParams.load_from_file(params_file)) openrave_manager.robot.SetDOFValues([0.0] + goal_joints, [0, 1, 2, 3, 4]) openrave_manager.get_initialized_viewer() red_color = np.array([1.0, 0.0, 0.0]) yellow_color = np.array([1.0, 1.0, 0.0]) green_color = np.array([0.0, 1.0, 0.0]) def create_sphere(id, radius, openrave_manager): body = RaveCreateKinBody(openrave_manager.env, '')
np.array(traj[i]) - np.array(traj[i + 1])) < step_size paths_file = os.path.join(cache_dir, file + '.paths_cache') print 'writing paths file {}'.format(paths_file) with open(paths_file, 'w') as pickle_file: pickle.dump(paths, pickle_file) print 'cache created at {}'.format(cache_dir) train_original_dir = os.path.join('imitation_data', scenario, 'train') train_transitions_dir = os.path.join('imitation_data_transitions', scenario, 'train') train_transitions_dir = os.path.join( train_transitions_dir, PotentialPoint.from_config(config)[-1].str) produce_transitions(train_original_dir, train_transitions_dir) train_paths_dir = os.path.join('imitation_data_paths', scenario, 'train') produce_paths(train_original_dir, train_paths_dir) test_original_dir = os.path.join('imitation_data', scenario, 'test') test_transitions_dir = os.path.join('imitation_data_transitions', scenario, 'test') test_transitions_dir = os.path.join(test_transitions_dir, PotentialPoint.from_config(config)[-1].str) produce_transitions(test_original_dir, test_transitions_dir) test_paths_dir = os.path.join('imitation_data_paths', scenario, 'test') produce_paths(test_original_dir, test_paths_dir) def get_files(paths_dir, transitions_dir, max_files=None):
def __init__(self, config, is_rollout_agent, image_shape=(55, 111), number_of_joints=4, pose_dimensions=2, pre_trained_reward=None, name_prefix=None): self.name_prefix = os.getpid() if name_prefix is None else name_prefix self.config = config self.potential_points = PotentialPoint.from_config(config) # input related data self.image_shape = image_shape self.number_of_joints = number_of_joints self.pose_dimensions = pose_dimensions # generate inputs all_inputs = self._create_inputs() self.joints_inputs = all_inputs[0] self.workspace_image_inputs = all_inputs[1] self.goal_joints_inputs = all_inputs[2] self.goal_pose_inputs = all_inputs[3] # images for vision self.images_3d = None if self.workspace_image_inputs is not None: self.images_3d = tf.expand_dims(self.workspace_image_inputs, axis=-1) # since we take partial derivatives w.r.t subsets of the parameters, we always need to remember which parameters # are currently being added. note that this also causes the model to be non thread safe, therefore the creation # must happen sequentially # online actor network variable_count = len(tf.trainable_variables()) actor_results = self._create_actor_network(self.joints_inputs, is_online=True, reuse_flag=False) self.online_action = actor_results[0] online_actor_tanh = actor_results[1] self.online_actor_params = tf.trainable_variables()[variable_count:] # create placeholders and assign ops to set these weights manually (used by rollout agents) self.online_actor_parameter_weights_placeholders = { var.name: tf.placeholder(tf.float32, var.get_shape()) for var in self.online_actor_params } self.online_actor_parameters_assign_ops = [ tf.assign( var, self.online_actor_parameter_weights_placeholders[var.name]) for var in self.online_actor_params ] # target actor network variable_count = len(tf.trainable_variables()) actor_results = self._create_actor_network(self.joints_inputs, is_online=False, reuse_flag=False) self.target_action = actor_results[0] self.target_actor_params = tf.trainable_variables()[variable_count:] # create placeholders and assign ops to set these weights manually (used by rollout agents) self.target_actor_parameter_weights_placeholders = { var.name: tf.placeholder(tf.float32, var.get_shape()) for var in self.target_actor_params } self.target_actor_parameters_assign_ops = [ tf.assign( var, self.target_actor_parameter_weights_placeholders[var.name]) for var in self.target_actor_params ] # this is as much as a rollout agent needs if is_rollout_agent: return tau = self.config['model']['tau'] gamma = self.config['model']['gamma'] use_reward_model = self.config['model']['use_reward_model'] self.forward_model_next_state, self.forward_model_action, forward_model_tanh = None, None, None if use_reward_model: # deterministic value of the next state (from current state, executing the online action) self.forward_model_next_state = self._next_state_model( ) if use_reward_model else None # online actor network for the result of the forward model variable_count = len(tf.trainable_variables()) actor_results = self._create_actor_network( self.forward_model_next_state, is_online=True, reuse_flag=True) self.forward_model_action = actor_results[0] forward_model_tanh = actor_results[1] assert variable_count == len(tf.trainable_variables( )) # make sure no new parameters were added # periodically update target actor with online actor weights self.update_actor_target_params = \ [self.target_actor_params[i].assign( tf.multiply(self.online_actor_params[i], tau) + tf.multiply(self.target_actor_params[i], 1. - tau) ) for i in range(len(self.target_actor_params))] # create inputs for the critic and reward network when using a constant action self.action_inputs = tf.placeholder(tf.float32, (None, self.number_of_joints), name='action_inputs') # online critic for predicting the q value for a specific joints+action pair variable_count = len(tf.trainable_variables()) self.online_q_value_fixed_action = self._create_critic_network( self.joints_inputs, self.action_inputs, is_online=True, reuse_flag=False, add_regularization_loss=True) online_critic_params = tf.trainable_variables()[variable_count:] # online critic for predicting the q value for actor update. # if using a reward model, the joint inputs are given by the forward model and so are the actions. # if in regular ddpg, the joints inputs are given by the current state inputs, the actions are the policy on # these joints. variable_count = len(tf.trainable_variables()) self.online_q_value_under_policy = self._create_critic_network( joints_input=self.forward_model_next_state if use_reward_model else self.joints_inputs, action_input=self.forward_model_action if use_reward_model else self.online_action, is_online=True, reuse_flag=True, add_regularization_loss=False) assert variable_count == len( tf.trainable_variables()) # make sure no new parameters were added # target critic network, predicting the q value current state under the target policy variable_count = len(tf.trainable_variables()) self.target_q_value_under_policy = self._create_critic_network( self.joints_inputs, self.target_action, is_online=False, reuse_flag=False, add_regularization_loss=False) target_critic_params = tf.trainable_variables()[variable_count:] # periodically update target critic with online critic weights self.update_critic_target_params = \ [target_critic_params[i].assign( tf.multiply(online_critic_params[i], tau) + tf.multiply(target_critic_params[i], 1. - tau) ) for i in range(len(target_critic_params))] self.fixed_action_reward, self.fixed_action_termination, self.online_action_reward, self.online_action_termination = None, None, None, None if use_reward_model: assert pre_trained_reward is not None variable_count = len(tf.trainable_variables()) # reward network to predict the immediate reward of a given action self.fixed_action_reward, fixed_action_status = pre_trained_reward.create_reward_network( self.joints_inputs, self.action_inputs, self.goal_joints_inputs, self.goal_pose_inputs, self.images_3d) self.fixed_action_termination = self._compute_termination_from_status( fixed_action_status) # reward network to predict the immediate reward of the online policy action self.online_action_reward, online_action_status = pre_trained_reward.create_reward_network( self.joints_inputs, self.online_action, self.goal_joints_inputs, self.goal_pose_inputs, self.images_3d) self.online_action_termination = self._compute_termination_from_status( online_action_status) assert variable_count == len(tf.trainable_variables()) # the label to use to train the online critic network self.scalar_label = tf.placeholder(tf.float32, [None, 1]) batch_size = tf.cast(tf.shape(self.joints_inputs)[0], tf.float32) # critic optimization critic_prediction_loss = tf.losses.mean_squared_error( self.scalar_label, self.online_q_value_fixed_action) critic_regularization = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) critic_regularization_loss = tf.add_n( critic_regularization) if len(critic_regularization) > 0 else 0.0 self.critic_total_loss = critic_prediction_loss + critic_regularization_loss self.critic_initial_gradients_norm, self.critic_clipped_gradients_norm, self.optimize_critic = \ self._optimize_by_loss( self.critic_total_loss, online_critic_params, self.config['critic']['learning_rate'], self.config['critic']['gradient_limit'] ) # summaries for the critic optimization self.critic_optimization_summaries = tf.summary.merge([ tf.summary.scalar('critic_prediction_loss', critic_prediction_loss), tf.summary.scalar('critic_regularization_loss', critic_regularization_loss), tf.summary.scalar('critic_total_loss', self.critic_total_loss), tf.summary.scalar('critic_gradients_norm_initial', self.critic_initial_gradients_norm), tf.summary.scalar('critic_gradients_norm_clipped', self.critic_clipped_gradients_norm), tf.summary.scalar('critic_mean_prediction', tf.reduce_mean( self.online_q_value_fixed_action)), tf.summary.histogram('critic_prediction_distribution', self.online_q_value_fixed_action), ]) # when training the actor we derive the advantage w.r.t mu's network params (mu is the online policy) if use_reward_model: # advantage is r(s, mu(s)) + \gamma * q(f(s, mu(s)), mu(f(s, mu(s)))) include_next_state = (1.0 - self.online_action_termination) # include_next_state = 1.0 self.actor_loss = -( self.online_action_reward + gamma * self.online_q_value_under_policy * include_next_state # this is actually the policy on the forward model output ) else: # advantage is q(s, mu(s)) self.actor_loss = -self.online_q_value_under_policy self.actor_loss = tf.reduce_sum(self.actor_loss) # if we have extra losses for the actor: tanh_loss_summary = None if self.config['action_predictor'][ 'tanh_preactivation_loss_coefficient'] > 0.0: tanh_preactivation_loss = tf.losses.mean_squared_error( tf.zeros_like(online_actor_tanh), online_actor_tanh) if use_reward_model: forward_model_tanh_preactivation_loss = tf.losses.mean_squared_error( tf.zeros_like(forward_model_tanh), forward_model_tanh) tanh_preactivation_loss += forward_model_tanh_preactivation_loss tanh_preactivation_loss *= self.config['action_predictor'][ 'tanh_preactivation_loss_coefficient'] self.actor_loss += tanh_preactivation_loss tanh_loss_summary = tf.summary.scalar('tanh_preactivation_loss', tanh_preactivation_loss) # divide by the batch size self.actor_loss = tf.div(self.actor_loss, batch_size) self.actor_initial_gradients_norm, self.actor_clipped_gradients_norm, self.optimize_actor = \ self._optimize_by_loss( self.actor_loss, self.online_actor_params, self.config['actor']['learning_rate'], self.config['actor']['gradient_limit'] ) # summaries for the optimization merge_list = [ tf.summary.scalar('actor_gradients_norm_initial', self.actor_initial_gradients_norm), tf.summary.scalar('actor_gradients_norm_clipped', self.actor_clipped_gradients_norm), tf.summary.scalar('actor_total_loss', self.actor_loss), ] if tanh_loss_summary is not None: merge_list.append(tanh_loss_summary) self.actor_optimization_summaries = tf.summary.merge(merge_list)
number_of_unzippers = config['general']['number_of_unzippers'] train = Oversampler(train_data_dir, batch_size, oversample_goal, oversample_collision, number_of_unzippers=number_of_unzippers) test = Oversampler(test_data_dir, batch_size, oversample_goal, oversample_collision, number_of_unzippers=number_of_unzippers) # get openrave manager openrave_manager = OpenraveManager(0.001, PotentialPoint.from_config(config)) # set summaries and saver dir summaries_dir = os.path.join('reward', 'tensorboard') train_summary_writer = tf.summary.FileWriter( os.path.join(summaries_dir, 'train_' + model_name)) test_summary_writer = tf.summary.FileWriter( os.path.join(summaries_dir, 'test_' + model_name)) saver_dir = os.path.join('reward', 'model', model_name) if not os.path.exists(saver_dir): os.makedirs(saver_dir) # save the config config_copy_path = os.path.join(saver_dir, 'config.yml') yaml.dump(config, open(config_copy_path, 'w'))
import tensorflow as tf import os import yaml from openrave_manager import OpenraveManager from potential_point import PotentialPoint is_gpu = tf.test.is_gpu_available() config_path = os.path.join(os.getcwd(), 'config/config.yml') with open(config_path, 'r') as yml_file: config = yaml.load(yml_file) potential_points = PotentialPoint.from_config(config) openrave_manager = OpenraveManager(0.01, potential_points) random_joints = openrave_manager.get_random_joints() print 'has gpu result {}'.format(is_gpu) print 'random joints result {}'.format(random_joints)
# import matplotlib.pyplot as plt # # fig = plt.figure() # ax = fig.add_subplot(111, projection='3d') # ax.scatter([t[0] for t in transformed], [t[1] for t in transformed], [t[2] for t in transformed]) # ax.set_xlabel('X Label') # ax.set_ylabel('Y Label') # ax.set_zlabel('Z Label') # # plt.show() # # print 'here' if __name__ == "__main__": potential_points = [ PotentialPoint(t) for t in [(4, 0.0, 0.0), (5, 0.0, 0.0)] ] m = OpenraveManager(0.01, potential_points) joints0 = [0.0] * 5 res1 = m.get_potential_points_poses(joints0) res2 = m.get_links_poses(joints0) print res1[potential_points[0].tuple] == res2[m.links_names[ potential_points[0].link]] print res1[potential_points[1].tuple] == res2[m.links_names[ potential_points[1].link]] res3 = m.get_potential_points_jacobians(joints0) res4 = m.get_links_jacobians(joints0) print res3[potential_points[0].tuple] == res4[m.links_names[ potential_points[0].link]] print res3[potential_points[1].tuple] == res4[m.links_names[
np.linalg.norm(np.array(traj[i]) - np.array(traj[i + 1])) < step_size ) paths_file = os.path.join(cache_dir, file + ".paths_cache") print("writing paths file {}".format(paths_file)) with open(paths_file, "w") as pickle_file: pickle.dump(paths, pickle_file) print("cache created at {}".format(cache_dir)) train_original_dir = os.path.join("imitation_data", scenario, "train") train_transitions_dir = os.path.join("imitation_data_transitions", scenario, "train") train_transitions_dir = os.path.join( train_transitions_dir, PotentialPoint.from_config(config)[-1].str ) produce_transitions(train_original_dir, train_transitions_dir) train_paths_dir = os.path.join("imitation_data_paths", scenario, "train") produce_paths(train_original_dir, train_paths_dir) test_original_dir = os.path.join("imitation_data", scenario, "test") test_transitions_dir = os.path.join("imitation_data_transitions", scenario, "test") test_transitions_dir = os.path.join( test_transitions_dir, PotentialPoint.from_config(config)[-1].str ) produce_transitions(test_original_dir, test_transitions_dir) test_paths_dir = os.path.join("imitation_data_paths", scenario, "test") produce_paths(test_original_dir, test_paths_dir)