def __init__(self, args): self.args = args self.ACTIONS = ['left', 'right', 'forward', 'backward', 'up', 'down'] # 'open', 'close'] self.gripping_force = args.grip_force self.break_threshold = args.break_thresh self.action_space = ActionSpace(dp=0.06, df=10) self.broken_so_far = 0 self.robot = None self.model_params = None
def __init__(self, args, robot, velcro_util, tactile_obs_space): self.args = args self.ACTIONS = ['left', 'right', 'forward', 'backward', 'up', 'down'] self.action_space = ActionSpace(dp=2.5 * args.act_mag, df=10) self.robot = robot self.velcro_util = velcro_util self.tactile_obs_space = tactile_obs_space self.normalizer = Multimodal_Normalizer(num_inputs=args.indim, device=args.device) self.normalizer.restore_state(args.normalizer_file)
def main(args): policy_net = Geom_DQN(args.indim, args.outdim).to(args.device) policy_net.load_state_dict(torch.load(args.weight_expert)['policy_net_1']) policy_net.eval() normalizer = Geom_Normalizer(args.indim, device=args.device) normalizer.restore_state(args.norm_expert) tactile_normalizer = Multimodal_Normalizer(num_inputs = args.tactile_indim, device=args.device) tactile_normalizer.restore_state(args.tactile_normalizer) action_space = ActionSpace(dp=0.06, df=10) # Create robot, reset simulation and grasp handle model = load_model_from_path(args.model_path) sim = MjSim(model) sim_param = SimParameter(sim) sim.step() if args.render: viewer = MjViewer(sim) else: viewer = None # load all velcro parameters model_dir = os.path.dirname(args.model_path) param_path = os.path.join(model_dir, 'uniform_sample.pkl') velcro_params = pickle.load(open(param_path, 'rb')) robot = RobotSim(sim, viewer, sim_param, args.render, args.break_thresh) velcro_util = VelcroUtil(robot, robot.mj_sim_param) traj_logger = TrajLogger(args, robot, velcro_util, policy_net, normalizer, tactile_normalizer) all_trajectories = [] all_success = [None for i in range(len(velcro_params))] all_time = [None for i in range(len(velcro_params))] for i in range(len(velcro_params)): geom_type, origin_offset, euler, radius = velcro_params[i] change_sim(robot.mj_sim, geom_type, origin_offset, euler, radius) performance = {'time':[], 'success':[], 'action_hist':[0,0,0,0,0,0]} min_time = args.max_iter all_trajectories.append(None) for j in range(args.num_try): robot.reset_simulation() ret = robot.grasp_handle() performance, expert_traj = traj_logger.test_network(performance) if performance['time'][-1] <= min_time: all_trajectories[i] = expert_traj min_time = performance['time'][-1] all_success[i] = performance['success'][j] all_time[i] = performance['time'][-1] print('\n\nFinished trajectory {}'.format(i)) print('Velcro parameters are:{} {} {} {}'.format(geom_type, origin_offset, euler, radius)) print(performance) success = np.array(performance['success']) time = np.array(performance['time']) print('Successfully opened the velcro in: {}% of cases'.format(100 * np.sum(success) / len(performance['success']))) print('Average time to open: {}'.format(np.average(time[success>0]))) print('Action histogram for the test is: {}'.format(performance['action_hist'])) print('\nCollected {} successful expert trajectories in total'.format(np.sum(np.array(all_success)))) print('Total success and time: {}, {}'.format(all_success, all_time)) output = {'args': args, 'traj': all_trajectories, 'success': all_success, 'all_time': all_time} output_path = args.result_dir + 'oneshot_expert_traj.pkl' write_results(output_path, output)
class RobotEnv: def __init__(self, args): self.args = args self.ACTIONS = ['left', 'right', 'forward', 'backward', 'up', 'down'] # 'open', 'close'] self.gripping_force = args.grip_force self.break_threshold = args.break_thresh self.action_space = ActionSpace(dp=0.06, df=10) self.broken_so_far = 0 self.robot = None self.model_params = None def reset(self): args = self.args model, self.model_params = init_model(args.model_path, geom=False) sim = MjSim(model) sim.step() if args.render: viewer = MjViewer(sim) else: viewer = None sim_param = SimParameter(sim) robot = RobotSim(sim, viewer, sim_param, args.render, self.break_threshold) self.robot = robot robot.reset_simulation() ret = robot.grasp_handle() if not ret: return None self.obs_space = Observation(robot.get_gripper_jpos(), robot.get_num_broken_buffer(args.hap_sample), robot.get_main_touch_buffer(args.hap_sample)) # Get current observation self.broken_so_far = 0 self.obs_space.update(robot.get_gripper_jpos(), # 6 robot.get_num_broken_buffer(args.hap_sample), # 30x1 robot.get_main_touch_buffer(args.hap_sample)) # 30x12 obs = self.obs_space.get_state() reward = 0 done = False info = {'slippage': False} return obs, reward, done, info def step(self, action): robot = self.robot args = self.args delta = self.action_space.get_action(self.ACTIONS[action])['delta'][:3] target_position = np.add(robot.get_gripper_jpos()[:3], np.array(delta)) target_pose = np.hstack((target_position, robot.get_gripper_jpos()[3:])) robot.move_joint(target_pose, True, self.gripping_force, hap_sample = args.hap_sample) # Get reward done, num = robot.update_tendons() slippage = robot.check_slippage() if num > self.broken_so_far: reward = num - self.broken_so_far self.broken_so_far = num else: reward = 0 if slippage: done = True info = {'slippage': slippage} self.obs_space.update(robot.get_gripper_jpos(), # 6 robot.get_num_broken_buffer(args.hap_sample), # 30x2 robot.get_main_touch_buffer(args.hap_sample)) # 30x7 obs = self.obs_space.get_state() return obs, reward, done, info
def main(args): if not os.path.isdir(args.result_dir): os.makedirs(args.result_dir) conv_net = ConvNet(args.outdim, args.depth).to(args.device) if os.path.exists(args.weight_convnet): checkpoint = torch.load(args.weight_convnet) conv_net.load_state_dict(checkpoint['conv_net']) # Create robot, reset simulation and grasp handle model = load_model_from_path(args.model_path) sim = MjSim(model) sim_param = SimParameter(sim) sim.step() if args.render: viewer = MjViewer(sim) else: viewer = None robot = RobotSim(sim, viewer, sim_param, args.render, args.break_thresh) # load all velcro parameters # model_dir = os.path.dirname(args.model_path) # param_path = os.path.join(model_dir, 'uniform_sample.pkl') param_path = '/home/jc/research/corl2019_learningHaptics/tests/test_xmls/case_{}.pickle'.format( args.case) velcro_params = pickle.load(open(param_path, 'rb')) if args.shuffle: random.shuffle(velcro_params) velcro_util = VelcroUtil(robot, sim_param) state_space = Observation( robot.get_gripper_jpos(), # 6 velcro_util.break_center(), # 6 velcro_util.break_norm()) action_space = ActionSpace(dp=0.06, df=10) performance = { 'time': [], 'success': [], 'num_broken': [], 'tendon_hist': [0, 0, 0, 0, 0] } for n, item in enumerate(velcro_params): geom_type, origin_offset, euler, radius = item print('\n\nTest {} Velcro parameters are: {}, {}, {}, {}'.format( n, geom_type, origin_offset, euler, radius)) change_sim(robot.mj_sim, geom_type, origin_offset, euler, radius) robot.reset_simulation() ret = robot.grasp_handle() broken_so_far = 0 # ax.clear() for t in range(args.max_iter): # take image an predict norm direction # Get image and normalize it img = robot.get_img(args.img_w, args.img_h, 'c1', args.depth) if args.depth: depth = norm_depth(img[1]) img = norm_img(img[0]) img_norm = np.empty((4, args.img_w, args.img_h)) img_norm[:3, :, :] = img img_norm[3, :, :] = depth else: img_norm = norm_img(img) torch_img = torch.from_numpy(img_norm).float().to( args.device).unsqueeze(0) pred = conv_net.forward(torch_img).detach().cpu() fl_norm = pred[0][3:6].numpy() break_dir_norm = pred[0][6:9].numpy() # normalize these vectors fl_norm = fl_norm / la.norm(fl_norm) break_dir_norm = break_dir_norm / la.norm(break_dir_norm) ################ choose action and get action direction vector ################ action_direction = args.act_mag * (-0.5 * fl_norm + 0.5 * break_dir_norm) action_key = (action_vec @ action_direction).argmax() action_direction = action_space.get_action( ACTIONS[action_key])['delta'][:3] gripper_pose = robot.get_gripper_jpos()[:3] # Perform action target_position = np.add(robot.get_gripper_jpos()[:3], action_direction) target_pose = np.hstack( (target_position, robot.get_gripper_jpos()[3:])) robot.move_joint(target_pose, True, 300, hap_sample=30) # check tendons and slippage done, num = robot.update_tendons() failure = robot.check_slippage() if num > broken_so_far: broken_so_far = num if done or failure: ratio_broken = float(num) / float(NUM_TENDON) if ratio_broken < 0.2: performance['tendon_hist'][0] += 1 elif ratio_broken >= 0.2 and ratio_broken < 0.4: performance['tendon_hist'][1] += 1 elif ratio_broken >= 0.4 and ratio_broken < 0.6: performance['tendon_hist'][2] += 1 elif ratio_broken >= 0.6 and ratio_broken < 0.8: performance['tendon_hist'][3] += 1 else: performance['tendon_hist'][4] += 1 performance['num_broken'].append(num) if done: performance['success'].append(1) performance['time'].append(t + 1) if failure: performance['success'].append(0) performance['time'].append(t + 1) break if t == args.max_iter - 1: ################## exceed max iterations #################### performance['success'].append(0) performance['time'].append(args.max_iter) ratio_broken = float(num) / float(NUM_TENDON) performance['num_broken'].append(num) if ratio_broken < 0.2: performance['tendon_hist'][0] += 1 elif ratio_broken >= 0.2 and ratio_broken < 0.4: performance['tendon_hist'][1] += 1 elif ratio_broken >= 0.4 and ratio_broken < 0.6: performance['tendon_hist'][2] += 1 elif ratio_broken >= 0.6 and ratio_broken < 0.8: performance['tendon_hist'][3] += 1 else: performance['tendon_hist'][4] += 1 # print episode performance print('Success: {}, time: {}, num_broken: {} '.format( performance['success'][-1], performance['time'][-1], performance['num_broken'][-1])) print('Finished opening velcro with haptics test \n') success = np.array(performance['success']) time = np.array(performance['time']) print('Successfully opened the velcro in: {}% of cases'.format( 100 * np.sum(success) / len(performance['success']))) print('Average time to open: {}'.format(np.average(time[success > 0]))) out_fname = 'vision_case{}.txt'.format(args.case) with open(os.path.join(args.result_dir, out_fname), 'w+') as f: f.write('Time: {}\n'.format(performance['time'])) f.write('Success: {}\n'.format(performance['success'])) f.write('Successfully opened the velcro in: {}% of cases\n'.format( 100 * np.sum(success) / len(performance['success']))) f.write('Average time to open: {}\n'.format( np.average(time[success > 0]))) f.write('Num_broken: {}\n'.format(performance['num_broken'])) f.write('Tendon histogram: {}\n'.format(performance['tendon_hist'])) f.close()
class TrajLogger: def __init__(self, args, robot, velcro_util, tactile_obs_space): self.args = args self.ACTIONS = ['left', 'right', 'forward', 'backward', 'up', 'down'] self.action_space = ActionSpace(dp=2.5 * args.act_mag, df=10) self.robot = robot self.velcro_util = velcro_util self.tactile_obs_space = tactile_obs_space self.normalizer = Multimodal_Normalizer(num_inputs=args.indim, device=args.device) self.normalizer.restore_state(args.normalizer_file) def select_action(self): sample = random.random() p_threshold = self.args.p if sample > p_threshold: return self.expert_action() else: action = random.randrange(6) return self.action_space.get_action( self.ACTIONS[action])['delta'][:3] def expert_action(self): norms = self.velcro_util.break_norm() centers = self.velcro_util.break_center() fl_center = centers[:3] fs_center = centers[3:] fl_norm = norms[:3] fs_norm = norms[3:6] break_dir_norm = norms[6:9] action_direction = self.args.act_mag * (-0.5 * fl_norm + 0.5 * break_dir_norm) return action_direction def test_network(self, performance): args = self.args max_iterations = args.max_iter broken_so_far = 0 expert_traj = [] for t in range(max_iterations): delta = self.select_action() # sample tactile feedback and norm info tactile_obs = self.tactile_obs_space.get_state() tactile_obs = self.normalizer.normalize(tactile_obs) expert_traj.append({ 'tactile': tactile_obs[:366], 'norm': self.velcro_util.break_norm(), 'center': self.velcro_util.break_center(), 'gpos': self.robot.get_gripper_jpos()[:3] }) # perform action target_position = np.add(self.robot.get_gripper_jpos()[:3], np.array(delta)) target_pose = np.hstack( (target_position, self.robot.get_gripper_jpos()[3:])) self.robot.move_joint(target_pose, True, args.grip_force, hap_sample=args.hap_sample) self.tactile_obs_space.update( self.robot.get_gripper_jpos(), # 6 self.robot.get_all_touch_buffer(args.hap_sample)) # 30x12 # Get reward done, num = self.robot.update_tendons() failure = self.robot.check_slippage() if num > broken_so_far: broken_so_far = num if done: performance['success'].append(1) performance['time'].append(t + 1) return performance, expert_traj break if failure: performance['success'].append(0) performance['time'].append(t + 1) return performance, expert_traj break # exceed max iterations performance['success'].append(0) performance['time'].append(max_iterations) return performance, expert_traj
def train(self): args = self.args self.load_all() print_variable = {'loss': None} start_epoch = 0 if args.weight_policy: if os.path.exists(args.weight_policy): checkpoint = torch.load(args.weight_policy) start_epoch = checkpoint['epoch'] self.steps_done = start_epoch * args.num_steps with open( os.path.join(os.path.dirname(args.weight_policy), 'results_imitation.pkl'), 'rb') as file: plot_dict = pickle.load(file) print_variable['loss'] = plot_dict['loss'] self.set_lr() action_space = ActionSpace(dp=0.06, df=10) BCELoss = nn.BCELoss(reduction='sum') for ii in range(start_epoch, start_epoch + args.epochs): start_time = time.time() epoch_loss = [] # sample expert trajectories and labeled actions batch, labels = self.memory.sample(args.batch_size, args.time_step) # encode labels onehot_labels = self.encode_label(labels) torch_labels = torch.tensor(onehot_labels).float().to(args.device) for step in range(args.num_steps): # forward networks and predict action states = self.extract_ft(self.conv_net, batch) hidden_batch, cell_batch = self.policy_net.init_hidden_states( args.batch_size, args.device) prediction, h = self.policy_net.forward( states, batch_size=args.batch_size, time_step=args.time_step, hidden_state=hidden_batch, cell_state=cell_batch) ground_truth = torch_labels[:, args.time_step - 1, :].squeeze(1) loss = BCELoss(prediction, ground_truth) # print and plot detached_loss = loss.detach().cpu() if print_variable['loss'] is not None: print_variable['loss'] = torch.cat( (print_variable['loss'], detached_loss.unsqueeze(0))) else: print_variable['loss'] = detached_loss.unsqueeze(0) self.plot_variables(self.figure, print_variable, 'Training Imitation Agent') epoch_loss.append(detached_loss.item()) # Optimize the model self.policy_optimizer.zero_grad() self.conv_optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) for param in self.conv_net.parameters(): param.grad.data.clamp_(-1, 1) self.policy_optimizer.step() self.conv_optimizer.step() # end of training epoch print( "Epoch {} took {}s, average loss of this epoch: {}\n\n".format( ii, time.time() - start_time, np.average(epoch_loss))) # Save checkpoints every vew iterations if ii % args.save_freq == 0: policy_save_path = os.path.join( args.output_dir, 'policy_checkpoint_' + str(ii) + '.pth') torch.save( { 'epoch': ii, 'policy_net': self.policy_net.state_dict(), 'policy_optimizer': self.policy_optimizer.state_dict(), }, policy_save_path) conv_save_path = os.path.join( args.output_dir, 'conv_checkpoint_' + str(ii) + '.pth') torch.save( { 'epoch': ii, 'conv_net': self.conv_net.state_dict(), 'conv_optimizer': self.conv_optimizer.state_dict(), }, conv_save_path) savefig_path = os.path.join(args.output_dir, 'training_imitation') self.figure[0].savefig(savefig_path, format='png') print('Training done') plt.show() return print_variable
def train_POMDP(self): args = self.args ROOT_DIR = os.path.dirname(os.path.dirname( os.path.abspath(__file__))) # corl2019 PARENT_DIR = os.path.dirname(ROOT_DIR) # reserach # Create the output directory if it does not exist output_dir = os.path.join(PARENT_DIR, 'multistep_pomdp', args.output_dir) if not os.path.isdir(output_dir): os.makedirs(output_dir) # write args to file with open(os.path.join(output_dir, 'args.txt'), 'w+') as f: json.dump(args.__dict__, f, indent=2) f.close() # Create our policy net and a target net self.policy_net_1 = DRQN(args.indim, args.outdim).to(args.device) self.policy_net_2 = DRQN(args.indim, args.outdim).to(args.device) self.policy_net_3 = DRQN(args.indim, args.outdim).to(args.device) # Set up the optimizer self.optimizer_1 = optim.RMSprop(self.policy_net_1.parameters()) self.optimizer_2 = optim.RMSprop(self.policy_net_2.parameters()) self.optimizer_3 = optim.RMSprop(self.policy_net_3.parameters()) self.memory = RecurrentMemory(800) self.steps_done = 0 # Setup the state normalizer normalizer = Multimodal_Normalizer(num_inputs=args.indim, device=args.device) print_variables = {'durations': [], 'rewards': [], 'loss': []} start_episode = 0 if args.checkpoint_file: if os.path.exists(args.checkpoint_file): checkpoint = torch.load(args.checkpoint_file) self.policy_net_1.load_state_dict(checkpoint['policy_net_1']) self.policy_net_2.load_state_dict(checkpoint['policy_net_2']) self.policy_net_3.load_state_dict(checkpoint['policy_net_3']) self.optimizer_1.load_state_dict(checkpoint['optimizer_1']) self.optimizer_2.load_state_dict(checkpoint['optimizer_2']) self.optimizer_3.load_state_dict(checkpoint['optimizer_3']) start_episode = checkpoint['epochs'] self.steps_done = checkpoint['steps_done'] with open( os.path.join(os.path.dirname(args.checkpoint_file), 'results_pomdp.pkl'), 'rb') as file: plot_dict = pickle.load(file) print_variables['durations'] = plot_dict['durations'] print_variables['rewards'] = plot_dict['rewards'] if args.normalizer_file: if os.path.exists(args.normalizer_file): normalizer.restore_state(args.normalizer_file) if args.memory: if os.path.exists(args.memory): self.memory.load(args.memory) action_space = ActionSpace(dp=0.06, df=10) # Create robot, reset simulation and grasp handle model = load_model_from_path(args.model_path) sim = MjSim(model) sim_param = SimParameter(sim) sim.step() if args.render: viewer = MjViewer(sim) else: viewer = None robot = RobotSim(sim, viewer, sim_param, args.render, self.break_threshold) # Main training loop for ii in range(start_episode, args.epochs): start_time = time.time() self.steps_done += 1 act_sequence = [] if args.sim: sim_params = init_model(robot.mj_sim) robot.reset_simulation() ret = robot.grasp_handle() if not ret: continue # Local memory for current episode localMemory = [] # Get current observation hidden_state_1, cell_state_1 = self.policy_net_1.init_hidden_states( batch_size=1, device=args.device) hidden_state_2, cell_state_2 = self.policy_net_2.init_hidden_states( batch_size=1, device=args.device) hidden_state_3, cell_state_3 = self.policy_net_3.init_hidden_states( batch_size=1, device=args.device) observation_space = TactileObs( robot.get_gripper_xpos(), # 24 robot.get_all_touch_buffer(args.hap_sample)) # 30 x 7 broken_so_far = 0 for t in count(): if not args.quiet and t % 50 == 0: print("Running training episode: {}, iteration: {}".format( ii, t)) # Select action observation = observation_space.get_state() if args.position: observation = observation[6:] if args.shear: indices = np.ones(len(observation), dtype=bool) indices[6:166] = False observation = observation[indices] if args.force: observation = observation[:166] normalizer.observe(observation) observation = normalizer.normalize(observation) action, hidden_state_1, cell_state_1 = self.select_action( observation, hidden_state_1, cell_state_1) # record actions in this epoch act_sequence.append(action) # Perform action delta = action_space.get_action( self.ACTIONS[action])['delta'][:3] target_position = np.add(robot.get_gripper_jpos()[:3], np.array(delta)) target_pose = np.hstack( (target_position, robot.get_gripper_jpos()[3:])) if args.sim: robot.move_joint(target_pose, True, self.gripping_force, hap_sample=args.hap_sample) # Get reward done, num = robot.update_tendons() failure = robot.check_slippage() if num > broken_so_far: reward = num - broken_so_far broken_so_far = num else: reward = 0 # # Add a movement reward # reward -= 0.05 * np.linalg.norm(target_position - robot.get_gripper_jpos()[:3]) / np.linalg.norm(delta) # Observe new state observation_space.update( robot.get_gripper_xpos(), # 24 robot.get_all_touch_buffer(args.hap_sample)) # 30x7 # Set max number of iterations if t >= self.max_iter: done = True # Check if done if not done and not failure: next_state = observation_space.get_state() if args.position: next_state = next_state[6:] if args.shear: indices = np.ones(len(next_state), dtype=bool) indices[6:166] = False next_state = next_state[indices] if args.force: next_state = next_state[:166] normalizer.observe(next_state) next_state = normalizer.normalize(next_state) else: next_state = None # Push new Transition into memory localMemory.append( Transition(observation, action, next_state, reward)) # Optimize the model if t % 10 == 0: loss = self.optimize_model() # if loss: # print_variables['loss'].append(loss.item()) # If we are done, reset the model if done or failure: self.memory.push(localMemory) if failure: print_variables['durations'].append(self.max_iter) else: print_variables['durations'].append(t) print_variables['rewards'].append(broken_so_far) plot_variables(self.figure, print_variables, "Training POMDP") print("Model parameters: {}".format(sim_params)) print("Actions in this epoch are: {}".format(act_sequence)) print("Epoch {} took {}s, total number broken: {}\n\n". format(ii, time.time() - start_time, broken_so_far)) break # Save checkpoints every vew iterations if ii % args.save_freq == 0: save_path = os.path.join( output_dir, 'checkpoint_model_' + str(ii) + '.pth') torch.save( { 'epochs': ii, 'steps_done': self.steps_done, 'policy_net_1': self.policy_net_1.state_dict(), 'policy_net_2': self.policy_net_2.state_dict(), 'policy_net_3': self.policy_net_3.state_dict(), 'optimizer_1': self.optimizer_1.state_dict(), 'optimizer_2': self.optimizer_2.state_dict(), 'optimizer_3': self.optimizer_3.state_dict(), }, save_path) self.memory.save_memory(os.path.join(output_dir, 'memory.pickle')) if args.savefig_path: now = dt.datetime.now() self.figure[0].savefig( args.savefig_path + '{}_{}_{}.png'.format(now.month, now.day, now.hour), format='png') print('Training done') plt.show() return print_variables
def __init__(self, args, robot, velcro_util): self.args = args self.ACTIONS = ['left', 'right', 'forward', 'backward', 'up', 'down'] self.action_space = ActionSpace(dp=2.5 * args.act_mag, df=10) self.robot = robot self.velcro_util = velcro_util
def train_MDP(self): args = self.args # Create the output directory if it does not exist if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) # Create our policy net and a target net self.policy_net_1 = DQN(args.indim, args.outdim).to(args.device) self.policy_net_2 = DQN(args.indim, args.outdim).to(args.device) self.policy_net_3 = DQN(args.indim, args.outdim).to(args.device) self.target_net = DQN(args.indim, args.outdim).to(args.device) self.target_net.load_state_dict(self.policy_net_1.state_dict()) self.target_net.eval() # Set up the optimizer self.optimizer_1 = optim.RMSprop(self.policy_net_1.parameters(), args.lr) self.optimizer_2 = optim.RMSprop(self.policy_net_2.parameters(), args.lr) self.optimizer_3 = optim.RMSprop(self.policy_net_3.parameters(), args.lr) self.memory = ReplayMemory(500000) self.steps_done = 0 # Setup the state normalizer normalizer = Normalizer(args.indim, device=args.device) print_variables = {'durations': [], 'rewards': [], 'loss': []} # Load old checkpoint if provided start_episode = 0 if args.checkpoint_file: if os.path.exists(args.checkpoint_file): checkpoint = torch.load(args.checkpoint_file) self.policy_net_1.load_state_dict( checkpoint['model_state_dict']) self.policy_net_2.load_state_dict( checkpoint['model_state_dict']) self.policy_net_3.load_state_dict( checkpoint['model_state_dict']) self.target_net.load_state_dict(checkpoint['model_state_dict']) start_episode = checkpoint['epoch'] self.steps_done = start_episode self.optimizer_1.load_state_dict( checkpoint['optimizer_state_dict']) self.optimizer_2.load_state_dict( checkpoint['optimizer_state_dict']) self.optimizer_3.load_state_dict( checkpoint['optimizer_state_dict']) with open( os.path.join(os.path.dirname(args.checkpoint_file), 'results_geom_mdp.pkl'), 'rb') as file: plot_dict = pickle.load(file) print_variables['durations'] = plot_dict['durations'] print_variables['rewards'] = plot_dict['rewards'] if args.normalizer_file: if os.path.exists(args.normalizer_file): normalizer.restore_state(args.normalizer_file) action_space = ActionSpace(dp=0.06, df=10) # Main training loop for ii in range(start_episode, args.epochs): start_time = time.time() if args.sim: # Create robot, reset simulation and grasp handle model, model_params = init_model(args.model_path) sim = MjSim(model) sim.step() viewer = None if args.render: viewer = MjViewer(sim) else: viewer = None sim_param = SimParameter(sim) robot = RobotSim(sim, viewer, sim_param, args.render, self.breaking_threshold) robot.reset_simulation() ret = robot.grasp_handle() if not ret: continue # Get current state state_space = Observation( robot.get_gripper_jpos(), robot.get_shear_buffer(args.hap_sample), robot.get_all_touch_buffer(args.hap_sample)) broken_so_far = 0 for t in count(): if not args.quiet and t % 20 == 0: print("Running training episode: {}, iteration: {}".format( ii, t)) # Select action state = state_space.get_state() if args.position: state = state[6:] if args.shear: indices = np.ones(len(state), dtype=bool) indices[6:166] = False state = state[indices] if args.force: state = state[:166] normalizer.observe(state) state = normalizer.normalize(state) action = self.select_action(state) # Perform action delta = action_space.get_action( self.ACTIONS[action])['delta'][:3] target_position = np.add(state_space.get_current_position(), np.array(delta)) target_pose = np.hstack( (target_position, robot.get_gripper_jpos()[3:])) if args.sim: robot.move_joint(target_pose, True, self.gripping_force, hap_sample=args.hap_sample) # Get reward done, num = robot.update_tendons() failure = robot.check_slippage() if num > broken_so_far: reward = num - broken_so_far broken_so_far = num else: reward = 0 # # Add a movement reward # reward -= 0.1 * np.linalg.norm(target_position - robot.get_gripper_jpos()[:3]) / np.linalg.norm(delta) # Observe new state state_space.update( robot.get_gripper_jpos(), robot.get_shear_buffer(args.hap_sample), robot.get_all_touch_buffer(args.hap_sample)) # Set max number of iterations if t >= self.max_iter: done = True # Check if done if not done and not failure: next_state = state_space.get_state() if args.position: next_state = next_state[6:] if args.shear: indices = np.ones(len(next_state), dtype=bool) indices[6:166] = False next_state = next_state[indices] if args.force: next_state = next_state[:166] normalizer.observe(next_state) next_state = normalizer.normalize(next_state) else: next_state = None # Push new Transition into memory self.memory.push(state, action, next_state, reward) # Optimize the model loss = self.optimize_model() # if loss: # print_variables['loss'].append(loss.item()) # If we are done, reset the model if done or failure: if failure: print_variables['durations'].append(self.max_iter) else: print_variables['durations'].append(t) print_variables['rewards'].append(broken_so_far) plot_variables(self.figure, print_variables, 'Training MDP') print("Model parameters: {}".format(model_params)) print("Epoch {} took {}s, total number broken: {}\n\n". format(ii, time.time() - start_time, broken_so_far)) break # Update the target network, every x iterations if ii % 10 == 0: self.target_net.load_state_dict(self.policy_net_1.state_dict()) # Save checkpoints every vew iterations if ii % args.save_freq == 0: save_path = os.path.join( args.output_dir, 'checkpoint_model_' + str(ii) + '.pth') torch.save( { 'epoch': ii, 'model_state_dict': self.target_net.state_dict(), 'optimizer_state_dict': self.optimizer_1.state_dict(), }, save_path) # Save normalizer state for inference normalizer.save_state( os.path.join(args.output_dir, 'normalizer_state.pickle')) if args.savefig_path: now = dt.datetime.now() self.figure[0].savefig( args.savefig_path + '{}_{}_{}'.format(now.month, now.day, now.hour), format='png') print('Training done') plt.show() return print_variables
class TrajLogger: def __init__(self, args, robot, velcro_util): self.args = args self.ACTIONS = ['left', 'right', 'forward', 'backward', 'up', 'down'] self.action_space = ActionSpace(dp=2.5 * args.act_mag, df=10) self.robot = robot self.velcro_util = velcro_util def select_action(self): sample = random.random() p_threshold = self.args.p if sample > p_threshold: return self.expert_action() else: action = random.randrange(6) return self.action_space.get_action( self.ACTIONS[action])['delta'][:3] def expert_action(self): norms = self.velcro_util.break_norm() centers = self.velcro_util.break_center() fl_center = centers[:3] fs_center = centers[3:] fl_norm = norms[:3] fs_norm = norms[3:6] break_dir_norm = norms[6:9] action_direction = self.args.act_mag * (-0.5 * fl_norm + 0.5 * break_dir_norm) return action_direction def test_network(self, performance): args = self.args max_iterations = args.max_iter broken_so_far = 0 expert_traj = [] for t in range(max_iterations): delta = self.select_action() # sample images and norm info sample = random.random() if sample < args.sample_ratio: img = self.robot.get_img(args.img_w, args.img_h, 'c1', args.depth) if args.depth: depth = norm_depth(img[1]) img = norm_img(img[0]) img_norm = np.empty((4, args.img_w, args.img_h)) img_norm[:3, :, :] = img img_norm[3, :, :] = depth else: img_norm = norm_img(img) expert_traj.append({ 'image': img_norm, 'norm': self.velcro_util.break_norm(), 'center': self.velcro_util.break_center(), 'gpos': self.robot.get_gripper_jpos()[:3] }) # perform action target_position = np.add(self.robot.get_gripper_jpos()[:3], np.array(delta)) target_pose = np.hstack( (target_position, self.robot.get_gripper_jpos()[3:])) self.robot.move_joint(target_pose, True, args.grip_force, hap_sample=args.hap_sample) # Get reward done, num = self.robot.update_tendons() failure = self.robot.check_slippage() if num > broken_so_far: broken_so_far = num if done: performance['success'].append(1) performance['time'].append(t + 1) return performance, expert_traj break if failure: performance['success'].append(0) performance['time'].append(t + 1) return performance, expert_traj break # exceed max iterations performance['success'].append(0) performance['time'].append(max_iterations) return performance, expert_traj
def train_POMDP(self): args = self.args # Create the output directory if it does not exist if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) # Create our policy net and a target net self.policy_net_1 = DRQN(args.indim, args.outdim).to(args.device) self.policy_net_2 = DRQN(args.indim, args.outdim).to(args.device) self.conv_net_1 = ConvNet(args.ftdim, args.depth).to(args.device) self.conv_net_2 = ConvNet(args.ftdim, args.depth).to(args.device) # Set up the optimizer self.policy_optimizer_1 = optim.RMSprop(self.policy_net_1.parameters(), lr=args.lr) self.policy_optimizer_2 = optim.RMSprop(self.policy_net_2.parameters(), lr=args.lr) self.conv_optimizer_1 = optim.RMSprop(self.conv_net_1.parameters(), lr=1e-5) self.conv_optimizer_2 = optim.RMSprop(self.conv_net_2.parameters(), lr=1e-5) self.memory = RecurrentMemory(70) self.steps_done = 0 # Setup the state normalizer normalizer = Multimodal_Normalizer(num_inputs=args.indim - args.ftdim, device=args.device) print_variables = {'durations': [], 'rewards': [], 'loss': []} start_episode = 0 if args.checkpoint_file: if os.path.exists(args.checkpoint_file): checkpoint = torch.load(args.checkpoint_file) self.policy_net_1.load_state_dict(checkpoint['policy_net_1']) self.policy_net_2.load_state_dict(checkpoint['policy_net_2']) self.conv_net_1.load_state_dict(checkpoint['conv_net_1']) self.conv_net_2.load_state_dict(checkpoint['conv_net_2']) self.policy_optimizer_1.load_state_dict( checkpoint['policy_optimizer_1']) self.policy_optimizer_2.load_state_dict( checkpoint['policy_optimizer_2']) self.conv_optimizer_1.load_state_dict( checkpoint['conv_optimizer_1']) self.conv_optimizer_2.load_state_dict( checkpoint['conv_optimizer_2']) start_episode = checkpoint['epoch'] self.steps_done = checkpoint['steps_done'] with open( os.path.join(os.path.dirname(args.checkpoint_file), 'results_pomdp.pkl'), 'rb') as file: plot_dict = pickle.load(file) print_variables['durations'] = plot_dict['durations'] print_variables['rewards'] = plot_dict['rewards'] if args.normalizer_file: if os.path.exists(args.normalizer_file): normalizer.restore_state(args.normalizer_file) if args.memory: if os.path.exists(args.memory): self.memory.load(args.memory) if args.weight_conv: checkpoint = torch.load(args.weight_conv) self.conv_net_1.load_state_dict(checkpoint['conv_net']) self.conv_optimizer_1.load_state_dict(checkpoint['conv_optimizer']) self.conv_net_2.load_state_dict(checkpoint['conv_net']) self.conv_optimizer_2.load_state_dict(checkpoint['conv_optimizer']) action_space = ActionSpace(dp=0.06, df=10) # Create robot, reset simulation and grasp handle model = load_model_from_path(args.model_path) sim = MjSim(model) sim_param = SimParameter(sim) sim.step() if args.render: viewer = MjViewer(sim) else: viewer = None robot = RobotSim(sim, viewer, sim_param, args.render, self.break_threshold) tactile_obs_space = TactileObs( robot.get_gripper_jpos(), # 6 robot.get_all_touch_buffer(args.hap_sample)) # 30 x 12 # Main training loop for ii in range(start_episode, args.epochs): start_time = time.time() act_sequence = [] velcro_params = init_model(robot.mj_sim) robot.reset_simulation() ret = robot.grasp_handle() if not ret: continue # Local memory for current episode localMemory = [] # Get current observation hidden_state_1, cell_state_1 = self.policy_net_1.init_hidden_states( batch_size=1, device=args.device) hidden_state_2, cell_state_2 = self.policy_net_2.init_hidden_states( batch_size=1, device=args.device) broken_so_far = 0 for t in count(): if not args.quiet and t % 50 == 0: print("Running training episode: {}, iteration: {}".format( ii, t)) # Select action tactile_obs = tactile_obs_space.get_state() normalizer.observe(tactile_obs) tactile_obs = normalizer.normalize(tactile_obs) # Get image and normalize it img = robot.get_img(args.img_w, args.img_h, 'c1', args.depth) if args.depth: depth = norm_depth(img[1]) img = norm_img(img[0]) img_norm = np.empty((4, args.img_w, args.img_h)) img_norm[:3, :, :] = img img_norm[3, :, :] = depth else: img_norm = norm_img(img) observation = [tactile_obs, img_norm] action, hidden_state_1, cell_state_1 = self.select_action( observation, hidden_state_1, cell_state_1) # record actions in this epoch act_sequence.append(action) # Perform action delta = action_space.get_action( self.ACTIONS[action])['delta'][:3] target_position = np.add(robot.get_gripper_jpos()[:3], np.array(delta)) target_pose = np.hstack( (target_position, robot.get_gripper_jpos()[3:])) robot.move_joint(target_pose, True, self.gripping_force, hap_sample=args.hap_sample) # Get reward done, num = robot.update_tendons() failure = robot.check_slippage() if num > broken_so_far: reward = num - broken_so_far broken_so_far = num else: reward = 0 # Observe new state tactile_obs_space.update( robot.get_gripper_jpos(), # 6 robot.get_all_touch_buffer(args.hap_sample)) # 30x12 # Set max number of iterations if t >= self.max_iter: done = True # Check if done if not done and not failure: next_tactile_obs = tactile_obs_space.get_state() normalizer.observe(next_tactile_obs) next_tactile_obs = normalizer.normalize(next_tactile_obs) # Get image and normalize it next_img = robot.get_img(args.img_w, args.img_h, 'c1', args.depth) if args.depth: next_depth = norm_depth(next_img[1]) next_img = norm_img(next_img[0]) next_img_norm = np.empty((4, args.img_w, args.img_h)) next_img_norm[:3, :, :] = next_img next_img_norm[3, :, :] = next_depth else: next_img_norm = norm_img(next_img) next_state = [next_tactile_obs, next_img_norm] else: next_state = None # Push new Transition into memory localMemory.append( Transition(observation, action, next_state, reward)) # Optimize the model if self.steps_done % 10 == 0: self.optimize() # If we are done, reset the model if done or failure: self.memory.push(localMemory) if failure: print_variables['durations'].append(self.max_iter) else: print_variables['durations'].append(t) print_variables['rewards'].append(broken_so_far) plot_variables(self.figure, print_variables, "Training POMDP") print("Model parameters: {}".format(velcro_params)) print("Actions in this epoch are: {}".format(act_sequence)) print("Epoch {} took {}s, total number broken: {}\n\n". format(ii, time.time() - start_time, broken_so_far)) break # Save checkpoints every vew iterations if ii % args.save_freq == 0: save_path = os.path.join( args.output_dir, 'checkpoint_model_' + str(ii) + '.pth') torch.save( { 'epochs': ii, 'steps_done': self.steps_done, 'conv_net_1': self.conv_net_1.state_dict(), 'conv_net_2': self.conv_net_2.state_dict(), 'policy_net_1': self.policy_net_1.state_dict(), 'policy_net_2': self.policy_net_2.state_dict(), 'conv_optimizer_1': self.conv_optimizer_1.state_dict(), 'conv_optimizer_2': self.conv_optimizer_2.state_dict(), 'policy_optimizer_1': self.policy_optimizer_1.state_dict(), 'policy_optimizer_2': self.policy_optimizer_2.state_dict(), }, save_path) # Save normalizer state for inference normalizer.save_state( os.path.join(args.output_dir, 'normalizer_state.pickle')) self.memory.save_memory(os.path.join(args.output_dir, 'memory.pickle')) if args.savefig_path: now = dt.datetime.now() self.figure[0].savefig( args.savefig_path + '{}_{}_{}.png'.format(now.month, now.day, now.hour), format='png') print('Training done') plt.show() return print_variables
def test_network(args, policy_net, tactile_net, normalizer, robot, obs_space, performance): hidden_state, cell_state = policy_net.init_hidden_states(1, args.device) action_space = ActionSpace(dp=0.06, df=10) ACTIONS = ['left', 'right', 'forward', 'backward', 'up', 'down'] broken_so_far = 0 t = 0 action = 4 collision = 0 while t < args.max_iter: if args.position: multistep_obs = np.empty((0, args.indim - 6)) elif args.force: multistep_obs = np.empty((0, args.indim - 390)) else: multistep_obs = np.empty((0, args.indim)) prev_action = action for k in range(args.len_ub): # Observe tactile features and stack them tactile_obs = obs_space.get_state() normalizer.observe(tactile_obs) tactile_obs = normalizer.normalize(tactile_obs) if args.position: tactile_obs = tactile_obs[6:] elif args.force: tactile_obs = tactile_obs[:6] multistep_obs = np.vstack((multistep_obs, tactile_obs)) # current jpos current_pos = robot.get_gripper_jpos()[:3] # Perform action delta = action_space.get_action(ACTIONS[action])['delta'][:3] target_position = np.add(robot.get_gripper_jpos()[:3], np.array(delta)) target_pose = np.hstack( (target_position, robot.get_gripper_jpos()[3:])) robot.move_joint(target_pose, True, args.grip_force, hap_sample=args.hap_sample) # check collision number collision += robot.feedback_buffer['collision'] # Observe new state obs_space.update( robot.get_gripper_xpos(), # 24 robot.get_all_touch_buffer(args.hap_sample)) # 30x6 displacement = la.norm(robot.get_gripper_jpos()[:3] - current_pos) if displacement / 0.06 < 0.7: break # input stiched multi-step tactile observation into tactile-net to generate tactile feature action, hidden_state, cell_state = select_action( args, multistep_obs, policy_net, tactile_net, hidden_state, cell_state) # record actions in this epoch # act_sequence.append(prev_action) # Get reward done, num = robot.update_tendons() failure = robot.check_slippage() if num > broken_so_far: broken_so_far = num t += k + 1 if done or failure: ratio_broken = float(num) / float(NUM_TENDON) if ratio_broken < 0.2: performance['tendon_hist'][0] += 1 elif ratio_broken >= 0.2 and ratio_broken < 0.4: performance['tendon_hist'][1] += 1 elif ratio_broken >= 0.4 and ratio_broken < 0.6: performance['tendon_hist'][2] += 1 elif ratio_broken >= 0.6 and ratio_broken < 0.8: performance['tendon_hist'][3] += 1 else: performance['tendon_hist'][4] += 1 performance['num_broken'].append(num) if done: performance['success'].append(1) performance['time'].append(t + 1) if failure: performance['success'].append(0) performance['time'].append(t + 1) performance['collision'].append(collision) return performance break ################## exceed max iterations #################### performance['success'].append(0) performance['time'].append(args.max_iter) ratio_broken = float(num) / float(NUM_TENDON) performance['num_broken'].append(num) if ratio_broken < 0.2: performance['tendon_hist'][0] += 1 elif ratio_broken >= 0.2 and ratio_broken < 0.4: performance['tendon_hist'][1] += 1 elif ratio_broken >= 0.4 and ratio_broken < 0.6: performance['tendon_hist'][2] += 1 elif ratio_broken >= 0.6 and ratio_broken < 0.8: performance['tendon_hist'][3] += 1 else: performance['tendon_hist'][4] += 1 performance['collision'].append(collision) return performance
def test_network(self, performance): args = self.args max_iterations = args.max_iter # Get current state state_space = Observation( self.robot.get_gripper_jpos(), # 6 self.velcro_util.break_center(), # 6 self.velcro_util.break_norm()) # 12 tactile_obs_space = TactileObs( self.robot.get_gripper_jpos(), # 6 self.robot.get_all_touch_buffer(args.hap_sample)) # 30 x 12 action_space = ActionSpace(dp=0.06, df=10) broken_so_far = 0 expert_traj = {'image': None, 'tactile': [], 'action': [], 'position': []} img = self.robot.get_img(args.img_w, args.img_h, 'c1', args.depth) if args.depth: depth = norm_depth(img[1]) img = norm_img(img[0]) img_norm = np.empty((4, args.img_w, args.img_h)) img_norm[:3,:,:] = img img_norm[3,:,:] = depth else: img_norm = norm_img(img) expert_traj['image'] = img_norm for t in range(max_iterations): # Observe state and normalize state = state_space.get_state() # self.normalizer.observe(state[:12]) state[:12] = self.normalizer.normalize(state[:12]) action = self.select_action(self.policy_net, state) performance['action_hist'][action] += 1 # record tactile and visual observation, corresponding action tactile_obs_space.update(self.robot.get_gripper_jpos(), # 6 self.robot.get_all_touch_buffer(args.hap_sample)) tactile_obs = tactile_obs_space.get_state() tactile_obs = self.tactile_normalizer.normalize(tactile_obs) expert_traj['tactile'].append(tactile_obs.tolist()) expert_traj['action'].append(action) expert_traj['position'].append(self.robot.get_gripper_jpos()[:3].tolist()) # perform action delta = action_space.get_action(self.ACTIONS[action])['delta'][:3] target_position = np.add(self.robot.get_gripper_jpos()[:3], np.array(delta)) target_pose = np.hstack((target_position, self.robot.get_gripper_jpos()[3:])) self.robot.move_joint(target_pose, True, args.grip_force, hap_sample = args.hap_sample) # Get reward done, num = self.robot.update_tendons() failure = self.robot.check_slippage() if num > broken_so_far: broken_so_far = num if not done and not failure: # Observe new state state_space.update( self.robot.get_gripper_jpos(), # 6 self.velcro_util.break_center(), # 6 self.velcro_util.break_norm()) # 12 else: if done: performance['success'].append(1) performance['time'].append(t + 1) if failure: performance['success'].append(0) performance['time'].append(t + 1) return performance, expert_traj break # exceed max iterations performance['success'].append(0) performance['time'].append(max_iterations) return performance, expert_traj
def main(args): # Create robot, reset simulation and grasp handle model = load_model_from_path(args.model_path) sim = MjSim(model) sim_param = SimParameter(sim) sim.step() if args.render: viewer = MjViewer(sim) else: viewer = None robot = RobotSim(sim, viewer, sim_param, args.render, args.break_thresh) # load all velcro parameters model_dir = os.path.dirname(args.model_path) param_path = os.path.join(model_dir, 'uniform_sample.pkl') velcro_params = pickle.load(open(param_path, 'rb')) velcro_util = VelcroUtil(robot, sim_param) state_space = Observation( robot.get_gripper_jpos(), # 6 velcro_util.break_center(), # 6 velcro_util.break_norm()) action_space = ActionSpace(dp=0.06, df=10) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.clear() for item in VELCRO_PARAMS: geom_type, origin_offset, euler, radius = item print('Velcro parameters are: {}, {}, {}, {}'.format( geom_type, origin_offset, euler, radius)) change_sim(robot.mj_sim, geom_type, origin_offset, euler, radius) robot.reset_simulation() ret = robot.grasp_handle() broken_so_far = 0 ax.clear() for i in range(args.max_iter): # check velcro breakline geometry norms = velcro_util.break_norm() centers = velcro_util.break_center() fl_center = centers[:3] fs_center = centers[3:] fl_norm = norms[:3] fs_norm = norms[3:6] fs_center = centers[3:] break_dir_norm = norms[6:9] action_direction = args.act_mag * (-0.5 * fl_norm + 0.5 * break_dir_norm) gripper_pose = robot.get_gripper_jpos()[:3] # Perform action target_position = np.add(robot.get_gripper_jpos()[:3], action_direction) target_pose = np.hstack( (target_position, robot.get_gripper_jpos()[3:])) robot.move_joint(target_pose, True, 300, hap_sample=30) ax.scatter(fl_center[0], fl_center[1], fl_center[2], c='r', marker='o') ax.scatter(fs_center[0], fs_center[1], fs_center[2], c='r', marker='o') X, Y, Z = lines(fl_center, fl_norm) ax.plot(X, Y, Z, c='b') X, Y, Z = lines(fs_center, fs_norm) ax.plot(X, Y, Z, c='b') # plot gripper position and action direction ax.scatter(gripper_pose[0], gripper_pose[1], gripper_pose[2], c='r', marker='v') X, Y, Z = lines(gripper_pose, action_direction / args.act_mag) ax.plot(X, Y, Z, c='g') plt.pause(0.001) # check tendons and slippage done, num = robot.update_tendons() failure = robot.check_slippage() if num > broken_so_far: broken_so_far = num if done or failure: break
def test_network(args, policy_net, normalizer, robot, state_space, performance): hidden_state, cell_state = policy_net.init_hidden_states(args.batch_size, args.device) action_space = ActionSpace(dp=0.06, df=10) ACTIONS = ['left', 'right', 'forward', 'backward', 'up', 'down'] broken_so_far = 0 t = 0 action = 4 collision = 0 while t < args.max_iter: # Observe state and normalize state = state_space.get_state() normalizer.observe(state) state = normalizer.normalize(state) torch_observation = torch.from_numpy(state).float().to(args.device).unsqueeze(0) model_out = policy_net(torch_observation, batch_size=1, time_step=1, hidden_state=hidden_state, cell_state=cell_state) out = model_out[0] hidden_state = model_out[1][0] cell_state = model_out[1][1] # print(out[0]) action_key = int(torch.argmax(out[0])) action = select_action(action_key) performance['action_hist'][action] += 1 # perform action delta = action_space.get_action(ACTIONS[action_key])['delta'][:3] target_position = np.add(robot.get_gripper_jpos()[:3], np.array(delta)) target_pose = np.hstack((target_position, robot.get_gripper_jpos()[3:])) robot.move_joint(target_pose, True, args.grip, hap_sample=args.hap_sample) # Get reward done, num = robot.update_tendons() failure = robot.check_slippage() # Observe new state state_space.update(robot.get_gripper_xpos(), # 24 robot.get_all_touch_buffer(args.hap_sample)) # 30x6 if num > broken_so_far: broken_so_far = num t += 1 if done or failure: ratio_broken = float(num) / float(NUM_TENDON) if ratio_broken < 0.2: performance['tendon_hist'][0] += 1 elif ratio_broken >= 0.2 and ratio_broken < 0.4: performance['tendon_hist'][1] += 1 elif ratio_broken >= 0.4 and ratio_broken < 0.6: performance['tendon_hist'][2] += 1 elif ratio_broken >= 0.6 and ratio_broken < 0.8: performance['tendon_hist'][3] += 1 else: performance['tendon_hist'][4] += 1 performance['num_broken'].append(num) if done: performance['success'].append(1) performance['time'].append(t + 1) if failure: performance['success'].append(0) performance['time'].append(t + 1) performance['collision'].append(collision) return performance break ################## exceed max iterations #################### performance['success'].append(0) performance['time'].append(args.max_iter) ratio_broken = float(num) / float(NUM_TENDON) performance['num_broken'].append(num) if ratio_broken < 0.2: performance['tendon_hist'][0] += 1 elif ratio_broken >= 0.2 and ratio_broken < 0.4: performance['tendon_hist'][1] += 1 elif ratio_broken >= 0.4 and ratio_broken < 0.6: performance['tendon_hist'][2] += 1 elif ratio_broken >= 0.6 and ratio_broken < 0.8: performance['tendon_hist'][3] += 1 else: performance['tendon_hist'][4] += 1 performance['collision'].append(collision) return performance
def train_POMDP(self): args = self.args ROOT_DIR = os.path.dirname(os.path.dirname( os.path.abspath(__file__))) # corl2019 PARENT_DIR = os.path.dirname(ROOT_DIR) # reserach # Create the output directory if it does not exist output_dir = os.path.join(PARENT_DIR, 'multistep_pomdp', args.output_dir) if not os.path.isdir(output_dir): os.makedirs(output_dir) # write args to file with open(os.path.join(output_dir, 'args.txt'), 'w+') as f: json.dump(args.__dict__, f, indent=2) f.close() # Create our policy net and a target net self.policy_net_1 = DRQN(args.ftdim, args.outdim).to(args.device) self.policy_net_2 = DRQN(args.ftdim, args.outdim).to(args.device) if args.position: self.tactile_net_1 = TactileNet(args.indim - 6, args.ftdim).to(args.device) self.tactile_net_2 = TactileNet(args.indim - 6, args.ftdim).to(args.device) elif args.force: self.tactile_net_1 = TactileNet(args.indim - 390, args.ftdim).to(args.device) self.tactile_net_2 = TactileNet(args.indim - 390, args.ftdim).to(args.device) else: self.tactile_net_1 = TactileNet(args.indim, args.ftdim).to(args.device) self.tactile_net_2 = TactileNet(args.indim, args.ftdim).to(args.device) # Set up the optimizer self.policy_optimizer_1 = optim.RMSprop(self.policy_net_1.parameters(), lr=args.lr) self.policy_optimizer_2 = optim.RMSprop(self.policy_net_2.parameters(), lr=args.lr) self.tactile_optimizer_1 = optim.RMSprop( self.tactile_net_1.parameters(), lr=args.lr) self.tactile_optimizer_2 = optim.RMSprop( self.tactile_net_2.parameters(), lr=args.lr) self.memory = RecurrentMemory(800) self.steps_done = 0 # Setup the state normalizer normalizer = Multimodal_Normalizer(num_inputs=args.indim, device=args.device) print_variables = {'durations': [], 'rewards': [], 'loss': []} start_episode = 0 if args.weight_policy: if os.path.exists(args.weight_policy): checkpoint = torch.load(args.weight_policy) self.policy_net_1.load_state_dict(checkpoint['policy_net_1']) self.policy_net_2.load_state_dict(checkpoint['policy_net_2']) self.policy_optimizer_1.load_state_dict( checkpoint['policy_optimizer_1']) self.policy_optimizer_2.load_state_dict( checkpoint['policy_optimizer_2']) start_episode = checkpoint['epochs'] self.steps_done = checkpoint['steps_done'] with open( os.path.join(os.path.dirname(args.weight_policy), 'results_pomdp.pkl'), 'rb') as file: plot_dict = pickle.load(file) print_variables['durations'] = plot_dict['durations'] print_variables['rewards'] = plot_dict['rewards'] if args.normalizer_file: if os.path.exists(args.normalizer_file): normalizer.restore_state(args.normalizer_file) if args.memory: if os.path.exists(args.memory): self.memory.load(args.memory) if args.weight_tactile: checkpoint = torch.load(args.weight_tactile) self.tactile_net_1.load_state_dict(checkpoint['tactile_net_1']) self.tactile_optimizer_1.load_state_dict( checkpoint['tactile_optimizer_1']) self.tactile_net_2.load_state_dict(checkpoint['tactile_net_2']) self.tactile_optimizer_2.load_state_dict( checkpoint['tactile_optimizer_2']) action_space = ActionSpace(dp=0.06, df=10) # Create robot, reset simulation and grasp handle model = load_model_from_path(args.model_path) sim = MjSim(model) sim_param = SimParameter(sim) sim.step() if args.render: viewer = MjViewer(sim) else: viewer = None robot = RobotSim(sim, viewer, sim_param, args.render, self.break_threshold) tactile_obs_space = TactileObs( robot.get_gripper_xpos(), # 24 robot.get_all_touch_buffer(args.hap_sample)) # 30 x 6 # Main training loop for ii in range(start_episode, args.epochs): self.steps_done += 1 start_time = time.time() act_sequence = [] act_length = [] velcro_params = init_model(robot.mj_sim) robot.reset_simulation() ret = robot.grasp_handle() if not ret: continue # Local memory for current episode localMemory = [] # Get current observation hidden_state_1, cell_state_1 = self.policy_net_1.init_hidden_states( batch_size=1, device=args.device) hidden_state_2, cell_state_2 = self.policy_net_2.init_hidden_states( batch_size=1, device=args.device) broken_so_far = 0 # pick a random action initially action = random.randrange(0, 5) current_state = None next_state = None t = 0 while t < args.max_iter: if not args.quiet and t == 0: print("Running training episode: {}".format(ii, t)) if args.position: multistep_obs = np.empty((0, args.indim - 6)) elif args.force: multistep_obs = np.empty((0, args.indim - 390)) else: multistep_obs = np.empty((0, args.indim)) prev_action = action for k in range(args.len_ub): # Observe tactile features and stack them tactile_obs = tactile_obs_space.get_state() normalizer.observe(tactile_obs) tactile_obs = normalizer.normalize(tactile_obs) if args.position: tactile_obs = tactile_obs[6:] elif args.force: tactile_obs = tactile_obs[:6] multistep_obs = np.vstack((multistep_obs, tactile_obs)) # current jpos current_pos = robot.get_gripper_jpos()[:3] # Perform action delta = action_space.get_action( self.ACTIONS[action])['delta'][:3] target_position = np.add(robot.get_gripper_jpos()[:3], np.array(delta)) target_pose = np.hstack( (target_position, robot.get_gripper_jpos()[3:])) robot.move_joint(target_pose, True, self.gripping_force, hap_sample=args.hap_sample) # Observe new state tactile_obs_space.update( robot.get_gripper_xpos(), # 24 robot.get_all_touch_buffer(args.hap_sample)) # 30x6 displacement = la.norm(robot.get_gripper_jpos()[:3] - current_pos) if displacement / 0.06 < 0.7: break # input stiched multi-step tactile observation into tactile-net to generate tactile feature action, hidden_state_1, cell_state_1 = self.select_action( multistep_obs, hidden_state_1, cell_state_1) if t == 0: next_state = multistep_obs.copy() else: current_state = next_state.copy() next_state = multistep_obs.copy() # record actions in this epoch act_sequence.append(prev_action) act_length.append(k) # Get reward done, num = robot.update_tendons() failure = robot.check_slippage() if num > broken_so_far: reward = num - broken_so_far broken_so_far = num else: if failure: reward = -20 else: reward = 0 t += k + 1 # Set max number of iterations if t >= self.max_iter: done = True if done or failure: next_state = None # Push new Transition into memory if t > k + 1: localMemory.append( Transition(current_state, prev_action, next_state, reward)) # Optimize the model if self.steps_done % 10 == 0: self.optimize() # If we are done, reset the model if done or failure: self.memory.push(localMemory) if failure: print_variables['durations'].append(self.max_iter) else: print_variables['durations'].append(t) print_variables['rewards'].append(broken_so_far) plot_variables(self.figure, print_variables, "Training POMDP") print("Model parameters: {}".format(velcro_params)) print( "{} of Actions in this epoch are: {} \n Action length are: {}" .format(len(act_sequence), act_sequence, act_length)) print("Epoch {} took {}s, total number broken: {}\n\n". format(ii, time.time() - start_time, broken_so_far)) break # Save checkpoints every vew iterations if ii % args.save_freq == 0: save_path = os.path.join(output_dir, 'policy_' + str(ii) + '.pth') torch.save( { 'epochs': ii, 'steps_done': self.steps_done, 'policy_net_1': self.policy_net_1.state_dict(), 'policy_net_2': self.policy_net_2.state_dict(), 'policy_optimizer_1': self.policy_optimizer_1.state_dict(), 'policy_optimizer_2': self.policy_optimizer_2.state_dict(), }, save_path) save_path = os.path.join(output_dir, 'tactile_' + str(ii) + '.pth') torch.save( { 'tactile_net_1': self.tactile_net_1.state_dict(), 'tactile_net_2': self.tactile_net_2.state_dict(), 'tactile_optimizer_1': self.tactile_optimizer_1.state_dict(), 'tactile_optimizer_2': self.tactile_optimizer_2.state_dict(), }, save_path) write_results(os.path.join(output_dir, 'results_pomdp.pkl'), print_variables) self.memory.save_memory( os.path.join(output_dir, 'memory.pickle')) if args.savefig_path: now = dt.datetime.now() self.figure[0].savefig( args.savefig_path + '{}_{}_{}.png'.format(now.month, now.day, now.hour), format='png') print('Training done') plt.show() return print_variables