def learning_method(self, gamma=0.9, alpha=0.1, epsilon=1e-5, display=False, lambda_=None): self.state = self.env.reset() s0 = self.state if display: self.env.render() # a0 = self.perform_policy(s0, epsilon) # print(self.action_t.name) time_in_episode, total_reward = 0, 0 is_done = False while not is_done: # add code here a0 = self.perform_policy(s0, epsilon) s1, r1, is_done, info, total_reward = self.act(a0) if display: self.env.render() self.policy = greedy_policy a1 = greedy_policy(self.A, s1, self.Q) old_q = get_dict(self.Q, s0, a0) q_prime = get_dict(self.Q, s1, a1) td_target = r1 + gamma * q_prime #alpha = alpha / num_episode new_q = old_q + alpha * (td_target - old_q) set_dict(self.Q, new_q, s0, a0) # s0, a0 = s1, a1 s0 = s1 time_in_episode += 1 if display: print(self.experience.last_episode) return time_in_episode, total_reward
def prediction_process(args, action_queue, experience_queue, work, ready, can_predict, should_reset, iteration, path_queue): # Setup model ts = time.time() first = True reward = 5.0 discount_factor = 0.5 path = path_queue.get() image_path = path[0]; depth_path = path[1]; pc_path = path[2]; vis_path = path[3]; mixed_paths = path[4]; feat_paths = path[5] trainer = Trainer(reward, discount_factor, False, args.primitive_lr, args.densenet_lr) trainer.behavior_net.load_state_dict(torch.load(args.model)) trainer.target_net.load_state_dict(trainer.behavior_net.state_dict()) ready.value = True cv2.namedWindow("prediction") print("[Prediction Thread] Load model took %f seconds. Start prediction thread" %(time.time()-ts)) while work.value: if should_reset.value: print("[Prediction Thread] Receive reset command") if first: print("[Prediction Thread] Already in initial state, abort reset request...") should_reset.value = False ready.value = True continue ts = time.time() ready.value = False trainer.behavior_net.load_state_dict(torch.load(args.model)) first = True path = path_queue.get() image_path = path[0]; depth_path = path[1]; pc_path = path[2]; vis_path = path[3]; mixed_paths = path[4]; feat_paths = path[5] print("[Prediction Thread] Reset complete! Took {} seconds".format(time.time()-ts)) should_reset.value = False ready.value = True continue if not first: while experience_queue.empty() and not should_reset.value and work.value: pass if not experience_queue.empty(): print("[Prediction Thread] Got experience, updating network...") transition = experience_queue.get() color = cv2.imread(transition.color) depth = np.load(transition.depth) next_color = cv2.imread(transition.next_color) next_depth = np.load(transition.next_depth) pixel_index = transition.pixel_idx td_target = trainer.get_label_value(transition.reward, next_color, next_depth, transition.is_empty, pixel_index[0]) trainer.backprop(color, depth, pixel_index, td_target, 1.0, 1, True, True) if can_predict.value: if first: first = False print("[Prediction Thread] Start prediction") pc_response = _get_pc(iteration.value, True, pc_path) color, depth, points = utils.get_heightmap(pc_response.pc, image_path, depth_path, iteration.value) suck_1_prediction, suck_2_prediction, grasp_prediction = trainer.forward(color, depth, is_volatile=True) heatmaps, mixed_imgs = utils.save_heatmap_and_mixed(suck_1_prediction, suck_2_prediction, grasp_prediction, feat_paths, mixed_paths, color, iteration.value) action, action_str, pixel_index, angle = utils.greedy_policy(suck_1_prediction, suck_2_prediction, grasp_prediction) visual_img = utils.draw_image(mixed_imgs[pixel_index[0]], False, pixel_index, vis_path + "vis_{:06}.jpg".format(iteration.value)) cv2.imshow("prediction", cv2.resize(visual_img, None, fx=2, fy=2)); cv2.waitKey(33) utils.print_action(action_str, pixel_index, points[pixel_index[1], pixel_index[2]]) action_queue.put([action, action_str, points[pixel_index[1], pixel_index[2]], angle, pixel_index]) can_predict.value = False print("[Prediction Thread] Prediction thread stop")
grasp_name = mixed_path + "grasp_{:06}_idx_{}.jpg".format(iteration, rotate_idx) cv2.imwrite(grasp_name, grasp_mixed_idx) print "[{:.6f}]: suck max: \033[0;34m{}\033[0m| grasp max: \033[0;35m{}\033[0m".format(time.time(), \ np.max(suck_predictions), np.max(grasp_predictions)) explore = -1 # None # Policy decider if not testing: # Train if not grasp_only: explore, action, action_str, pixel_index, angle = \ utils.epsilon_greedy_policy(epsilon_, suck_predictions, grasp_predictions) else: explore, action, action_str, pixel_index, angle = \ utils.grasp_epsilon_greedy_policy(epsilon_, grasp_predictions) if testing: # Test if not grasp_only: action, action_str, pixel_index, angle = utils.greedy_policy(suck_predictions, grasp_predictions) else: # Grasp-only action = 0 action_str = 'grasp' pixel_index, angle = utils.grasp_only_policy(grasp_predictions) explore_list.append(explore) if explore == 1: print "Use exploring..." del suck_predictions, grasp_predictions, state_feat print "[%f]: Take action: \033[0;31m %s\033[0m at \ \033[0;32m(%d, %d)\033[0m with theta \033[0;33m%f \033[0m" %(time.time(), action_str, pixel_index[1], \ pixel_index[2], angle) # Draw color + heatmap + motion visual_img = None if action: # SUCK visual_img = utils.draw_image(suck_mixed, action, pixel_index) else: # GRASP
print "Forward past: {} seconds".format(time.time() - ts) heatmaps, mixed_imgs = utils.save_heatmap_and_mixed( suck_1_prediction, suck_2_prediction, grasp_prediction, feat_paths, mixed_paths, color, iteration) # Standarize predictions to avoid bias between them #suck_1_prediction = utils.standarization(suck_1_prediction);suck_2_prediction = utils.standarization(suck_2_prediction) #grasp_prediction = utils.standarization(grasp_prediction) # SELECT ACTION if not testing: # Train explore, action, action_str, pixel_index, angle = utils.epsilon_greedy_policy( epsilon_, suck_1_prediction, suck_2_prediction, grasp_prediction, depth, diff_path, iteration, specific_tool) else: # Testing action, action_str, pixel_index, angle = utils.greedy_policy( suck_1_prediction, suck_2_prediction, grasp_prediction, specific_tool) explore = False explore_list.append(explore) target_list.append(pixel_index) position_list.append(points[pixel_index[1], pixel_index[2]]) del suck_1_prediction, suck_2_prediction, grasp_prediction utils.print_action(action_str, pixel_index, points[pixel_index[1], pixel_index[2]]) # Save (color heightmap + prediction heatmap + motion primitive and corresponding position), then show it visual_img = utils.draw_image( mixed_imgs[pixel_index[0]], explore, pixel_index, vis_path + "vis_{:06}.jpg".format(iteration)) cv2.imshow("prediction", cv2.resize(visual_img, None, fx=2, fy=2)) cv2.waitKey(33)