if torch.cuda.is_available(): print("cuda is available :D") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") USE_VALUE_NET = False STATE_IS_IMAGE = True CONTINUE_TRAINING = True MEMORY_SIZE = 1000000 # create the target and policy networks policy_net = dqn.DQN().to(device) target_net1 = dqn.DQN().to(device) target_net2 = dqn.DQN().to(device) value_net = valuenet.ValueNet().to(device) value_net_trainer = valuenet.ValueNetTrainer(value_net) print("number of parameters: ", sum(p.numel() for p in policy_net.parameters() if p.requires_grad)) target_net1.load_state_dict(policy_net.state_dict()) target_net1.eval() target_net2.load_state_dict(policy_net.state_dict()) target_net2.eval() # default xavier init for m in policy_net.modules(): if isinstance(m, (nn.Conv2d, nn.Linear)): nn.init.xavier_uniform(m.weight, gain=nn.init.calculate_gain('relu')) # if the model file exists, load it if os.path.isfile("vrep_arm_model.pt") and CONTINUE_TRAINING:
def collect_data(self, use_scripted_policy=True, visualize=False, n_files=None, start_at=None, epsilon=0.0, dt=50e-3, maxtime=20, dryrun=False, memory_capacity=5000): t = 0 if torch.cuda.is_available(): print("cuda is available :D") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") USE_VALUE_NET = False STATE_IS_IMAGE = True MEMORY_SIZE = memory_capacity GZIP_COMPRESSION_LEVEL = 3 self.s.set_visualize(visualize) policy_net = None target_net1 = None target_net2 = None value_net = None value_net_trainer = None # create the target and policy networks policy_net = dqn.DQN().to(device) # default xavier init for m in policy_net.modules(): if isinstance(m, (nn.Conv2d, nn.Linear)): nn.init.xavier_uniform(m.weight, gain=nn.init.calculate_gain('relu')) if os.path.isfile("vrep_arm_model.pt"): policy_net.load_state_dict(torch.load('vrep_arm_model.pt')) print("loaded existing model file") target_net1 = dqn.DQN().to(device) target_net2 = dqn.DQN().to(device) value_net = valuenet.ValueNet().to(device) value_net_trainer = valuenet.ValueNetTrainer(value_net) print( "number of parameters: ", sum(p.numel() for p in policy_net.parameters() if p.requires_grad)) target_net1.load_state_dict(policy_net.state_dict()) target_net1.eval() target_net2.load_state_dict(policy_net.state_dict()) target_net2.eval() br = brain.Brain( simulator=self.s, #only to access scripted policy policy_net=policy_net, target_net1=target_net1, target_net2=target_net2, memory_size=MEMORY_SIZE, value_net_trainer=value_net_trainer, state_is_image=STATE_IS_IMAGE, use_value_net=USE_VALUE_NET) # ============================================================================ # train for num_episodes epochs # ============================================================================ from itertools import count total_reached = 0 reached = 0 MAX_TIME = maxtime FRAME_SKIP = 1 MAX_FILES = 6 FILE_PREFIX = "dataset/replay_" num_file = 0 PRINT_EVERY = 1 br.memory = dqn.ReplayMemory(capacity=MEMORY_SIZE) if not use_scripted_policy: FILE_PREFIX = "dataset_online/replay_" MAX_FILES = int(MAX_FILES / 2) # used to be divided by 2, but nvm for now num_episodes = 2000000 if n_files is not None: MAX_FILES = n_files if start_at is not None: num_file += start_at MAX_FILES += start_at start_time = time.time() total_episode_reward = 0 for i_episode in range(num_episodes): episode_reward = 0 if i_episode % PRINT_EVERY == 0: print("recording: episode", i_episode) # Initialize the environment and state self.s.reset() target_x, target_y, target_z = self.s.randomly_place_target() img_state, numerical_state = self.s.get_robot_state() error = np.random.normal(0, 0.1) error = 0 for t in count(): # Select and perform an action based on epsilon greedy # action is chosen based on the policy network img_state = torch.Tensor(img_state) numerical_state = torch.Tensor(numerical_state) # get the reward, detect if the task is done a = [0, 0, 0] action = None last_img_state = img_state last_numerical_state = numerical_state # record the action from the scripted exploration thresh = None action = None if use_scripted_policy: action = br.select_action_scripted_exploration(thresh=1.0, error=error) else: action = br.select_action_epsilon_greedy( img_state, numerical_state, epsilon) self.s.set_control(action.view(-1).cpu().numpy()) self.s.step() img_state, numerical_state = self.s.get_robot_state() reward_number, done = self.s.get_reward_and_done( numerical_state) reward = torch.tensor([reward_number], device=device) episode_reward += (br.GAMMA**t) * reward_number if done and reward_number > 0: #reached the target on its own # print("data collector: episode reached at timestep",t) reached += 1 if t > MAX_TIME: # we will terminate if it doesn't finish # print("data collector: episode timeout") done = True # Observe new state if not done: state_img_tensor = torch.Tensor(img_state) state_numerical_tensor = torch.Tensor(numerical_state) else: state_img_tensor = None state_numerical_tensor = None # Store the transition in memory # as the states are ndarray, change it to tensor # the actoin and rewards are already tensors, so they're cool if (t % FRAME_SKIP == 0): br.memory.push(torch.Tensor(last_img_state), torch.Tensor(last_numerical_state), action.view(-1).float(), state_img_tensor, state_numerical_tensor, reward) if done: #visualize and break break total_episode_reward += episode_reward if i_episode % 10 == 0: time_per_ep = (time.time() - start_time) / 10.0 start_time = time.time() print("reached target", reached, "/ 10 times, memory:", len(br.memory), "/", MEMORY_SIZE, ",", (100.0 * len(br.memory) / MEMORY_SIZE), "% full,", time_per_ep, "sec/ep") total_reached += reached reached = 0 if len(br.memory) >= br.memory.capacity: # if the buffer is full, save it and reset it filename = FILE_PREFIX + str(num_file).zfill(2) + ".gz" if not dryrun: print("> saving file into", filename) with gzip.GzipFile( filename, 'wb', compresslevel=GZIP_COMPRESSION_LEVEL) as handle: cPickle.dump(br.memory, handle, protocol=cPickle.HIGHEST_PROTOCOL) print("> saving completed") else: print("> data collector: dryrun, not saving memory") num_file += 1 if (num_file >= MAX_FILES): print("> data_collector: all files collected, closing") print("> total success rate:", (total_reached * 1.0 / i_episode)) print("> mean episode reward:", (total_episode_reward * 1.0 / i_episode)) return total_reached * 1.0 / i_episode, total_episode_reward * 1.0 / i_episode br.memory = dqn.ReplayMemory(capacity=MEMORY_SIZE) print("> data_collector: memory is full, saved as: " + filename)
def __init__(self, name, memory, maxtime=20, dt=0.05, port=19991, visualize=False, use_scripted_policy=False, epsilon=0.0, vrep_file_name='ik_sawyer.ttt'): print("> DCThread: launching thread") threading.Thread.__init__(self) self.printer = logger.Printer("DCThread") self.r = vrep_sawyer.VrepSawyer(dt, headless_mode=True, port_num=port, vrep_file_name=vrep_file_name) self.s = simulator.Simulator(self.r, dt, target_x=0, target_y=0, target_z=0, visualize=False) if torch.cuda.is_available(): self.printer.print_to_screen("> DCThread: cuda is available :D") self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.name = name self.memory = memory self.use_scripted_policy = use_scripted_policy self.epsilon = epsilon USE_VALUE_NET = False STATE_IS_IMAGE = True MEMORY_SIZE = 1000000 self.s.set_visualize(visualize) policy_net = None target_net1 = None target_net2 = None value_net = None value_net_trainer = None # create the target and policy networks policy_net = dqn.DQN().to(self.device) policy_net.load_state_dict(torch.load('vrep_arm_model.pt')) self.printer.print_to_screen("> DCThread: loaded existing model file") target_net1 = dqn.DQN().to(self.device) target_net2 = dqn.DQN().to(self.device) value_net = valuenet.ValueNet().to(self.device) value_net_trainer = valuenet.ValueNetTrainer(value_net) print_string = "> DCThread: number of parameters: ", sum( p.numel() for p in policy_net.parameters() if p.requires_grad) self.printer.print_to_screen(print_string) target_net1.load_state_dict(policy_net.state_dict()) target_net1.eval() target_net2.load_state_dict(policy_net.state_dict()) target_net2.eval() self.br = brain.Brain( simulator=self.s, #only to access scripted policy policy_net=policy_net, target_net1=target_net1, target_net2=target_net2, memory_size=MEMORY_SIZE, value_net_trainer=value_net_trainer, state_is_image=STATE_IS_IMAGE, use_value_net=USE_VALUE_NET) self.MAX_TIME = maxtime self.FRAME_SKIP = 1 # attach the current memory to the brain self.br.memory = self.memory self.current_success_rate = None self.current_reward = None