def __init__(self, config, global_network, thread_index, network_scope="network", scene_scope="scene", task_scope="task"): self.thread_index = thread_index self.config = config self.network_scope = network_scope self.scene_scope = scene_scope self.task_scope = task_scope self.scopes = [network_scope, scene_scope, task_scope] self.local_network = global_network self.env = Environment({ 'scene_name': self.scene_scope, 'terminal_state_id': int(self.task_scope) }) self.env.reset() self.expert = Expert(self.env) self.local_t = 0 self.episode_length = 0 self.first_iteration = True # first iteration of Dagger # training dataset self.states = [] self.actions = [] self.targets = []
def ask_to_expert(self, experience): # expert policy if self.expert == None: self.expert = Expert(self.config) self.policy_fn = self.expert.load_expert_policy(self.envname) batch_size = self.config.bc.batch_size observations = experience['observations'] actions = experience['actions'] num_timesteps = observations.shape[0] num_steps = num_timesteps // batch_size for step in range(0, num_steps): start_idx = step * batch_size end_idx = (step + 1) * batch_size actions[start_idx:end_idx] = \ self. policy_fn(observations[start_idx:end_idx, :]) experience['actions'] = actions return experience
def __init__(self, num_experts, lr=0, cam_centers=None, gating_capacity=1): self.num_experts = num_experts self.lr = lr # learning rate if cam_centers is None: cam_centers = torch.zeros(num_experts, 3) cam_centers = cam_centers.cuda() # setup gating network self.model_g = Gating(num_experts, gating_capacity) self.model_g = self.model_g.cuda() self.model_g.train() self.optimizer_g = optim.Adam(self.model_g.parameters(), lr=lr) # setup expert networks self.experts = [] self.expert_opts = [] for i in range(0, num_experts): model_e = Expert(cam_centers[i]) model_e = model_e.cuda() model_e.train() optimizer_e = optim.Adam(model_e.parameters(), lr=lr) self.experts.append(model_e) self.expert_opts.append(optimizer_e)
def observeExpert(): #Feature sum for one run of the optimal policy """ Main function, runs the experiment. """ expert = Expert( int(sys.argv[1]) ) # initialise and expert with a certain policy form the pre-trained ones env = init_env() # initialise an environment featureSum = [0, 0, 0] # feature expectations is a 1x3 vector counter = 1 #counter to calculate average #runs of the policy expert.start() state, reward = env.reset() while not env.terminal: action = expert.step(state, reward) state, reward = env.update(action) feat = featuresFromState(state) featureSum = [sum(x) for x in zip(*[featureSum, feat])] counter += 1 expert.end(reward) return featureSum
def observeExpert(): #Feature sum for one run of the optimal policy """ Main function, runs the experiment. """ expert = Expert(int(sys.argv[1])) # initialise and expert with a certain policy form the pre-trained ones env = init_env() # initialise an environment featureSum=[0,0,0] # feature expectations is a 1x3 vector counter = 1 #counter to calculate average #runs of the policy expert.start() state, reward = env.reset() while not env.terminal: action = expert.step(state, reward) state, reward = env.update(action) feat = featuresFromState(state) featureSum=[sum(x) for x in zip(*[featureSum,feat])] counter+=1 expert.end(reward) return featureSum
def __init__(self, saver, model, global_step): super(Trainer, self).__init__() self._exp = Expert() self._net = model self._update_global_step_op = tf.assign_add(global_step, 1) self._enough_history = False optimizer_class = getattr(tf.train, FLAGS.optimizer) optimizer = optimizer_class(learning_rate=FLAGS.learning_rate) self._update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) loss_key = 'loss' if not FLAGS.learn_mapper else 'estimate_loss' with tf.control_dependencies(self._update_ops): gradients, variables = zip( *optimizer.compute_gradients(model.output_tensors[loss_key])) if FLAGS.grad_clip > 0: gradients_constrained, _ = tf.clip_by_global_norm( gradients, FLAGS.grad_clip) else: gradients_constrained = gradients self._gradient_names = [ v.name for g, v in zip(gradients_constrained, variables) if g is not None ] self._gradient_summary_op = [ tf.reduce_mean(tf.abs(g)) for g in gradients_constrained if g is not None ] self._train_op = optimizer.apply_gradients(zip( gradients_constrained, variables), global_step=global_step) with tf.control_dependencies([self._train_op]): self._train_loss = model.output_tensors[loss_key] self._writer = Proc._build_writer()
class DaggerThread(object): def __init__(self, config, global_network, thread_index, network_scope="network", scene_scope="scene", task_scope="task"): self.thread_index = thread_index self.config = config self.network_scope = network_scope self.scene_scope = scene_scope self.task_scope = task_scope self.scopes = [network_scope, scene_scope, task_scope] self.local_network = global_network self.env = Environment({ 'scene_name': self.scene_scope, 'terminal_state_id': int(self.task_scope) }) self.env.reset() self.expert = Expert(self.env) self.local_t = 0 self.episode_length = 0 self.first_iteration = True # first iteration of Dagger # training dataset self.states = [] self.actions = [] self.targets = [] def choose_action_label_smooth(self, expected_action, epsilon): """ P(k) = (1-epsilon) * P_e + e * 1/N """ pi_values = [epsilon / float(self.config.action_size) ] * self.config.action_size pi_values[expected_action] += 1 - epsilon return pi_values def choose_action_greedy(self, pi_values): # greedy algorithm since this is supervised learning return np.argmax(pi_values, axis=0) def choose_action(self, pi_values): values = [] s = 0.0 for rate in pi_values: s += rate values.append(s) r = random.random() * s for i in range(len(values)): if values[i] >= r: return i # fail safe return len(values) - 1 def add_summary(self, writer, value_dict): if writer is None or len(value_dict) == 0: return value = [ tf.Summary.Value(tag=k, simple_value=v) for k, v in value_dict.items() ] summary = tf.Summary(value=value) writer.add_summary(summary, global_step=self.local_network.get_global_step()) logging.debug("writing summary %s" % (str(summary))) def train(self, session, writer): assert len(self.states) == len( self.actions), "data count of action and state mismatch" s = self.states a = self.actions n_total = len(s) assert n_total > 0, "null dataset" t = [self.env.s_target] * n_total if n_total > self.config.batch_size: data = list(zip(s, a)) np.random.shuffle(data) s, a = zip(*data) local_t = self.local_t scope = self.scene_scope + '/' + self.task_scope for epoch in range(self.config.max_epochs): train_loss, train_accuracy = self.local_network.run_epoch( session, self.scopes, s, t, a, True, writer) global_step = self.local_network.get_global_step() logging.info( "%(scope)s:t=%(local_t)d " "train_step=%(global_step)d loss=%(train_loss)f acc=%(train_accuracy)f" % locals()) return def process(self, sess, global_t, summary_writer): start_local_t = self.local_t # draw experience with current policy or expert policy terminal = False for i in range(self.config.local_t_max): if self.first_iteration: # use expert policy before any training expert_action = action = self.expert.get_next_action() expert_lsr_pi = self.choose_action_label_smooth( expert_action, self.config.lsr_epsilon) else: expert_action = self.expert.get_next_action() expert_lsr_pi = self.choose_action_label_smooth( expert_action, self.config.lsr_epsilon) pi_ = self.local_network.run_policy(sess, self.env.s_t, self.env.s_target, self.scopes) action = self.choose_action(pi_) logging.debug( "action=%(action)d expert_action=%(expert_action)d " "expert_lsr_pi=%(expert_lsr_pi)s pi_=%(pi_)s" % locals()) self.states.insert(0, self.env.s_t) self.actions.insert(0, expert_lsr_pi) self.env.step(action) self.env.update() terminal = True if self.episode_length > self.config.max_steps_per_e else self.env.terminal self.episode_length += 1 self.local_t += 1 if terminal: logging.info( "[episode end] time %d | thread #%d | scene %s | target #%s expert:%s episode length = %d\n" % (global_t, self.thread_index, self.scene_scope, self.task_scope, "T" if self.first_iteration else "F", self.episode_length)) summary_values = { "episode_length_input": float(self.episode_length), } if not self.first_iteration: # record agent's score only self.add_summary(summary_writer, summary_values) self.episode_length = 0 self.env.reset() break # train policy network with gained labels self.train(sess, summary_writer) self.first_iteration = False return self.local_t - start_local_t def evaluate(self, sess, n_episodes, expert_agent=False): ep_lengths = [] ep_collisions = [] accuracies = [] for i in range(n_episodes): self.env.reset() terminal = False step = 0 n_collision = 0 while not terminal: if expert_agent: action = self.expert.get_next_action() else: expert_action = self.expert.get_next_action() pi_ = self.local_network.run_policy( sess, self.env.s_t, self.env.s_target, self.scopes) action = self.choose_action(pi_) accuracies.append(1.0 if expert_action == action else 0.0) logging.debug( "action=%(action)d expert_action=%(expert_action)d pi_=%(pi_)s" % locals()) self.env.step(action) self.env.update() terminal = self.env.terminal if step > self.config.max_steps_per_e: terminal = True logging.debug("episode %(i)d hits max steps" % locals()) n_collision += int(self.env.collided) step += 1 logging.debug("episode %(i)d ends with %(step)d steps" % locals()) ep_lengths.append(step) ep_collisions.append(n_collision) return ep_lengths, ep_collisions, accuracies
max_action = float(env.action_space.high[0]) # Initialize policy if args.policy_name == "TD3": policy = TD3.TD3(state_dim, action_dim, max_action) elif args.policy_name == "OurDDPG": policy = OurDDPG.DDPG(state_dim, action_dim, max_action) elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, max_action) elif args.policy_name == "ExpertDDPG": policy = ExpertDDPG.ExpertDDPG(state_dim, action_dim, max_action) replay_buffer = utils.ReplayBuffer() ### expert 6/28 expert = Expert(args.expert_dir) value_expert = expert.value() ### 计算 expert 的 value 6/28 all_episode_reward = [] total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True # Evaluate untrained policy reward_gd, reward_pred = evaluate_policy(policy, expert_value=value_expert) value_step = [total_timesteps] value_true = [reward_gd] value_pred = [reward_pred]
def __call__(self, lock, history, sess, coord): assert isinstance(history, deque) assert isinstance(sess, tf.Session) assert isinstance(coord, tf.train.Coordinator) history_lock = lock env = environment.get_game_environment( self._maps, multiproc=FLAGS.multiproc, random_goal=FLAGS.random_goal, random_spawn=FLAGS.random_spawn, apple_prob=FLAGS.apple_prob, episode_length=FLAGS.episode_length) exp = Expert() with sess.as_default(), sess.graph.as_default(): while not coord.should_stop(): try: if not self._eval: train_global_step, np_global_step, model_version = sess.run( [ self._train_global_step, self._update_explore_global_step_op, self._model_version ]) if model_version != train_global_step: self._update_graph(sess) random_rate = FLAGS.supervision_rate * np.exp( -train_global_step / FLAGS.decay) if FLAGS.learn_mapper: random_rate = 2 else: np_global_step = sess.run( self._update_explore_global_step_op) random_rate = 0 env.reset() obs, info = env.observations() episode = dict() episode['act'] = [np.argmax(exp.get_optimal_action(info))] episode['obs'] = [self._merge_depth(obs, info['depth'])] episode['ego'] = [[0., 0., 0.]] episode['est'] = [ exp.get_free_space_map( info, estimate_size=FLAGS.estimate_size) ] episode['gol'] = [ exp.get_goal_map(info, estimate_size=FLAGS.estimate_size) ] episode['rwd'] = [0.] episode['inf'] = [deepcopy(info)] estimate_map_list = [ np.zeros( (1, FLAGS.estimate_size, FLAGS.estimate_size, 3)) for _ in xrange(FLAGS.estimate_scale) ] old_estimate_map_list = estimate_map_list for _ in xrange(FLAGS.episode_size): prev_info = deepcopy(episode['inf'][-1]) optimal_action = exp.get_optimal_action(prev_info) expand_dim = lambda x: np.array([[x[-1]]]) feed_data = { 'sequence_length': np.array([1]), 'visual_input': expand_dim(episode['obs']), 'egomotion': expand_dim(episode['ego']), 'reward': expand_dim(episode['rwd']), 'space_map': expand_dim(episode['est']), 'goal_map': expand_dim(episode['gol']), 'estimate_map_list': estimate_map_list, 'optimal_action': expand_dim(episode['act']), 'optimal_estimate': expand_dim(episode['est']), 'is_training': False } feed_dict = prepare_feed_dict(self._net.input_tensors, feed_data) results = sess.run( [self._net.output_tensors['action']] + self._net. intermediate_tensors['estimate_map_list'], feed_dict=feed_dict) predict_action = np.squeeze(results[0]) old_estimate_map_list = estimate_map_list estimate_map_list = [m[0] for m in results[1:]] if np.random.rand() < random_rate and not self._eval: dagger_action = optimal_action else: dagger_action = predict_action action = np.argmax(dagger_action) obs, reward, terminal, info = env.step(action) if not terminal: episode['act'].append(np.argmax(optimal_action)) episode['obs'].append( self._merge_depth(obs, info['depth'])) episode['ego'].append( environment.calculate_egomotion( prev_info['POSE'], info['POSE'])) episode['est'].append( exp.get_free_space_map( info, estimate_size=FLAGS.estimate_size)) episode['gol'].append( exp.get_goal_map( info, estimate_size=FLAGS.estimate_size)) episode['rwd'].append(deepcopy(reward)) episode['inf'].append(deepcopy(info)) else: break if not self._eval: history.append(episode) if np_global_step % FLAGS.save_every == 0 or self._eval: feed_data = { 'sequence_length': np.array([1]), 'visual_input': expand_dim(episode['obs']), 'egomotion': expand_dim(episode['ego']), 'reward': expand_dim(episode['rwd']), 'space_map': expand_dim(episode['est']), 'goal_map': expand_dim(episode['gol']), 'estimate_map_list': old_estimate_map_list, 'optimal_action': expand_dim(episode['act']), 'optimal_estimate': expand_dim(episode['est']), 'is_training': False } feed_dict = prepare_feed_dict(self._net.input_tensors, feed_data) summary_ops = self._estimate_maps + self._goal_maps + self._reward_maps + self._value_maps results = sess.run(summary_ops, feed_dict=feed_dict) estimate_maps_images = results[:len(self._estimate_maps )] results = results[len(self._estimate_maps):] goal_maps_images = results[:len(self._goal_maps)] results = results[len(self._goal_maps):] fused_maps_images = results[:len(self._reward_maps)] results = results[len(self._reward_maps):] value_maps_images = results[:len(self._value_maps)] results = results[len(self._value_maps):] assert len(results) == 0 postfix = '_eval' if self._eval else '' self._writer.add_summary(self._build_map_summary( estimate_maps_images, episode['est'], goal_maps_images, fused_maps_images, value_maps_images, postfix), global_step=np_global_step) # summary_text = ','.join('{}[{}]-{}={}'.format(key, idx, step, value) # for step, info in enumerate(episode['inf']) # for key in ('GOAL.LOC', 'SPAWN.LOC', 'POSE', 'env_name') # for idx, value in enumerate(info[key])) # step_episode_summary = sess.run(self._step_history_op, # feed_dict={self._step_history: summary_text}) # self._writer.add_summary(step_episode_summary, global_step=np_global_step) self._writer.add_summary( self._build_trajectory_summary( episode['rwd'], episode['inf'], exp, random_rate, postfix), global_step=np_global_step) if self._eval and FLAGS.total_steps <= np_global_step: coord.request_stop() except Exception as e: print e
policy = TD3.TD3(state_dim, action_dim, max_action) elif args.policy_name == "OurDDPG": policy = OurDDPG.DDPG(state_dim, action_dim, max_action) elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, max_action) elif args.policy_name == "ExpertDDPG": policy = ExpertDDPG.ExpertDDPG(state_dim, action_dim, max_action) policy_contrast = ExpertDDPG.ExpertDDPG( state_dim, action_dim, max_action) ### 不使用 expert 作为对比 6/28 replay_buffer = utils.ReplayBuffer() replay_buffer_contrast = utils.ReplayBuffer() ### 不能使用同一个经验池 6/28 ### expert 6/28 expert_dir = './expert_data/' expert = Expert(expert_dir) total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True done_contrast = True expert_flag = True ### 决定当前是否使用 expert policy 6/28 # Evaluate untrained policy evaluations = [(total_timesteps, evaluate_policy(policy, policy_contrast)) ] ### tuple 6/28 while total_timesteps < args.max_timesteps: '''################### without expert ##################### if done_contrast:
print(sc._conf.getAll()) #sc=None (x_train, y_train), (x_test, y_test) = cifar10.load_data() x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 inputs = Input(shape=x_train.shape[1:]) #Load init experts experts = [] for i in range(5): tempExpert = Expert(x_train,y_train,x_test,y_test, 32, str(i + 1), inputs) experts.append(tempExpert.expertModel) #Storage dir for MoE weights moe_weights_file='../lib/weights/moe_full' # Convert class vectors to binary class matrices. y_train = to_categorical(y_train, num_classes) y_test = to_categorical(y_test, num_classes) #Create MoE model and train it with two experts moeModel = Mixture(x_train, y_train, x_test, y_test, experts, inputs, sc) moeModel.train_init(datagen, moe_weights_file)
class BehaviorCloning(): """Behavior Cloning Classes. Attributes: config : configuration object envname : environment name """ def __init__(self, config): self.config = config self.envname = config.bc.envname self.expert = None def train(self, imitation_mode=ImitationMode.bc): """Training for Behavior Cloning 1. Set hyper parameters 2. Load the expert data 3. Calculate the number of features and actions 4. Create the Behavior Cloning model 5. Train the Behavior Cloning model """ # Hyper parameters epochs = self.config.bc.epochs batch_size = self.config.bc.batch_size display_step = self.config.bc.display_step keep_prob = self.config.bc.keep_prob # check point configrations checkpoint_dir = self.config.bc.checkpoint_dir # load expert data expert_data_loader = DataLoader() self.x_train, self.y_train, \ self.x_valid, self.y_valid, \ self.x_test, self.y_test = expert_data_loader.load(self.envname) # calculate the number of features and actions self.num_features = self.x_train.shape[1] self.num_actions = self.y_train.shape[1] # Training Phase print('Training...') with tf.Session() as sess: # tensorboard logger self.tb_logger = self.get_tb_logger(sess, self.envname, imitation_mode) # build model self.model, continue_train = self.create_model( sess, self.num_features, self.num_actions, imitation_mode) # if not necessary to train the model, do test if not continue_train: self.test(sess, expert_data_loader) return # Training cycle self.num_timesteps = self.x_train.shape[0] num_steps = self.num_timesteps // batch_size epoch = 1 global_step = 1 early_stop = self.reset_early_stop() loss_list = [] v_loss_list = [] last_v_loss = 0 while epoch <= epochs: print('epoch ', epoch) self.x_train, self.y_train = \ util.shuffle_dataset(self.x_train, self.y_train) self.num_timesteps = self.x_train.shape[0] num_steps = self.num_timesteps // batch_size # mini batch iterations for step in range(0, num_steps): start_idx = step * batch_size end_idx = (step + 1) * batch_size x_obs = self.x_train[start_idx:end_idx] x_actions = self.y_train[start_idx:end_idx] loss, log_loss, _ = \ self.model.update(sess, x_obs, x_actions, keep_prob) if loss == 0: print(step, "loss is zero") if global_step % display_step == 0: # validation v_loss, log_v_loss = self.model.validate( sess, self.x_valid, self.y_valid) print("step " + str(global_step) + \ ", train loss " + "{:.5f}".format(loss) + \ ", validation loss " + "{:.5f}".format(v_loss)) # tensorboard logging self.tb_logger.add_summary(log_loss, global_step) self.tb_logger.add_summary(log_v_loss, global_step) # early stopping early_stop = self.check_early_stop(v_loss, last_v_loss) # make loss list for plotting last_v_loss = v_loss loss_list.append(loss) v_loss_list.append(v_loss) if early_stop: break global_step += 1 if early_stop: break epoch += 1 # if loss is greater than the threshold, increase # epochs epochs = self.check_epochs(epochs, epoch, loss) if imitation_mode == ImitationMode.DAgger: self.add_experience(sess, expert_data_loader) self.num_timesteps = self.x_train.shape[0] num_steps = self.num_timesteps // batch_size print("step " + str(global_step) + \ ", train loss " + "{:.5f}".format(loss) + \ ", validation loss " + "{:.5f}".format(v_loss)) # Save Model self.model.save(sess, checkpoint_dir, self.envname, global_step, imitation_mode) # show loss plot self.show_train_graph(loss_list, v_loss_list) # test policy self.test(sess, expert_data_loader) def create_model(self, sess, num_features, num_actions, imitation_mode=ImitationMode.bc): # model configuratoin learning_rate = self.config.bc.learning_rate hidden_list = self.config.model.hidden_list[self.envname] # create a model model = Model(num_features, hidden_list, num_actions, learning_rate) continue_train = True # check point configuratoin checkpoint_dir = self.config.bc.checkpoint_dir restore = self.config.bc.restore restore_file = self.config.bc.restore_file # initialize or restore the model if restore == ModelInit.new: # Initializing the variables sess.run(tf.global_variables_initializer()) elif restore == ModelInit.restore_test: model.restore(sess, checkpoint_dir, restore_file, imitation_mode) continue_train = False elif restore == ModelInit.restore_train: model.restore(sess, checkpoint_dir, restore_file, imitation_mode) # need to develop training from restored time steps return model, continue_train def show_train_graph(self, loss_list, v_loss_list): plt.plot(loss_list) plt.plot(v_loss_list) plt.xlabel("Steps") plt.ylabel("Loss") plt.show() def summary_returns(self, returns, title): time_steps = returns.shape[0] return_mean = np.mean(returns) return_std = np.std(returns) print() print(title, " Return Summary:") print("Rollouts : ", time_steps) print("Mean : ", return_mean) print("Stdev : ", return_std) def reset_early_stop(self): self.early_stop_count = 0 return False def check_early_stop(self, loss, last_loss): early_stop_threshold = self.config.bc.early_stop_threshold early_stop_count_threshold = self.config.bc.early_stop_count_threshold diff = loss - last_loss if abs(diff) < early_stop_threshold: self.early_stop_count += 1 if self.early_stop_count >= early_stop_count_threshold: print("v_loss - last_v_loss ", diff, "early_stop_count", self.early_stop_count) return True return False return self.reset_early_stop() def check_epochs(self, epochs, epoch, loss): threshold = self.config.bc.loss_convergence_threshold if epoch > epochs and abs(loss) > threshold: max_epochs = self.config.bc.max_epochs new_epochs = epochs + int(epochs * 0.1) return min([new_epochs, max_epochs]) return epochs def get_tb_logger(self, sess, envname, imitation_mode=ImitationMode.bc): log_dir = self.config.bc.log_dir imitation_mode_str = util.imitation_mode_str[imitation_mode] log_path = os.path.join(log_dir, envname, imitation_mode_str) if not os.path.exists(log_path): os.makedirs(log_path) return tf.summary.FileWriter(log_path, sess.graph) def test(self, sess, expert_data_loader): self.test_policy(sess) # rollout bc policy num_rollouts = self.config.bc.num_rollouts max_steps = self.config.bc.max_steps experience = self.rollout_policy(sess, self.envname, max_steps, num_rollouts) self.summary_returns(expert_data_loader.returns, "Expert") self.summary_returns(experience['returns'], "Imitation Learning") return experience def test_policy(self, sess): print('Testing...') num_timesteps = self.x_test.shape[0] batch_size = self.config.bc.batch_size display_step = self.config.bc.test_display_step num_steps = num_timesteps // batch_size loss_list = [] # mini batch iterations for step in range(1, num_steps + 1): start_idx = step * batch_size end_idx = (step + 1) * batch_size x_obs = self.x_test[start_idx:end_idx] x_actions = self.y_test[start_idx:end_idx] if step % display_step == 1: # Calculate batch loss and accuracy loss, log_test_loss = self.model.test(sess, x_obs, x_actions) print("step " + str(step) + \ ", test loss " + "{:.5f}".format(loss)) self.tb_logger.add_summary(log_test_loss, step) loss_list.append(loss) def rollout_policy(self, sess, envname, max_steps, num_rollouts=10, render=True): observations = [] actions = [] returns = [] env = gym.make(envname) for i in range(num_rollouts): if render: print('iter', i) obs = env.reset() done = False totalr = 0. steps = 0 while not done: observations.append(obs) actions.append( self.model.predict(sess, np.expand_dims(obs, axis=0))[0]) obs, r, done, _ = env.step(actions[-1]) totalr += r steps += 1 if render: env.render() if steps >= max_steps: break returns.append(totalr) experience = { 'observations': np.array(observations), 'actions': np.array(np.squeeze(actions)), 'returns': np.array(returns) } return experience def add_experience(self, sess, expert_data_loader): # rollout policy num_rollouts = self.config.bc.num_rollouts max_steps = self.config.bc.max_steps experience = self.rollout_policy(sess, self.envname, max_steps, num_rollouts, False) experience = self.ask_to_expert(experience) self.x_train = np.concatenate( (self.x_train, experience['observations'])) self.y_train = np.concatenate((self.y_train, experience['actions'])) ''' self.x_train, self.y_train, \ self.x_valid, self.y_valid, \ self.x_test, self.y_test = \ expert_data_loader.add_experience(experience) ''' def ask_to_expert(self, experience): # expert policy if self.expert == None: self.expert = Expert(self.config) self.policy_fn = self.expert.load_expert_policy(self.envname) batch_size = self.config.bc.batch_size observations = experience['observations'] actions = experience['actions'] num_timesteps = observations.shape[0] num_steps = num_timesteps // batch_size for step in range(0, num_steps): start_idx = step * batch_size end_idx = (step + 1) * batch_size actions[start_idx:end_idx] = \ self. policy_fn(observations[start_idx:end_idx, :]) experience['actions'] = actions return experience
help="Number of epochs at init") parser.add_argument("--batch_size", type=int, default=32, help="Size of the minibatch") args = parser.parse_args() args.cuda = torch.cuda.is_available() # pylint: disable=E1101 args.device = torch.device("cuda" if args.cuda else "cpu") # pylint: enable=E1101 # Data data = translated_gaussian_dataset(args.batch_size, args) # Model experts = [Expert(args).to(args.device) for i in range(args.num_experts)] discriminator = Discriminator(args).to(args.device) # initialize_experts(experts, data, args) discriminator_opt = torch.optim.Adam(discriminator.parameters()) expert_opt = [] for e in experts: expert_opt.append(torch.optim.Adam(e.parameters())) for n in range(args.num_epoch): train_icm(experts, expert_opt, discriminator, discriminator_opt, data, args) print([e(torch.Tensor(np.array([[0.0, 0.0]]))) for e in experts])
feed_dict[v] = data[k] else: for t, d in zip(v, data[k]): feed_dict[t] = d.astype(t.dtype.as_numpy_dtype) return feed_dict if __name__ == "__main__": estimate_size = 256 estimate_scale = 3 episode_size = 360 net = CMAP(image_size=(episode_size, episode_size, 3)) exp = Expert() env = get_game_environment(width=str(episode_size), height=str(episode_size)) while True: with tf.Session() as sess: sess.run(tf.global_variables_initializer()) env.reset() obs = env.observations() obs["pose.loc"] = obs["DEBUG.POS.TRANS"] print("Init player loc:", obs["pose.loc"][:2]) print("Init player node(row, col):", exp.player_node(obs))
for idx, image, focallength, gt_pose, gt_coords, gt_expert in trainset_loader: gt_coords = gt_coords[0] gt_coords = gt_coords.view(3, -1) coord_mask = gt_coords.abs().sum(0) > 0 gt_coords = gt_coords[:, coord_mask] mean += gt_coords.sum(1) count += int(coord_mask.sum()) mean /= count print("Done. Mean: %.2f, %.2f, %.2f\n" % (mean[0], mean[1], mean[2])) model = Expert(mean) else: # === large, connected environment, perform clustering ================== from cluster_dataset import ClusterDataset trainset = ClusterDataset("training", num_clusters=opt.clusters, cluster=opt.expert) trainset_loader = torch.utils.data.DataLoader(trainset, shuffle=True, num_workers=6) model = Expert(trainset.cam_centers[opt.expert]) model.cuda() model.train() model_file = 'expert_e%d_%s.net' % (opt.expert, opt.session)
from dataset import RoomDataset trainset = RoomDataset("training", scene=opt.expert) else: # === large, connected environment, perform clustering ================== from cluster_dataset import ClusterDataset trainset = ClusterDataset("training", num_clusters=opt.clusters, cluster=opt.expert) trainset_loader = torch.utils.data.DataLoader(trainset, shuffle=True, num_workers=6) model = Expert(torch.zeros((3, ))) model.load_state_dict( torch.load('expert_e%d_%s.net' % (opt.expert, opt.session))) print("Successfully loaded model.") model.cuda() model.train() model_file = 'expert_e%d_%s_refined.net' % (opt.expert, opt.session) optimizer = optim.Adam(model.parameters(), lr=opt.learningrate) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=opt.lrssteps, gamma=opt.lrsgamma)
def main(): PATH_TO_LOGGING = '/home/mirshad7/habitat_imitation_learning/logger' save_model_path = '/home/mirshad7/hierarchical_imitation/learning_module/checkpoint' writer = SummaryWriter(PATH_TO_LOGGING) # EncoderCNN architecture CNN_fc_hidden1 = 256 CNN_embed_dim = 150 # latent dim extracted by 2D CNN dropout_p_CNN = 0.3 # dropout probability pose_feature_dim = 72 # DecoderRNN architecture RNN_hidden_layers = 3 RNN_hidden_nodes = 100 RNN_FC_dim = 50 output_dim = 6 dropout_p_RNN = 0.3 # Detect devices img_x = 224 img_y = 224 use_cuda = torch.cuda.is_available() # check if GPU exists device = torch.device("cuda" if use_cuda else "cpu") # use CPU or GPU params = { 'lr': 1e-4, 'batch_size': 15, 'epochs': 30, 'model': 'enoder_decoder' } #Expert Params num_scenes = 72 num_episodes_per_scene = 10 min_distance = 2 max_distance = 18 val_split = 0.2 data_path_train = 'data/datasets/pointnav/gibson/v1/all/training_batch_0.json.gz' data_path_val = 'data/datasets/pointnav/gibson/v1/val/val.json.gz' scene_dir = 'data/scene_datasets/' mode = "exact_gradient" config_path = "configs/tasks/pointnav_gibson.yaml" num_traj_train = num_scenes * num_episodes_per_scene num_traj_val = int(num_traj_train * val_split) dataloader_params = { 'batch_size': params['batch_size'], 'shuffle': True, 'num_workers': 0, 'pin_memory': True } if use_cuda else {} log_interval = 3 # interval for displaying training info transform = transforms.Compose([ transforms.Resize([img_x, img_y]), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) expert_train = Expert(data_path_train, scene_dir, mode, config_path, transform) images_train, actions_train = expert_train.read_observations_and_actions( num_traj_train, min_distance, max_distance) expert_val = Expert(data_path_val, scene_dir, mode, config_path, transform) images_val, actions_val = expert_train.read_observations_and_actions( num_traj_val, min_distance, max_distance) #Define dataset here train_set = Dataset_RNN(images_train, actions_train) val_set = Dataset_RNN(images_val, actions_val) train_loader = data.DataLoader(train_set, **dataloader_params, collate_fn=pad_collate, drop_last=True) val_loader = data.DataLoader(val_set, **dataloader_params, collate_fn=pad_collate, drop_last=True) print( "==================================================================================" ) print( " ...DATA LOADING DONE.... " ) print( " ...STARTING TRAIN LOOP.... " ) print( "==================================================================================" ) # Create model cnn_encoder = CNNEncoder(fc_hidden1=CNN_fc_hidden1, CNN_embed_dim=CNN_embed_dim, drop_p=dropout_p_CNN).to(device) rnn_decoder = DecoderRNN(embed_dim=CNN_embed_dim, h_RNN_layers=RNN_hidden_layers, num_hidden=RNN_hidden_nodes, h_FC_dim=RNN_FC_dim, drop_prob=dropout_p_RNN, num_classes=output_dim).to(device) crnn_params = list(cnn_encoder.fc1.parameters()) + list(cnn_encoder.bn1.parameters()) + \ list(cnn_encoder.fc2.parameters()) + list(rnn_decoder.parameters()) optimizer = torch.optim.Adam(crnn_params, lr=params['lr']) criterion = nn.CrossEntropyLoss(ignore_index=-1) #train model for epoch in range(params['epochs']): train(log_interval, [cnn_encoder, rnn_decoder], criterion, device, train_loader, optimizer, epoch, params['batch_size'], output_dim, params, writer) validate(log_interval, [cnn_encoder, rnn_decoder], criterion, device, val_loader, epoch, params['batch_size'], output_dim, params, writer)