def __init__(self,
              config,
              global_network,
              thread_index,
              network_scope="network",
              scene_scope="scene",
              task_scope="task"):
     self.thread_index = thread_index
     self.config = config
     self.network_scope = network_scope
     self.scene_scope = scene_scope
     self.task_scope = task_scope
     self.scopes = [network_scope, scene_scope, task_scope]
     self.local_network = global_network
     self.env = Environment({
         'scene_name': self.scene_scope,
         'terminal_state_id': int(self.task_scope)
     })
     self.env.reset()
     self.expert = Expert(self.env)
     self.local_t = 0
     self.episode_length = 0
     self.first_iteration = True  # first iteration of Dagger
     # training dataset
     self.states = []
     self.actions = []
     self.targets = []
예제 #2
0
    def ask_to_expert(self, experience):
        # expert policy
        if self.expert == None:
            self.expert = Expert(self.config)
            self.policy_fn = self.expert.load_expert_policy(self.envname)

        batch_size = self.config.bc.batch_size
        observations = experience['observations']
        actions = experience['actions']
        num_timesteps = observations.shape[0]
        num_steps = num_timesteps // batch_size

        for step in range(0, num_steps):
            start_idx = step * batch_size
            end_idx = (step + 1) * batch_size
            actions[start_idx:end_idx] = \
               self. policy_fn(observations[start_idx:end_idx, :])

        experience['actions'] = actions
        return experience
예제 #3
0
    def __init__(self, num_experts, lr=0, cam_centers=None, gating_capacity=1):

        self.num_experts = num_experts
        self.lr = lr  # learning rate

        if cam_centers is None:
            cam_centers = torch.zeros(num_experts, 3)

        cam_centers = cam_centers.cuda()

        # setup gating network
        self.model_g = Gating(num_experts, gating_capacity)
        self.model_g = self.model_g.cuda()
        self.model_g.train()
        self.optimizer_g = optim.Adam(self.model_g.parameters(), lr=lr)

        # setup expert networks
        self.experts = []
        self.expert_opts = []

        for i in range(0, num_experts):

            model_e = Expert(cam_centers[i])
            model_e = model_e.cuda()
            model_e.train()
            optimizer_e = optim.Adam(model_e.parameters(), lr=lr)

            self.experts.append(model_e)
            self.expert_opts.append(optimizer_e)
예제 #4
0
def observeExpert():  #Feature sum for one run of the optimal policy
    """ Main function, runs the experiment. """
    expert = Expert(
        int(sys.argv[1])
    )  # initialise and expert with a certain policy form the pre-trained ones
    env = init_env()  # initialise an environment
    featureSum = [0, 0, 0]  # feature expectations is a 1x3 vector
    counter = 1  #counter to calculate average
    #runs of the policy
    expert.start()
    state, reward = env.reset()
    while not env.terminal:
        action = expert.step(state, reward)
        state, reward = env.update(action)
        feat = featuresFromState(state)
        featureSum = [sum(x) for x in zip(*[featureSum, feat])]
        counter += 1
    expert.end(reward)
    return featureSum
예제 #5
0
def observeExpert(): #Feature sum for one run of the optimal policy
  """ Main function, runs the experiment. """
  expert = Expert(int(sys.argv[1])) # initialise and expert with a certain policy form the pre-trained ones
  env = init_env() # initialise an environment
  featureSum=[0,0,0] # feature expectations is a 1x3 vector
  counter = 1 #counter to calculate average
  #runs of the policy
  expert.start() 
  state, reward = env.reset()
  while not env.terminal:
      action = expert.step(state, reward)
      state, reward = env.update(action)
      feat = featuresFromState(state)
      featureSum=[sum(x) for x in zip(*[featureSum,feat])]  
      counter+=1
  expert.end(reward)
  return featureSum    
    def __init__(self, saver, model, global_step):
        super(Trainer, self).__init__()

        self._exp = Expert()
        self._net = model
        self._update_global_step_op = tf.assign_add(global_step, 1)
        self._enough_history = False

        optimizer_class = getattr(tf.train, FLAGS.optimizer)
        optimizer = optimizer_class(learning_rate=FLAGS.learning_rate)
        self._update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        loss_key = 'loss' if not FLAGS.learn_mapper else 'estimate_loss'

        with tf.control_dependencies(self._update_ops):
            gradients, variables = zip(
                *optimizer.compute_gradients(model.output_tensors[loss_key]))
            if FLAGS.grad_clip > 0:
                gradients_constrained, _ = tf.clip_by_global_norm(
                    gradients, FLAGS.grad_clip)
            else:
                gradients_constrained = gradients
            self._gradient_names = [
                v.name for g, v in zip(gradients_constrained, variables)
                if g is not None
            ]
            self._gradient_summary_op = [
                tf.reduce_mean(tf.abs(g)) for g in gradients_constrained
                if g is not None
            ]
            self._train_op = optimizer.apply_gradients(zip(
                gradients_constrained, variables),
                                                       global_step=global_step)

        with tf.control_dependencies([self._train_op]):
            self._train_loss = model.output_tensors[loss_key]

        self._writer = Proc._build_writer()
class DaggerThread(object):
    def __init__(self,
                 config,
                 global_network,
                 thread_index,
                 network_scope="network",
                 scene_scope="scene",
                 task_scope="task"):
        self.thread_index = thread_index
        self.config = config
        self.network_scope = network_scope
        self.scene_scope = scene_scope
        self.task_scope = task_scope
        self.scopes = [network_scope, scene_scope, task_scope]
        self.local_network = global_network
        self.env = Environment({
            'scene_name': self.scene_scope,
            'terminal_state_id': int(self.task_scope)
        })
        self.env.reset()
        self.expert = Expert(self.env)
        self.local_t = 0
        self.episode_length = 0
        self.first_iteration = True  # first iteration of Dagger
        # training dataset
        self.states = []
        self.actions = []
        self.targets = []

    def choose_action_label_smooth(self, expected_action, epsilon):
        """ P(k) =  (1-epsilon) * P_e +  e * 1/N """
        pi_values = [epsilon / float(self.config.action_size)
                     ] * self.config.action_size
        pi_values[expected_action] += 1 - epsilon
        return pi_values

    def choose_action_greedy(self, pi_values):
        # greedy algorithm since this is supervised learning
        return np.argmax(pi_values, axis=0)

    def choose_action(self, pi_values):
        values = []
        s = 0.0
        for rate in pi_values:
            s += rate
            values.append(s)
        r = random.random() * s
        for i in range(len(values)):
            if values[i] >= r:
                return i
        # fail safe
        return len(values) - 1

    def add_summary(self, writer, value_dict):
        if writer is None or len(value_dict) == 0:
            return
        value = [
            tf.Summary.Value(tag=k, simple_value=v)
            for k, v in value_dict.items()
        ]
        summary = tf.Summary(value=value)
        writer.add_summary(summary,
                           global_step=self.local_network.get_global_step())
        logging.debug("writing summary %s" % (str(summary)))

    def train(self, session, writer):
        assert len(self.states) == len(
            self.actions), "data count of action and state mismatch"
        s = self.states
        a = self.actions
        n_total = len(s)
        assert n_total > 0, "null dataset"
        t = [self.env.s_target] * n_total
        if n_total > self.config.batch_size:
            data = list(zip(s, a))
            np.random.shuffle(data)
            s, a = zip(*data)
        local_t = self.local_t
        scope = self.scene_scope + '/' + self.task_scope
        for epoch in range(self.config.max_epochs):
            train_loss, train_accuracy = self.local_network.run_epoch(
                session, self.scopes, s, t, a, True, writer)
            global_step = self.local_network.get_global_step()
            logging.info(
                "%(scope)s:t=%(local_t)d "
                "train_step=%(global_step)d loss=%(train_loss)f acc=%(train_accuracy)f"
                % locals())
        return

    def process(self, sess, global_t, summary_writer):
        start_local_t = self.local_t
        # draw experience with current policy or expert policy
        terminal = False
        for i in range(self.config.local_t_max):
            if self.first_iteration:
                # use expert policy before any training
                expert_action = action = self.expert.get_next_action()
                expert_lsr_pi = self.choose_action_label_smooth(
                    expert_action, self.config.lsr_epsilon)
            else:
                expert_action = self.expert.get_next_action()
                expert_lsr_pi = self.choose_action_label_smooth(
                    expert_action, self.config.lsr_epsilon)
                pi_ = self.local_network.run_policy(sess, self.env.s_t,
                                                    self.env.s_target,
                                                    self.scopes)
                action = self.choose_action(pi_)
                logging.debug(
                    "action=%(action)d expert_action=%(expert_action)d "
                    "expert_lsr_pi=%(expert_lsr_pi)s pi_=%(pi_)s" % locals())
            self.states.insert(0, self.env.s_t)
            self.actions.insert(0, expert_lsr_pi)
            self.env.step(action)
            self.env.update()
            terminal = True if self.episode_length > self.config.max_steps_per_e else self.env.terminal
            self.episode_length += 1
            self.local_t += 1
            if terminal:
                logging.info(
                    "[episode end] time %d | thread #%d | scene %s | target #%s expert:%s episode length = %d\n"
                    % (global_t, self.thread_index, self.scene_scope,
                       self.task_scope, "T" if self.first_iteration else "F",
                       self.episode_length))
                summary_values = {
                    "episode_length_input": float(self.episode_length),
                }
                if not self.first_iteration:
                    # record agent's score only
                    self.add_summary(summary_writer, summary_values)
                self.episode_length = 0
                self.env.reset()
                break
        # train policy network with gained labels
        self.train(sess, summary_writer)
        self.first_iteration = False
        return self.local_t - start_local_t

    def evaluate(self, sess, n_episodes, expert_agent=False):
        ep_lengths = []
        ep_collisions = []
        accuracies = []
        for i in range(n_episodes):
            self.env.reset()
            terminal = False
            step = 0
            n_collision = 0
            while not terminal:
                if expert_agent:
                    action = self.expert.get_next_action()
                else:
                    expert_action = self.expert.get_next_action()
                    pi_ = self.local_network.run_policy(
                        sess, self.env.s_t, self.env.s_target, self.scopes)
                    action = self.choose_action(pi_)
                    accuracies.append(1.0 if expert_action == action else 0.0)
                    logging.debug(
                        "action=%(action)d expert_action=%(expert_action)d pi_=%(pi_)s"
                        % locals())
                self.env.step(action)
                self.env.update()
                terminal = self.env.terminal
                if step > self.config.max_steps_per_e:
                    terminal = True
                    logging.debug("episode %(i)d hits max steps" % locals())
                n_collision += int(self.env.collided)
                step += 1
            logging.debug("episode %(i)d ends with %(step)d steps" % locals())
            ep_lengths.append(step)
            ep_collisions.append(n_collision)
        return ep_lengths, ep_collisions, accuracies
예제 #8
0
    max_action = float(env.action_space.high[0])

    # Initialize policy
    if args.policy_name == "TD3":
        policy = TD3.TD3(state_dim, action_dim, max_action)
    elif args.policy_name == "OurDDPG":
        policy = OurDDPG.DDPG(state_dim, action_dim, max_action)
    elif args.policy_name == "DDPG":
        policy = DDPG.DDPG(state_dim, action_dim, max_action)
    elif args.policy_name == "ExpertDDPG":
        policy = ExpertDDPG.ExpertDDPG(state_dim, action_dim, max_action)

    replay_buffer = utils.ReplayBuffer()

    ### expert 6/28
    expert = Expert(args.expert_dir)
    value_expert = expert.value()  ### 计算 expert 的 value 6/28

    all_episode_reward = []

    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    done = True

    # Evaluate untrained policy
    reward_gd, reward_pred = evaluate_policy(policy, expert_value=value_expert)
    value_step = [total_timesteps]
    value_true = [reward_gd]
    value_pred = [reward_pred]
    def __call__(self, lock, history, sess, coord):
        assert isinstance(history, deque)
        assert isinstance(sess, tf.Session)
        assert isinstance(coord, tf.train.Coordinator)

        history_lock = lock

        env = environment.get_game_environment(
            self._maps,
            multiproc=FLAGS.multiproc,
            random_goal=FLAGS.random_goal,
            random_spawn=FLAGS.random_spawn,
            apple_prob=FLAGS.apple_prob,
            episode_length=FLAGS.episode_length)
        exp = Expert()

        with sess.as_default(), sess.graph.as_default():
            while not coord.should_stop():
                try:
                    if not self._eval:
                        train_global_step, np_global_step, model_version = sess.run(
                            [
                                self._train_global_step,
                                self._update_explore_global_step_op,
                                self._model_version
                            ])

                        if model_version != train_global_step:
                            self._update_graph(sess)

                        random_rate = FLAGS.supervision_rate * np.exp(
                            -train_global_step / FLAGS.decay)
                        if FLAGS.learn_mapper:
                            random_rate = 2
                    else:
                        np_global_step = sess.run(
                            self._update_explore_global_step_op)
                        random_rate = 0

                    env.reset()
                    obs, info = env.observations()

                    episode = dict()
                    episode['act'] = [np.argmax(exp.get_optimal_action(info))]
                    episode['obs'] = [self._merge_depth(obs, info['depth'])]
                    episode['ego'] = [[0., 0., 0.]]
                    episode['est'] = [
                        exp.get_free_space_map(
                            info, estimate_size=FLAGS.estimate_size)
                    ]
                    episode['gol'] = [
                        exp.get_goal_map(info,
                                         estimate_size=FLAGS.estimate_size)
                    ]
                    episode['rwd'] = [0.]
                    episode['inf'] = [deepcopy(info)]

                    estimate_map_list = [
                        np.zeros(
                            (1, FLAGS.estimate_size, FLAGS.estimate_size, 3))
                        for _ in xrange(FLAGS.estimate_scale)
                    ]
                    old_estimate_map_list = estimate_map_list

                    for _ in xrange(FLAGS.episode_size):
                        prev_info = deepcopy(episode['inf'][-1])
                        optimal_action = exp.get_optimal_action(prev_info)

                        expand_dim = lambda x: np.array([[x[-1]]])
                        feed_data = {
                            'sequence_length': np.array([1]),
                            'visual_input': expand_dim(episode['obs']),
                            'egomotion': expand_dim(episode['ego']),
                            'reward': expand_dim(episode['rwd']),
                            'space_map': expand_dim(episode['est']),
                            'goal_map': expand_dim(episode['gol']),
                            'estimate_map_list': estimate_map_list,
                            'optimal_action': expand_dim(episode['act']),
                            'optimal_estimate': expand_dim(episode['est']),
                            'is_training': False
                        }
                        feed_dict = prepare_feed_dict(self._net.input_tensors,
                                                      feed_data)

                        results = sess.run(
                            [self._net.output_tensors['action']] + self._net.
                            intermediate_tensors['estimate_map_list'],
                            feed_dict=feed_dict)

                        predict_action = np.squeeze(results[0])
                        old_estimate_map_list = estimate_map_list
                        estimate_map_list = [m[0] for m in results[1:]]

                        if np.random.rand() < random_rate and not self._eval:
                            dagger_action = optimal_action
                        else:
                            dagger_action = predict_action

                        action = np.argmax(dagger_action)
                        obs, reward, terminal, info = env.step(action)

                        if not terminal:
                            episode['act'].append(np.argmax(optimal_action))
                            episode['obs'].append(
                                self._merge_depth(obs, info['depth']))
                            episode['ego'].append(
                                environment.calculate_egomotion(
                                    prev_info['POSE'], info['POSE']))
                            episode['est'].append(
                                exp.get_free_space_map(
                                    info, estimate_size=FLAGS.estimate_size))
                            episode['gol'].append(
                                exp.get_goal_map(
                                    info, estimate_size=FLAGS.estimate_size))
                            episode['rwd'].append(deepcopy(reward))
                            episode['inf'].append(deepcopy(info))
                        else:
                            break

                    if not self._eval:
                        history.append(episode)

                    if np_global_step % FLAGS.save_every == 0 or self._eval:
                        feed_data = {
                            'sequence_length': np.array([1]),
                            'visual_input': expand_dim(episode['obs']),
                            'egomotion': expand_dim(episode['ego']),
                            'reward': expand_dim(episode['rwd']),
                            'space_map': expand_dim(episode['est']),
                            'goal_map': expand_dim(episode['gol']),
                            'estimate_map_list': old_estimate_map_list,
                            'optimal_action': expand_dim(episode['act']),
                            'optimal_estimate': expand_dim(episode['est']),
                            'is_training': False
                        }
                        feed_dict = prepare_feed_dict(self._net.input_tensors,
                                                      feed_data)

                        summary_ops = self._estimate_maps + self._goal_maps + self._reward_maps + self._value_maps
                        results = sess.run(summary_ops, feed_dict=feed_dict)

                        estimate_maps_images = results[:len(self._estimate_maps
                                                            )]
                        results = results[len(self._estimate_maps):]
                        goal_maps_images = results[:len(self._goal_maps)]
                        results = results[len(self._goal_maps):]
                        fused_maps_images = results[:len(self._reward_maps)]
                        results = results[len(self._reward_maps):]
                        value_maps_images = results[:len(self._value_maps)]
                        results = results[len(self._value_maps):]

                        assert len(results) == 0

                        postfix = '_eval' if self._eval else ''

                        self._writer.add_summary(self._build_map_summary(
                            estimate_maps_images, episode['est'],
                            goal_maps_images, fused_maps_images,
                            value_maps_images, postfix),
                                                 global_step=np_global_step)

                        # summary_text = ','.join('{}[{}]-{}={}'.format(key, idx, step, value)
                        #                         for step, info in enumerate(episode['inf'])
                        #                         for key in ('GOAL.LOC', 'SPAWN.LOC', 'POSE', 'env_name')
                        #                         for idx, value in enumerate(info[key]))
                        # step_episode_summary = sess.run(self._step_history_op,
                        #                                 feed_dict={self._step_history: summary_text})
                        # self._writer.add_summary(step_episode_summary, global_step=np_global_step)
                        self._writer.add_summary(
                            self._build_trajectory_summary(
                                episode['rwd'], episode['inf'], exp,
                                random_rate, postfix),
                            global_step=np_global_step)

                    if self._eval and FLAGS.total_steps <= np_global_step:
                        coord.request_stop()
                except Exception as e:
                    print e
예제 #10
0
        policy = TD3.TD3(state_dim, action_dim, max_action)
    elif args.policy_name == "OurDDPG":
        policy = OurDDPG.DDPG(state_dim, action_dim, max_action)
    elif args.policy_name == "DDPG":
        policy = DDPG.DDPG(state_dim, action_dim, max_action)
    elif args.policy_name == "ExpertDDPG":
        policy = ExpertDDPG.ExpertDDPG(state_dim, action_dim, max_action)
        policy_contrast = ExpertDDPG.ExpertDDPG(
            state_dim, action_dim, max_action)  ### 不使用 expert 作为对比 6/28

    replay_buffer = utils.ReplayBuffer()
    replay_buffer_contrast = utils.ReplayBuffer()  ### 不能使用同一个经验池 6/28

    ### expert 6/28
    expert_dir = './expert_data/'
    expert = Expert(expert_dir)

    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    done = True
    done_contrast = True
    expert_flag = True  ### 决定当前是否使用 expert policy 6/28

    # Evaluate untrained policy
    evaluations = [(total_timesteps, evaluate_policy(policy, policy_contrast))
                   ]  ### tuple 6/28

    while total_timesteps < args.max_timesteps:
        '''################### without expert #####################
        if done_contrast: 
예제 #11
0
print(sc._conf.getAll())
#sc=None

(x_train, y_train), (x_test, y_test) = cifar10.load_data()

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

inputs = Input(shape=x_train.shape[1:])

#Load init experts
experts = []
for i in range(5):
    tempExpert = Expert(x_train,y_train,x_test,y_test, 32, str(i + 1), inputs)
    experts.append(tempExpert.expertModel)

#Storage dir for MoE weights
moe_weights_file='../lib/weights/moe_full'

# Convert class vectors to binary class matrices.
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)

#Create MoE model and train it with two experts
moeModel = Mixture(x_train, y_train, x_test, y_test, experts, inputs, sc)
moeModel.train_init(datagen, moe_weights_file)


예제 #12
0
class BehaviorCloning():
    """Behavior Cloning Classes.

    Attributes:
        config : configuration object
        envname : environment name

    """
    def __init__(self, config):
        self.config = config
        self.envname = config.bc.envname
        self.expert = None

    def train(self, imitation_mode=ImitationMode.bc):
        """Training for Behavior Cloning

        1. Set hyper parameters
        2. Load the expert data
        3. Calculate the number of features and actions
        4. Create the Behavior Cloning model
        5. Train the Behavior Cloning model

        """
        # Hyper parameters
        epochs = self.config.bc.epochs
        batch_size = self.config.bc.batch_size
        display_step = self.config.bc.display_step
        keep_prob = self.config.bc.keep_prob

        # check point configrations
        checkpoint_dir = self.config.bc.checkpoint_dir

        # load expert data
        expert_data_loader = DataLoader()

        self.x_train, self.y_train, \
        self.x_valid, self.y_valid, \
        self.x_test, self.y_test = expert_data_loader.load(self.envname)

        # calculate the number of features and actions
        self.num_features = self.x_train.shape[1]
        self.num_actions = self.y_train.shape[1]

        # Training Phase
        print('Training...')
        with tf.Session() as sess:
            # tensorboard logger
            self.tb_logger = self.get_tb_logger(sess, self.envname,
                                                imitation_mode)

            # build model
            self.model, continue_train = self.create_model(
                sess, self.num_features, self.num_actions, imitation_mode)

            # if not necessary to train the model, do test
            if not continue_train:
                self.test(sess, expert_data_loader)
                return

            # Training cycle
            self.num_timesteps = self.x_train.shape[0]
            num_steps = self.num_timesteps // batch_size
            epoch = 1
            global_step = 1
            early_stop = self.reset_early_stop()
            loss_list = []
            v_loss_list = []
            last_v_loss = 0
            while epoch <= epochs:
                print('epoch ', epoch)
                self.x_train, self.y_train = \
                    util.shuffle_dataset(self.x_train, self.y_train)

                self.num_timesteps = self.x_train.shape[0]
                num_steps = self.num_timesteps // batch_size
                # mini batch iterations
                for step in range(0, num_steps):
                    start_idx = step * batch_size
                    end_idx = (step + 1) * batch_size
                    x_obs = self.x_train[start_idx:end_idx]
                    x_actions = self.y_train[start_idx:end_idx]

                    loss, log_loss, _ = \
                        self.model.update(sess, x_obs, x_actions, keep_prob)
                    if loss == 0: print(step, "loss is zero")
                    if global_step % display_step == 0:
                        # validation
                        v_loss, log_v_loss = self.model.validate(
                            sess, self.x_valid, self.y_valid)

                        print("step " + str(global_step) + \
                              ", train loss " + "{:.5f}".format(loss) + \
                              ", validation loss " + "{:.5f}".format(v_loss))

                        # tensorboard logging
                        self.tb_logger.add_summary(log_loss, global_step)
                        self.tb_logger.add_summary(log_v_loss, global_step)

                        # early stopping
                        early_stop = self.check_early_stop(v_loss, last_v_loss)
                        # make loss list for plotting
                        last_v_loss = v_loss
                        loss_list.append(loss)

                        v_loss_list.append(v_loss)
                        if early_stop: break

                    global_step += 1

                if early_stop: break
                epoch += 1
                # if loss is greater than the threshold, increase # epochs
                epochs = self.check_epochs(epochs, epoch, loss)
                if imitation_mode == ImitationMode.DAgger:
                    self.add_experience(sess, expert_data_loader)
                    self.num_timesteps = self.x_train.shape[0]
                    num_steps = self.num_timesteps // batch_size

            print("step " + str(global_step) + \
                  ", train loss " + "{:.5f}".format(loss) + \
                  ", validation loss " + "{:.5f}".format(v_loss))

            # Save Model
            self.model.save(sess, checkpoint_dir, self.envname, global_step,
                            imitation_mode)

            # show loss plot
            self.show_train_graph(loss_list, v_loss_list)

            # test policy
            self.test(sess, expert_data_loader)

    def create_model(self,
                     sess,
                     num_features,
                     num_actions,
                     imitation_mode=ImitationMode.bc):

        # model configuratoin
        learning_rate = self.config.bc.learning_rate
        hidden_list = self.config.model.hidden_list[self.envname]

        # create a model
        model = Model(num_features, hidden_list, num_actions, learning_rate)
        continue_train = True

        # check point configuratoin
        checkpoint_dir = self.config.bc.checkpoint_dir
        restore = self.config.bc.restore
        restore_file = self.config.bc.restore_file

        # initialize or restore the model
        if restore == ModelInit.new:
            # Initializing the variables
            sess.run(tf.global_variables_initializer())

        elif restore == ModelInit.restore_test:
            model.restore(sess, checkpoint_dir, restore_file, imitation_mode)
            continue_train = False

        elif restore == ModelInit.restore_train:
            model.restore(sess, checkpoint_dir, restore_file, imitation_mode)
            # need to develop training from restored time steps

        return model, continue_train

    def show_train_graph(self, loss_list, v_loss_list):
        plt.plot(loss_list)
        plt.plot(v_loss_list)
        plt.xlabel("Steps")
        plt.ylabel("Loss")
        plt.show()

    def summary_returns(self, returns, title):
        time_steps = returns.shape[0]
        return_mean = np.mean(returns)
        return_std = np.std(returns)

        print()
        print(title, " Return Summary:")
        print("Rollouts : ", time_steps)
        print("Mean : ", return_mean)
        print("Stdev : ", return_std)

    def reset_early_stop(self):
        self.early_stop_count = 0
        return False

    def check_early_stop(self, loss, last_loss):
        early_stop_threshold = self.config.bc.early_stop_threshold
        early_stop_count_threshold = self.config.bc.early_stop_count_threshold
        diff = loss - last_loss

        if abs(diff) < early_stop_threshold:
            self.early_stop_count += 1

            if self.early_stop_count >= early_stop_count_threshold:
                print("v_loss - last_v_loss ", diff, "early_stop_count",
                      self.early_stop_count)
                return True

            return False

        return self.reset_early_stop()

    def check_epochs(self, epochs, epoch, loss):

        threshold = self.config.bc.loss_convergence_threshold
        if epoch > epochs and abs(loss) > threshold:
            max_epochs = self.config.bc.max_epochs
            new_epochs = epochs + int(epochs * 0.1)
            return min([new_epochs, max_epochs])

        return epochs

    def get_tb_logger(self, sess, envname, imitation_mode=ImitationMode.bc):

        log_dir = self.config.bc.log_dir
        imitation_mode_str = util.imitation_mode_str[imitation_mode]
        log_path = os.path.join(log_dir, envname, imitation_mode_str)
        if not os.path.exists(log_path):
            os.makedirs(log_path)

        return tf.summary.FileWriter(log_path, sess.graph)

    def test(self, sess, expert_data_loader):
        self.test_policy(sess)

        # rollout bc policy
        num_rollouts = self.config.bc.num_rollouts
        max_steps = self.config.bc.max_steps
        experience = self.rollout_policy(sess, self.envname, max_steps,
                                         num_rollouts)

        self.summary_returns(expert_data_loader.returns, "Expert")
        self.summary_returns(experience['returns'], "Imitation Learning")

        return experience

    def test_policy(self, sess):
        print('Testing...')

        num_timesteps = self.x_test.shape[0]
        batch_size = self.config.bc.batch_size
        display_step = self.config.bc.test_display_step

        num_steps = num_timesteps // batch_size
        loss_list = []

        # mini batch iterations
        for step in range(1, num_steps + 1):
            start_idx = step * batch_size
            end_idx = (step + 1) * batch_size
            x_obs = self.x_test[start_idx:end_idx]
            x_actions = self.y_test[start_idx:end_idx]

            if step % display_step == 1:
                # Calculate batch loss and accuracy
                loss, log_test_loss = self.model.test(sess, x_obs, x_actions)

                print("step " + str(step) + \
                      ", test loss " + "{:.5f}".format(loss))

                self.tb_logger.add_summary(log_test_loss, step)
                loss_list.append(loss)

    def rollout_policy(self,
                       sess,
                       envname,
                       max_steps,
                       num_rollouts=10,
                       render=True):
        observations = []
        actions = []
        returns = []

        env = gym.make(envname)
        for i in range(num_rollouts):
            if render: print('iter', i)
            obs = env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                observations.append(obs)
                actions.append(
                    self.model.predict(sess, np.expand_dims(obs, axis=0))[0])
                obs, r, done, _ = env.step(actions[-1])
                totalr += r
                steps += 1
                if render: env.render()
                if steps >= max_steps: break

            returns.append(totalr)

        experience = {
            'observations': np.array(observations),
            'actions': np.array(np.squeeze(actions)),
            'returns': np.array(returns)
        }
        return experience

    def add_experience(self, sess, expert_data_loader):
        # rollout policy
        num_rollouts = self.config.bc.num_rollouts
        max_steps = self.config.bc.max_steps
        experience = self.rollout_policy(sess, self.envname, max_steps,
                                         num_rollouts, False)
        experience = self.ask_to_expert(experience)

        self.x_train = np.concatenate(
            (self.x_train, experience['observations']))

        self.y_train = np.concatenate((self.y_train, experience['actions']))
        '''
        self.x_train, self.y_train, \
        self.x_valid, self.y_valid, \
        self.x_test, self.y_test = \
            expert_data_loader.add_experience(experience)
        '''

    def ask_to_expert(self, experience):
        # expert policy
        if self.expert == None:
            self.expert = Expert(self.config)
            self.policy_fn = self.expert.load_expert_policy(self.envname)

        batch_size = self.config.bc.batch_size
        observations = experience['observations']
        actions = experience['actions']
        num_timesteps = observations.shape[0]
        num_steps = num_timesteps // batch_size

        for step in range(0, num_steps):
            start_idx = step * batch_size
            end_idx = (step + 1) * batch_size
            actions[start_idx:end_idx] = \
               self. policy_fn(observations[start_idx:end_idx, :])

        experience['actions'] = actions
        return experience
예제 #13
0
파일: train.py 프로젝트: yidingjiang/icm
                        help="Number of epochs at init")
    parser.add_argument("--batch_size",
                        type=int,
                        default=32,
                        help="Size of the minibatch")

    args = parser.parse_args()
    args.cuda = torch.cuda.is_available()
    # pylint: disable=E1101
    args.device = torch.device("cuda" if args.cuda else "cpu")
    # pylint: enable=E1101

    # Data
    data = translated_gaussian_dataset(args.batch_size, args)

    # Model
    experts = [Expert(args).to(args.device) for i in range(args.num_experts)]
    discriminator = Discriminator(args).to(args.device)

    # initialize_experts(experts, data, args)

    discriminator_opt = torch.optim.Adam(discriminator.parameters())
    expert_opt = []
    for e in experts:
        expert_opt.append(torch.optim.Adam(e.parameters()))

    for n in range(args.num_epoch):
        train_icm(experts, expert_opt, discriminator, discriminator_opt, data,
                  args)
        print([e(torch.Tensor(np.array([[0.0, 0.0]]))) for e in experts])
예제 #14
0
                feed_dict[v] = data[k]
        else:
            for t, d in zip(v, data[k]):
                feed_dict[t] = d.astype(t.dtype.as_numpy_dtype)

    return feed_dict


if __name__ == "__main__":

    estimate_size = 256
    estimate_scale = 3
    episode_size = 360

    net = CMAP(image_size=(episode_size, episode_size, 3))
    exp = Expert()
    env = get_game_environment(width=str(episode_size),
                               height=str(episode_size))

    while True:

        with tf.Session() as sess:

            sess.run(tf.global_variables_initializer())

            env.reset()

            obs = env.observations()
            obs["pose.loc"] = obs["DEBUG.POS.TRANS"]
            print("Init player loc:", obs["pose.loc"][:2])
            print("Init player node(row, col):", exp.player_node(obs))
예제 #15
0
	for idx, image, focallength, gt_pose, gt_coords, gt_expert in trainset_loader:

		gt_coords = gt_coords[0]
		gt_coords = gt_coords.view(3, -1)

		coord_mask = gt_coords.abs().sum(0) > 0
		gt_coords = gt_coords[:, coord_mask]

		mean += gt_coords.sum(1)
		count += int(coord_mask.sum())

	mean /= count

	print("Done. Mean: %.2f, %.2f, %.2f\n" % (mean[0], mean[1], mean[2]))

	model = Expert(mean)

else:

	# === large, connected environment, perform clustering ==================
	from cluster_dataset import ClusterDataset
	trainset = ClusterDataset("training", num_clusters=opt.clusters, cluster=opt.expert)
	trainset_loader = torch.utils.data.DataLoader(trainset, shuffle=True, num_workers=6)

	model = Expert(trainset.cam_centers[opt.expert])


model.cuda()
model.train()

model_file = 'expert_e%d_%s.net' % (opt.expert, opt.session)
예제 #16
0
    from dataset import RoomDataset
    trainset = RoomDataset("training", scene=opt.expert)

else:

    # === large, connected environment, perform clustering ==================
    from cluster_dataset import ClusterDataset
    trainset = ClusterDataset("training",
                              num_clusters=opt.clusters,
                              cluster=opt.expert)

trainset_loader = torch.utils.data.DataLoader(trainset,
                                              shuffle=True,
                                              num_workers=6)

model = Expert(torch.zeros((3, )))
model.load_state_dict(
    torch.load('expert_e%d_%s.net' % (opt.expert, opt.session)))

print("Successfully loaded model.")

model.cuda()
model.train()

model_file = 'expert_e%d_%s_refined.net' % (opt.expert, opt.session)

optimizer = optim.Adam(model.parameters(), lr=opt.learningrate)
scheduler = optim.lr_scheduler.StepLR(optimizer,
                                      step_size=opt.lrssteps,
                                      gamma=opt.lrsgamma)
예제 #17
0
def main():

    PATH_TO_LOGGING = '/home/mirshad7/habitat_imitation_learning/logger'
    save_model_path = '/home/mirshad7/hierarchical_imitation/learning_module/checkpoint'
    writer = SummaryWriter(PATH_TO_LOGGING)

    # EncoderCNN architecture
    CNN_fc_hidden1 = 256
    CNN_embed_dim = 150  # latent dim extracted by 2D CNN
    dropout_p_CNN = 0.3  # dropout probability
    pose_feature_dim = 72

    # DecoderRNN architecture
    RNN_hidden_layers = 3
    RNN_hidden_nodes = 100
    RNN_FC_dim = 50
    output_dim = 6
    dropout_p_RNN = 0.3

    # Detect devices
    img_x = 224
    img_y = 224
    use_cuda = torch.cuda.is_available()  # check if GPU exists
    device = torch.device("cuda" if use_cuda else "cpu")  # use CPU or GPU
    params = {
        'lr': 1e-4,
        'batch_size': 15,
        'epochs': 30,
        'model': 'enoder_decoder'
    }

    #Expert Params
    num_scenes = 72
    num_episodes_per_scene = 10
    min_distance = 2
    max_distance = 18
    val_split = 0.2
    data_path_train = 'data/datasets/pointnav/gibson/v1/all/training_batch_0.json.gz'
    data_path_val = 'data/datasets/pointnav/gibson/v1/val/val.json.gz'
    scene_dir = 'data/scene_datasets/'
    mode = "exact_gradient"
    config_path = "configs/tasks/pointnav_gibson.yaml"

    num_traj_train = num_scenes * num_episodes_per_scene
    num_traj_val = int(num_traj_train * val_split)

    dataloader_params = {
        'batch_size': params['batch_size'],
        'shuffle': True,
        'num_workers': 0,
        'pin_memory': True
    } if use_cuda else {}
    log_interval = 3  # interval for displaying training info
    transform = transforms.Compose([
        transforms.Resize([img_x, img_y]),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    expert_train = Expert(data_path_train, scene_dir, mode, config_path,
                          transform)
    images_train, actions_train = expert_train.read_observations_and_actions(
        num_traj_train, min_distance, max_distance)

    expert_val = Expert(data_path_val, scene_dir, mode, config_path, transform)
    images_val, actions_val = expert_train.read_observations_and_actions(
        num_traj_val, min_distance, max_distance)

    #Define dataset here
    train_set = Dataset_RNN(images_train, actions_train)
    val_set = Dataset_RNN(images_val, actions_val)
    train_loader = data.DataLoader(train_set,
                                   **dataloader_params,
                                   collate_fn=pad_collate,
                                   drop_last=True)
    val_loader = data.DataLoader(val_set,
                                 **dataloader_params,
                                 collate_fn=pad_collate,
                                 drop_last=True)

    print(
        "=================================================================================="
    )
    print(
        "                    ...DATA LOADING DONE....                                      "
    )
    print(
        "                    ...STARTING TRAIN LOOP....                                      "
    )
    print(
        "=================================================================================="
    )

    # Create model
    cnn_encoder = CNNEncoder(fc_hidden1=CNN_fc_hidden1,
                             CNN_embed_dim=CNN_embed_dim,
                             drop_p=dropout_p_CNN).to(device)
    rnn_decoder = DecoderRNN(embed_dim=CNN_embed_dim,
                             h_RNN_layers=RNN_hidden_layers,
                             num_hidden=RNN_hidden_nodes,
                             h_FC_dim=RNN_FC_dim,
                             drop_prob=dropout_p_RNN,
                             num_classes=output_dim).to(device)

    crnn_params = list(cnn_encoder.fc1.parameters()) + list(cnn_encoder.bn1.parameters()) + \
              list(cnn_encoder.fc2.parameters()) + list(rnn_decoder.parameters())

    optimizer = torch.optim.Adam(crnn_params, lr=params['lr'])
    criterion = nn.CrossEntropyLoss(ignore_index=-1)

    #train model
    for epoch in range(params['epochs']):
        train(log_interval, [cnn_encoder, rnn_decoder], criterion, device,
              train_loader, optimizer, epoch, params['batch_size'], output_dim,
              params, writer)
        validate(log_interval, [cnn_encoder, rnn_decoder], criterion, device,
                 val_loader, epoch, params['batch_size'], output_dim, params,
                 writer)