Exemplo n.º 1
0
def thread_memory(config, memory_queue, batch_queue, update_p_queue,
                  priority_environment_queue):
    memory = ReplayMemory(config, memory_queue, batch_queue, update_p_queue,
                          priority_environment_queue)
    memory.loop()
    return
Exemplo n.º 2
0
resize_shape = (1, 30, 90)  # 训练缩放的大小
FPS = 10  # 控制游戏截图帧数

# 实例化一个游戏环境,参数为游戏名称
env = DinoGame(reshape=resize_shape)
# 图像输入形状和动作维度
obs_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# 创建策略模型和目标模型,目标模型不参与训练
policyQ = Model(obs_dim, action_dim)
targetQ = Model(obs_dim, action_dim)
targetQ.eval()

# 数据记录器
rpm = ReplayMemory(memory_size)
# 优化方法
optimizer = paddle.optimizer.Adam(parameters=policyQ.parameters(),
                                  learning_rate=learning_rate)


# 评估模型
def evaluate():
    total_reward = 0
    obs = env.reset()
    last_time = time.time()
    while True:
        obs = np.expand_dims(obs, axis=0)
        obs = paddle.to_tensor(obs, dtype='float32')
        action = targetQ(obs)
        action = paddle.argmax(action).numpy()[0]
Exemplo n.º 3
0
    def __init__(self, dimO, dimA,num_layer,num_nodes):
        self.dimA = dimA[0]
        dimA = list(dimA)
        dimO = list(dimO)
        if num_layer == 2:
            if num_nodes == 1:
                import ddpg_nets_dm_conv2_1
                nets = ddpg_nets_dm_conv2_1
            elif num_nodes == 2:
                import ddpg_nets_dm_conv2_2
                nets = ddpg_nets_dm_conv2_2
        elif num_layer == 3:
            if num_nodes == 1:
                import ddpg_nets_dm_conv3_1
                nets = ddpg_nets_dm_conv3_1
            elif num_nodes == 2:
                import ddpg_nets_dm_conv3_2
                nets = ddpg_nets_dm_conv3_2

        tau = FLAGS.tau
        discount = FLAGS.discount
        pl2norm = FLAGS.pl2norm
        l2norm = FLAGS.l2norm
        plearning_rate = FLAGS.prate
        learning_rate = FLAGS.rate
        outheta = FLAGS.outheta
        ousigma = FLAGS.ousigma

        # init replay memory
        self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA)
        # start tf session
        self.sess = tf.Session(config=tf.ConfigProto(
            inter_op_parallelism_threads=FLAGS.thread,
            log_device_placement=False,
            allow_soft_placement=True,
            gpu_options=tf.GPUOptions(allow_growth=True)))

        # create tf computational graph
        #
        self.theta_p = nets.theta_p(dimO, dimA, FLAGS.l1size, FLAGS.l2size)
        self.theta_q = nets.theta_q(dimO, dimA, FLAGS.l1size, FLAGS.l2size)
       # self.thetaq_cvx_ = [v for v in self.theta_q
                           # if 'conv' in v.name]
        #self.makeCvx = [v.assign(-tf.abs(v)) for v in self.thetaq_cvx_]
        #self.proj = [v.assign(tf.minimum(v, 0)) for v in self.thetaq_cvx_]
        self.theta_pt, update_pt = exponential_moving_averages(self.theta_p, tau)
        self.theta_qt, update_qt = exponential_moving_averages(self.theta_q, tau)

        obs = tf.placeholder(tf.float32, [None] + dimO, "obs")
        act_test = nets.policy(obs, self.theta_p)

        # explore
        self.epsilon = 1
        self.noise = np.zeros(self.dimA)
        self.noise -= FLAGS.outheta*self.noise - \
                              FLAGS.ousigma*npr.randn(self.dimA)
        act_expl = act_test +self.epsilon* self.noise
        #self.epsilon -= 1/5000000

        # test
        q = nets.qfunction(obs, act_test, self.theta_q)
        # training

        # q optimization
        act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train")
        rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew")
        obs2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs2")
        term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2")

        # policy loss
        act_train_policy = nets.policy(obs, self.theta_p)
        q_train_policy = nets.qfunction(obs, act_train_policy, self.theta_q)
        meanq = tf.reduce_mean(q_train_policy, 0)
        wd_p = tf.add_n([pl2norm * tf.nn.l2_loss(var) for var in self.theta_p])  # weight decay
        loss_p = -meanq + wd_p
        # policy optimization
        optim_p = tf.train.AdamOptimizer(learning_rate=plearning_rate, epsilon=1e-4)
        grads_and_vars_p = optim_p.compute_gradients(loss_p, var_list=self.theta_p)
        optimize_p = optim_p.apply_gradients(grads_and_vars_p)
        with tf.control_dependencies([optimize_p]):
            train_p = tf.group(update_pt)

        # q
        q_train = nets.qfunction(obs, act_train, self.theta_q)
        # q targets
        act2 = nets.policy(obs2, theta=self.theta_pt)
        q2 = nets.qfunction(obs2, act2, theta=self.theta_qt)
        q_target = tf.stop_gradient(tf.where(term2, rew, rew + discount * q2))
        # q_target = tf.stop_gradient(rew + discount * q2)
        # q loss
        td_error = q_train - q_target
        ms_td_error = tf.reduce_mean(tf.square(td_error), 0)
        wd_q = tf.add_n([l2norm * tf.nn.l2_loss(var) for var in self.theta_q])  # weight decay
        loss_q = ms_td_error + wd_q
        # q optimization
        optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-4)
        grads_and_vars_q = optim_q.compute_gradients(loss_q, var_list=self.theta_q)
        optimize_q = optim_q.apply_gradients(grads_and_vars_q)
        with tf.control_dependencies([optimize_q]):
            train_q = tf.group(update_qt)

        summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.outdir, 'board'), self.sess.graph)
        summary_list = []
        summary_list.append(tf.summary.scalar('Qvalue', tf.reduce_mean(q_train)))
        summary_list.append(tf.summary.scalar('loss', ms_td_error))
        summary_list.append(tf.summary.scalar('reward', tf.reduce_mean(rew)))

        # tf functions
        with self.sess.as_default():
            self._act_test = Fun(obs, act_test)
            self._act_expl = Fun(obs, act_expl)
            #self._reset = Fun([], self.ou_reset)
            self._train = Fun([obs, act_train, rew, obs2, term2], [train_p, train_q, loss_q], summary_list, summary_writer)

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=1)
        ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf")
        if ckpt:
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.initialize_all_variables())
            #self.sess.run(self.makeCvx)

        self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)
Exemplo n.º 4
0
def train(args, net, env):
    # Begin tf session
    with tf.Session() as sess:
        # Initialize variables
        tf.global_variables_initializer().run()
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)

        # load from previous save
        if len(args.ckpt_name) > 0:
            saver.restore(sess, os.path.join(args.save_dir, args.ckpt_name))

        # Load data
        shift = sess.run(net.shift)
        scale = sess.run(net.scale)
        shift_u = sess.run(net.shift_u)
        scale_u = sess.run(net.scale_u)

        replay_memory = ReplayMemory(args, shift, scale, shift_u, scale_u, env,
                                     net, sess)

        # Store normalization parameters
        sess.run(tf.assign(net.shift, replay_memory.shift_x))
        sess.run(tf.assign(net.scale, replay_memory.scale_x))
        sess.run(tf.assign(net.shift_u, replay_memory.shift_u))
        sess.run(tf.assign(net.scale_u, replay_memory.scale_u))

        #Function to evaluate loss on validation set
        def val_loss(kl_weight):
            replay_memory.reset_batchptr_val()
            loss = 0.0
            for b in range(replay_memory.n_batches_val):
                # Get inputs
                batch_dict = replay_memory.next_batch_val()
                x = batch_dict["states"]
                u = batch_dict['inputs']

                # Construct inputs for network
                feed_in = {}
                feed_in[net.x] = np.reshape(
                    x, (2 * args.batch_size * args.seq_length, args.state_dim))
                feed_in[net.u] = u
                if args.kl_weight > 0.0:
                    feed_in[net.kl_weight] = kl_weight
                else:
                    feed_in[net.kl_weight] = 1.0

                # Find loss
                feed_out = net.cost
                cost = sess.run(feed_out, feed_in)
                loss += cost

            return loss / replay_memory.n_batches_val

        # Initialize variable to track validation score over time
        old_score = 1e9
        count_decay = 0
        decay_epochs = []

        # Define temperature for annealing kl_weight
        T = args.anneal_time * replay_memory.n_batches_train
        count = 0

        # Loop over epochs
        for e in range(args.num_epochs):
            visualize_predictions(args, sess, net, replay_memory, env, e)

            # Initialize loss
            loss = 0.0
            rec_loss = 0.0
            kl_loss = 0.0
            loss_count = 0
            replay_memory.reset_batchptr_train()

            # Loop over batches
            for b in range(replay_memory.n_batches_train):
                start = time.time()
                count += 1

                # Update kl_weight
                if e < args.start_kl:
                    kl_weight = 1e-3
                else:
                    count += 1
                    kl_weight = min(args.kl_weight,
                                    1e-3 + args.kl_weight * count / float(T))

                # Get inputs
                batch_dict = replay_memory.next_batch_train()
                x = batch_dict["states"]
                u = batch_dict['inputs']

                # Construct inputs for network
                feed_in = {}
                feed_in[net.x] = np.reshape(
                    x, (2 * args.batch_size * args.seq_length, args.state_dim))
                feed_in[net.u] = u
                feed_in[net.kl_weight] = kl_weight

                # Find loss and perform training operation
                feed_out = [
                    net.cost, net.loss_reconstruction, net.kl_loss, net.train
                ]
                out = sess.run(feed_out, feed_in)

                # Update and display cumulative losses
                loss += out[0]
                rec_loss += out[1]
                kl_loss += out[2]
                loss_count += 1

                end = time.time()

                # Print loss
                if (e * replay_memory.n_batches_train +
                        b) % 100 == 0 and b > 0:
                    print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
                      .format(e * replay_memory.n_batches_train + b, args.num_epochs * replay_memory.n_batches_train,
                              e, loss/loss_count, end - start))
                    print("{}/{} (epoch {}), rec_loss = {:.3f}, time/batch = {:.3f}" \
                      .format(e * replay_memory.n_batches_train + b, args.num_epochs * replay_memory.n_batches_train,
                              e, rec_loss/loss_count, end - start))
                    print("{}/{} (epoch {}), kl_loss = {:.3f}, time/batch = {:.3f}" \
                      .format(e * replay_memory.n_batches_train + b, args.num_epochs * replay_memory.n_batches_train,
                              e, kl_loss/loss_count, end - start))

                    print('')
                    loss = 0.0
                    rec_loss = 0.0
                    kl_loss = 0.0
                    loss_count = 0

            # Evaluate loss on validation set
            score = val_loss(args.kl_weight * (e >= args.start_kl))
            print('Validation Loss: {0:f}'.format(score))

            # Set learning rate
            if (old_score - score) < 0.01 and e != args.start_kl:
                count_decay += 1
                decay_epochs.append(e)
                if len(decay_epochs) >= 3 and np.sum(
                        np.diff(decay_epochs)[-2:]) == 2:
                    break
                print('setting learning rate to ',
                      args.learning_rate * (args.decay_rate**count_decay))
                sess.run(
                    tf.assign(
                        net.learning_rate,
                        args.learning_rate * (args.decay_rate**count_decay)))
                if args.learning_rate * (args.decay_rate**count_decay) < 1e-5:
                    break
            print('learning rate is set to ',
                  args.learning_rate * (args.decay_rate**count_decay))
            old_score = score

            # Save model every epoch
            checkpoint_path = os.path.join(args.save_dir,
                                           args.save_name + '.ckpt')
            saver.save(sess, checkpoint_path, global_step=e)
            print("model saved to {}".format(checkpoint_path))
Exemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--num-envs', type=int, default=32)
    parser.add_argument('--t-max', type=int, default=1)
    parser.add_argument('--learning-rate', type=float, default=0.0002)
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--steps-per-epoch', type=int, default=100000)
    parser.add_argument('--testing', type=int, default=0)
    parser.add_argument('--continue-training', type=int, default=4)
    parser.add_argument('--epoch-num', type=int, default=20)
    parser.add_argument('--start-epoch', type=int, default=20)
    parser.add_argument('--testing-epoch', type=int, default=0)
    parser.add_argument('--save-log', type=str, default='log')
    parser.add_argument('--signal-num', type=int, default=4)
    parser.add_argument('--toxin', type=int, default=0)
    parser.add_argument('--a1-AC-folder', type=str, default='basic/a1_Qnet')
    parser.add_argument('--a2-AC-folder', type=str, default='basic/a2_Qnet')
    parser.add_argument('--eps-start', type=float, default=1.0)
    parser.add_argument('--replay-start-size', type=int, default=50000)
    parser.add_argument('--decay-rate', type=int, default=50000)
    parser.add_argument('--replay-memory-size', type=int, default=1000000)
    parser.add_argument('--eps-min', type=float, default=0.1)

    args = parser.parse_args()
    config = Config(args)
    t_max = args.t_max
    q_ctx = config.ctx
    steps_per_epoch = args.steps_per_epoch
    np.random.seed(args.seed)
    start_epoch = args.start_epoch
    testing_epoch = args.testing_epoch
    save_log = args.save_log
    epoch_num = args.epoch_num
    epoch_range = range(epoch_num)
    signal_num = args.signal_num
    toxin = args.toxin
    a1_Qnet_folder = args.a1_AC_folder
    a2_Qnet_folder = args.a2_AC_folder

    freeze_interval = 10000
    update_interval = 5
    replay_memory_size = args.replay_memory_size
    discount = 0.99
    replay_start_size = args.replay_start_size
    history_length = 1
    eps_start = args.eps_start
    eps_min = args.eps_min
    eps_decay = (eps_start - eps_min) / args.decay_rate
    eps_curr = eps_start
    freeze_interval /= update_interval
    minibatch_size = 32

    testing = args.testing
    testing = True if testing == 1 else False
    continue_training = args.continue_training
    continue_training = True if continue_training == 1 else False

    rewards = {
        "positive": 1.0,
        "negative": -1.0,
        "tick": -0.002,
        "loss": -2.0,
        "win": 2.0
    }

    game = HunterWorld(width=256,
                       height=256,
                       num_preys=10,
                       draw=False,
                       num_hunters=2,
                       num_toxins=toxin)
    env = PLE(game,
              fps=30,
              force_fps=True,
              display_screen=False,
              reward_values=rewards,
              resized_rows=80,
              resized_cols=80,
              num_steps=3)

    action_set = env.get_action_set()
    action_map1 = []
    for action in action_set[0].values():
        action_map1.append(action)

    action_map2 = []
    for action in action_set[1].values():
        action_map2.append(action)
    action_num = len(action_map1)

    replay_memory1 = ReplayMemory(state_dim=(74, ),
                                  history_length=history_length,
                                  memory_size=replay_memory_size,
                                  replay_start_size=replay_start_size,
                                  state_dtype='float32')

    a1_target1 = Qnetwork(actions_num=action_num,
                          q_ctx=q_ctx,
                          isTrain=False,
                          batch_size=1,
                          dir=dir,
                          folder=a1_Qnet_folder)
    a1_target32 = Qnetwork(actions_num=action_num,
                           q_ctx=q_ctx,
                           isTrain=False,
                           batch_size=32,
                           dir=dir,
                           folder=a1_Qnet_folder)
    Qnet1 = Qnetwork(actions_num=action_num,
                     q_ctx=q_ctx,
                     isTrain=True,
                     batch_size=32,
                     dir=dir,
                     folder=a1_Qnet_folder)

    a2_target1 = Qnetwork(actions_num=action_num,
                          q_ctx=q_ctx,
                          isTrain=False,
                          batch_size=1,
                          dir=dir,
                          folder=a2_Qnet_folder)
    a2_target32 = Qnetwork(actions_num=action_num,
                           q_ctx=q_ctx,
                           isTrain=False,
                           batch_size=32,
                           dir=dir,
                           folder=a2_Qnet_folder)
    Qnet2 = Qnetwork(actions_num=action_num,
                     q_ctx=q_ctx,
                     isTrain=True,
                     batch_size=32,
                     dir=dir,
                     folder=a2_Qnet_folder)

    training_steps = 0
    total_steps = 0
    if testing:
        env.force_fps = False
        env.game.draw = True
        env.display_screen = True
        Qnet1.load_params(testing_epoch)
        Qnet2.load_params(testing_epoch)
    elif continue_training:
        epoch_range = range(start_epoch, epoch_num + start_epoch)
        Qnet1.load_params(start_epoch - 1)
        Qnet2.load_params(start_epoch - 1)
        logging_config(logging, dir, save_log, file_name)
    else:
        logging_config(logging, dir, save_log, file_name)

    copyTargetQNetwork(Qnet1.model, a1_target1.model)
    copyTargetQNetwork(Qnet1.model, a1_target32.model)
    copyTargetQNetwork(Qnet2.model, a2_target1.model)
    copyTargetQNetwork(Qnet2.model, a2_target32.model)

    logging.info('args=%s' % args)
    logging.info('config=%s' % config.__dict__)
    print_params(logging, Qnet1.model)
    print_params(logging, Qnet2.model)

    for epoch in epoch_range:
        steps_left = steps_per_epoch
        episode = 0
        epoch_reward = 0
        start = time.time()
        env.reset_game()
        while steps_left > 0:
            episode += 1
            episode_loss = 0.0
            episode_q_value = 0.0
            episode_update_step = 0
            episode_action_step = 0
            episode_reward = 0
            episode_step = 0
            collisions = 0.0
            time_episode_start = time.time()
            env.reset_game()
            next_ob = env.get_states()
            while not env.game_over():
                if replay_memory1.size >= history_length and replay_memory1.size > replay_start_size:
                    do_exploration = (np.random.rand() < eps_curr)
                    eps_curr = max(eps_curr - eps_decay, eps_min)
                    if do_exploration:
                        action1 = np.random.randint(action_num)
                        action2 = np.random.randint(action_num)
                    else:
                        current_state1 = next_ob[0].reshape(1, 74)
                        current_state2 = next_ob[1].reshape(1, 74)
                        state1 = nd.array(
                            current_state1.reshape((1, ) +
                                                   current_state1.shape),
                            ctx=q_ctx)
                        state2 = nd.array(
                            current_state2.reshape((1, ) +
                                                   current_state2.shape),
                            ctx=q_ctx)
                        a1_target1.model.forward(mx.io.DataBatch([state1], []))
                        a2_target1.model.forward(mx.io.DataBatch([state2], []))
                        q_value1 = a1_target1.model.get_outputs()[0].asnumpy(
                        )[0]
                        q_value2 = a2_target1.model.get_outputs()[0].asnumpy(
                        )[0]
                        action1 = numpy.argmax(q_value1)
                        action2 = numpy.argmax(q_value2)
                        episode_q_value += q_value1[action1]
                        episode_q_value += q_value2[action2]
                        episode_action_step += 1
                else:
                    action1 = np.random.randint(action_num)
                    action2 = np.random.randint(action_num)

                next_ob, reward, terminal_flag = env.act(
                    [action_map1[action1], action_map2[action2]])
                replay_memory1.append(next_ob[0], action1, reward[0],
                                      terminal_flag)

                total_steps += 1
                sum_reward = sum(reward)
                episode_reward += sum_reward
                if sum_reward < 0:
                    collisions += 1
                episode_step += 1

                if total_steps % update_interval == 0 and replay_memory1.size > replay_start_size:
                    training_steps += 1

                    state_batch1, actions1, rewards1, nextstate_batch1, terminate_flags1 = replay_memory1.sample(
                        batch_size=minibatch_size)
                    state_batch2, actions2, rewards2, nextstate_batch2, terminate_flags2 = replay_memory1.sample(
                        batch_size=minibatch_size)

                    state_batch1 = nd.array(state_batch1, ctx=q_ctx)
                    actions_batch1 = nd.array(actions1, ctx=q_ctx)
                    reward_batch1 = nd.array(rewards1, ctx=q_ctx)
                    terminate_flags1 = nd.array(terminate_flags1, ctx=q_ctx)

                    state_batch2 = nd.array(state_batch2, ctx=q_ctx)
                    actions_batch2 = nd.array(actions2, ctx=q_ctx)
                    reward_batch2 = nd.array(rewards2, ctx=q_ctx)
                    terminate_flags2 = nd.array(terminate_flags2, ctx=q_ctx)

                    a1_target32.model.forward(
                        mx.io.DataBatch(
                            [nd.array(nextstate_batch1, ctx=q_ctx)], []))
                    Qvalue1 = a1_target32.model.get_outputs()[0]

                    y_batch1 = reward_batch1 + nd.choose_element_0index(
                        Qvalue1, nd.argmax_channel(Qvalue1)) * (
                            1.0 - terminate_flags1) * discount

                    Qnet1.model.forward(mx.io.DataBatch(
                        [state_batch1, actions_batch1, y_batch1], []),
                                        is_train=True)
                    Qnet1.model.backward()
                    Qnet1.model.update()

                    a2_target32.model.forward(
                        mx.io.DataBatch(
                            [nd.array(nextstate_batch2, ctx=q_ctx)], []))
                    Qvalue2 = a2_target32.model.get_outputs()[0]

                    y_batch2 = reward_batch2 + nd.choose_element_0index(
                        Qvalue2, nd.argmax_channel(Qvalue2)) * (
                            1.0 - terminate_flags2) * discount

                    Qnet2.model.forward(mx.io.DataBatch(
                        [state_batch2, actions_batch2, y_batch2], []),
                                        is_train=True)
                    Qnet2.model.backward()
                    Qnet2.model.update()

                    if training_steps % 10 == 0:
                        loss1 = 0.5 * nd.square(
                            nd.choose_element_0index(
                                Qnet1.model.get_outputs()[0], actions_batch1) -
                            y_batch1)
                        loss2 = 0.5 * nd.square(
                            nd.choose_element_0index(
                                Qnet2.model.get_outputs()[0], actions_batch2) -
                            y_batch2)
                        episode_loss += nd.sum(loss1).asnumpy()
                        episode_loss += nd.sum(loss2).asnumpy()
                        episode_update_step += 1

                    if training_steps % freeze_interval == 0:
                        copyTargetQNetwork(Qnet1.model, a1_target1.model)
                        copyTargetQNetwork(Qnet1.model, a1_target32.model)
                        copyTargetQNetwork(Qnet2.model, a2_target1.model)
                        copyTargetQNetwork(Qnet2.model, a2_target32.model)

            steps_left -= episode_step
            time_episode_end = time.time()
            epoch_reward += episode_reward
            info_str = "Epoch:%d, Episode:%d, Steps Left:%d/%d/%d, Reward:%f, fps:%f, Exploration:%f" \
                       % (epoch, episode, steps_left, episode_step, steps_per_epoch, episode_reward,
                          episode_step / (time_episode_end - time_episode_start), eps_curr)

            info_str += ", Collision:%f/%d " % (collisions / episode_step,
                                                collisions)

            if episode_update_step > 0:
                info_str += ", Avg Loss:%f/%d" % (episode_loss /
                                                  episode_update_step,
                                                  episode_update_step * 10)
            if episode_action_step > 0:
                info_str += ", Avg Q Value:%f/%d " % (
                    episode_q_value / episode_action_step, episode_action_step)

            if episode % 1 == 0:
                logging.info(info_str)
                print info_str

        end = time.time()
        fps = steps_per_epoch / (end - start)
        Qnet1.save_params(epoch)
        Qnet2.save_params(epoch)
        logging.info("Epoch:%d, FPS:%f, Avg Reward: %f/%d" %
                     (epoch, fps, epoch_reward / float(episode), episode))
Exemplo n.º 6
0
if args.random_seed:
    random.seed(args.random_seed)

# instantiate classes
if args.environment == 'ale':
    env = ALEEnvironment(args.game, args)
    logger.info("Using ALE Environment")
elif args.environment == 'gym':
    logger.handlers.pop()
    env = GymEnvironment(args.game, args)
    logger.info("Using Gym Environment")
else:
    assert False, "Unknown environment" + args.environment

mem = ReplayMemory(args.replay_size, args)
net = DeepQNetwork(env.numActions(), args)
agent = Agent(env, mem, net, args)
stats = Statistics(agent, net, mem, env, args)

if args.load_weights:
    logger.info("Loading weights from %s" % args.load_weights)
    net.load_weights(args.load_weights)

if args.play_games:
    logger.info("Playing for %d game(s)" % args.play_games)
    # Set env mode test so that loss of life is not considered as terminal
    env.setMode('test')
    stats.reset()
    agent.play(args.play_games)
    stats.write(0, "play")
Exemplo n.º 7
0
    parser.add_argument('--train_freq', type=int, default=1)
    parser.add_argument('--save_freq', type=int, default=10000)
    parser.add_argument('--log_freq', type=int, default=1000)
    args = parser.parse_args()

    # create environment and add standard wrappers
    env = gym.make(args.env_name)

    # create main model and target model.
    model = create_model(env.observation_space, env.action_space, args)
    target_model = create_model(env.observation_space, env.action_space, args)
    # copy main model weights to target
    update_target(model, target_model)

    # create replay memory
    replay_memory = ReplayMemory(args.replay_size, env.observation_space.shape)

    # statistics
    loss = 0
    qmean = 0
    rewards = []
    lengths = []
    episode_num = 0
    episode_reward = 0
    episode_length = 0
    num_iterations = 0

    # reset the environment
    obs = env.reset()
    # loop for args.num_timesteps steps
    for t in range(args.num_timesteps):
Exemplo n.º 8
0
from agent import Agent
from model_utils import saveModel, loadModel

# Environment Settings
env = gym.make('LunarLander-v2')
env.seed(0)

state_space = env.observation_space.shape[0]
action_space = env.action_space.n
print('State shape: ', state_space)
print('Number of actions: ', action_space)

# ReplayMemory Settings
capacity = 5000
batch_size = 64
replayMemory = ReplayMemory(capacity, batch_size)

# Strategy Settings
strategy = EpsilonGreedyStrategy(1, 0, 1E-3)

# Deep Learning Model
model = Model(state_space, action_space)
path = './checkpoint.pth.tar'  # To save/load the model

pathExist = os.path.isfile(path)
pathExist and input(
    'The file {} will be replaced, do you wish to continue? (if not, press ctrl+c)'
    .format(path))
# if you want load some model uncoment the lines below
not (pathExist) and print('{} don\'t founded'.format(path))
model, _, _, _ = loadModel(
Exemplo n.º 9
0
def policy_var(policy, mcts, n=200):
    states, actions, qvalues = mcts.memory.sample(n)
    states = torch.FloatTensor(states).to(device)
    actions, _, mean = policy.sample(states)
    return (actions - mean).norm(2) / n


policy = GaussianPolicy(state_dim, action_dim).to(device=device)
if args.double_Q:
    critic = DoubleQNetwork(state_dim, action_dim,
                            args.hidden_size).to(device=device)
else:
    critic = QNetwork(state_dim, action_dim,
                      args.hidden_size).to(device=device)

memory = ReplayMemory(20000, args.seed, state_dim, action_dim)
traj_memory = TrajReplayMemory(200000, args.seed, state_dim, action_dim)
alg = ActorCriticMCTS(policy, critic, env, memory, traj_memory, args)
__ = 0
while len(memory) < 300:
    state = env.reset()
    ps, pa, pr, _ = alg.Interaction(state,
                                    done=False,
                                    steps=0,
                                    max_steps=args.max_interact_steps,
                                    rand_act=True)
    parse_path(ps, pa, pr, memory)
    parse_path_to_traj(ps,
                       pa,
                       pr,
                       traj_memory,
Exemplo n.º 10
0
        total_samples = 0
        # Making the network to be in evaluation mode so that losses won't be accumulated, thereby saving memory
        self.eval()


poi_vals = env.set_poi_values()
if not os.path.exists(args.save_foldername): os.makedirs(args.save_foldername)

if args.algo == "NAF":
    agent = NAF(args.gamma, args.tau, args.num_hnodes,
                args.autoencoder_output_length, env.action_space, args)
else:
    agent = DDPG(args.gamma, args.tau, args.num_hnodes,
                 args.autoencoder_output_length, env.action_space, args)

memory = ReplayMemory(args.buffer_size)
ounoise = OUNoise(env.action_space.shape[0])

model1 = AutoEncoder(40)
model1.load_state_dict(torch.load('DDDPG_4_10_20_LS_20.pth'))
model1 = model1.to(device)
model1.eval()

episode_rewards_list = []
rover_path_list = []
poi_pos_list = []
poi_status_list = []

for i_episode in range(args.num_episodes):
    joint_state = utils.to_tensor(np.array(
        env.reset()))  # reset the environment
Exemplo n.º 11
0
policy_net = DQN(H, W, n_actions).to(device)
target_net = DQN(H, W, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

writer = SummaryWriter()

Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

optimizer = optim.RMSprop(policy_net.parameters(), lr=LEARNING_RATE)
# optimizer = optim.Adam(policy.parameters(), lr=LEARNING_RATE)

MEMORY_CAPACITY = 100000
memory = ReplayMemory(MEMORY_CAPACITY)


def get_screen():
    screen = env.render(render_scale=1, complete=False, store=False)
    H, W = screen.shape
    # screen = np.reshape(screen,(1, H, W))
    # print(screen.shape)
    screen = np.ascontiguousarray(screen, dtype=np.float32)
    screen = torch.from_numpy(screen)
    screen = transform(screen.unsqueeze(0))
    # pdb.set_trace()
    return screen.to(device)


def preprocess_frame(frame):
Exemplo n.º 12
0
 def setUp(self):
     self.heap = BinaryHeap()
     self.replayMemory = ReplayMemory(10, 32, 4, 84, 84)
Exemplo n.º 13
0
    def __init__(self, env):

        self.env = env
        tf.reset_default_graph()
        self.sess = tf.Session()

        # A few starter hyperparameters
        # hyperparameters
        self.gamma = 0.99
        self.h1 = 64
        self.h2 = 64
        self.h3 = 64
        self.l2_reg = 1e-6
        self.max_episode_step = 1000
        self.update_slow_target_every = 100
        self.batch_size = 1024
        self.eps_start = 1.0
        self.epsilon_end = 0.05
        self.epsilon_decay_length = 1e5
        self.epsilon_decay_exp = 0.97
        self.num_episodes = 0
        self.num_steps = 0
        self.epsilon_linear_step = (
            self.eps_start - self.epsilon_end) / self.epsilon_decay_length
        # memory
        self.replay_memory = ReplayMemory(1e6)
        # Perhaps you want to have some samples in the memory before starting to train?
        self.min_replay_size = 2000

        # define yours training operations here...
        self.observation_input = tf.placeholder(
            tf.float32, shape=[None] + list(self.env.observation_space.shape))
        self.target_input = tf.placeholder(
            dtype=tf.float32,
            shape=[None] + list(self.env.observation_space.shape)
        )  # input to slow target network

        with tf.variable_scope('q_network') as scope:
            self.q_values = self.build_model(self.observation_input)

        with tf.variable_scope('target_network') as scope:
            self.target_q_values = self.build_model(self.observation_input,
                                                    False)

        self.q_network_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope='q_network')
        self.q_target_network_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope='target_network')

        # update values for slowly-changing target network to match current critic network
        update_slow_target_ops = []
        for i, slow_target_var in enumerate(self.q_target_network_vars):
            update_slow_target_op = slow_target_var.assign(
                self.q_network_vars[i])
            update_slow_target_ops.append(update_slow_target_op)

        self.update_slow_target_op = tf.group(*update_slow_target_ops,
                                              name='update_slow_target')

        # define your update operations here...
        self.saver = tf.train.Saver(tf.trainable_variables())
        self.target = tf.placeholder(tf.float32, shape=[None])

        self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
        #Calculating the action q value is taken from https://github.com/dennybritz/reinforcement-learning/tree/master/DQN
        gather_indices = tf.range(self.batch_size) * tf.shape(
            self.q_values)[1] + self.actions
        self.action_predictions = tf.gather(tf.reshape(self.q_values, [-1]),
                                            gather_indices)
        self.loss = tf.losses.huber_loss(
            self.target, self.action_predictions
        )  #tf.squared_difference(self.target, self.action_predictions)

        #Adding a regularization term for the weights
        for var in self.q_network_vars:
            if not 'bias' in var.name:
                self.loss += self.l2_reg * 0.5 * tf.nn.l2_loss(var)
        #self.loss = (self.target-self.action_predictions)**2
        #self.losses = tf.reduce_mean(self.loss)
        self.minimizer = tf.train.AdamOptimizer(learning_rate=1e-6).minimize(
            self.loss
        )  #tf.train.GradientDescentOptimizer(1e-5).minimize(self.losses)
        self.sess.run(tf.global_variables_initializer())
        self.writer = tf.summary.FileWriter(LOGDIR)
        self.writer.add_graph(self.sess.graph)
        self.count = 0

        # Summaries for Tensorboard
        tf.summary.scalar("loss", self.loss)
        #tf.summary.scalar("loss_hist", self.losses),
        tf.summary.histogram("q_values_hist", self.q_values),
        tf.summary.scalar("max_q_value", tf.reduce_max(self.q_values))
        self.summ = tf.summary.merge_all()
Exemplo n.º 14
0
    def __init__(self,
                 simulator,
                 gamma=0.99,
                 mem_size=int(1e5),
                 lr=9e-4,
                 batch_size=32,
                 ode_tol=1e-3,
                 ode_dim=20,
                 enc_hidden_to_latent_dim=20,
                 latent_dim=10,
                 eps_decay=1e-4,
                 weight_decay=1e-3,
                 model=None,
                 timer_type='',
                 latent_policy=False,
                 obs_normal=False,
                 exp_id=0,
                 trained_model_path='',
                 ckpt_path='',
                 traj_data_path='',
                 logger=None):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.exp_id = exp_id
        self.simulator = simulator
        self.batch_size = batch_size
        self.memory_traj_train = ReplayMemory(mem_size, Trajectory)
        self.memory_traj_test = ReplayMemory(mem_size // 10, Trajectory)
        self.input_dim = self.simulator.num_states + self.simulator.num_actions
        self.output_dim = self.simulator.num_states
        self.latent_dim = latent_dim
        self.ckpt_path = ckpt_path
        self.logger = logger
        self.rms = RunningStats(dim=self.simulator.num_states,
                                device=self.device) if obs_normal else None

        # policy and replay buffer
        assert not (model == 'free' and latent_policy)
        if 'HalfCheetah' in repr(simulator) or 'Swimmer' in repr(
                simulator) or 'Hopper' in repr(simulator):
            self.policy = PolicyDDPG(state_dim=self.simulator.num_states,
                                     action_dim=self.simulator.num_actions,
                                     device=self.device,
                                     gamma=gamma,
                                     latent=latent_policy)
            self.memory_trans = ReplayMemory(mem_size, Transition)
        else:
            state_dim = self.simulator.num_states + latent_dim if latent_policy else self.simulator.num_states
            self.policy = PolicyDQN(state_dim=state_dim,
                                    action_dim=self.simulator.num_actions,
                                    device=self.device,
                                    gamma=gamma,
                                    latent=latent_policy)
            self.memory_trans = PrioritizedReplayMemory(mem_size, Transition)

        # model
        min_t, max_t, max_time_length, is_cont = simulator.get_time_info()
        timer_choice = Timer if timer_type == 'fool' else MLPTimer
        timer = timer_choice(input_dim=self.input_dim + self.latent_dim,
                             output_dim=1 if is_cont else max_t - min_t + 1,
                             min_t=min_t,
                             max_t=max_t,
                             max_time_length=max_time_length,
                             device=self.device).to(self.device)

        # ode network
        if 'ode' in model:
            gen_ode_func = ODEFunc(
                ode_func_net=utils.create_net(latent_dim,
                                              latent_dim,
                                              n_layers=2,
                                              n_units=ode_dim,
                                              nonlinear=nn.Tanh)).to(
                                                  self.device)
            diffq_solver = DiffeqSolver(gen_ode_func,
                                        'dopri5',
                                        odeint_rtol=ode_tol,
                                        odeint_atol=ode_tol / 10)

        # encoder
        if model == 'vae-rnn' or model == 'latent-ode':
            encoder = Encoder_z0_RNN(
                latent_dim,
                self.input_dim,
                hidden_to_z0_units=enc_hidden_to_latent_dim,
                device=self.device).to(self.device)
            z0_prior = Normal(
                torch.tensor([0.]).to(self.device),
                torch.tensor([1.]).to(self.device))

        # decoder
        decoder = Decoder(latent_dim, self.output_dim,
                          n_layers=0).to(self.device)

        if model == 'free' or model == 'rnn':
            self.model = VanillaGRU(input_dim=self.input_dim,
                                    latent_dim=latent_dim,
                                    eps_decay=eps_decay,
                                    decoder=decoder,
                                    timer=timer,
                                    device=self.device).to(self.device)
        elif model == 'deltaT-rnn':
            self.model = DeltaTGRU(input_dim=self.input_dim,
                                   latent_dim=latent_dim,
                                   eps_decay=eps_decay,
                                   decoder=decoder,
                                   timer=timer,
                                   device=self.device).to(self.device)
        elif model == 'decay-rnn':
            self.model = ExpDecayGRU(input_dim=self.input_dim,
                                     latent_dim=latent_dim,
                                     eps_decay=eps_decay,
                                     decoder=decoder,
                                     timer=timer,
                                     device=self.device).to(self.device)
        elif model == 'ode-rnn':
            self.model = ODEGRU(input_dim=self.input_dim,
                                latent_dim=latent_dim,
                                eps_decay=eps_decay,
                                decoder=decoder,
                                diffeq_solver=diffq_solver,
                                timer=timer,
                                device=self.device).to(self.device)
        elif model == 'vae-rnn':
            self.model = VAEGRU(input_dim=self.input_dim,
                                latent_dim=latent_dim,
                                eps_decay=eps_decay,
                                encoder_z0=encoder,
                                decoder=decoder,
                                z0_prior=z0_prior,
                                timer=timer,
                                device=self.device).to(self.device)
        elif model == 'latent-ode':
            self.model = LatentODE(input_dim=self.input_dim,
                                   latent_dim=latent_dim,
                                   eps_decay=eps_decay,
                                   encoder_z0=encoder,
                                   decoder=decoder,
                                   diffeq_solver=diffq_solver,
                                   z0_prior=z0_prior,
                                   timer=timer,
                                   device=self.device).to(self.device)
        else:
            raise NotImplementedError

        if trained_model_path:
            self.model.load_state_dict(
                torch.load(trained_model_path,
                           map_location=self.device)['model_state_dict'])

        if traj_data_path:
            self.load_traj_buffer(traj_data_path)

        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=lr,
                                    weight_decay=weight_decay)
Exemplo n.º 15
0
def main():
    sess = tf.Session()
    K.set_session(sess)
    env = gym.make("MountainCarContinuous-v0")

    #Parameters
    memory_size = 100000
    batch_size = 32
    tau = 0.001
    lr_actor = 0.0001
    lr_critic = 0.001
    discount_factor = 0.99
    episodes = 1001
    time_steps = 501
    collect_experience = 50000
    save_frequency = 250
    ep_reward = []
    training = False

    #Noise objecct
    noise = OUNoise(env.action_space)
    #Initialize actor and critic objects
    actor = Actor(env, sess, lr_actor, tau)

    #Uncomment to the following line to save the actor model architecture as json file. Need to be saved
    #once only

    # actor.save_model_architecture("Actor_model_architecture.json")
    critic = Critic(env, sess, lr_critic, tau, discount_factor)

    #Initialize replay memory of size defined by memory_size
    replay_memory = ReplayMemory(memory_size)

    #Toggle between true and false for debugging purposes. For training it is always true
    run = True
    if run:
        #Loop over the number of episodes. At eqach new episode reset the environment, reset the noise
        #state and set total episode reward to 0
        for episode in range(episodes):
            state = env.reset()
            noise.reset()
            episode_reward = 0

            #Loop over the number of steps in an episode
            for time in range(time_steps):
                #Uncomment the following line of you want to visualize the mountain car during training.
                #Can also be trained without visualization for the case where we are using
                #position and velocities as state variables.

                # env.render()

                #Predict an action from the actor model using the current state
                action = actor.predict_action(state.reshape((1, 2)))[0]

                #Add ohlnbeck noise to the predicted action to encourage exploration of the environment
                exploratory_action = noise.get_action(action, time)

                #Take the noisy action to enter the next state
                next_state, reward, done, _ = env.step(exploratory_action)

                #Predict the action to be taken given the next_state. This next state action is predicted
                #using the actor's target model
                next_action = actor.predict_next_action(
                    next_state.reshape((1, 2)))[0]

                #Append this experience sample to the replay memory
                replay_memory.append(state, exploratory_action, reward,
                                     next_state, next_action, done)

                #Only start training when there are a minimum number of experience samples available in
                #memory
                if replay_memory.count() == collect_experience:
                    training = True
                    print('Start training')

                #When training:
                if training:
                    # 1)first draw a random batch of samples from the replay memory
                    batch = replay_memory.sample(batch_size)
                    # 2) using this sample calculate dQ/dA from the critic model
                    grads = critic.calc_grads(batch)
                    # 3) calculate dA/dTheta from the actor using the same batch
                    # 4) multiply dA/dTheta by negative dQ/dA to get dJ/dTheta
                    # 5) Update actor weights such that dJ/dTheta is maximized
                    # 6) The above operation is easily performed by minimizing the value obtained in (4)
                    t_grads = actor.train(batch, grads)

                    # update critic weights by minimizing the bellman loss. Use actor target to compute
                    # next action in the next state (already computed and stored in replay memory)
                    # in order to compute TD target
                    critic.train(batch)

                    #After each weight update of the actor and critic online model perform soft updates
                    # of their targets so that they can smoothly and slowly track the online model's
                    #weights
                    actor.update_target()
                    critic.update_target()

                #Add each step reward to the episode reward
                episode_reward += reward

                #Set current state as next state
                state = next_state

                #If target reached before the max allowed time steps, break the inner for loop
                if done:
                    break

            #Store episode reward
            ep_reward.append([episode, episode_reward])

            #Print info for each episode to track training progress
            print(
                "Completed in {} steps.... episode: {}/{}, episode reward: {} "
                .format(time, episode, episodes, episode_reward))

            #Save model's weights and episode rewards after each save_frequency episode
            if training and (episode % save_frequency) == 0:
                print('Data saved at epsisode:', episode)
                actor.save_weights(
                    './Model/DDPG_actor_model_{}.h5'.format(episode))
                pickle.dump(
                    ep_reward,
                    open('./Rewards/rewards_{}.dump'.format(episode), 'wb'))

        # Close the mountain car environment
        env.close()
Exemplo n.º 16
0
        Q.load_state_dict(weights['Q'])
        Q_targ.load_state_dict(weights['Q_targ'])

    #Learn params
    gamma = 0.99

    #Hyperparams
    frame_count = 40000
    eps_decay_time = 0.5

    eps_start = 1
    eps_end = 0.05
    l = -math.log(eps_end) / (frame_count * eps_decay_time)

    replay_mem = ReplayMemory(
        max_size=500000, alpha=0.5,
        eps=0.0) if prev_state is None else prev_state['replay_mem']

    episode_depth = 10000

    batch_size = 32

    Q_targ_update_freq = 300

    save_freq = frame_count / 100

    #Episode loop
    curr_eps = eps_start if prev_state is None else prev_state['end_eps']

    episode_num = 0 if prev_state is None else (prev_state['end_episode'] + 1)
    curr_frame_count = 0 if prev_state is None else prev_state[
Exemplo n.º 17
0
    def __init__(self,
                 dqn,
                 num_actions,
                 gamma=0.99,
                 learning_rate=0.00025,
                 replay_start_size=50000,
                 epsilon_start=1.0,
                 epsilon_end=0.01,
                 epsilon_steps=1000000,
                 update_freq=4,
                 target_copy_freq=30000,
                 replay_memory_size=1000000,
                 frame_history=4,
                 batch_size=32,
                 error_clip=1,
                 restore_network_file=None,
                 double=True):
        self.dqn = dqn
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.inp_actions = tf.placeholder(tf.float32, [None, num_actions])
        inp_shape = [None] + list(self.dqn.get_input_shape()) + [frame_history]
        inp_dtype = self.dqn.get_input_dtype()
        assert type(inp_dtype) is str
        self.inp_frames = tf.placeholder(inp_dtype, inp_shape)
        self.inp_sp_frames = tf.placeholder(inp_dtype, inp_shape)
        self.inp_terminated = tf.placeholder(tf.bool, [None])
        self.inp_reward = tf.placeholder(tf.float32, [None])
        self.inp_mask = tf.placeholder(inp_dtype, [None, frame_history])
        self.inp_sp_mask = tf.placeholder(inp_dtype, [None, frame_history])
        self.gamma = gamma
        with tf.variable_scope('online'):
            mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [
                frame_history
            ]
            mask = tf.reshape(self.inp_mask, mask_shape)
            masked_input = self.inp_frames * mask
            self.q_online = self.dqn.construct_q_network(masked_input)
        with tf.variable_scope('target'):
            mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [
                frame_history
            ]
            sp_mask = tf.reshape(self.inp_sp_mask, mask_shape)
            masked_sp_input = self.inp_sp_frames * sp_mask
            self.q_target = self.dqn.construct_q_network(masked_sp_input)

        if double:
            with tf.variable_scope('online', reuse=True):
                self.q_online_prime = self.dqn.construct_q_network(
                    masked_sp_input)
            self.maxQ = tf.gather_nd(
                self.q_target,
                tf.transpose([
                    tf.range(0, 32, dtype=tf.int32),
                    tf.cast(tf.argmax(self.q_online_prime, axis=1), tf.int32)
                ], [1, 0]))
        else:
            self.maxQ = tf.reduce_max(self.q_target, reduction_indices=1)

        self.r = tf.sign(self.inp_reward)
        use_backup = tf.cast(tf.logical_not(self.inp_terminated),
                             dtype=tf.float32)
        self.y = self.r + use_backup * gamma * self.maxQ
        self.delta = tf.reduce_sum(self.inp_actions * self.q_online,
                                   reduction_indices=1) - self.y
        self.error = tf.where(
            tf.abs(self.delta) < error_clip, 0.5 * tf.square(self.delta),
            error_clip * tf.abs(self.delta))
        self.loss = tf.reduce_sum(self.error)
        self.g = tf.gradients(self.loss, self.q_online)
        optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
                                              decay=0.95,
                                              centered=True,
                                              epsilon=0.01)
        self.train_op = optimizer.minimize(self.loss,
                                           var_list=th.get_vars('online'))
        self.copy_op = th.make_copy_op('online', 'target')
        self.saver = tf.train.Saver(var_list=th.get_vars('online'))

        self.replay_buffer = ReplayMemory(self.dqn.get_input_shape(),
                                          self.dqn.get_input_dtype(),
                                          replay_memory_size, frame_history)
        self.frame_history = frame_history
        self.replay_start_size = replay_start_size
        self.epsilon = epsilon_start
        self.epsilon_min = epsilon_end
        self.epsilon_steps = epsilon_steps
        self.epsilon_delta = (self.epsilon -
                              self.epsilon_min) / self.epsilon_steps
        self.update_freq = update_freq
        self.target_copy_freq = target_copy_freq
        self.action_ticker = 1

        self.num_actions = num_actions
        self.batch_size = batch_size

        self.sess.run(tf.initialize_all_variables())

        if restore_network_file is not None:
            self.saver.restore(self.sess, restore_network_file)
            print('Restored network from file')
        self.sess.run(self.copy_op)
Exemplo n.º 18
0
                      args.sample_size,
                      sample_ratio=0.5,
                      is_valid=True,
                      need_feat=args.history)
train = BatchProvider(train_iter,
                      train_lst,
                      True,
                      args.sample_size,
                      sample_ratio=0.5,
                      need_feat=args.history)
N = args.num_id

cmcs, ap, cmcn, vscores, vturns = [[], [], [], []], [], [1, 5, 10, 20], [], []

iterations = args.num_examples
memory = ReplayMemory(replay_size=args.memory_size, alpha=args.pr_alpha)
epsilon = 1.0
final_epsilon = args.final_epsilon
rand_ep, fix_ep = 0, int(args.num_epoches * args.exp_ratio)
epsilon_shr = (epsilon - final_epsilon) / (fix_ep - rand_ep) / iterations

max_penalty = 1

frf = open(('figurelog/%s' % args.mode), 'w')

for e in xrange(args.num_epoches):
    if args.verbose:
        print 'Epoch', e
    for batch in xrange(iterations):
        if args.verbose:
            print 'Epoch', e, 'batch', batch
Exemplo n.º 19
0
def start():
	torch.cuda.empty_cache()

	rospy.init_node('deepracer_controller_mpc', anonymous=True)
	
	pose_sub2 = rospy.Subscriber("/gazebo/model_states_drop",ModelStates,get_vehicle_state)
	# x_sub1 = rospy.Subscriber("/move_base_simple/goal",PoseStamped,get_clicked_point)
	lidar_sub2 = rospy.Subscriber("/scan", LaserScan, get_lidar_data)
	pose_sub = message_filters.Subscriber("/gazebo/model_states_drop", ModelStates)
	lidar_sub = message_filters.Subscriber("/scan", LaserScan)
	ts = message_filters.ApproximateTimeSynchronizer([pose_sub,lidar_sub],10,0.1,allow_headerless=True)
	ts.registerCallback(filtered_data)
	target_point = [10, 8.5]
	env =  DeepracerGym(target_point)
	
	# while not rospy.is_shutdown():
	# 	time.sleep(1)
	# 	print('---------------------------',check_env(env))	

	# max_time_step = 3000
	# max_eposide = 1
	# e = 0
	# while not rospy.is_shutdown():
	# 	time.sleep(1) #Do not remove this 
	# 	state = env.reset()
	# 	env.stop_car()
	# 	time.sleep(1)        
	# 	while(e < max_eposide):
	# 		e += 1  
	# 		# state = env.reset()          
	# 		for _ in range(max_time_step):
	# 			action = np.array([0.1,-1])
	# 			n_state,reward,done,info = env.step(action)
	# 			# display(n_state[2])
	# 			time.sleep(0.01)
	# 			print(n_state[2],end='\r')
	# 			if done:
	# 				state = env.reset()                   
	# 				break
	# 	return True
	
	# rospy.spin()

	while not rospy.is_shutdown():
		# Training Script
		rospy.sleep(1) #Do not remove this 
		state = env.reset() #Do not remove this 
		torch.manual_seed(args.seed)
		np.random.seed(args.seed)

		agent = SAC(env.observation_space.shape[0], env.action_space, args)

		#Pretrained Agent
		# actor_path = "models/sac_actor_<DeepracerGym instance>_"
		# critic_path = "models/sac_critic_<DeepracerGym instance>_"
		# agent.load_model(actor_path, critic_path)

		# Memory
		memory = ReplayMemory(args.replay_size, args.seed)
		#Tesnorboard
		writer = SummaryWriter('runs/{}_SAC_{}_{}_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), 'DeepracerGym',
															 args.policy, "autotune" if args.automatic_entropy_tuning else ""))
		total_numsteps = 0
		updates = 0
		num_goal_reached = 0

		for i_episode in itertools.count(1):
			# print("New episode")
			episode_reward = 0
			episode_steps = 0
			done = False
			state = env.reset()
			
			while not done:
				start_time = time.time()
				if args.start_steps > total_numsteps:
					action = env.action_space.sample()  # Sample random action
				else:
					action = agent.select_action(state)  # Sample action from policy
				rospy.sleep(0.02)

				next_state, reward, done, _ = env.step(action) # Step
				if (reward > 9) and (episode_steps > 1): #Count the number of times the goal is reached
					num_goal_reached += 1 

				episode_steps += 1
				total_numsteps += 1
				episode_reward += reward
				if episode_steps > args.max_episode_length:
					done = True

				# Ignore the "done" signal if it comes from hitting the time horizon.
				# (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py)
				mask = 1 if episode_steps == args.max_episode_length else float(not done)
				# mask = float(not done)
				memory.push(state, action, reward, next_state, mask) # Append transition to memory

				state = next_state
				print(done)

			# if i_episode % UPDATE_EVERY == 0: 
			if len(memory) > args.batch_size:
				# Number of updates per step in environment
				for i in range(args.updates_per_step*args.max_episode_length):
					# Update parameters of all the networks
					critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(memory, args.batch_size, updates)

					writer.add_scalar('loss/critic_1', critic_1_loss, updates)
					writer.add_scalar('loss/critic_2', critic_2_loss, updates)
					writer.add_scalar('loss/policy', policy_loss, updates)
					writer.add_scalar('loss/entropy_loss', ent_loss, updates)
					writer.add_scalar('entropy_temprature/alpha', alpha, updates)
					updates += 1

			if total_numsteps > args.num_steps:
				break

			if (episode_steps > 1):
				writer.add_scalar('reward/train', episode_reward, i_episode)
				writer.add_scalar('reward/episode_length',episode_steps, i_episode)
				writer.add_scalar('reward/num_goal_reached',num_goal_reached, i_episode)

			print("Episode: {}, total numsteps: {}, episode steps: {}, reward: {}".format(i_episode, total_numsteps, episode_steps, round(episode_reward, 2)))
			print("Number of Goals Reached: ",num_goal_reached)

		print('----------------------Training Ending----------------------')
		env.stop_car()

		agent.save_model("corridor_straight", suffix = "1")
		return True

	rospy.spin()
Exemplo n.º 20
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Multi-agent DDPG')
    # add argument
    parser.add_argument('--grid_size', default=100, type=int, help='the size of a grid world')
    parser.add_argument('--n_actions', default=7, type=int, help='total number of actions an agent can take')
    parser.add_argument('--filename', default='../data/pr.txt', type=str, help='Pick-up probability file')
    parser.add_argument('--n_agents', default=4, type=int, help='the number of agent play in the environment')
    parser.add_argument('--runs', default=1, type=int, help='the number of times run the game')
    parser.add_argument('--aggre', default=False, help='the number of times run the game')

    # parser args
    args = parser.parse_args()
    env = GridWorld(args=args, terminal_time=1000, reward_stay=-0.1, reward_hitwall=-1, reward_move=-0.1, reward_pick=10)

    # Create memory
    memory = ReplayMemory(buffer=50000, batchSize=500)

    # Create a network
    dqn = DQN(memory=memory)
    # Evaluating......
    print('\nCollecting experience...')
    text_file = open("../results/output_dqn.txt", "w")

    for i_episode in range(4000):
        s, done = env.reset()
        a = torch.LongTensor(args.n_agents)
        ep_r = 0
        while True:
            for i in range(args.n_agents):
                (x, y) = s[i]['loc']
                one_hot_state = torch.Tensor(100 * 100)
Exemplo n.º 21
0
 def __init__(self, memory_entry_size):
     self.discount = .99
     self.double_q = True
     self.memory_entry_size = memory_entry_size
     self.memory = ReplayMemory(self.memory_entry_size)
Exemplo n.º 22
0
def train(config_filepath, save_dir, device, visualize_interval):
    conf = load_toml_config(config_filepath)
    data_dir, log_dir = create_save_dir(save_dir)
    # Save config file
    shutil.copyfile(config_filepath,
                    os.path.join(save_dir, os.path.basename(config_filepath)))
    device = torch.device(device)

    # Set up log metrics
    metrics = {
        'episode': [],
        'episodic_step': [],
        'collected_total_samples': [],
        'reward': [],
        'q_loss': [],
        'policy_loss': [],
        'alpha_loss': [],
        'alpha': [],
        'policy_switch_epoch': [],
        'policy_switch_sample': [],
        'test_episode': [],
        'test_reward': [],
    }

    policy_switch_samples = conf.policy_switch_samples if hasattr(
        conf, "policy_switch_samples") else None
    total_collected_samples = 0

    # Create environment
    env = make_env(conf.environment, render=False)

    # Instantiate modules
    # memory = ReplayBuffer(int(conf.replay_buffer_capacity), env.observation_space.shape, env.action_space.shape)
    memory = ReplayMemory(conf.replay_buffer_capacity)
    agent = getattr(agents, conf.agent_type)(env.observation_space,
                                             env.action_space,
                                             device=device,
                                             **conf.agent)

    # Load checkpoint if specified in config
    if conf.checkpoint != '':
        ckpt = torch.load(conf.checkpoint, map_location=device)
        metrics = ckpt['metrics']
        agent.load_state_dict(ckpt['agent'])
        memory.load_state_dict(ckpt['memory'])
        policy_switch_samples = ckpt['policy_switch_samples']
        total_collected_samples = ckpt['total_collected_samples']

    def save_checkpoint():
        # Save checkpoint
        ckpt = {
            'metrics': metrics,
            'agent': agent.state_dict(),
            'memory': memory.state_dict(),
            'policy_switch_samples': policy_switch_samples,
            'total_collected_samples': total_collected_samples
        }
        path = os.path.join(data_dir, 'checkpoint.pth')
        torch.save(ckpt, path)

        # Save agent model only
        model_ckpt = {'agent': agent.state_dict()}
        model_path = os.path.join(data_dir, 'model.pth')
        torch.save(model_ckpt, model_path)

        # Save metrics only
        metrics_ckpt = {'metrics': metrics}
        metrics_path = os.path.join(data_dir, 'metrics.pth')
        torch.save(metrics_ckpt, metrics_path)

    # Train agent
    init_episode = 0 if len(
        metrics['episode']) == 0 else metrics['episode'][-1] + 1
    pbar = tqdm.tqdm(range(init_episode, conf.episodes))
    reward_moving_avg = None
    agent_update_count = 0
    for episode in pbar:
        episodic_reward = 0
        o = env.reset()
        q1_loss, q2_loss, policy_loss, alpha_loss, alpha = None, None, None, None, None

        for t in range(conf.horizon):
            if total_collected_samples <= conf.random_sample_num:  # Select random actions at the begining of training.
                h = env.action_space.sample()
            elif memory.step <= conf.random_sample_num:  # Select actions from random latent variable soon after inserting a new subpolicy.
                h = agent.select_action(o, random=True)
            else:
                h = agent.select_action(o)

            a = agent.post_process_action(
                o, h)  # Convert abstract action h to actual action a

            o_next, r, done, _ = env.step(a)
            total_collected_samples += 1
            episodic_reward += r
            memory.push(o, h, r, o_next, done)
            o = o_next

            if memory.step > conf.random_sample_num:
                # Update agent
                batch_data = memory.sample(conf.agent_update_batch_size)
                q1_loss, q2_loss, policy_loss, alpha_loss, alpha = agent.update_parameters(
                    batch_data, agent_update_count)
                agent_update_count += 1

            if done:
                break

        # Describe and save episodic metrics
        reward_moving_avg = (
            1. - MOVING_AVG_COEF
        ) * reward_moving_avg + MOVING_AVG_COEF * episodic_reward if reward_moving_avg else episodic_reward
        pbar.set_description(
            "EPISODE {} (total samples {}, subpolicy samples {}) --- Step {}, Reward {:.1f} (avg {:.1f})"
            .format(episode, total_collected_samples, memory.step, t,
                    episodic_reward, reward_moving_avg))
        metrics['episode'].append(episode)
        metrics['reward'].append(episodic_reward)
        metrics['episodic_step'].append(t)
        metrics['collected_total_samples'].append(total_collected_samples)
        if episode % visualize_interval == 0:
            # Visualize metrics
            lineplot(metrics['episode'][-len(metrics['reward']):],
                     metrics['reward'], 'REWARD', log_dir)
            reward_avg = np.array(metrics['reward']) / np.array(
                metrics['episodic_step'])
            lineplot(metrics['episode'][-len(reward_avg):], reward_avg,
                     'AVG_REWARD', log_dir)
            lineplot(
                metrics['collected_total_samples'][-len(metrics['reward']):],
                metrics['reward'],
                'SAMPLE-REWARD',
                log_dir,
                xaxis='sample')

        # Save metrics for agent update
        if q1_loss is not None:
            metrics['q_loss'].append(np.mean([q1_loss, q2_loss]))
            metrics['policy_loss'].append(policy_loss)
            metrics['alpha_loss'].append(alpha_loss)
            metrics['alpha'].append(alpha)
            if episode % visualize_interval == 0:
                lineplot(metrics['episode'][-len(metrics['q_loss']):],
                         metrics['q_loss'], 'Q_LOSS', log_dir)
                lineplot(metrics['episode'][-len(metrics['policy_loss']):],
                         metrics['policy_loss'], 'POLICY_LOSS', log_dir)
                lineplot(metrics['episode'][-len(metrics['alpha_loss']):],
                         metrics['alpha_loss'], 'ALPHA_LOSS', log_dir)
                lineplot(metrics['episode'][-len(metrics['alpha']):],
                         metrics['alpha'], 'ALPHA', log_dir)

        # Insert new subpolicy layer and reset memory if a specific amount of samples is collected
        if policy_switch_samples and len(
                policy_switch_samples
        ) > 0 and total_collected_samples >= policy_switch_samples[0]:
            print(
                "----------------------\nInser new policy\n----------------------"
            )
            agent.insert_subpolicy()
            memory.reset()
            metrics['policy_switch_epoch'].append(episode)
            metrics['policy_switch_sample'].append(total_collected_samples)
            policy_switch_samples = policy_switch_samples[1:]

        # Test a policy
        if episode % conf.test_interval == 0:
            test_rewards = []
            for _ in range(conf.test_times):
                episodic_reward = 0
                obs = env.reset()
                for t in range(conf.horizon):
                    h = agent.select_action(obs, eval=True)
                    a = agent.post_process_action(o, h)
                    obs_next, r, done, _ = env.step(a)
                    episodic_reward += r
                    obs = obs_next

                    if done:
                        break

                test_rewards.append(episodic_reward)

            test_reward_avg, test_reward_std = np.mean(test_rewards), np.std(
                test_rewards)
            print("   TEST --- ({} episodes) Reward {:.1f} (pm {:.1f})".format(
                conf.test_times, test_reward_avg, test_reward_std))
            metrics['test_episode'].append(episode)
            metrics['test_reward'].append(test_rewards)
            lineplot(metrics['test_episode'][-len(metrics['test_reward']):],
                     metrics['test_reward'], "TEST_REWARD", log_dir)

        # Save checkpoint
        if episode % conf.checkpoint_interval:
            save_checkpoint()

    # Save the final model
    torch.save({'agent': agent.state_dict()},
               os.path.join(data_dir, 'final_model.pth'))
Exemplo n.º 23
0
def main():
    agent = SAC(state_dim, env.action_space, device, hidden_size, lr, gamma,
                tau, alpha)
    replay_buffer = ReplayMemory(args.capacity, args.seed)

    if args.train: print("Train True")
    if args.load:
        print("Load True")
        # agent.load_model(actor_path="./models_hard1/actor.pth", critic_path="./models_hard1/critic.pth")
        agent.load_model()

    updates = 0
    avg_reward = 0.
    total_steps = 0
    count_1500 = 0
    time_start = time.time()
    scores_deque = deque(maxlen=100)
    avg_scores_array = []

    for i in range(args.iteration):
        ep_r = 0
        ep_s = 0
        done = False
        state = env.reset()
        while not done:
            action = []
            if total_steps < start_steps and not args.load:
                action = env.action_space.sample()
            else:
                use_eval = False
                if args.render:
                    use_eval = True
                else:
                    if i % (test_ep * 2) >= test_ep:
                        use_eval = True
                action = agent.select_action(state, use_eval)

            next_state, reward, done, info = env.step(action)

            reward = reward * reward_scale

            ep_r += reward
            ep_s += 1
            total_steps += 1
            if args.render and i >= args.render_interval: env.render()

            mask = 1 if (ep_s == 1600) else float(not done)
            if args.train:
                replay_buffer.push(state, action, reward, next_state, mask)

            state = next_state

        if i % (test_ep * 2) >= test_ep:
            avg_reward += ep_r
            writer.add_scalar('reward/test', ep_r, i)
        if i % (test_ep * 2) == test_ep * 2 - 1:
            avg_reward /= test_ep
            writer.add_scalar('reward/test_avg', avg_reward, i / 2)
            avg_reward = 0.

        if args.train:
            for upi in range(ep_s):
                if args.load:
                    if len(replay_buffer) >= 10000:
                        agent.update_parameters(replay_buffer, batch_size,
                                                updates, writer)
                        updates += 1
                if not args.load and len(replay_buffer) >= update_start_steps:
                    agent.update_parameters(replay_buffer, batch_size, updates,
                                            writer)
                    updates += 1

        writer.add_scalar('reward/train', ep_r, i)

        s = (int)(time.time() - time_start)
        print("Ep.: {}, Total Steps: {}, Ep.Steps: {}, Score: {:.2f}, Time: {:02}:{:02}:{:02}".\
            format(i, total_steps, ep_s, ep_r, \
                  s//3600, s%3600//60, s%60))

        if ep_r >= 1500:
            count_1500 += 1
            if count_1500 == 200:
                agent.save_model()
                break

        if args.train:
            if ep_r > 1400:
                agent.save_model()

            if i % 20 == 0:
                agent.save_model()

    env.close()
Exemplo n.º 24
0
    def __init__(self, args):
        '''Constructor'''
        self.WARM_UP = 0
        self.QUALIFYING = 1
        self.RACE = 2
        self.UNKNOWN = 3
        self.stage = args.stage
        
        self.parser = msgParser.MsgParser()
        self.state = carState.CarState()
        self.control = carControl.CarControl()

        self.steers = [-1.0, -0.8, -0.6, -0.5, -0.4, -0.3, -0.2, -0.15, -0.1, -0.05, 0.0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 1.0]
        self.speeds = [-1.0, -0.5, 0.0, 0.5, 1.0]
        self.num_inputs = 19
        self.num_steers = len(self.steers)
        self.num_speeds = len(self.speeds)
        self.num_actions = self.num_steers + self.num_speeds
        
        self.net = DeepQNetwork(self.num_inputs, self.num_steers, self.num_speeds, args)
        self.mem = ReplayMemory(args.replay_size, self.num_inputs, args)
        self.minibatch_size = args.batch_size

        if args.load_weights:
            self.net.load_weights(args.load_weights)
        self.save_weights_prefix = args.save_weights_prefix
        self.pretrained_network = args.pretrained_network

        self.steer_lock = 0.785398
        self.max_speed = 100

        self.algorithm = args.algorithm
        self.device = args.device
        self.mode = args.mode
        self.maxwheelsteps = args.maxwheelsteps
        
        self.enable_training = args.enable_training
        self.enable_exploration = args.enable_exploration

        self.total_train_steps = 0
        self.exploration_decay_steps = args.exploration_decay_steps
        self.exploration_rate_start = args.exploration_rate_start
        self.exploration_rate_end = args.exploration_rate_end

        self.show_sensors = args.show_sensors
        self.show_qvalues = args.show_qvalues

        self.episode = 0
        self.onRestart()
        
        if self.show_sensors:
            from sensorstats import Stats
            self.stats = Stats(inevery=8)
        
        if self.show_qvalues:
            from plotq import PlotQ
            self.plotq = PlotQ(self.num_steers, self.num_speeds)

        if self.device == 'wheel':
            from wheel import Wheel
            self.wheel = Wheel(args.joystick_nr, args.autocenter, args.gain, args.min_force, args.max_force)
Exemplo n.º 25
0
    np.random.seed(args.seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = int(env.action_space.high[0])

    # Initialize and load policy
    actor_path = "models/DDPG_actor_{}_{}.pkl".format(args.env_name,
                                                      args.buffer_type)
    critic_path = "models/DDPG_critic_{}_{}.pkl".format(
        args.env_name, args.buffer_type)
    policy = ddpg.DDPG(state_dim, action_dim, max_action)
    policy.load(actor_path, critic_path)

    # Initialize buffer
    memory = ReplayMemory(args.replay_size)

    evaluations = []

    total_timesteps = 0
    episode_num = 0
    done = True

    while total_timesteps < args.replay_size:

        if done:

            if total_timesteps != 0:
                print("Total T: %d Episode Num: %d Episode T: %d Reward: %f" %
                      (total_timesteps, episode_num, episode_timesteps,
                       episode_reward))
Exemplo n.º 26
0
 def empty_replay(self):
     return ReplayMemory(30, [1, 1, 1], 5, 200, np.random.RandomState(456))
Exemplo n.º 27
0
torch.manual_seed(args.seed)
np.random.seed(args.seed)

# Agent
agent = SAC(env.observation_space.shape[0], env.action_space, args)
# path = 'models/sac_CHANGE_LineFollower-v0_normal'
# agent.load_model(path.replace('CHANGE', 'actor'),
#                  path.replace('CHANGE', 'critic'))

# Tesnorboard
writer = SummaryWriter('runs/{}_SAC_{}_{}_{}'.format(
    datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name,
    args.policy, "autotune" if args.automatic_entropy_tuning else ""))

# Memory
memory = ReplayMemory(args.replay_size, args.seed)

# Training Loop
total_numsteps = 0
updates = 0
did_it = False
for i_episode in itertools.count(1):
    episode_reward = 0
    episode_steps = 0
    done = False
    episode = []
    state = env.reset()
    if did_it:
        did_it = False
    while not done:
        if args.start_steps > total_numsteps:
Exemplo n.º 28
0
    def __init__(self,
                 scenario_tag=None,
                 model_savefile=None,
                 run_id_string=None,
                 network_class="DQNNet",
                 write_summaries=True,
                 tf_logdir="tensorboard_logs",
                 epochs=100,
                 train_steps_per_epoch=1000000,
                 test_episodes_per_epoch=100,
                 run_tests=True,
                 initial_epsilon=1.0,
                 final_epsilon=0.0000,
                 epsilon_decay_steps=10e07,
                 epsilon_decay_start_step=2e05,
                 frozen_steps=5000,
                 batchsize=32,
                 memory_capacity=10000,
                 update_pattern=(4, 4),
                 prioritized_memory=False,
                 enable_progress_bar=True,
                 save_interval=1,
                 writer_max_queue=10,
                 writer_flush_secs=120,
                 dynamic_frameskips=None,
                 **settings):

        if prioritized_memory:
            raise NotImplementedError(
                "Prioritized memory not implemented. Maybe some day.")
            # TODO maybe some day ...
            pass

        if dynamic_frameskips:
            if isinstance(dynamic_frameskips, (list, tuple)):
                self.frameskips = list(dynamic_frameskips)
            elif isinstance(dynamic_frameskips, int):
                self.frameskips = list(range(1, dynamic_frameskips + 1))
        else:
            self.frameskips = [None]

        self.update_pattern = update_pattern
        self.write_summaries = write_summaries
        self._settings = settings
        self.run_id_string = run_id_string
        self.train_steps_per_epoch = train_steps_per_epoch
        self._run_tests = test_episodes_per_epoch > 0 and run_tests
        self.test_episodes_per_epoch = test_episodes_per_epoch
        self._epochs = np.float32(epochs)

        self.doom_wrapper = VizdoomWrapper(**settings)
        misc_len = self.doom_wrapper.misc_len
        img_shape = self.doom_wrapper.img_shape
        self.use_misc = self.doom_wrapper.use_misc
        self.actions_num = self.doom_wrapper.actions_num
        self.replay_memory = ReplayMemory(img_shape,
                                          misc_len,
                                          batch_size=batchsize,
                                          capacity=memory_capacity)
        self.network = getattr(networks, network_class)(
            actions_num=self.actions_num * len(self.frameskips),
            img_shape=img_shape,
            misc_len=misc_len,
            **settings)

        self.batchsize = batchsize
        self.frozen_steps = frozen_steps

        self.save_interval = save_interval

        self._model_savefile = model_savefile
        ## TODO move summaries somewhere so they are consistent between dqn and asyncs
        if self.write_summaries:
            assert tf_logdir is not None
            create_directory(tf_logdir)

            self.scores_placeholder, summaries = setup_vector_summaries(
                scenario_tag + "/scores")
            self._summaries = tf.summary.merge(summaries)
            self._train_writer = tf.summary.FileWriter(
                "{}/{}/{}".format(tf_logdir, self.run_id_string, "train"),
                flush_secs=writer_flush_secs,
                max_queue=writer_max_queue)
            self._test_writer = tf.summary.FileWriter(
                "{}/{}/{}".format(tf_logdir, self.run_id_string, "test"),
                flush_secs=writer_flush_secs,
                max_queue=writer_max_queue)
        else:
            self._train_writer = None
            self._test_writer = None
            self._summaries = None
        self.steps = 0
        # TODO epoch as tf variable?
        self._epoch = 1

        # Epsilon
        self.epsilon_decay_rate = (initial_epsilon -
                                   final_epsilon) / epsilon_decay_steps
        self.epsilon_decay_start_step = epsilon_decay_start_step
        self.initial_epsilon = initial_epsilon
        self.final_epsilon = final_epsilon

        self.enable_progress_bar = enable_progress_bar
Exemplo n.º 29
0
    def __init__(self, dimO, dimA):
        dimA = list(dimA)
        dimO = list(dimO)

        nets = nets_dm

        # init replay memory
        self.rm = ReplayMemory(rm_size,
                               dimO,
                               dimA,
                               dtype=np.__dict__[rm_dtype])
        self.rrrm = ReplayMemory(rm_size,
                                 dimO,
                                 dimA,
                                 dtype=np.__dict__[rm_dtype])
        # start tf session
        self.sess = tf.Session(
            config=tf.ConfigProto(inter_op_parallelism_threads=threads,
                                  log_device_placement=False,
                                  log_device_placement=False,
                                  allow_soft_placement=True))

        # create tf computational graph
        #
        self.theta_p = nets.theta_p(dimO, dimA)
        self.theta_q = nets.theta_q(dimO, dimA)
        self.theta_pt, update_pt = exponential_moving_averages(
            self.theta_p, tau)
        self.theta_qt, update_qt = exponential_moving_averages(
            self.theta_q, tau)

        obs = tf.placeholder(tf.float32, [None] + dimO, "obs")
        act_test, sum_p = nets.policy(obs, self.theta_p)

        # explore
        noise_init = tf.zeros([1] + dimA)
        noise_var = tf.Variable(noise_init)
        self.ou_reset = noise_var.assign(noise_init)
        noise = noise_var.assign_sub(
            (FLAGS.ou_theta) * noise_var -
            tf.random_normal(dimA, stddev=FLAGS.ou_sigma))
        act_expl = act_test + noise

        # test
        q, sum_q = nets.qfunction(obs, act_test, self.theta_q)
        # training
        # policy loss
        meanq = tf.reduce_mean(q, 0)
        wd_p = tf.add_n([pl2 * tf.nn.l2_loss(var)
                         for var in self.theta_p])  # weight decay
        loss_p = -meanq + wd_p
        # policy optimization
        optim_p = tf.train.AdamOptimizer(learning_rate=lrp)
        grads_and_vars_p = optim_p.compute_gradients(loss_p,
                                                     var_list=self.theta_p)
        optimize_p = optim_p.apply_gradients(grads_and_vars_p)
        with tf.control_dependencies([optimize_p]):
            train_p = tf.group(update_pt)

        # q optimization
        act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA,
                                   "act_train")
        rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew")
        term = tf.placeholder(tf.bool, [FLAGS.bsize], "term")
        obs2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs2")
        # q
        q_train, sum_qq = nets.qfunction(obs, act_train, self.theta_q)
        # q targets
        act2, sum_p2 = nets.policy(obs2, theta=self.theta_pt)
        q2, sum_q2 = nets.qfunction(obs2, act2, theta=self.theta_qt)
        q_target = tf.stop_gradient(tf.select(term, rew, rew + discount * q2))
        # q_target = tf.stop_gradient(rew + discount * q2)
        # q loss
        td_error = q_train - q_target
        ms_td_error = tf.reduce_mean(tf.square(td_error), 0)
        wd_q = tf.add_n([ql2 * tf.nn.l2_loss(var)
                         for var in self.theta_q])  # weight decay
        loss_q = ms_td_error + wd_q
        # q optimization
        optim_q = tf.train.AdamOptimizer(learning_rate=lrq)
        grads_and_vars_q = optim_q.compute_gradients(loss_q,
                                                     var_list=self.theta_q)
        optimize_q = optim_q.apply_gradients(grads_and_vars_q)
        with tf.control_dependencies([optimize_q]):
            train_q = tf.group(update_qt)

        # logging
        log_obs = [] if dimO[0] > 20 else [
            tf.histogram_summary("obs/" + str(i), obs[:, i])
            for i in range(dimO[0])
        ]
        log_act = [] if dimA[0] > 20 else [
            tf.histogram_summary("act/inf" + str(i), act_test[:, i])
            for i in range(dimA[0])
        ]
        log_act2 = [] if dimA[0] > 20 else [
            tf.histogram_summary("act/train" + str(i), act_train[:, i])
            for i in range(dimA[0])
        ]
        log_misc = [sum_p, sum_qq, tf.histogram_summary("td_error", td_error)]
        log_grad = [
            grad_histograms(grads_and_vars_p),
            grad_histograms(grads_and_vars_q)
        ]
        log_train = log_obs + log_act + log_act2 + log_misc + log_grad

        # initialize tf log writer
        self.writer = tf.train.SummaryWriter(FLAGS.outdir + "/tf",
                                             self.sess.graph,
                                             flush_secs=20)

        # init replay memory for recording episodes
        max_ep_length = 10000
        self.rm_log = ReplayMemory(max_ep_length, dimO, dimA, rm_dtype)

        # tf functions
        with self.sess.as_default():
            self._act_test = Fun(obs, act_test)
            self._act_expl = Fun(obs, act_expl)
            self._reset = Fun([], self.ou_reset)
            self._train_q = Fun([obs, act_train, rew, term, obs2], [train_q],
                                log_train, self.writer)
            self._train_p = Fun([obs], [train_p], log_train, self.writer)
            self._train = Fun([obs, act_train, rew, term, obs2],
                              [train_p, train_q], log_train, self.writer)

        # initialize tf variables
        self.saver = tf.train.Saver(max_to_keep=1)
        ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf")
        if ckpt:
            self.saver.restore(self.sess, ckpt)
        else:
            self.sess.run(tf.initialize_all_variables())

        self.sess.graph.finalize()

        self.t = 0  # global training time (number of observations)
Exemplo n.º 30
0
def train(active_mv):

    senv = ShapeNetEnv(FLAGS)
    replay_mem = ReplayMemory(FLAGS)

    #### for debug
    #a = np.array([[1,0,1],[0,0,0]])
    #b = np.array([[1,0,1],[0,1,0]])
    #print('IoU: {}'.format(replay_mem.calu_IoU(a, b)))
    #sys.exit()
    #### for debug

    log_string('====== Starting burning in memories ======')
    burn_in(senv, replay_mem)
    log_string('====== Done. {} trajectories burnt in ======'.format(
        FLAGS.burn_in_length))

    #epsilon = FLAGS.init_eps
    K_single = np.asarray([[420.0, 0.0, 112.0], [0.0, 420.0, 112.0],
                           [0.0, 0.0, 1]])
    K_list = np.tile(K_single[None, None, ...],
                     (1, FLAGS.max_episode_length, 1, 1))

    ### burn in(pretrain) for MVnet
    if FLAGS.burn_in_iter > 0:
        for i in xrange(FLAGS.burn_in_iter):
            mvnet_input = replay_mem.get_batch_list(FLAGS.batch_size)
            tic = time.time()
            out_stuff = active_mv.run_step(mvnet_input,
                                           mode='burnin',
                                           is_training=True)
            burnin_log(i, out_stuff, time.time() - tic)

    rollout_obj = Rollout(active_mv, senv, replay_mem, FLAGS)

    for i_idx in xrange(FLAGS.max_iter):

        t0 = time.time()

        rollout_obj.go(i_idx, verbose=True, add_to_mem=True, mode='random')
        t1 = time.time()

        replay_mem.enable_gbl()
        mvnet_input = replay_mem.get_batch_list(FLAGS.batch_size)
        t2 = time.time()

        out_stuff = active_mv.run_step(mvnet_input,
                                       mode='train_mv',
                                       is_training=True)
        replay_mem.disable_gbl()
        t3 = time.time()

        train_log(i_idx, out_stuff, (t0, t1, t2, t3))

        active_mv.train_writer.add_summary(out_stuff.merged_train, i_idx)

        if i_idx % FLAGS.save_every_step == 0 and i_idx > 0:
            save(active_mv, i_idx, i_idx, i_idx)

        if i_idx % FLAGS.test_every_step == 0 and i_idx > 0:
            evaluate(active_mv, FLAGS.test_episode_num, replay_mem, i_idx,
                     rollout_obj)