Пример #1
0
def main(test=False):
    if test:
        dqn = DQN()
        dqn.test(test_case_count=10000, load_dir='models/dqn.pkl')
    else:
        dqn = DQN()
        env = Env()
        # dqn.load("models/pretrained.pkl")
        print('\nCollecting experience...')
        for i_episode in range(60000):
            s = env.reset()
            ep_r = 0
            for _count in range(4):
                root_action, leaf_action = dqn.choose_action(s)

                # take action
                s_, r, done = env.step(root_action, leaf_action)

                dqn.store_transition(s, (root_action, leaf_action), r, s_)

                ep_r += r
                if dqn.memory_counter > MEMORY_CAPACITY:
                    dqn.learn()

                if done:
                    break
                s = s_
            # print('ep_r:', ep_r)

            if i_episode % 1000 == 1:
                dqn.test()

        dqn.save('models/dqn_final_no_pretrain.pkl')
Пример #2
0
def test_attack():
    agent = Agent(args.img_stack, device)
    agent.load_param()
    env = Env(args.seed, args.img_stack, args.action_repeat)

    # load adv input, by default general attack perturbation
    delta_s = np.load('param/adv_general.npy')
    if args.attack_type != 'general':
        file_path = 'param/adv_' + args.attack_type
        if args.attack_type == 'patch':
            file_path += '_' + args.patch_type
        file_path += '.npy'
        delta_s = np.load(file_path)
    # show adv
    fig = plt.figure(figsize=(8, 8))
    plt.title('Stack of ' + str(args.img_stack) +
              ' adversarial signals seen by Agent')
    plt.axis('off')
    columns, rows = args.img_stack // 2, args.img_stack // 2
    for i in range(1, columns * rows + 1):
        # denormalize while showing the image
        img = (delta_s[i - 1] + 1) * 128
        fig.add_subplot(rows, columns, i)
        plt.imshow(img, cmap='gray')
    plt.show()

    for i_ep in range(10):
        score = 0
        state = env.reset()

        for t in range(1000):
            # steps range to render attack in 1000
            attack_render = [30, 40]
            if t in np.arange(attack_render[0], attack_render[1] + 1):
                if t in attack_render:
                    s_with_ds = (state + delta_s)
                    # clip the image limits and denormalize for displaying
                    s_with_ds = np.clip(s_with_ds, -1, 0.9921875)
                    s_with_ds = (s_with_ds + 1) * 128
                    title = 'Attack Started' if t == attack_render[
                        0] else 'Attack ended'
                    title += ' (showing first frame of 4 frames visible to policy)'
                    plt.imshow(s_with_ds[0], cmap='gray')
                    plt.axis('off')
                    plt.title(title)
                    plt.show()
                state += delta_s

            action = agent.select_action(state)
            state_, reward, done, die = env.step(action *
                                                 np.array([2., 1., 1.]) +
                                                 np.array([-1., 0., 0.]))
            if args.render:
                env.render()
            score += reward
            state = state_
            if done:
                break

        print('Ep {}\tScore: {:.2f}\t'.format(i_ep, score))
Пример #3
0
def run_agent():
    agent = Agent(args.img_stack, device)
    agent.load_param()
    env = Env(args.seed, args.img_stack, args.action_repeat)

    state = env.reset()

    # Prepare attack
    attack = AdvAttack(args.attack_type)
    attack.initialize_perturbation(state.shape)
    attack.load_networks()

    for i_ep in range(50):
        score = 0
        state = env.reset()

        for t in range(1000):
            action = agent.select_action(state)
            # update buffer for training the attack
            attack.update_buffer(state)

            # write to tensorboard
            input_imgs_to_net = torch.tensor(
                (attack.buffer['s'] + attack.buffer['d_s']))
            input_imgs_grid = make_grid(input_imgs_to_net[0].reshape(
                4, 1, 96, 96))
            writer.add_image('Four stack of input state with adversarial',
                             input_imgs_grid)
            writer.add_graph(attack.net, input_imgs_to_net)
            writer.close()

            # train attack
            attack.train()

            state_, reward, done, die = env.step(action *
                                                 np.array([2., 1., 1.]) +
                                                 np.array([-1., 0., 0.]))
            if args.render:
                env.render()
            score += reward
            state = state_
            if done or die:
                break

        print('Ep {}\tScore: {:.2f}\t'.format(i_ep, score))
Пример #4
0
def main():
    env = Env(enable_draw=True, base_fix=False)
    agent = Agent(env)

    time_horizon = 10
    com_pos = np.array([0.0, 0, 0.1])
    rpy = np.zeros(3)
    com_vel = np.zeros(3)
    base_ang_vel = np.zeros(3)
    target_x = np.concatenate([com_pos, rpy, com_vel, base_ang_vel])
    target_x = target_x.reshape((-1, 1))
    target_u = np.array([0, 0, env.model.mass * 0.25 * 9.8] * 4).reshape(
        (12, 1))
    init_u_list = np.array([target_u for i in range(time_horizon)])

    state = env.reset()
    t = 0
    while t < 10:
        com_pos = env.model.com_pos
        rpy = env.model.base_rpy
        com_vel = env.model.base_vel
        base_ang_vel = np.matmul(env.model.base_rot.T, env.model.base_ang_vel)
        init_x = np.concatenate([com_pos, rpy, com_vel, base_ang_vel])
        init_x = init_x.reshape((-1, 1))

        delta_time_list = np.array([0.01] * time_horizon)
        foot_pos_list = np.array(
            [env.model.foot_pos_list for i in range(time_horizon + 1)])
        contact_phi_list = np.array([[1, 1, 1, 1]
                                     for i in range(time_horizon + 1)])

        target_x_list = np.array([target_x for i in range(time_horizon + 1)])
        target_u_list = np.array([target_u for i in range(time_horizon)])

        action, u_list = agent.get_action(init_x, init_u_list, delta_time_list,
                                          foot_pos_list, contact_phi_list,
                                          target_x_list, target_u_list)
        init_u_list = deepcopy(u_list)

        state = env.step(action)

        #time.sleep(env.time_step)
        t += env.time_step
Пример #5
0
def play_greedy_game(verbose=True):
    """
    This function plays a Tichu game with four "greedy" players.
    Uses greedyAgent. 
    This is an Agent with very simple heuristic play moves.
    Always tries to win a stack except opponent is leading.
    Raises an Exception if 10 consecutive false moves are made.
    (This should not happen when environment and greedyAgent is bugfree.)
    """
    agent = greedyAgent()
    env = Env(train_mode=not (verbose))
    state, rewards, done, active_player = env.reset()
    conseq_active_counter = 0
    cummulative_reward = [0, 0, 0, 0]
    while True:
        my_state = state[active_player]
        action = agent.act(my_state)
        last_active = active_player
        if not env.game.players[active_player].finished:
            cummulative_reward[active_player] += rewards[active_player]
        state, rewards, done, active_player = env.step(active_player, action)
        new_active = active_player
        if last_active == new_active:
            conseq_active_counter += 1
        else:
            conseq_active_counter = 0
        if done:
            if verbose:
                print('-----')
            for i in range(4):
                cummulative_reward[i] += rewards[i]
                if verbose:
                    print('Cummulative reward of player {}: {}'.format(
                        i, cummulative_reward[i]))
            return
        if conseq_active_counter > 10:
            raise Exception(
                "Active counter exceeded. Possible infinity loop detected.")
Пример #6
0
def main():
    config = read_config("config.yaml")
    agent_config = config['Agent']
    network_config = agent_config['Network']
    training_config = config['Training']
    files_config = config['Files']
    eval_config = config['Evaluation']

    print('\t\t --------------------------------------------')
    print('\t\t ------  Parameters of the experiment  ------')
    print('\t\t --------------------------------------------\n')

    print('## Agent params')
    print('Agent : ' + agent_config['name'])
    print('Gamma : ', agent_config['gamma'])
    print('')

    print('## Network Params')
    print('Network used : ' + network_config['name'])
    print('Number of filters : ', network_config['n_filters'])
    print('activation function : ' + network_config['activation'])
    print('state embedding size : ', network_config['state_embedding_size'])
    print('')

    print('## Training params')
    print('Number of iteration : ', training_config['n_iter'])
    print('Learning rate : ', network_config['lr'])
    print('Number of games per iteration : ', training_config['n_games'])
    print('Number of workers : ', training_config['n_workers'])
    print('Batch size : ', training_config['batch_size'])
    print('Buffer size : ', training_config['buffer_size'])
    print('')

    print('## Evaluation params')
    print('Number of games per iteration : ', eval_config['n_games'])
    print('Number of workers : ', eval_config['n_workers'])
    print('')

    sleep(2.0)

    # Init files and tensorboard
    model_name = agent_config['name']
    checkpoints_dir = os.path.join(model_name, files_config['checkpoints_dir'])
    tensorboard_log_dir = os.path.join(model_name,
                                       files_config['tensorboard_log_dir'])
    results_log_path = os.path.join(model_name,
                                    files_config['results_log_path'])

    # fix random seed
    if config['Seed'] is None:
        np.random.seed(seed=42)
    else:
        np.random.seed(int(seed))

    print('\n\n')
    env = Env()

    # if train from scratch
    if training_config["init_checkpoint"] == 0:
        # initialize dir for tensorboard
        flush_or_create(tensorboard_log_dir)
        # initialize dir for checkpoitns
        flush_or_create(checkpoints_dir)
        # init agent and network from scratch
        agent = ActorCriticAgent(agent_config, network_config, checkpoints_dir,
                                 tensorboard_log_dir)
        # initialize iteration number
        start = 0

    # else restart training from last checkpoint
    else:
        agent = ActorCriticAgent(agent_config,
                                 network_config,
                                 checkpoints_dir,
                                 tensorboard_log_dir,
                                 restore=True)
        print('\nnetwork restored from checkpoint # ', latest_checkpoint)
        print('')
        start = latest_checkpoint

    # intialize the summary writer and results log file
    log_file = open(results_log_path,
                    "wb+")  # open log file to write in during evaluation

    display_every = training_config["display_every"]
    n_games_train = training_config["n_games"]
    n_workers_train = training_config["n_workers"]
    T_update_net = training_config["T_update_net"]
    T_update_target_net = training_config["T_update_target_net"]
    n_games_eval = eval_config["n_games"]
    n_workers_eval = eval_config["n_workers"]
    prefill_buffer = training_config["prefill_buffer"]
    # gamma = agent_config['gamma']

    summary_dict = dict({})
    data_buffer = Buffer(capacity=training_config['buffer_size'])

    logger = logging.getLogger(__name__)

    if prefill_buffer:
        # populate buffer with intial data from random games
        print('\nPopulating Buffer ... \n')
        populate_buffer(agent, n_workers_train, data_buffer)

    print('\n\n')
    print('Starting training\n\n')
    batch_size = training_config['batch_size']
    for it in tqdm(np.arange(start, training_config["n_iter"]),
                   desc="parallel gameplay iterations"):
        # play games to generate data and train the network
        env.reset()
        try:
            agent.train(env, n_games_train, data_buffer, batch_size,
                        n_workers_train, display_every, T_update_net)
        except Exception as error:
            print('\n\n#### AN ERROR OCCURED WHILE TRAINING ####\n\n')
            agent.net.summary_writer.close()
            agent.net.sess.close()
            log_file.close()
            logger.error(error)
            raise
        agent.net.save_checkpoint(checkpoints_dir, it=it + 1)

        # play games with latest checkpoint and track average final reward
        results = agent.evaluate(env, n_games_eval, n_workers_eval)
        # save results
        pickle.dump(results, log_file)
        print('')

    agent.net.summary_writer.close()
    agent.net.sess.close()
    log_file.close()
    print('End of training')
Пример #7
0
def main():
    env = Env(enable_draw=True, base_fix=False)
    agent = Agent(env)

    delta_time = 0.025
    time_horizon = 10
    com_pos = np.array([0.0, 0, 0.25])
    rpy = np.zeros(3)
    com_vel = np.zeros(3)
    base_ang_vel = np.zeros(3)
    target_x = np.concatenate([com_pos, rpy, com_vel, base_ang_vel])
    target_x = target_x.reshape((-1, 1))
    target_u = np.array([0, 0, env.model.mass * 0.25 * 9.8] * 4).reshape(
        (12, 1))
    init_u_list = np.array([target_u for i in range(time_horizon)])

    temp_length = int(0.3 / delta_time)
    temp_contact_phi_list = [[0, 1, 1, 0]] * temp_length + [[
        1, 1, 1, 1
    ]] * temp_length + [[1, 0, 0, 1]] * temp_length + [[1, 1, 1, 1]
                                                       ] * temp_length
    total_contact_phi_list = np.array([[1, 1, 1, 1]] * temp_length +
                                      temp_contact_phi_list * 1000)

    state = env.reset()
    t = 0
    last_t = 0
    while t < 100:

        if last_t == 0 or t - last_t >= delta_time:
            last_t = t
            com_pos = env.model.com_pos
            print(com_pos)
            rpy = env.model.base_rpy
            com_vel = env.model.base_vel
            base_ang_vel = np.matmul(env.model.base_rot.T,
                                     env.model.base_ang_vel)
            init_x = np.concatenate([com_pos, rpy, com_vel, base_ang_vel])
            init_x = init_x.reshape((-1, 1))

            delta_time_list = np.array([delta_time] * time_horizon)
            foot_pos_list = np.array(
                [env.model.foot_pos_list for i in range(time_horizon + 1)])
            contact_phi_list = total_contact_phi_list[:time_horizon + 1]
            total_contact_phi_list = total_contact_phi_list[1:]

            target_x_list = np.array(
                [target_x for i in range(time_horizon + 1)])
            target_u_list = np.array([target_u for i in range(time_horizon)])

            action, u_list = agent.get_action(init_x, init_u_list,
                                              delta_time_list, foot_pos_list,
                                              contact_phi_list, target_x_list,
                                              target_u_list)
            init_u_list = deepcopy(u_list)
            for leg_idx in range(4):
                if contact_phi_list[0, leg_idx] == 0.0:
                    action[leg_idx * 3:(leg_idx + 1) * 3] = [0, 0, -3.0]

        state = env.step(action, contact_phi_list[0, :])

        t += env.time_step
Пример #8
0
def test_env_state():
    env = Env()
    state, _, _, _ = env.reset()
    assert np.shape(state) == (4, 4, 3)
    for i in range(4):
        assert sum(state[i][0][2]) == 14
Пример #9
0
              output_count=env.actions_count,
              hidden_count=settings.NN_HIDDEN_COUNT)

    history = History()

    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        sess.run(init)

        gradBuffer = sess.run(tf.trainable_variables())

        for i, grad in enumerate(gradBuffer):
            gradBuffer[i] = 0

        state = env.reset()

        old_action = None

        for iter_num in range(settings.TRAIN_ITERATIONS):
            action = sess.run(net.chosen_action,
                              feed_dict={net.layer_input: [state]})[0]

            # if action == old_action and action != 0:
            #     action += 1

            old_action = action

            logger.debug(
                f'Iter {iter_num}: action={action} (avg reward={np.mean(history.get_rewards()[-100:])})'
            )