Exemplo n.º 1
0
def one_step(seed):
    np.random.seed(seed)
    tf.random.set_seed(seed)

    path = 'data/mh/trajs_mh.pkl'
    nfolds = 5
    n_splits = 5
    ckpts = (np.arange(10) + 1) * 5000

    num_actions = 5
    # configures
    config['online'] = False
    config['hiddens'] = [64, 64]
    config['double'] = False
    config['dueling'] = False
    config['lr'] = 5e-4
    config['decay_steps'] = 50000
    config['max_training_steps'] = 50000
    config['training_steps_to_checkpoint'] = 5000
    config['training_steps_to_eval'] = 100000

    index = pd.MultiIndex.from_product([np.arange(nfolds), ckpts])
    columns = ['dqn', 'dml', 'sale']
    rets = pd.DataFrame(index=index, columns=columns)

    print('-' * 20, 'start', '-' * 20)
    cvs = CVS(path, n_splits=nfolds, random_state=seed)
    cvs.split()
    for fold in range(nfolds):
        train_path = cvs.train_paths[fold] + 'trajs.pkl'
        kf = KFoldCV(train_path,
                     n_trajs=None,
                     n_splits=n_splits,
                     shuffle=False,
                     random_state=seed)
        kf.split()

        print('-' * 20, 'training agent', '-' * 20)
        # agent
        config['persistent_directory'] = kf.agent_path
        config['checkpoint_path'] = kf.ckpt_path
        agent = DQNAgent(num_actions=num_actions, config=config)
        agent.learn()

        print('-' * 20, 'training agents', '-' * 20)
        # agent_1, ..., agent_K
        for idx in range(kf.n_splits):
            config_idx = copy.deepcopy(config)
            config_idx['persistent_directory'] = kf.agent_paths[idx]
            config_idx['checkpoint_path'] = kf.ckpt_paths[idx]
            agent_idx = DQNAgent(num_actions=num_actions, config=config_idx)
            agent_idx.learn()

        # fitted q evaluation
        test_path = cvs.test_paths[fold] + 'trajs.pkl'
        with open(test_path, 'rb') as f:
            trajs = pickle.load(f)

        print('-' * 20, 'behavior cloning', '-' * 20)
        # behavior cloning
        bc = BehaviorCloning(num_actions=num_actions)
        states = np.array(
            [transition[0] for traj in kf.trajs for transition in traj])
        actions = np.array(
            [transition[1] for traj in kf.trajs for transition in traj])
        bc.train(states, actions)

        for ckpt in ckpts:
            print('-' * 20, 'ckpt: ', ckpt, '-' * 20)
            agent = DQNAgent(num_actions=num_actions, config=config)
            agent.load(kf.ckpt_path + 'dqn_{}.ckpt'.format(ckpt))

            agents = []
            for idx in range(kf.n_splits):
                config_idx = copy.deepcopy(config)
                config_idx['persistent_directory'] = kf.agent_paths[idx]
                config_idx['checkpoint_path'] = kf.ckpt_paths[idx]
                agent_idx = DQNAgent(num_actions=num_actions,
                                     config=config_idx)
                agent_idx.load(kf.ckpt_paths[idx] + 'dqn_{}.ckpt'.format(ckpt))
                agents.append(agent_idx)
            states, qvalues, qtildes = kf.update_q(agents, bc)

            print('-' * 20, 'adv learner', '-' * 20)
            advs1 = qvalues - qvalues.mean(axis=1, keepdims=True)
            agent1 = AdvantageLearner(num_actions=num_actions)
            agent1._train(states, advs1)

            advs2 = qtildes - qtildes.mean(axis=1, keepdims=True)
            agent2 = AdvantageLearner(num_actions=num_actions)
            agent2._train(states, advs2)

            print('-' * 20, 'fqe on dqn & dml & sale', '-' * 20)
            fqe_dqn = FQE(agent.greedy_actions, num_actions=num_actions)
            fqe_dqn.train(trajs)
            fqe_dml = FQE(agent1.greedy_actions, num_actions=num_actions)
            fqe_dml.train(trajs)
            fqe_sale = FQE(agent2.greedy_actions, num_actions=num_actions)
            fqe_sale.train(trajs)

            rets.loc[(fold, ckpt), 'dqn'] = fqe_dqn.values
            rets.loc[(fold, ckpt), 'dml'] = fqe_dml.values
            rets.loc[(fold, ckpt), 'sale'] = fqe_sale.values

    return rets
Exemplo n.º 2
0
def one_round_run(replica,
    strategy = 'random',
    agent_name = 'dqn',
    n_trajs=200,
    n_splits=2,
    config_modify_func=None):
    
    path = 'data/{}_{}_{}/{}/trajs_qr_dqn.pkl'.format(
        agent_name, n_splits, n_trajs, strategy
    )
    kf = KFoldCV(path, n_trajs=n_trajs, n_splits=n_splits, shuffle=True, random_state=123456789, first=False)
    kf.split()
    working_directory = kf.working_directory
    print('Working directory path: {}'.format(kf.working_directory))
    print('Check point path: {}'.format(kf.ckpt_paths))

    print('Behavior Cloning...')
    bc = BehaviorCloning(num_actions=4, hiddens=[256,256], activation='relu', lr=5e-4)
    states  = np.array([transition[0] for traj in kf.trajs for transition in traj])
    actions = np.array([transition[1] for traj in kf.trajs for transition in traj])
    bc.train(states, actions)

    print('Single Agent Training...')
    if config_modify_func is not None:
        config_modify_func(kf=kf)
    else:
        set_config(kf=kf)
    print('With Config: {}'.format(config))
    
    agent = DQNAgent(name='LunarLander-v2', num_actions=4, config=config)
    agent.learn()

    rewards = pd.Series(agent.eval_episode_rewards)
    steps = pd.Series(agent.eval_episode_steps)

    fig, axes = plt.subplots(2, 2, figsize=(18, 8))

    axes[0][0].plot(rewards.rolling(100, min_periods=20).mean())
    axes[0][0].set_title('mean reward')
    axes[0][1].plot(rewards.rolling(100, min_periods=20).max())
    axes[0][1].set_title('max reward')
    axes[1][0].plot(steps.rolling(100, min_periods=20).mean())
    axes[1][0].set_title('mean step')
    axes[1][1].plot(steps.rolling(100, min_periods=20).max())
    axes[1][1].set_title('max step')

    file_path = 'dqn_{}_single_rc{}_dt{}.jpg'.format(strategy, replica, datetime.now().strftime('%Y%m%d_%H-%M-%s'))
    pic_dir = os.path.join(working_directory, 'pic')
    if not os.path.isdir(pic_dir):
        os.mkdir(pic_dir)
    plt.savefig(os.path.join(pic_dir, file_path))

    print('Save Single Agent Training...')
    file_name = 'offline-single-lr{}-dedcay_step{}-max_training_steps{}'.format(
    config['lr'], config['decay_steps'], config['max_training_steps'])
    save_weight_dir = os.path.join(working_directory, 'model_save')
    file_path = '{file_name}.weights'.format_map({'file_name': file_name})
    agent.save(os.path.join(working_directory, file_name, file_path))
    
    print('Cross Validating...')
    for idx in range(kf.n_splits):
        config_idx = copy.deepcopy(config)
        config_idx['persistent_directory'] = kf.agent_paths[idx]
        config_idx['checkpoint_path'] = kf.ckpt_paths[idx]
        
        agent_idx = DQNAgent(name='LunarLander-v2', num_actions=4, config=config_idx)
        agent_idx.learn()
        
        rewards = pd.Series(agent_idx.eval_episode_rewards)
        steps = pd.Series(agent_idx.eval_episode_steps)

        fig, axes = plt.subplots(2, 2, figsize=(18, 8))

        axes[0][0].plot(rewards.rolling(100, min_periods=20).mean())
        axes[0][0].set_title('mean reward')
        axes[0][1].plot(rewards.rolling(100, min_periods=20).max())
        axes[0][1].set_title('max reward')
        axes[1][0].plot(steps.rolling(100, min_periods=20).mean())
        axes[1][0].set_title('mean step')
        axes[1][1].plot(steps.rolling(100, min_periods=20).max())
        axes[1][1].set_title('max step')
        
        file_path = 'dqn_{}_cv_rc{}_dt{}.jpg'.format(strategy, replica, datetime.now().strftime('%Y%m%d_%H-%M-%s'))
        plt.savefig(os.path.join(pic_dir, file_path))
    
    rs = compare_within_ckpt(kf, bc, config, working_directory, strategy = strategy,
                            num_trajectories = n_trajs,
                            agent_name = agent_name,
                            num_kf = n_splits, replica=replica)