예제 #1
0
파일: test_lols.py 프로젝트: yyht/macarico
def test2():
    # aggrevate
    print
    print '# test sequence labeler on mod data with AggreVaTe'
    n_types = 10
    n_labels = 4

    data = macarico.util.make_sequence_mod_data(100, 5, n_types, n_labels)
    data = [Example(x, y, n_labels) for x, y in data]

    tRNN = TransitionRNN(
        [RNNFeatures(n_types)],
        [AttendAt()],
        n_labels,
    )
    policy = LinearPolicy(tRNN, n_labels)

    p_rollin_ref = stochastic(ExponentialAnnealing(0.99))
    optimizer = torch.optim.Adam(policy.parameters(), lr=0.01)

    macarico.util.trainloop(
        training_data=data[:len(data) // 2],
        dev_data=data[len(data) // 2:],
        policy=policy,
        Learner=lambda: AggreVaTe(HammingLossReference(), policy, p_rollin_ref
                                  ),
        losses=HammingLoss(),
        optimizer=optimizer,
        run_per_epoch=[p_rollin_ref.step],
        n_epochs=4,
        train_eval_skip=1,
    )
예제 #2
0
def test0():
    print
    print '# test sequence labeler on mod data with DAgger'
    n_types = 10
    n_labels = 4

    data = [
        Example(x, y, n_labels)
        for x, y in macarico.util.make_sequence_mod_data(
            100, 5, n_types, n_labels)
    ]

    tRNN = Actor([RNNFeatures(n_types, output_field='mytok_rnn')],
                 [AttendAt(field='mytok_rnn')], n_labels)
    policy = LinearPolicy(tRNN, n_labels)

    p_rollin_ref = stochastic(ExponentialAnnealing(0.99))
    optimizer = torch.optim.Adam(policy.parameters(), lr=0.01)

    macarico.util.trainloop(
        training_data=data[:len(data) // 2],
        dev_data=data[len(data) // 2:],
        policy=policy,
        Learner=lambda: DAgger(HammingLossReference(), policy, p_rollin_ref),
        losses=HammingLoss(),
        optimizer=optimizer,
        run_per_epoch=[p_rollin_ref.step],
        n_epochs=4,
        train_eval_skip=1,
    )
예제 #3
0
파일: test_lols.py 프로젝트: yyht/macarico
def test1():
    n_types = 10
    n_labels = 4
    print
    print '# test sequence labeler on mod data with LOLS'
    data = macarico.util.make_sequence_mod_data(20, 6, n_types, n_labels)
    data = [Example(x, y, n_labels) for x, y in data]

    tRNN = TransitionRNN([RNNFeatures(n_types)], [AttendAt()], n_labels)
    policy = LinearPolicy(tRNN, n_labels)
    optimizer = torch.optim.Adam(policy.parameters(), lr=0.01)

    p_rollin_ref = stochastic(ExponentialAnnealing(0.9))
    p_rollout_ref = stochastic(ExponentialAnnealing(0.9))

    macarico.util.trainloop(
        training_data=data[:len(data) // 2],
        dev_data=data[len(data) // 2:],
        policy=policy,
        learning_alg=lambda ex: LOLS.lols(ex, HammingLoss,
                                          HammingLossReference(), policy,
                                          p_rollin_ref, p_rollout_ref),
        losses=HammingLoss(),
        optimizer=optimizer,
        run_per_epoch=[p_rollin_ref.step, p_rollout_ref.step],
        train_eval_skip=1,
    )
예제 #4
0
def test_wsj():
    print
    print '# test on wsj subset'
    from macarico.data import nlp_data
    tr,de,te,vocab,label_id = \
      nlp_data.read_wsj_pos('data/wsj.pos', n_tr=50, n_de=50, n_te=0)

    n_types = len(vocab)
    n_labels = len(label_id)

    print 'n_train: %s, n_dev: %s, n_test: %s' % (len(tr), len(de), len(te))
    print 'n_types: %s, n_labels: %s' % (n_types, n_labels)

    tRNN = TransitionRNN([RNNFeatures(n_types, rnn_type='RNN')], [AttendAt()],
                         n_labels)
    policy = LinearPolicy(tRNN, n_labels)

    p_rollin_ref = stochastic(ExponentialAnnealing(0.9))
    optimizer = torch.optim.Adam(policy.parameters(), lr=0.01)

    macarico.util.trainloop(
        training_data=tr,
        dev_data=de,
        policy=policy,
        Learner=lambda: DAgger(HammingLossReference(), policy, p_rollin_ref),
        #        Learner         = lambda: MaximumLikelihood(HammingLossReference(), policy),
        losses=HammingLoss(),
        optimizer=optimizer,
        run_per_epoch=[p_rollin_ref.step],
        n_epochs=10,
        #        train_eval_skip = None,
    )
예제 #5
0
def test1(learning_method, exploration):
    print
    print '# testing learning_method=%d exploration=%d' % (learning_method,
                                                           exploration)
    print
    n_types = 10
    n_labels = 4
    data = macarico.util.make_sequence_mod_data(100, 6, n_types, n_labels)
    data = [Example(x, y, n_labels) for x, y in data]

    tRNN = TransitionRNN([RNNFeatures(n_types)], [AttendAt()], n_labels)
    policy = LinearPolicy(tRNN, n_labels)
    optimizer = torch.optim.Adam(policy.parameters(), lr=0.001)

    p_rollin_ref = stochastic(ExponentialAnnealing(0.9))
    p_rollout_ref = stochastic(ExponentialAnnealing(0.99999))

    macarico.util.trainloop(
        training_data=data[:len(data) // 2],
        dev_data=data[len(data) // 2:],
        policy=policy,
        Learner=lambda: BanditLOLS(
            HammingLossReference(),
            policy,
            p_rollin_ref,
            p_rollout_ref,
            learning_method,  # LEARN_IPS, LEARN_DR, LEARN_BIASED
            exploration,
        ),
        losses=HammingLoss(),
        optimizer=optimizer,
        run_per_epoch=[p_rollin_ref.step, p_rollout_ref.step],
        train_eval_skip=10,
    )
예제 #6
0
def test_restore(n_types, n_labels, data, model):
    actor = TransitionRNN([RNNFeatures(n_types)], [AttendAt()], n_labels)
    policy = LinearPolicy(actor, n_labels)
    print 'evaluating new model: %g' % \
        macarico.util.evaluate(data, policy, HammingLoss())
    policy.load_state_dict(model)
    print 'evaluating restored model: %g' % \
        macarico.util.evaluate(data, policy, HammingLoss())
예제 #7
0
파일: test_mdp.py 프로젝트: yyht/macarico
def test1(LEARNER=LearnerOpts.DAGGER):
    print
    print 'Running test 1 with learner=%s' % LEARNER
    print '======================================================='

    n_states = 3
    n_actions = 2

    tRNN = TransitionRNN([mdp.MDPFeatures(n_states, noise_rate=0.5)],
                         [AttendAt(lambda _: 0, 's')], n_actions)
    policy = LinearPolicy(tRNN, n_actions)

    p_rollin_ref = stochastic(ExponentialAnnealing(0.99))
    p_rollout_ref = stochastic(ExponentialAnnealing(1))

    optimizer = torch.optim.Adam(policy.parameters(), lr=0.01)

    test_mdp, pi_ref = make_ross_mdp()

    if LEARNER == LearnerOpts.DAGGER:
        learner = lambda: DAgger(pi_ref, policy, p_rollin_ref)
    elif LEARNER == LearnerOpts.TWISTED:
        learner = lambda: TwistedDAgger(pi_ref, policy, p_rollin_ref)
    elif LEARNER == LearnerOpts.MAXLIK:
        learner = lambda: MaximumLikelihood(pi_ref, policy)
    elif LEARNER == LearnerOpts.AGGREVATE:
        learner = lambda: AggreVaTe(pi_ref, policy, p_rollin_ref)
    elif LEARNER == LearnerOpts.LOLS:
        learner = None

    losses = []
    for epoch in xrange(101):
        optimizer.zero_grad()
        if learner is not None:
            l = learner()
            env = test_mdp.mk_env()
            res = env.run_episode(l)
            loss = mdp.MDPLoss()(test_mdp, env)
            l.update(loss)
        elif LEARNER == LearnerOpts.LOLS:
            lols(test_mdp, mdp.MDPLoss, pi_ref, policy, p_rollin_ref,
                 p_rollout_ref)

        optimizer.step()
        p_rollin_ref.step()
        p_rollout_ref.step()

        env = test_mdp.mk_env()
        res = env.run_episode(policy)
        loss = mdp.MDPLoss()(test_mdp, env)
        losses.append(loss)
        if epoch % 20 == 0:
            print epoch, sum(losses[-100:]) / len(losses[-100:]), '\t', res
예제 #8
0
def test2():
    print()
    print('blackjack')
    print()
    ex = Blackjack()
    run_environment(
        ex,
        lambda: TransitionBOW(
            [BlackjackFeatures()],  #ex.width, ex.height)],
            [AttendAt(lambda _: 0, 'blackjack')],
            ex.n_actions),
        BlackjackLoss(),
    )
예제 #9
0
def test1():
    print()
    print('pendulum')
    print()
    ex = Pendulum()
    run_environment(
        ex,
        lambda: TransitionRNN(
            [PendulumFeatures()],  #ex.width, ex.height)],
            [AttendAt(lambda _: 0, 'pendulum')],
            ex.n_actions),
        PendulumLoss(),
    )
예제 #10
0
def test1(use_bootstrap):
    n_types = 10
    n_labels = 4
    print
    print '# test sequence labeler on mod data with Reslope and', (
        'bootstrap' if use_bootstrap else 'boltzmann'), 'exploration'
    data = macarico.util.make_sequence_mod_data(3000, 6, n_types, n_labels)
    data = [Example(x, y, n_labels) for x, y in data]

    if not use_bootstrap:
        tRNN = TransitionRNN([RNNFeatures(n_types)], [AttendAt()], n_labels)
        policy = LinearPolicy(tRNN, n_labels)
    else:
        rnns = [
            TransitionRNN([RNNFeatures(n_types)], [AttendAt()],
                          n_labels,
                          h_name='h%d' % i) for i in xrange(5)
        ]
        policy = BootstrapPolicy(rnns, n_labels)

    optimizer = torch.optim.Adam(policy.parameters(), lr=0.01)

    p_ref = stochastic(ExponentialAnnealing(0.9))

    macarico.util.trainloop(
        training_data   = data[:2048],
        dev_data        = data[2048:],
        policy          = policy,
        Learner         = lambda: Reslope(HammingLossReference(), policy, p_ref,
                                          exploration=BanditLOLS.EXPLORE_BOOTSTRAP if use_bootstrap else \
                                                      BanditLOLS.EXPLORE_BOLTZMANN
        ),
        losses          = HammingLoss(),
        optimizer       = optimizer,
        run_per_epoch   = [p_ref.step],
        train_eval_skip = 1,
        bandit_evaluation = True,
        n_epochs = 1,
    )
예제 #11
0
def test0():
    print()
    print('micro pocman')
    print()
    ex = MicroPOCMAN()
    run_environment(
        ex,
        lambda: TransitionBOW(
            [LocalPOCFeatures(history_length=4)],  #ex.width, ex.height)],
            [AttendAt(lambda _: 0, 'poc')],
            4),
        POCLoss(),
    )
예제 #12
0
def test():
    print('')
    print('Cart Pole')
    print('')
    ex = CartPoleEnv()
    run_environment(
        ex,
        lambda: TransitionBOW([CartPoleFeatures(
        )], [AttendAt(lambda _: 0, 'cartpole')], ex.n_actions),
        CartPoleLoss(),
        rl_alg=reinforce,
        n_epochs=100,
        lr=0.1,
    )
예제 #13
0
def test3():
    print()
    print('hex')
    print()
    board_size = 3
    ex = Hex(Hex.BLACK, board_size)
    run_environment(
        ex,
        lambda: TransitionBOW(
            [HexFeatures(board_size)],  #ex.width, ex.height)],
            [AttendAt(lambda _: 0, 'hex')],
            ex.n_actions),
        HexLoss(),
    )
예제 #14
0
def test():
    print('')
    print('Proximal Policy Optimization')
    print('')
    args = parse_arguments()
    if args.task == 'mountaincar':
        print('Mountain Car')
        ex = MountainCar()
        run_ppo(
            ex,
            lambda dy_model:
            TransitionBOW(
                          [MountainCarFeatures()],
                          [AttendAt(lambda _: 0, 'mountain_car')],
                          ex.n_actions),
            MountainCarLoss(),
            args.eps,
            args.learner,
        )
    elif args.task == 'cartpole':
        print('Cart Pole')
        ex = CartPoleEnv()
        run_ppo(
            ex,
            lambda dy_model:
            TransitionBOW(
                          [CartPoleFeatures()],
                          [AttendAt(lambda _: 0, 'cartpole')],
                          ex.n_actions),
            CartPoleLoss(),
            args.eps,
            args.learner,
        )
    else:
        print('Unsupported Task!')
        exit(-1)
예제 #15
0
def test1(learning_method, exploration):
    print
    print '# testing learning_method=%d exploration=%d' % (learning_method,
                                                           exploration)
    print
    n_types = 10
    n_labels = 2
    data = macarico.util.make_sequence_mod_data(100, 1, n_types, n_labels)
    data = [Example(x, y, n_labels) for x, y in data]

    bag_size = 5
    tRNN = [
        TransitionRNN([RNNFeatures(n_types)], [AttendAt()], n_labels)
        for i in range(bag_size)
    ]
    policy = BootstrapPolicy(tRNN, n_labels)
    #policy = LinearPolicy(tRNN[0], n_labels)
    #print 'policy=', policy
    #print 'parameters=', list(policy.parameters())
    optimizer = torch.optim.Adam(policy.parameters(), lr=0.01)

    p_rollin_ref = stochastic(ExponentialAnnealing(0.9))
    p_rollout_ref = stochastic(ExponentialAnnealing(0.99999))

    macarico.util.trainloop(
        training_data=data[:len(data) // 2],
        dev_data=data[len(data) // 2:],
        policy=policy,
        Learner=lambda: BanditLOLS(
            HammingLossReference(),
            policy,
            p_rollin_ref,
            p_rollout_ref,
            learning_method,
            exploration,
        ),
        losses=HammingLoss(),
        optimizer=optimizer,
        run_per_batch=[p_rollin_ref.step, p_rollout_ref.step],
        train_eval_skip=1,
        n_epochs=2,
    )
예제 #16
0
def run_train(n_types, n_labels, data):
    actor = TransitionRNN([RNNFeatures(n_types)], [AttendAt()], n_labels)
    policy = LinearPolicy(actor, n_labels)
    print 'training'
    _, model = macarico.util.trainloop(
        training_data=data[:len(data) // 2],
        dev_data=data[len(data) // 2:],
        policy=policy,
        Learner=lambda: MaximumLikelihood(HammingLossReference(), policy),
        losses=HammingLoss(),
        optimizer=torch.optim.Adam(policy.parameters(), lr=0.01),
        n_epochs=2,
        train_eval_skip=1,
        returned_parameters='best',
    )
    print 'evaluating learned model: %g' % \
        macarico.util.evaluate(data, policy, HammingLoss())
    policy.load_state_dict(model)
    print 'evaluating learned model: %g' % \
        macarico.util.evaluate(data, policy, HammingLoss())
    return model
예제 #17
0
def test_rl(environment_name, n_epochs=10000):
    print('rl', environment_name)
    tasks = {
        'pocman': (pocman.MicroPOCMAN, pocman.LocalPOCFeatures, pocman.POCLoss, pocman.POCReference),
        'cartpole': (cartpole.CartPoleEnv, cartpole.CartPoleFeatures, cartpole.CartPoleLoss, None),
        'blackjack': (blackjack.Blackjack, blackjack.BlackjackFeatures, blackjack.BlackjackLoss, None),
        'hex': (hexgame.Hex, hexgame.HexFeatures, hexgame.HexLoss, None),
        'gridworld': (gridworld.make_default_gridworld, gridworld.LocalGridFeatures, gridworld.GridLoss, None),
        'pendulum': (pendulum.Pendulum, pendulum.PendulumFeatures, pendulum.PendulumLoss, None),
        'car': (car.MountainCar, car.MountainCarFeatures, car.MountainCarLoss, None),
        'mdp': (lambda: synth.make_ross_mdp()[0], lambda: mdp.MDPFeatures(3), mdp.MDPLoss, lambda: synth.make_ross_mdp()[1]),
    }
              
    mk_env, mk_fts, loss_fn, ref = tasks[environment_name]
    env = mk_env()
    features = mk_fts()
    
    attention = AttendAt(features, position=lambda _: 0)
    actor = BOWActor([attention], env.n_actions)
    policy = CSOAAPolicy(actor, env.n_actions)
    learner = Reinforce(policy)
    print(learner)
    optimizer = torch.optim.Adam(policy.parameters(), lr=0.001)
    losses, objs = [], []
    for epoch in range(1, 1+n_epochs):
        optimizer.zero_grad()
        env = mk_env()
        env.run_episode(learner)
        loss_val = loss_fn()(env.example)
        obj = learner.get_objective(loss_val)
        if not isinstance(obj, float):
            obj.backward()
            optimizer.step()
            obj = obj.data[0]
        losses.append(loss_val)
        objs.append(obj)
        #losses.append(loss)
        if epoch%100 == 0 or epoch==n_epochs:
            print(epoch, np.mean(losses[-500:]), np.mean(objs[-500:]))
예제 #18
0
def test4():
    print '\n===\n=== test4: big grid world, global features\n==='
    ex = make_big_gridworld()
    run_gridworld(
        ex, lambda: TransitionBOW([GlobalGridFeatures(ex.width, ex.height)],
                                  [AttendAt(lambda _: 0, 'grid')], 4))
예제 #19
0
def test3():
    print '\n===\n=== test3: p_step_success=0.8, but local features only\n==='
    ex = make_default_gridworld(p_step_success=0.8, start_random=True)
    run_gridworld(
        ex, lambda: TransitionBOW([LocalGridFeatures(ex.width, ex.height)],
                                  [AttendAt(lambda _: 0, 'grid')], 4))
예제 #20
0
def test2():
    print '\n===\n=== test2: p_step_success=0.8 and per_step_cost=0.1\n==='
    ex = make_default_gridworld(per_step_cost=0.1, p_step_success=0.8)
    run_gridworld(
        ex, lambda: TransitionBOW([GlobalGridFeatures(ex.width, ex.height)],
                                  [AttendAt(lambda _: 0, 'grid')], 4))
예제 #21
0
def test0():
    print '\n===\n=== test0: p_step_success=1.0\n==='
    ex = make_default_gridworld(p_step_success=1.0)
    run_gridworld(
        ex, lambda: TransitionBOW([GlobalGridFeatures(ex.width, ex.height)],
                                  [AttendAt(lambda _: 0, 'grid')], 4))
예제 #22
0
def test1(task=0, LEARNER=LearnerOpts.DAGGER):
    print
    print 'Running test 1 (v%d) with learner=%s' % (task, LEARNER)
    print '======================================================='

    if task == 0:
        print 'Sequence reversal task, easy version'
        data = macarico.util.make_sequence_reversal_data(100, 5, 5)
        foci = [AttendAt(lambda s: s.N - s.n - 1)]
    elif task == 1:
        print 'Sequence reversal task, hard version'
        data = macarico.util.make_sequence_reversal_data(1000, 5, 5)
        foci = [AttendAt()]
    elif task == 2:
        print 'Sequence reversal task, multi-focus version'
        data = macarico.util.make_sequence_reversal_data(100, 5, 5)
        foci = [AttendAt(), AttendAt(lambda s: s.N - s.n - 1)]
    elif task == 3:
        print 'Memoryless task, add-one mod K'
        data = macarico.util.make_sequence_mod_data(50, 5, 10, 3)
        foci = [AttendAt()]
    elif task == 4:
        print 'Matti-style data'
        data = make_matti_data(1000, 20, 2, 0.05)
        foci = [AttendAt()]

    n_types = 1 + max({x for X, _ in data for x in X})
    n_labels = 1 + max({y for _, Y in data for y in Y})

    data = [Example(x, y, n_labels) for x, y in data]

    random.shuffle(data)
    m = len(data) // 2
    train = data[:m]
    dev = data[m:]

    print 'n_train: %s, n_dev: %s' % (len(train), len(dev))
    print 'n_types: %s, n_labels: %s' % (n_types, n_labels)
    print 'learner:', LEARNER
    print

    tRNN = Actor([RNNFeatures(n_types)], foci, n_labels)
    policy = LinearPolicy(tRNN, n_labels)

    baseline = EWMA(0.8)
    p_rollin_ref = stochastic(ExponentialAnnealing(0.5))
    p_rollout_ref = stochastic(ExponentialAnnealing(0.5))

    if LEARNER == LearnerOpts.AC:
        from macarico.lts.reinforce import AdvantageActorCritic, LinearValueFn
        baseline = LinearValueFn(policy.features)
        policy.vfa = baseline  # adds params to policy via nn.module

    optimizer = torch.optim.Adam(policy.parameters(), lr=0.01)

    if LEARNER == LearnerOpts.DAGGER:
        learner = lambda: DAgger(HammingLossReference(), policy, p_rollin_ref)
    elif LEARNER == LearnerOpts.TWISTED:
        learner = lambda: TwistedDAgger(HammingLossReference(), policy,
                                        p_rollin_ref)
    elif LEARNER == LearnerOpts.MAXLIK:
        learner = lambda: MaximumLikelihood(HammingLossReference(), policy)
    elif LEARNER == LearnerOpts.AC:
        learner = lambda: AdvantageActorCritic(policy, baseline)
    elif LEARNER == LearnerOpts.REINFORCE:
        learner = lambda: Reinforce(policy, baseline)
    elif LEARNER == LearnerOpts.BANDITLOLS:
        learner = lambda: BanditLOLS(HammingLossReference(
        ), policy, p_rollin_ref, p_rollout_ref, BanditLOLS.LEARN_DR, BanditLOLS
                                     .EXPLORE_UNIFORM, baseline)

    macarico.util.trainloop(
        training_data=train,
        dev_data=dev,
        policy=policy,
        Learner=learner,
        losses=HammingLoss(),
        optimizer=optimizer,
        run_per_epoch=[p_rollin_ref.step, p_rollout_ref.step],
        n_epochs=10,
        train_eval_skip=1,
    )
예제 #23
0
def test(teacher=True, imitation=True):
    n_card_types = 3
    n_epochs = 500000
    print('Concentration: n_card_types', n_card_types, 'teacher', teacher,
          'imitation', imitation)

    env = Concentration(n_card_types=n_card_types,
                        random_deck_per_episode=True)

    if teacher:
        features = ConcentrationSmartFeatures(n_card_types, cheat=False)
        features = macarico.Torch(features, 20, [
            nn.Linear(features.dim, 20),
            nn.ReLU(),
        ])
        attention = AttendAt(features, position=lambda _: 0)
        actor = BOWActor([attention], env.n_actions)

    else:  # student
        features = ConcentrationPOFeatures()
        attention = AttendAt(features, position=lambda _: 0)
        actor = RNNActor([attention], env.n_actions, d_hid=50)

    policy = CSOAAPolicy(actor, env.n_actions)
    reference = ConcentrationReference()

    learner = DAgger(policy, reference) if imitation else Reinforce(policy)
    loss_fn = ConcentrationLoss()

    print(learner)

    ref_losses = []
    for epoch in range(1000):
        env.run_episode(reference)
        ref_losses.append(loss_fn(env.example))
    print('average reference loss %g' % np.mean(ref_losses))

    rnd_losses = []
    for epoch in range(1000):
        env.run_episode(lambda s: np.random.choice(list(s.actions)))
        rnd_losses.append(loss_fn(env.example))
    print('average random loss %g' % np.mean(rnd_losses))

    optimizer = torch.optim.Adam(policy.parameters(), lr=0.001)
    losses, objs = [], []
    best_loss = None
    for epoch in range(1, 1 + n_epochs):
        optimizer.zero_grad()
        output = env.run_episode(learner)
        loss_val = loss_fn(env.example)
        obj = learner.get_objective(loss_val)
        if not isinstance(obj, float):
            obj.backward()
            optimizer.step()
            obj = obj.item()

        losses.append(loss_val)
        #env.run_episode(policy)
        #losses.append(loss_fn(env.example))

        objs.append(obj)
        #losses.append(loss)
        if epoch % 1000 == 0 or epoch == n_epochs:
            loss = np.mean(losses[-500:])
            if best_loss is None or loss < best_loss[0]:
                best_loss = (loss, epoch)
            print(epoch, 'losses', loss, 'objective',
                  np.mean(objs[-500:]), 'best_loss', best_loss, 'init_losses',
                  np.mean(losses[:1000]), sum(env.example.costs), env.card_seq)
            if loss <= 0.99 * np.mean(ref_losses):
                break
예제 #24
0
def build_random_learner(n_types, n_actions, ref, loss_fn, require_attention):
    # compute base features
    features = np.random.choice([lambda: EmbeddingFeatures(n_types),
                                 lambda: BOWFeatures(n_types)])()

    # optionally run RNN or CNN
    features = np.random.choice([lambda: features,
                                 lambda: RNN(features,
                                             cell_type=np.random.choice(['RNN', 'GRU', 'LSTM', 'QRNN']),
                                             bidirectional=np.random.random() < 0.5),
                                 lambda: DilatedCNN(features)])()

    # maybe some nn magic
    if np.random.random() < 0.5:
        features = macarico.Torch(features,
                                  50, # final dimension, too hard to tell from list of layers :(
                                  [nn.Linear(features.dim, 50),
                                   nn.Tanh(),
                                   nn.Linear(50, 50),
                                   nn.Tanh()])

    # compute some attention
    if require_attention is not None:
        attention = [require_attention(features)]
    else:
        attention = [np.random.choice([lambda: AttendAt(features, 'n'), # or `lambda s: s.n`
                                       lambda: AverageAttention(features),
                                       lambda: FrontBackAttention(features),
                                       lambda: SoftmaxAttention(features)])()] # note: softmax doesn't work with BOWActor
        if np.random.random() < 0.2:
            attention.append(AttendAt(features, lambda s: s.N-s.n))

    # build an actor
    if any((isinstance(x, SoftmaxAttention) for x in attention)):
        actor = RNNActor(attention, n_actions)
    else:
        actor = np.random.choice([lambda: RNNActor(attention,
                                                   n_actions,
                                                   d_actemb=np.random.choice([None,5]),
                                                   cell_type=np.random.choice(['RNN', 'GRU', 'LSTM'])),
                               lambda: BOWActor(attention, n_actions, act_history_length=3, obs_history_length=2)])()

    # do something fun: add a torch module in the middle
    if np.random.random() < 0.5:
        actor = macarico.Torch(actor,
                               27, # final dimension, too hard to tell from list of layers :(
                               [nn.Linear(actor.dim, 27),
                                nn.Tanh()])

    # build the policy
    policy = np.random.choice([lambda: CSOAAPolicy(actor, n_actions, 'huber'),
                            lambda: CSOAAPolicy(actor, n_actions, 'squared'),
                            lambda: WMCPolicy(actor, n_actions, 'huber'),
                            lambda: WMCPolicy(actor, n_actions, 'hinge'),
                            lambda: WMCPolicy(actor, n_actions, 'multinomial'),
                           ])()
    parameters = policy.parameters()

    # build the learner
    if np.random.random() < 0.1: # A2C
        value_fn = LinearValueFn(actor)
        learner = A2C(policy, value_fn)
        parameters = list(parameters) + list(value_fn.parameters())
    else:
        learner = np.random.choice([BehavioralCloning(policy, ref),
                                 DAgger(policy, ref), #, ExponentialAnnealing(0.99))
                                 Coaching(policy, ref, policy_coeff=0.1),
                                 AggreVaTe(policy, ref),
                                 Reinforce(policy),
                                 BanditLOLS(policy, ref),
                                 LOLS(policy, ref, loss_fn)])
    
    return policy, learner, parameters