def test2(): # aggrevate print print '# test sequence labeler on mod data with AggreVaTe' n_types = 10 n_labels = 4 data = macarico.util.make_sequence_mod_data(100, 5, n_types, n_labels) data = [Example(x, y, n_labels) for x, y in data] tRNN = TransitionRNN( [RNNFeatures(n_types)], [AttendAt()], n_labels, ) policy = LinearPolicy(tRNN, n_labels) p_rollin_ref = stochastic(ExponentialAnnealing(0.99)) optimizer = torch.optim.Adam(policy.parameters(), lr=0.01) macarico.util.trainloop( training_data=data[:len(data) // 2], dev_data=data[len(data) // 2:], policy=policy, Learner=lambda: AggreVaTe(HammingLossReference(), policy, p_rollin_ref ), losses=HammingLoss(), optimizer=optimizer, run_per_epoch=[p_rollin_ref.step], n_epochs=4, train_eval_skip=1, )
def test0(): print print '# test sequence labeler on mod data with DAgger' n_types = 10 n_labels = 4 data = [ Example(x, y, n_labels) for x, y in macarico.util.make_sequence_mod_data( 100, 5, n_types, n_labels) ] tRNN = Actor([RNNFeatures(n_types, output_field='mytok_rnn')], [AttendAt(field='mytok_rnn')], n_labels) policy = LinearPolicy(tRNN, n_labels) p_rollin_ref = stochastic(ExponentialAnnealing(0.99)) optimizer = torch.optim.Adam(policy.parameters(), lr=0.01) macarico.util.trainloop( training_data=data[:len(data) // 2], dev_data=data[len(data) // 2:], policy=policy, Learner=lambda: DAgger(HammingLossReference(), policy, p_rollin_ref), losses=HammingLoss(), optimizer=optimizer, run_per_epoch=[p_rollin_ref.step], n_epochs=4, train_eval_skip=1, )
def test1(): n_types = 10 n_labels = 4 print print '# test sequence labeler on mod data with LOLS' data = macarico.util.make_sequence_mod_data(20, 6, n_types, n_labels) data = [Example(x, y, n_labels) for x, y in data] tRNN = TransitionRNN([RNNFeatures(n_types)], [AttendAt()], n_labels) policy = LinearPolicy(tRNN, n_labels) optimizer = torch.optim.Adam(policy.parameters(), lr=0.01) p_rollin_ref = stochastic(ExponentialAnnealing(0.9)) p_rollout_ref = stochastic(ExponentialAnnealing(0.9)) macarico.util.trainloop( training_data=data[:len(data) // 2], dev_data=data[len(data) // 2:], policy=policy, learning_alg=lambda ex: LOLS.lols(ex, HammingLoss, HammingLossReference(), policy, p_rollin_ref, p_rollout_ref), losses=HammingLoss(), optimizer=optimizer, run_per_epoch=[p_rollin_ref.step, p_rollout_ref.step], train_eval_skip=1, )
def test_wsj(): print print '# test on wsj subset' from macarico.data import nlp_data tr,de,te,vocab,label_id = \ nlp_data.read_wsj_pos('data/wsj.pos', n_tr=50, n_de=50, n_te=0) n_types = len(vocab) n_labels = len(label_id) print 'n_train: %s, n_dev: %s, n_test: %s' % (len(tr), len(de), len(te)) print 'n_types: %s, n_labels: %s' % (n_types, n_labels) tRNN = TransitionRNN([RNNFeatures(n_types, rnn_type='RNN')], [AttendAt()], n_labels) policy = LinearPolicy(tRNN, n_labels) p_rollin_ref = stochastic(ExponentialAnnealing(0.9)) optimizer = torch.optim.Adam(policy.parameters(), lr=0.01) macarico.util.trainloop( training_data=tr, dev_data=de, policy=policy, Learner=lambda: DAgger(HammingLossReference(), policy, p_rollin_ref), # Learner = lambda: MaximumLikelihood(HammingLossReference(), policy), losses=HammingLoss(), optimizer=optimizer, run_per_epoch=[p_rollin_ref.step], n_epochs=10, # train_eval_skip = None, )
def test1(learning_method, exploration): print print '# testing learning_method=%d exploration=%d' % (learning_method, exploration) print n_types = 10 n_labels = 4 data = macarico.util.make_sequence_mod_data(100, 6, n_types, n_labels) data = [Example(x, y, n_labels) for x, y in data] tRNN = TransitionRNN([RNNFeatures(n_types)], [AttendAt()], n_labels) policy = LinearPolicy(tRNN, n_labels) optimizer = torch.optim.Adam(policy.parameters(), lr=0.001) p_rollin_ref = stochastic(ExponentialAnnealing(0.9)) p_rollout_ref = stochastic(ExponentialAnnealing(0.99999)) macarico.util.trainloop( training_data=data[:len(data) // 2], dev_data=data[len(data) // 2:], policy=policy, Learner=lambda: BanditLOLS( HammingLossReference(), policy, p_rollin_ref, p_rollout_ref, learning_method, # LEARN_IPS, LEARN_DR, LEARN_BIASED exploration, ), losses=HammingLoss(), optimizer=optimizer, run_per_epoch=[p_rollin_ref.step, p_rollout_ref.step], train_eval_skip=10, )
def test_restore(n_types, n_labels, data, model): actor = TransitionRNN([RNNFeatures(n_types)], [AttendAt()], n_labels) policy = LinearPolicy(actor, n_labels) print 'evaluating new model: %g' % \ macarico.util.evaluate(data, policy, HammingLoss()) policy.load_state_dict(model) print 'evaluating restored model: %g' % \ macarico.util.evaluate(data, policy, HammingLoss())
def test1(LEARNER=LearnerOpts.DAGGER): print print 'Running test 1 with learner=%s' % LEARNER print '=======================================================' n_states = 3 n_actions = 2 tRNN = TransitionRNN([mdp.MDPFeatures(n_states, noise_rate=0.5)], [AttendAt(lambda _: 0, 's')], n_actions) policy = LinearPolicy(tRNN, n_actions) p_rollin_ref = stochastic(ExponentialAnnealing(0.99)) p_rollout_ref = stochastic(ExponentialAnnealing(1)) optimizer = torch.optim.Adam(policy.parameters(), lr=0.01) test_mdp, pi_ref = make_ross_mdp() if LEARNER == LearnerOpts.DAGGER: learner = lambda: DAgger(pi_ref, policy, p_rollin_ref) elif LEARNER == LearnerOpts.TWISTED: learner = lambda: TwistedDAgger(pi_ref, policy, p_rollin_ref) elif LEARNER == LearnerOpts.MAXLIK: learner = lambda: MaximumLikelihood(pi_ref, policy) elif LEARNER == LearnerOpts.AGGREVATE: learner = lambda: AggreVaTe(pi_ref, policy, p_rollin_ref) elif LEARNER == LearnerOpts.LOLS: learner = None losses = [] for epoch in xrange(101): optimizer.zero_grad() if learner is not None: l = learner() env = test_mdp.mk_env() res = env.run_episode(l) loss = mdp.MDPLoss()(test_mdp, env) l.update(loss) elif LEARNER == LearnerOpts.LOLS: lols(test_mdp, mdp.MDPLoss, pi_ref, policy, p_rollin_ref, p_rollout_ref) optimizer.step() p_rollin_ref.step() p_rollout_ref.step() env = test_mdp.mk_env() res = env.run_episode(policy) loss = mdp.MDPLoss()(test_mdp, env) losses.append(loss) if epoch % 20 == 0: print epoch, sum(losses[-100:]) / len(losses[-100:]), '\t', res
def test2(): print() print('blackjack') print() ex = Blackjack() run_environment( ex, lambda: TransitionBOW( [BlackjackFeatures()], #ex.width, ex.height)], [AttendAt(lambda _: 0, 'blackjack')], ex.n_actions), BlackjackLoss(), )
def test1(): print() print('pendulum') print() ex = Pendulum() run_environment( ex, lambda: TransitionRNN( [PendulumFeatures()], #ex.width, ex.height)], [AttendAt(lambda _: 0, 'pendulum')], ex.n_actions), PendulumLoss(), )
def test1(use_bootstrap): n_types = 10 n_labels = 4 print print '# test sequence labeler on mod data with Reslope and', ( 'bootstrap' if use_bootstrap else 'boltzmann'), 'exploration' data = macarico.util.make_sequence_mod_data(3000, 6, n_types, n_labels) data = [Example(x, y, n_labels) for x, y in data] if not use_bootstrap: tRNN = TransitionRNN([RNNFeatures(n_types)], [AttendAt()], n_labels) policy = LinearPolicy(tRNN, n_labels) else: rnns = [ TransitionRNN([RNNFeatures(n_types)], [AttendAt()], n_labels, h_name='h%d' % i) for i in xrange(5) ] policy = BootstrapPolicy(rnns, n_labels) optimizer = torch.optim.Adam(policy.parameters(), lr=0.01) p_ref = stochastic(ExponentialAnnealing(0.9)) macarico.util.trainloop( training_data = data[:2048], dev_data = data[2048:], policy = policy, Learner = lambda: Reslope(HammingLossReference(), policy, p_ref, exploration=BanditLOLS.EXPLORE_BOOTSTRAP if use_bootstrap else \ BanditLOLS.EXPLORE_BOLTZMANN ), losses = HammingLoss(), optimizer = optimizer, run_per_epoch = [p_ref.step], train_eval_skip = 1, bandit_evaluation = True, n_epochs = 1, )
def test0(): print() print('micro pocman') print() ex = MicroPOCMAN() run_environment( ex, lambda: TransitionBOW( [LocalPOCFeatures(history_length=4)], #ex.width, ex.height)], [AttendAt(lambda _: 0, 'poc')], 4), POCLoss(), )
def test(): print('') print('Cart Pole') print('') ex = CartPoleEnv() run_environment( ex, lambda: TransitionBOW([CartPoleFeatures( )], [AttendAt(lambda _: 0, 'cartpole')], ex.n_actions), CartPoleLoss(), rl_alg=reinforce, n_epochs=100, lr=0.1, )
def test3(): print() print('hex') print() board_size = 3 ex = Hex(Hex.BLACK, board_size) run_environment( ex, lambda: TransitionBOW( [HexFeatures(board_size)], #ex.width, ex.height)], [AttendAt(lambda _: 0, 'hex')], ex.n_actions), HexLoss(), )
def test(): print('') print('Proximal Policy Optimization') print('') args = parse_arguments() if args.task == 'mountaincar': print('Mountain Car') ex = MountainCar() run_ppo( ex, lambda dy_model: TransitionBOW( [MountainCarFeatures()], [AttendAt(lambda _: 0, 'mountain_car')], ex.n_actions), MountainCarLoss(), args.eps, args.learner, ) elif args.task == 'cartpole': print('Cart Pole') ex = CartPoleEnv() run_ppo( ex, lambda dy_model: TransitionBOW( [CartPoleFeatures()], [AttendAt(lambda _: 0, 'cartpole')], ex.n_actions), CartPoleLoss(), args.eps, args.learner, ) else: print('Unsupported Task!') exit(-1)
def test1(learning_method, exploration): print print '# testing learning_method=%d exploration=%d' % (learning_method, exploration) print n_types = 10 n_labels = 2 data = macarico.util.make_sequence_mod_data(100, 1, n_types, n_labels) data = [Example(x, y, n_labels) for x, y in data] bag_size = 5 tRNN = [ TransitionRNN([RNNFeatures(n_types)], [AttendAt()], n_labels) for i in range(bag_size) ] policy = BootstrapPolicy(tRNN, n_labels) #policy = LinearPolicy(tRNN[0], n_labels) #print 'policy=', policy #print 'parameters=', list(policy.parameters()) optimizer = torch.optim.Adam(policy.parameters(), lr=0.01) p_rollin_ref = stochastic(ExponentialAnnealing(0.9)) p_rollout_ref = stochastic(ExponentialAnnealing(0.99999)) macarico.util.trainloop( training_data=data[:len(data) // 2], dev_data=data[len(data) // 2:], policy=policy, Learner=lambda: BanditLOLS( HammingLossReference(), policy, p_rollin_ref, p_rollout_ref, learning_method, exploration, ), losses=HammingLoss(), optimizer=optimizer, run_per_batch=[p_rollin_ref.step, p_rollout_ref.step], train_eval_skip=1, n_epochs=2, )
def run_train(n_types, n_labels, data): actor = TransitionRNN([RNNFeatures(n_types)], [AttendAt()], n_labels) policy = LinearPolicy(actor, n_labels) print 'training' _, model = macarico.util.trainloop( training_data=data[:len(data) // 2], dev_data=data[len(data) // 2:], policy=policy, Learner=lambda: MaximumLikelihood(HammingLossReference(), policy), losses=HammingLoss(), optimizer=torch.optim.Adam(policy.parameters(), lr=0.01), n_epochs=2, train_eval_skip=1, returned_parameters='best', ) print 'evaluating learned model: %g' % \ macarico.util.evaluate(data, policy, HammingLoss()) policy.load_state_dict(model) print 'evaluating learned model: %g' % \ macarico.util.evaluate(data, policy, HammingLoss()) return model
def test_rl(environment_name, n_epochs=10000): print('rl', environment_name) tasks = { 'pocman': (pocman.MicroPOCMAN, pocman.LocalPOCFeatures, pocman.POCLoss, pocman.POCReference), 'cartpole': (cartpole.CartPoleEnv, cartpole.CartPoleFeatures, cartpole.CartPoleLoss, None), 'blackjack': (blackjack.Blackjack, blackjack.BlackjackFeatures, blackjack.BlackjackLoss, None), 'hex': (hexgame.Hex, hexgame.HexFeatures, hexgame.HexLoss, None), 'gridworld': (gridworld.make_default_gridworld, gridworld.LocalGridFeatures, gridworld.GridLoss, None), 'pendulum': (pendulum.Pendulum, pendulum.PendulumFeatures, pendulum.PendulumLoss, None), 'car': (car.MountainCar, car.MountainCarFeatures, car.MountainCarLoss, None), 'mdp': (lambda: synth.make_ross_mdp()[0], lambda: mdp.MDPFeatures(3), mdp.MDPLoss, lambda: synth.make_ross_mdp()[1]), } mk_env, mk_fts, loss_fn, ref = tasks[environment_name] env = mk_env() features = mk_fts() attention = AttendAt(features, position=lambda _: 0) actor = BOWActor([attention], env.n_actions) policy = CSOAAPolicy(actor, env.n_actions) learner = Reinforce(policy) print(learner) optimizer = torch.optim.Adam(policy.parameters(), lr=0.001) losses, objs = [], [] for epoch in range(1, 1+n_epochs): optimizer.zero_grad() env = mk_env() env.run_episode(learner) loss_val = loss_fn()(env.example) obj = learner.get_objective(loss_val) if not isinstance(obj, float): obj.backward() optimizer.step() obj = obj.data[0] losses.append(loss_val) objs.append(obj) #losses.append(loss) if epoch%100 == 0 or epoch==n_epochs: print(epoch, np.mean(losses[-500:]), np.mean(objs[-500:]))
def test4(): print '\n===\n=== test4: big grid world, global features\n===' ex = make_big_gridworld() run_gridworld( ex, lambda: TransitionBOW([GlobalGridFeatures(ex.width, ex.height)], [AttendAt(lambda _: 0, 'grid')], 4))
def test3(): print '\n===\n=== test3: p_step_success=0.8, but local features only\n===' ex = make_default_gridworld(p_step_success=0.8, start_random=True) run_gridworld( ex, lambda: TransitionBOW([LocalGridFeatures(ex.width, ex.height)], [AttendAt(lambda _: 0, 'grid')], 4))
def test2(): print '\n===\n=== test2: p_step_success=0.8 and per_step_cost=0.1\n===' ex = make_default_gridworld(per_step_cost=0.1, p_step_success=0.8) run_gridworld( ex, lambda: TransitionBOW([GlobalGridFeatures(ex.width, ex.height)], [AttendAt(lambda _: 0, 'grid')], 4))
def test0(): print '\n===\n=== test0: p_step_success=1.0\n===' ex = make_default_gridworld(p_step_success=1.0) run_gridworld( ex, lambda: TransitionBOW([GlobalGridFeatures(ex.width, ex.height)], [AttendAt(lambda _: 0, 'grid')], 4))
def test1(task=0, LEARNER=LearnerOpts.DAGGER): print print 'Running test 1 (v%d) with learner=%s' % (task, LEARNER) print '=======================================================' if task == 0: print 'Sequence reversal task, easy version' data = macarico.util.make_sequence_reversal_data(100, 5, 5) foci = [AttendAt(lambda s: s.N - s.n - 1)] elif task == 1: print 'Sequence reversal task, hard version' data = macarico.util.make_sequence_reversal_data(1000, 5, 5) foci = [AttendAt()] elif task == 2: print 'Sequence reversal task, multi-focus version' data = macarico.util.make_sequence_reversal_data(100, 5, 5) foci = [AttendAt(), AttendAt(lambda s: s.N - s.n - 1)] elif task == 3: print 'Memoryless task, add-one mod K' data = macarico.util.make_sequence_mod_data(50, 5, 10, 3) foci = [AttendAt()] elif task == 4: print 'Matti-style data' data = make_matti_data(1000, 20, 2, 0.05) foci = [AttendAt()] n_types = 1 + max({x for X, _ in data for x in X}) n_labels = 1 + max({y for _, Y in data for y in Y}) data = [Example(x, y, n_labels) for x, y in data] random.shuffle(data) m = len(data) // 2 train = data[:m] dev = data[m:] print 'n_train: %s, n_dev: %s' % (len(train), len(dev)) print 'n_types: %s, n_labels: %s' % (n_types, n_labels) print 'learner:', LEARNER print tRNN = Actor([RNNFeatures(n_types)], foci, n_labels) policy = LinearPolicy(tRNN, n_labels) baseline = EWMA(0.8) p_rollin_ref = stochastic(ExponentialAnnealing(0.5)) p_rollout_ref = stochastic(ExponentialAnnealing(0.5)) if LEARNER == LearnerOpts.AC: from macarico.lts.reinforce import AdvantageActorCritic, LinearValueFn baseline = LinearValueFn(policy.features) policy.vfa = baseline # adds params to policy via nn.module optimizer = torch.optim.Adam(policy.parameters(), lr=0.01) if LEARNER == LearnerOpts.DAGGER: learner = lambda: DAgger(HammingLossReference(), policy, p_rollin_ref) elif LEARNER == LearnerOpts.TWISTED: learner = lambda: TwistedDAgger(HammingLossReference(), policy, p_rollin_ref) elif LEARNER == LearnerOpts.MAXLIK: learner = lambda: MaximumLikelihood(HammingLossReference(), policy) elif LEARNER == LearnerOpts.AC: learner = lambda: AdvantageActorCritic(policy, baseline) elif LEARNER == LearnerOpts.REINFORCE: learner = lambda: Reinforce(policy, baseline) elif LEARNER == LearnerOpts.BANDITLOLS: learner = lambda: BanditLOLS(HammingLossReference( ), policy, p_rollin_ref, p_rollout_ref, BanditLOLS.LEARN_DR, BanditLOLS .EXPLORE_UNIFORM, baseline) macarico.util.trainloop( training_data=train, dev_data=dev, policy=policy, Learner=learner, losses=HammingLoss(), optimizer=optimizer, run_per_epoch=[p_rollin_ref.step, p_rollout_ref.step], n_epochs=10, train_eval_skip=1, )
def test(teacher=True, imitation=True): n_card_types = 3 n_epochs = 500000 print('Concentration: n_card_types', n_card_types, 'teacher', teacher, 'imitation', imitation) env = Concentration(n_card_types=n_card_types, random_deck_per_episode=True) if teacher: features = ConcentrationSmartFeatures(n_card_types, cheat=False) features = macarico.Torch(features, 20, [ nn.Linear(features.dim, 20), nn.ReLU(), ]) attention = AttendAt(features, position=lambda _: 0) actor = BOWActor([attention], env.n_actions) else: # student features = ConcentrationPOFeatures() attention = AttendAt(features, position=lambda _: 0) actor = RNNActor([attention], env.n_actions, d_hid=50) policy = CSOAAPolicy(actor, env.n_actions) reference = ConcentrationReference() learner = DAgger(policy, reference) if imitation else Reinforce(policy) loss_fn = ConcentrationLoss() print(learner) ref_losses = [] for epoch in range(1000): env.run_episode(reference) ref_losses.append(loss_fn(env.example)) print('average reference loss %g' % np.mean(ref_losses)) rnd_losses = [] for epoch in range(1000): env.run_episode(lambda s: np.random.choice(list(s.actions))) rnd_losses.append(loss_fn(env.example)) print('average random loss %g' % np.mean(rnd_losses)) optimizer = torch.optim.Adam(policy.parameters(), lr=0.001) losses, objs = [], [] best_loss = None for epoch in range(1, 1 + n_epochs): optimizer.zero_grad() output = env.run_episode(learner) loss_val = loss_fn(env.example) obj = learner.get_objective(loss_val) if not isinstance(obj, float): obj.backward() optimizer.step() obj = obj.item() losses.append(loss_val) #env.run_episode(policy) #losses.append(loss_fn(env.example)) objs.append(obj) #losses.append(loss) if epoch % 1000 == 0 or epoch == n_epochs: loss = np.mean(losses[-500:]) if best_loss is None or loss < best_loss[0]: best_loss = (loss, epoch) print(epoch, 'losses', loss, 'objective', np.mean(objs[-500:]), 'best_loss', best_loss, 'init_losses', np.mean(losses[:1000]), sum(env.example.costs), env.card_seq) if loss <= 0.99 * np.mean(ref_losses): break
def build_random_learner(n_types, n_actions, ref, loss_fn, require_attention): # compute base features features = np.random.choice([lambda: EmbeddingFeatures(n_types), lambda: BOWFeatures(n_types)])() # optionally run RNN or CNN features = np.random.choice([lambda: features, lambda: RNN(features, cell_type=np.random.choice(['RNN', 'GRU', 'LSTM', 'QRNN']), bidirectional=np.random.random() < 0.5), lambda: DilatedCNN(features)])() # maybe some nn magic if np.random.random() < 0.5: features = macarico.Torch(features, 50, # final dimension, too hard to tell from list of layers :( [nn.Linear(features.dim, 50), nn.Tanh(), nn.Linear(50, 50), nn.Tanh()]) # compute some attention if require_attention is not None: attention = [require_attention(features)] else: attention = [np.random.choice([lambda: AttendAt(features, 'n'), # or `lambda s: s.n` lambda: AverageAttention(features), lambda: FrontBackAttention(features), lambda: SoftmaxAttention(features)])()] # note: softmax doesn't work with BOWActor if np.random.random() < 0.2: attention.append(AttendAt(features, lambda s: s.N-s.n)) # build an actor if any((isinstance(x, SoftmaxAttention) for x in attention)): actor = RNNActor(attention, n_actions) else: actor = np.random.choice([lambda: RNNActor(attention, n_actions, d_actemb=np.random.choice([None,5]), cell_type=np.random.choice(['RNN', 'GRU', 'LSTM'])), lambda: BOWActor(attention, n_actions, act_history_length=3, obs_history_length=2)])() # do something fun: add a torch module in the middle if np.random.random() < 0.5: actor = macarico.Torch(actor, 27, # final dimension, too hard to tell from list of layers :( [nn.Linear(actor.dim, 27), nn.Tanh()]) # build the policy policy = np.random.choice([lambda: CSOAAPolicy(actor, n_actions, 'huber'), lambda: CSOAAPolicy(actor, n_actions, 'squared'), lambda: WMCPolicy(actor, n_actions, 'huber'), lambda: WMCPolicy(actor, n_actions, 'hinge'), lambda: WMCPolicy(actor, n_actions, 'multinomial'), ])() parameters = policy.parameters() # build the learner if np.random.random() < 0.1: # A2C value_fn = LinearValueFn(actor) learner = A2C(policy, value_fn) parameters = list(parameters) + list(value_fn.parameters()) else: learner = np.random.choice([BehavioralCloning(policy, ref), DAgger(policy, ref), #, ExponentialAnnealing(0.99)) Coaching(policy, ref, policy_coeff=0.1), AggreVaTe(policy, ref), Reinforce(policy), BanditLOLS(policy, ref), LOLS(policy, ref, loss_fn)]) return policy, learner, parameters