示例#1
0
def learning_algorithm(args):
    name = args.algorithm
    learning_rate = float(args.learning_rate)
    momentum = args.momentum
    clipping_threshold = args.clipping
    clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold))
    if name == 'adam':
        adam = Adam(learning_rate=learning_rate)
        step_rule = CompositeRule([adam, clipping])
        learning_rate = adam.learning_rate
    elif name == 'rms_prop':
        rms_prop = RMSProp(learning_rate=learning_rate)
        step_rule = CompositeRule([clipping, rms_prop])
        learning_rate = rms_prop.learning_rate
    elif name == 'momentum':
        sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum)
        step_rule = CompositeRule([clipping, sgd_momentum])
        learning_rate = sgd_momentum.learning_rate
    elif name == 'sgd':
        sgd = Scale(learning_rate=learning_rate)
        step_rule = CompositeRule([clipping, sgd])
        learning_rate = sgd.learning_rate
    else:
        raise NotImplementedError
    return step_rule, learning_rate
示例#2
0
 def algorithm(self):
     if self._algorithm is None:
         self._algorithm = GradientDescent(cost=self.cost,
                                           parameters=self.parameters,
                                           step_rule=CompositeRule(
                                               self.step_rules))
     return self._algorithm
示例#3
0
def test_composite_rule():
    rule = CompositeRule([StepClipping(4), Scale(0.1)])
    gradients = {0: shared_floatx(3.0), 1: shared_floatx(4.0)}
    result, _ = rule.compute_steps(gradients)
    assert_allclose(result[0].eval(), 12 / 50.0)
    assert_allclose(result[1].eval(), 16 / 50.0)

    class RuleWithUpdates(StepRule):
        def __init__(self, updates):
            self.updates = updates

        def compute_steps(self, previous_steps):
            return previous_steps, self.updates

    rule = CompositeRule([RuleWithUpdates([(1, 2)]),
                          RuleWithUpdates([(3, 4)])])
    assert rule.compute_steps(None)[1] == [(1, 2), (3, 4)]
def learning_algorithm(args):
    name = args.algorithm
    learning_rate = float(args.learning_rate)
    momentum = args.momentum
    clipping_threshold = args.clipping
    if name == 'adam':
        clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold))
        adam = Adam(learning_rate=learning_rate)
        step_rule = CompositeRule([adam, clipping])
    elif name == 'rms_prop':
        clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold))
        rms_prop = RMSProp(learning_rate=learning_rate)
        step_rule = CompositeRule([clipping, rms_prop])
    else:
        clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold))
        sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum)
        step_rule = CompositeRule([clipping, sgd_momentum])
    return step_rule
示例#5
0
def learning_algorithm(args):
    name = args.algorithm
    learning_rate = float(args.learning_rate)
    momentum = args.momentum
    clipping_threshold = args.clipping
    if name == 'adam':
        clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold))
        adam = Adam(learning_rate=learning_rate)
        # [adam, clipping] means 'step clipping'
        # [clipping, adam] means 'gradient clipping'
        step_rule = CompositeRule([adam, clipping])
    elif name == 'rms_prop':
        clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold))
        rms_prop = RMSProp(learning_rate=learning_rate)
        rm_non_finite = RemoveNotFinite()
        step_rule = CompositeRule([clipping, rms_prop, rm_non_finite])
    else:
        clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold))
        sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum)
        rm_non_finite = RemoveNotFinite()
        step_rule = CompositeRule([clipping, sgd_momentum, rm_non_finite])
    return step_rule
示例#6
0
def learning_algorithm(learning_rate,
                       momentum=0.0,
                       clipping_threshold=100,
                       algorithm='sgd'):
    if algorithm == 'adam':
        clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold))
        adam = Adam(learning_rate=learning_rate)
        # [adam, clipping] means 'step clipping'
        # [clipping, adam] means 'gradient clipping'
        step_rule = CompositeRule([adam, clipping])
    elif algorithm == 'rms_prop':
        clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold))
        rms_prop = RMSProp(learning_rate=learning_rate)
        rm_non_finite = RemoveNotFinite()
        step_rule = CompositeRule([clipping, rms_prop, rm_non_finite])
    elif algorithm == 'sgd':
        clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold))
        sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum)
        rm_non_finite = RemoveNotFinite()
        step_rule = CompositeRule([clipping, sgd_momentum, rm_non_finite])
    else:
        raise NotImplementedError
    return step_rule
def learning_algorithm(args):
    name = args.algorithm
    learning_rate = float(args.learning_rate)
    momentum = args.momentum
    clipping_threshold = args.clipping
    if name == 'adam':
        adam = Adam(learning_rate=learning_rate)
        step_rule = adam
    elif name == 'rms_prop':
        rms_prop = RMSProp(learning_rate=learning_rate, decay_rate=0.9)
        step_rule = CompositeRule([StepClipping(1.), rms_prop])
    else:
        sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum)
        step_rule = sgd_momentum
    return step_rule
示例#8
0
    def get_optimizer(self):
        optimizer_dict = self.config['optimizer'].copy()
        lr = optimizer_dict['learning_rate']
        optimizer_param_dict = {k: optimizer_dict[k] for k \
                                in set(optimizer_dict.keys()) - set(['type', 'learning_rate'])}

        if len(optimizer_param_dict.keys()) > 0:
            optimizer = self.Optimizer(learning_rate=lr,
                                       **optimizer_param_dict)
        else:
            optimizer = self.Optimizer(lr)

        if dmconfig.floatX == 'float16':
            print "WARNING: float16 makes the training unstable, inserting RemoveNotFinite in optimizer"
            optimizer = CompositeRule([optimizer, RemoveNotFinite()])

        return optimizer
def test_composite_rule():
    rule = CompositeRule([StepClipping(4), Scale(0.1)])
    gradients = {0: shared_floatx(3.0), 1: shared_floatx(4.0)}
    result, _ = rule.compute_steps(gradients)
    assert_allclose(result[0].eval(), 12 / 50.0)
    assert_allclose(result[1].eval(), 16 / 50.0)

    class RuleWithUpdates(StepRule):
        def __init__(self, updates):
            self.updates = updates

        def compute_steps(self, previous_steps):
            return previous_steps, self.updates

    rule = CompositeRule([RuleWithUpdates([(1, 2)]),
                          RuleWithUpdates([(3, 4)])])
    assert rule.compute_steps(None)[1] == [(1, 2), (3, 4)]
示例#10
0
def test_composite_rule():
    rule = CompositeRule([GradientClipping(4), SteepestDescent(0.1)])
    gradients = {0: shared_floatx(3.0), 1: shared_floatx(4.0)}
    result = rule.compute_steps(gradients)
    assert_allclose(result[0].eval(), -12 / 50.0)
    assert_allclose(result[1].eval(), -16 / 50.0)

    class RuleWithUpdates(StepRule):
        def __init__(self, updates):
            self.updates = updates

        def additional_updates(self):
            return self.updates

    rule = CompositeRule([RuleWithUpdates([(1, 2)]),
                          RuleWithUpdates([(3, 4)])])
    assert rule.additional_updates() == [(1, 2), (3, 4)]
示例#11
0
    def train(self, data_file, output_data_file, n_epochs=0):

        training_data = dataset.T_H5PYDataset(data_file, which_sets=('train',))
        test_data = dataset.T_H5PYDataset(data_file, which_sets=('test',))

        session = Session(root_url='http://localhost:5006')

        if self.MainLoop is None:
            step_rules = [RMSProp(learning_rate=0.2, decay_rate=0.95), StepClipping(1)]

            algorithm = GradientDescent(cost=self.Cost,
                                        parameters=self.ComputationGraph.parameters,
                                        step_rule=CompositeRule(step_rules),
                                        on_unused_sources='ignore')

            train_stream = DataStream.default_stream(
                training_data, iteration_scheme=SequentialScheme(
                    training_data.num_examples, batch_size=100))

            test_stream = DataStream.default_stream(
                test_data, iteration_scheme=SequentialScheme(
                    test_data.num_examples, batch_size=100))

            self.MainLoop = MainLoop(
                model=Model(self.Cost),
                data_stream=train_stream,
                algorithm=algorithm,
                extensions=[
                    FinishAfter(after_n_epochs=n_epochs),
                    Printing(),
                    Checkpoint(output_data_file, every_n_epochs=50),
                    TrainingDataMonitoring([self.Cost], after_batch=True, prefix='train'),
                    DataStreamMonitoring([self.Cost], after_batch=True, data_stream=test_stream, prefix='test'),
                    Plot(output_data_file, channels=[['train_cost', 'test_cost']])
                ])

        self.MainLoop.run()
示例#12
0
def ali_algorithm(discriminator_loss, discriminator_parameters,
                  discriminator_step_rule, generator_loss,
                  generator_parameters, generator_step_rule):
    """Instantiates a training algorithm for ALI.

    Parameters
    ----------
    discriminator_loss : tensor variable
        Discriminator loss.
    discriminator_parameters : list
        Discriminator parameters.
    discriminator_step_rule : :class:`blocks.algorithms.StepRule`
        Discriminator step rule.
    generator_loss : tensor variable
        Generator loss.
    generator_parameters : list
        Generator parameters.
    generator_step_rule : :class:`blocks.algorithms.StepRule`
        Generator step rule.
    """
    gradients = OrderedDict()
    gradients.update(
        zip(discriminator_parameters,
            theano.grad(discriminator_loss, discriminator_parameters)))
    gradients.update(
        zip(generator_parameters,
            theano.grad(generator_loss, generator_parameters)))
    step_rule = CompositeRule([
        Restrict(discriminator_step_rule, discriminator_parameters),
        Restrict(generator_step_rule, generator_parameters)
    ])
    return GradientDescent(cost=generator_loss + discriminator_loss,
                           gradients=gradients,
                           parameters=discriminator_parameters +
                           generator_parameters,
                           step_rule=step_rule,
                           on_unused_sources='warn')
示例#13
0
def setup_algorithms(cost, cg, method, type="ff"):
    """Setup training algorithm.

    Parameters
    ----------
    cost : expression
        cost expression
    cg : ComputationGraph
        Computation graph
    method : string
        training method: SGD, momentum SGD, AdaGrad, RMSprop
    learning_rate : float
        learning rate for learning method

    Returns
    -------
    algorithm : GradientDescent
        Gradient Descent algorithm based on different optimization method
    """
    if method == "sgd":
        step_rule = Scale(learning_rate=0.01)
    elif method == "momentum":
        step_rule = Momentum(learning_rate=0.01, momentum=0.95)
    elif method == "adagrad":
        step_rule = AdaGrad()
    elif method == "rmsprop":
        step_rule = RMSProp()

    if type == "RNN":
        step_rule = CompositeRule([StepClipping(1.0), step_rule])

    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=step_rule)

    return algorithm
示例#14
0
    def train(self, training_data):

        step_rules = [Adam(), StepClipping(1.0)]

        algorithm = GradientDescent(
            cost=self.Cost,
            parameters=self.ComputationGraph.parameters,
            step_rule=CompositeRule(step_rules))

        train_stream = DataStream.default_stream(
            training_data,
            iteration_scheme=SequentialScheme(training_data.num_examples,
                                              batch_size=20))

        main = MainLoop(model=Model(self.Cost),
                        data_stream=train_stream,
                        algorithm=algorithm,
                        extensions=[
                            FinishAfter(),
                            Printing(),
                            Checkpoint('trainingdata.tar', every_n_epochs=10)
                        ])

        main.run()
示例#15
0
def main(save_to, num_epochs,
         weight_decay=0.0001, noise_pressure=0, subset=None, num_batches=None,
         batch_size=None, histogram=None, resume=False):
    output_size = 10

    prior_noise_level = -10
    noise_step_rule = Scale(1e-6)
    noise_rate = theano.shared(numpy.asarray(1e-5, dtype=theano.config.floatX))
    convnet = create_res_net(out_noise=True, tied_noise=True, tied_sigma=True,
            noise_rate=noise_rate,
            prior_noise_level=prior_noise_level)

    x = tensor.tensor4('features')
    y = tensor.lmatrix('targets')

    # Normalize input and apply the convnet
    test_probs = convnet.apply(x)
    test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs)
            .copy(name='cost'))
    test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs)
                  .copy(name='error_rate'))
    test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs)
                  .copy(name='confusion'))
    test_confusion.tag.aggregation_scheme = Sum(test_confusion)

    test_cg = ComputationGraph([test_cost, test_error_rate])

    # Apply dropout to all layer outputs except final softmax
    # dropout_vars = VariableFilter(
    #         roles=[OUTPUT], bricks=[Convolutional],
    #         theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables)
    # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5)

    # Apply 0.2 dropout to the pre-averaging layer
    # dropout_vars_2 = VariableFilter(
    #         roles=[OUTPUT], bricks=[Convolutional],
    #         theano_name_regex="^conv_8_apply_output$")(test_cg.variables)
    # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2)

    # Apply 0.2 dropout to the input, as in the paper
    # train_cg = apply_dropout(test_cg, [x], 0.2)
    # train_cg = drop_cg
    # train_cg = apply_batch_normalization(test_cg)

    # train_cost, train_error_rate, train_components = train_cg.outputs

    with batch_normalization(convnet):
        with training_noise(convnet):
            train_probs = convnet.apply(x)
    train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs)
                .copy(name='cost'))
    train_components = (ComponentwiseCrossEntropy().apply(y.flatten(),
                train_probs).copy(name='components'))
    train_error_rate = (MisclassificationRate().apply(y.flatten(),
                train_probs).copy(name='error_rate'))
    train_cg = ComputationGraph([train_cost,
                train_error_rate, train_components])
    population_updates = get_batch_normalization_updates(train_cg)
    bn_alpha = 0.9
    extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha))
                for p, m in population_updates]

    # for annealing
    nit_penalty = theano.shared(numpy.asarray(noise_pressure, dtype=theano.config.floatX))
    nit_penalty.name = 'nit_penalty'

    # Compute noise rates for training graph
    train_logsigma = VariableFilter(roles=[LOG_SIGMA])(train_cg.variables)
    train_mean_log_sigma = tensor.concatenate([n.flatten() for n in train_logsigma]).mean()
    train_mean_log_sigma.name = 'mean_log_sigma'
    train_nits = VariableFilter(roles=[NITS])(train_cg.auxiliary_variables)
    train_nit_rate = tensor.concatenate([n.flatten() for n in train_nits]).mean()
    train_nit_rate.name = 'nit_rate'
    train_nit_regularization = nit_penalty * train_nit_rate
    train_nit_regularization.name = 'nit_regularization'

    # Apply regularization to the cost
    trainable_parameters = VariableFilter(roles=[WEIGHT, BIAS])(
            train_cg.parameters)
    mask_parameters = [p for p in trainable_parameters
            if get_brick(p).name == 'mask']
    noise_parameters = VariableFilter(roles=[NOISE])(train_cg.parameters)
    biases = VariableFilter(roles=[BIAS])(train_cg.parameters)
    weights = VariableFilter(roles=[WEIGHT])(train_cg.variables)
    nonmask_weights = [p for p in weights if get_brick(p).name != 'mask']
    l2_norm = sum([(W ** 2).sum() for W in nonmask_weights])
    l2_norm.name = 'l2_norm'
    l2_regularization = weight_decay * l2_norm
    l2_regularization.name = 'l2_regularization'

    # testversion
    test_cost = test_cost + l2_regularization
    test_cost.name = 'cost_with_regularization'

    # Training version of cost
    train_cost_without_regularization = train_cost
    train_cost_without_regularization.name = 'cost_without_regularization'
    train_cost = train_cost + l2_regularization + train_nit_regularization
    train_cost.name = 'cost_with_regularization'

    cifar10_train = CIFAR10(("train",))
    cifar10_train_stream = RandomPadCropFlip(
        NormalizeBatchLevels(DataStream.default_stream(
            cifar10_train, iteration_scheme=ShuffledScheme(
                cifar10_train.num_examples, batch_size)),
        which_sources=('features',)),
        (32, 32), pad=4, which_sources=('features',))

    test_batch_size = 128
    cifar10_test = CIFAR10(("test",))
    cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream(
        cifar10_test,
        iteration_scheme=ShuffledScheme(
            cifar10_test.num_examples, test_batch_size)),
        which_sources=('features',))

    momentum = Momentum(0.01, 0.9)

    # Create a step rule that doubles the learning rate of biases, like Caffe.
    # scale_bias = Restrict(Scale(2), biases)
    # step_rule = CompositeRule([scale_bias, momentum])

    # Create a step rule that reduces the learning rate of noise
    scale_mask = Restrict(noise_step_rule, mask_parameters)
    step_rule = CompositeRule([scale_mask, momentum])

    # from theano.compile.nanguardmode import NanGuardMode

    # Train with simple SGD
    algorithm = GradientDescent(
        cost=train_cost, parameters=trainable_parameters,
        step_rule=step_rule)
    algorithm.add_updates(extra_updates)

    #,
    #    theano_func_kwargs={
    #        'mode': NanGuardMode(
    #            nan_is_error=True, inf_is_error=True, big_is_error=True)})

    exp_name = save_to.replace('.%d', '')

    # `Timing` extension reports time for reading data, aggregating a batch
    # and monitoring;
    # `ProgressBar` displays a nice progress bar during training.
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=num_epochs,
                              after_n_batches=num_batches),
                  EpochSchedule(momentum.learning_rate, [
                      (0, 0.01),     # Warm up with 0.01 learning rate
                      (50, 0.1),     # Then go back to 0.1
                      (100, 0.01),
                      (150, 0.001)
                      # (83, 0.01),  # Follow the schedule in the paper
                      # (125, 0.001)
                  ]),
                  EpochSchedule(noise_step_rule.learning_rate, [
                      (0, 1e-2),
                      (2, 1e-1),
                      (4, 1)
                      # (0, 1e-6),
                      # (2, 1e-5),
                      # (4, 1e-4)
                  ]),
                  EpochSchedule(noise_rate, [
                      (0, 1e-2),
                      (2, 1e-1),
                      (4, 1)
                      # (0, 1e-6),
                      # (2, 1e-5),
                      # (4, 1e-4),
                      # (6, 3e-4),
                      # (8, 1e-3), # Causes nit rate to jump
                      # (10, 3e-3),
                      # (12, 1e-2),
                      # (15, 3e-2),
                      # (19, 1e-1),
                      # (24, 3e-1),
                      # (30, 1)
                  ]),
                  NoiseExtension(
                      noise_parameters=noise_parameters),
                  NoisyDataStreamMonitoring(
                      [test_cost, test_error_rate, test_confusion],
                      cifar10_test_stream,
                      noise_parameters=noise_parameters,
                      prefix="test"),
                  TrainingDataMonitoring(
                      [train_cost, train_error_rate, train_nit_rate,
                       train_cost_without_regularization,
                       l2_regularization,
                       train_nit_regularization,
                       momentum.learning_rate,
                       train_mean_log_sigma,
                       aggregation.mean(algorithm.total_gradient_norm)],
                      prefix="train",
                      every_n_batches=17),
                      # after_epoch=True),
                  Plot('Training performance for ' + exp_name,
                      channels=[
                          ['train_cost_with_regularization',
                           'train_cost_without_regularization',
                           'train_nit_regularization',
                           'train_l2_regularization'],
                          ['train_error_rate'],
                          ['train_total_gradient_norm'],
                          ['train_mean_log_sigma'],
                      ],
                      every_n_batches=17),
                  Plot('Test performance for ' + exp_name,
                      channels=[[
                          'train_error_rate',
                          'test_error_rate',
                          ]],
                      after_epoch=True),
                  EpochCheckpoint(save_to, use_cpickle=True, after_epoch=True),
                  ProgressBar(),
                  Printing()]

    if histogram:
        attribution = AttributionExtension(
            components=train_components,
            parameters=cg.parameters,
            components_size=output_size,
            after_batch=True)
        extensions.insert(0, attribution)

    if resume:
        extensions.append(Load(exp_name, True, True))

    model = Model(train_cost)

    main_loop = MainLoop(
        algorithm,
        cifar10_train_stream,
        model=model,
        extensions=extensions)

    main_loop.run()

    if histogram:
        save_attributions(attribution, filename=histogram)

    with open('execution-log.json', 'w') as outfile:
        json.dump(main_loop.log, outfile, cls=NumpyEncoder)
示例#16
0
y_hat, cost, cells = nn_fprop(x, y, vocab_size, hidden_size, num_layers, model)

# COST
cg = ComputationGraph(cost)

if dropout > 0:
    # Apply dropout only to the non-recurrent inputs (Zaremba et al. 2015)
    inputs = VariableFilter(theano_name_regex=r'.*apply_input.*')(cg.variables)
    cg = apply_dropout(cg, inputs, dropout)
    cost = cg.outputs[0]

# Learning algorithm
step_rules = [RMSProp(learning_rate=learning_rate, decay_rate=decay_rate),
              StepClipping(step_clipping)]
algorithm = GradientDescent(cost=cost, parameters=cg.parameters,
                            step_rule=CompositeRule(step_rules))

# Extensions
gradient_norm = aggregation.mean(algorithm.total_gradient_norm)
step_norm = aggregation.mean(algorithm.total_step_norm)
monitored_vars = [cost, gradient_norm, step_norm]

dev_monitor = DataStreamMonitoring(variables=[cost], after_epoch=True,
                                   before_first_epoch=True, data_stream=dev_stream, prefix="dev")
train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_batch=True,
                                       before_first_epoch=True, prefix='tra')

extensions = [dev_monitor, train_monitor, Timing(), Printing(after_batch=True),
              FinishAfter(after_n_epochs=nepochs),
              saveload.Load(load_path),
              saveload.Checkpoint(last_path),
示例#17
0
def main(name, epochs, batch_size, learning_rate, attention, n_iter, enc_dim,
         dec_dim, z_dim):

    if name is None:
        tag = "watt" if attention else "woatt"
        name = "%s-t%d-enc%d-dec%d-z%d" % (tag, n_iter, enc_dim, dec_dim,
                                           z_dim)

    print("\nRunning experiment %s" % name)
    print("         learning rate: %5.3f" % learning_rate)
    print("             attention: %s" % attention)
    print("          n_iterations: %d" % n_iter)
    print("     encoder dimension: %d" % enc_dim)
    print("           z dimension: %d" % z_dim)
    print("     decoder dimension: %d" % dec_dim)
    print()

    #------------------------------------------------------------------------

    x_dim = 28 * 28
    img_height, img_width = (28, 28)

    rnninits = {
        'weights_init': Orthogonal(),
        #'weights_init': IsotropicGaussian(0.001),
        'biases_init': Constant(0.),
    }
    inits = {
        'weights_init': Orthogonal(),
        #'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }

    prior_mu = T.zeros([z_dim])
    prior_log_sigma = T.zeros([z_dim])

    if attention:
        read_N = 4
        write_N = 6
        read_dim = 2 * read_N**2

        reader = AttentionReader(x_dim=x_dim,
                                 dec_dim=dec_dim,
                                 width=img_width,
                                 height=img_height,
                                 N=read_N,
                                 **inits)
        writer = AttentionWriter(input_dim=dec_dim,
                                 output_dim=x_dim,
                                 width=img_width,
                                 height=img_height,
                                 N=read_N,
                                 **inits)
    else:
        read_dim = 2 * x_dim

        reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits)
        writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits)

    encoder = LSTM(dim=enc_dim, name="RNN_enc", **rnninits)
    decoder = LSTM(dim=dec_dim, name="RNN_dec", **rnninits)
    encoder_mlp = MLP([Tanh()], [(read_dim + dec_dim), 4 * enc_dim],
                      name="MLP_enc",
                      **inits)
    decoder_mlp = MLP([Tanh()], [z_dim, 4 * dec_dim], name="MLP_dec", **inits)
    q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits)

    for brick in [
            reader, writer, encoder, decoder, encoder_mlp, decoder_mlp,
            q_sampler
    ]:
        brick.allocate()
        brick.initialize()

    #------------------------------------------------------------------------
    x = tensor.matrix('features')

    # This is one iteration
    def one_iteration(c, h_enc, c_enc, z_mean, z_log_sigma, z, h_dec, c_dec,
                      x):
        x_hat = x - T.nnet.sigmoid(c)
        r = reader.apply(x, x_hat, h_dec)
        i_enc = encoder_mlp.apply(T.concatenate([r, h_dec], axis=1))
        h_enc, c_enc = encoder.apply(states=h_enc,
                                     cells=c_enc,
                                     inputs=i_enc,
                                     iterate=False)
        z_mean, z_log_sigma, z = q_sampler.apply(h_enc)
        i_dec = decoder_mlp.apply(z)
        h_dec, c_dec = decoder.apply(states=h_dec,
                                     cells=c_dec,
                                     inputs=i_dec,
                                     iterate=False)
        c = c + writer.apply(h_dec)
        return c, h_enc, c_enc, z_mean, z_log_sigma, z, h_dec, c_dec

    outputs_info = [
        T.zeros([batch_size, x_dim]),  # c
        T.zeros([batch_size, enc_dim]),  # h_enc
        T.zeros([batch_size, enc_dim]),  # c_enc
        T.zeros([batch_size, z_dim]),  # z_mean
        T.zeros([batch_size, z_dim]),  # z_log_sigma
        T.zeros([batch_size, z_dim]),  # z
        T.zeros([batch_size, dec_dim]),  # h_dec
        T.zeros([batch_size, dec_dim]),  # c_dec
    ]

    outputs, scan_updates = theano.scan(fn=one_iteration,
                                        sequences=[],
                                        outputs_info=outputs_info,
                                        non_sequences=[x],
                                        n_steps=n_iter)

    c, h_enc, c_enc, z_mean, z_log_sigma, z, h_dec, c_dec = outputs

    kl_terms = (prior_log_sigma - z_log_sigma + 0.5 *
                (tensor.exp(2 * z_log_sigma) +
                 (z_mean - prior_mu)**2) / tensor.exp(2 * prior_log_sigma) -
                0.5).sum(axis=-1)

    x_recons = T.nnet.sigmoid(c[-1, :, :])
    recons_term = BinaryCrossEntropy().apply(x, x_recons)
    recons_term.name = "recons_term"

    cost = recons_term + kl_terms.sum(axis=0).mean()
    cost.name = "nll_bound"

    #------------------------------------------------------------
    cg = ComputationGraph([cost])
    params = VariableFilter(roles=[PARAMETER])(cg.variables)

    algorithm = GradientDescent(
        cost=cost,
        params=params,
        step_rule=CompositeRule([
            #StepClipping(3.),
            Adam(learning_rate),
        ])
        #step_rule=RMSProp(learning_rate),
        #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95)
    )
    algorithm.add_updates(scan_updates)

    #------------------------------------------------------------------------
    # Setup monitors
    monitors = [cost]
    for t in range(n_iter):
        kl_term_t = kl_terms[t, :].mean()
        kl_term_t.name = "kl_term_%d" % t

        x_recons_t = T.nnet.sigmoid(c[t, :, :])
        recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t)
        recons_term_t = recons_term_t.mean()
        recons_term_t.name = "recons_term_%d" % t

        monitors += [kl_term_t, recons_term_t]

    train_monitors = monitors[:]
    train_monitors += [aggregation.mean(algorithm.total_gradient_norm)]
    train_monitors += [aggregation.mean(algorithm.total_step_norm)]

    # Live plotting...
    plot_channels = [["train_nll_bound", "test_nll_bound"],
                     ["train_kl_term_%d" % t for t in range(n_iter)],
                     ["train_recons_term_%d" % t for t in range(n_iter)],
                     ["train_total_gradient_norm", "train_total_step_norm"]]

    #------------------------------------------------------------

    mnist_train = BinarizedMNIST("train", sources=['features'])
    mnist_test = BinarizedMNIST("test", sources=['features'])
    #mnist_train = MNIST("train", binary=True, sources=['features'])
    #mnist_test = MNIST("test", binary=True, sources=['features'])

    main_loop = MainLoop(
        model=None,
        data_stream=ForceFloatX(
            DataStream(mnist_train,
                       iteration_scheme=SequentialScheme(
                           mnist_train.num_examples, batch_size))),
        algorithm=algorithm,
        extensions=[
            Timing(),
            FinishAfter(after_n_epochs=epochs),
            DataStreamMonitoring(
                monitors,
                ForceFloatX(
                    DataStream(mnist_test,
                               iteration_scheme=SequentialScheme(
                                   mnist_test.num_examples, batch_size))),
                updates=scan_updates,
                prefix="test"),
            TrainingDataMonitoring(train_monitors,
                                   prefix="train",
                                   after_every_epoch=True),
            SerializeMainLoop(name + ".pkl"),
            Plot(name, channels=plot_channels),
            ProgressBar(),
            Printing()
        ])
    main_loop.run()
n_entities = 550
embed_size = 200

ctx_lstm_size = [256]
ctx_skip_connections = True

question_lstm_size = [256]
question_skip_connections = True

attention_mlp_hidden = [100]
attention_mlp_activations = [Tanh()]

out_mlp_hidden = []
out_mlp_activations = []

step_rule = CompositeRule([
    RMSProp(decay_rate=0.95, learning_rate=5e-5),
    BasicMomentum(momentum=0.9)
])

dropout = 0.2
w_noise = 0.

valid_freq = 1000
save_freq = 1000
print_freq = 100

weights_init = IsotropicGaussian(0.01)
biases_init = Constant(0.)
示例#19
0
def main(config, tr_stream, dev_stream, use_bokeh=False, the_task=None, the_track=None):

    config['the_task'] = the_task
    # Create Theano variables
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')
    target_sentence = tensor.lmatrix('target')
    target_sentence_mask = tensor.matrix('target_mask')
    sampling_input = tensor.lmatrix('input')

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(
        # end_embed is dimension of word embedding matrix in encoder; enc_nhids number of hidden units in encoder GRU
        config['src_vocab_size'], config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(
        config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'],
        config['enc_nhids'] * 2, config['use_attention'], cost_type=config['error_fct'])
    cost = decoder.cost(
        encoder.apply(source_sentence, source_sentence_mask),
        source_sentence_mask, target_sentence, target_sentence_mask)
    testVar = decoder.getTestVar(
        encoder.apply(source_sentence, source_sentence_mask),
        source_sentence_mask, target_sentence, target_sentence_mask)
   
    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # Initialize model
    logger.info('Initializing model')
    my_rng = numpy.random.RandomState(config['rng_value']) 
    if config['identity_init']:
      encoder.weights_init = decoder.weights_init = Identity()
    else:
      encoder.weights_init = decoder.weights_init = IsotropicGaussian(
          config['weight_scale'])
      encoder.rng = decoder.rng = my_rng
    
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    encoder.bidir.prototype.rng = my_rng
    decoder.transition.weights_init = Orthogonal()
    decoder.transition.rng = my_rng
    encoder.initialize()
    decoder.initialize()

    # apply dropout for regularization
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        logger.info('Applying dropout')
        dropout_inputs = [x for x in cg.intermediary_variables
                          if x.name == 'maxout_apply_output']
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])

    # Apply weight noise for regularization
    if config['weight_noise_ff'] > 0.0:
        logger.info('Applying weight noise to ff layers')
        enc_params = Selector(encoder.lookup).get_params().values()
        enc_params += Selector(encoder.fwd_fork).get_params().values()
        enc_params += Selector(encoder.back_fork).get_params().values()
        dec_params = Selector(
            decoder.sequence_generator.readout).get_params().values()
        dec_params += Selector(
            decoder.sequence_generator.fork).get_params().values()
        dec_params += Selector(decoder.state_init).get_params().values()
        cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff'], seed=my_rng)

    cost = cg.outputs[0]

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))
    


    # Print parameter names
    enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
                               Selector(decoder).get_parameters())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.items():
        logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}"
                .format(len(enc_dec_param_dict)))


    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)    

    # Set extensions
    logger.info("Initializing extensions")
    # this is ugly code and done, because I am not sure if the order of the extensions is important
    if 'track2' in config['saveto']: # less epochs for track 2, because of more data
      if config['early_stopping']:
	extensions = [
	    FinishAfter(after_n_epochs=config['finish_after']/2),
	    #FinishAfter(after_n_batches=config['finish_after']),
	    TrainingDataMonitoring([cost], after_batch=True),
	    Printing(after_batch=True),
	    CheckpointNMT(config['saveto'],
			  every_n_batches=config['save_freq'])
	]
      else:
	extensions = [
	    FinishAfter(after_n_epochs=config['finish_after']/2),
	    #FinishAfter(after_n_batches=config['finish_after']),
	    TrainingDataMonitoring([cost], after_batch=True),
	    Printing(after_batch=True),
	    CheckpointNMT(config['saveto'],
			  every_n_batches=config['save_freq'])
	]
    else:
      if config['early_stopping']:
	extensions = [
	    FinishAfter(after_n_epochs=config['finish_after']),
	    #FinishAfter(after_n_batches=config['finish_after']),
	    TrainingDataMonitoring([cost], after_batch=True),
	    Printing(after_batch=True),
	    CheckpointNMT(config['saveto'],
			  every_n_batches=config['save_freq'])
	]
      else:
	extensions = [
	    FinishAfter(after_n_epochs=config['finish_after']),
	    #FinishAfter(after_n_batches=config['finish_after']),
	    TrainingDataMonitoring([cost], after_batch=True),
	    Printing(after_batch=True),
	    CheckpointNMT(config['saveto'],
			  every_n_batches=config['save_freq'])
	]

    # Set up beam search and sampling computation graphs if necessary
    if config['hook_samples'] >= 1:
        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        generated = decoder.generate(sampling_input, sampling_representation)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

    
    # Add sampling
    if config['hook_samples'] >= 1:
        logger.info("Building sampler")
        extensions.append(
            Sampler(model=search_model, data_stream=tr_stream,
                    hook_samples=config['hook_samples'],
                    #every_n_batches=1,
                    every_n_batches=config['sampling_freq'],
                    src_vocab_size=8))
                    #src_vocab_size=config['src_vocab_size']))
    
    # Add early stopping based on bleu
    if config['val_set'] is not None:
        logger.info("Building accuracy validator")
        extensions.append(
            AccuracyValidator(sampling_input, samples=samples, config=config,
                          model=search_model, data_stream=dev_stream,
                          after_training=True,
                          #after_epoch=True))
                          every_n_epochs=5))
    else:
        logger.info("No validation set given for this language")
    
    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))
       
    
    # Set up training algorithm
    logger.info("Initializing training algorithm")
    algorithm = GradientDescent(
        cost=cost, parameters=cg.parameters,
        step_rule=CompositeRule([StepClipping(config['step_clipping']),
                                 eval(config['step_rule'])()])
    )

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(
        model=training_model,
        algorithm=algorithm,
        data_stream=tr_stream,
        extensions=extensions
    )
    
    # Train!
    main_loop.run()
示例#20
0
def mainPredict(config, data_to_predict_stream, use_ensemble, lang=None, et_version=False, use_bokeh=False, the_track=None):
    # Create Theano variables
    assert the_track != None
    
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')
    target_sentence = tensor.lmatrix('target')
    target_sentence_mask = tensor.matrix('target_mask')
    sampling_input = tensor.lmatrix('input')

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(
        config['src_vocab_size'], config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(
        config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'],
        config['enc_nhids'] * 2, cost_type=config['error_fct'])
    cost = decoder.cost(
        encoder.apply(source_sentence, source_sentence_mask),
        source_sentence_mask, target_sentence, target_sentence_mask)

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # Initialize model
    logger.info('Initializing model')
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    decoder.transition.weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names
    enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
                               Selector(decoder).get_parameters())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.items():
        logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}"
                .format(len(enc_dec_param_dict)))
    
    
    # Set extensions
    logger.info("Initializing (empty) extensions")
    extensions = [
    ]

    logger.info("Building sampling model")
    sampling_representation = encoder.apply(
        sampling_input, tensor.ones(sampling_input.shape))
    generated = decoder.generate(sampling_input, sampling_representation)
    search_model = Model(generated)

    _, samples = VariableFilter(
        bricks=[decoder.sequence_generator], name="outputs")(
            ComputationGraph(generated[1]))  # generated[1] is next_outputs
    
    # Reload the model (as this is prediction, it is 100% necessary):
    if config['reload']:
        #extensions.append(LoadOnlyBestModel(config['saveto'])) # without early stopping use LoadOnlyModel here!
        extensions.append(LoadOnlyModel(config['saveto'])) # without early stopping use LoadOnlyModel here!
    else:
        raise Exception('No model available for prediction! (Check config[\'reload\'] variable)')

    
    # Set up training algorithm
    logger.info("Initializing training algorithm")
    algorithm = GradientDescent(
        cost=cost, parameters=cg.parameters,
        step_rule=CompositeRule([StepClipping(config['step_clipping']),
                                 eval(config['step_rule'])()])
    )

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(
        model=search_model,
        algorithm=algorithm,
        #algorithm=None,
        data_stream=data_to_predict_stream,
        extensions=extensions
    )

    predictByHand(main_loop, decoder, data_to_predict_stream, use_ensemble, lang, et_version, config, the_track=the_track)
    blocks.config.config.default_seed = args.seed

    if args.continue_from:
        from blocks.serialization import load
        main_loop = load(args.continue_from)
        main_loop.run()
        sys.exit(0)

    graphs, extensions, updates = construct_graphs(args, nclasses,
                                                   sequence_length)
    #graph, extension, update = construct_graphs(args, nclasses, sequence_length)

    ### optimization algorithm definition
    step_rule = CompositeRule([
        StepClipping(1.),
        #Momentum(learning_rate=args.learning_rate, momentum=0.9),
        RMSProp(learning_rate=args.learning_rate, decay_rate=0.5),
    ])

    algorithm = GradientDescent(cost=graphs["training"].outputs[0],
                                parameters=graphs["training"].parameters,
                                step_rule=step_rule)
    algorithm.add_updates(updates["training"])
    model = Model(graphs["training"].outputs[0])
    extensions = extensions["training"] + extensions["inference"]

    # step monitor (after epoch to limit the log size)
    step_channels = []
    step_channels.extend([
        algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name)
        for name, param in model.get_parameter_dict().items()
示例#22
0
        logger.info('    %20s %s' % (value.get_value().shape, value.name))
        parameters_size += reduce(operator.mul, value.get_value().shape, 1)
    logger.info('Total number of parameters: %d in %d matrices' %
                (parameters_size, len(cg.parameters)))

    if hasattr(config, 'step_rule'):
        step_rule = config.step_rule
    else:
        step_rule = AdaDelta()

    logger.info("Fuel seed: %d" % fuel.config.default_seed)
    logger.info("Blocks seed: %d" % blocks.config.default_seed)

    params = cg.parameters
    algorithm = GradientDescent(cost=cost,
                                step_rule=CompositeRule(
                                    [RemoveNotFinite(), step_rule]),
                                parameters=params)

    plot_vars = [['valid_' + x.name for x in valid_monitored] +
                 ['train_' + x.name for x in valid_monitored]]
    logger.info('Plotted variables: %s' % str(plot_vars))

    dump_path = os.path.join('model_data', model_name) + '.pkl'
    logger.info('Dump path: %s' % dump_path)

    extensions = [
        TrainingDataMonitoring(monitored, prefix='train',
                               every_n_batches=1000),
        DataStreamMonitoring(valid_monitored,
                             valid_stream,
                             prefix='valid',
示例#23
0
def main(name, dataset, epochs, batch_size, learning_rate, attention, n_iter,
         enc_dim, dec_dim, z_dim, oldmodel):

    image_size, data_train, data_valid, data_test = datasets.get_data(dataset)

    train_stream = Flatten(
        DataStream(data_train,
                   iteration_scheme=SequentialScheme(data_train.num_examples,
                                                     batch_size)))
    valid_stream = Flatten(
        DataStream(data_valid,
                   iteration_scheme=SequentialScheme(data_valid.num_examples,
                                                     batch_size)))
    test_stream = Flatten(
        DataStream(data_test,
                   iteration_scheme=SequentialScheme(data_test.num_examples,
                                                     batch_size)))

    if name is None:
        name = dataset

    img_height, img_width = image_size
    x_dim = img_height * img_width

    rnninits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    inits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }

    if attention != "":
        read_N, write_N = attention.split(',')

        read_N = int(read_N)
        write_N = int(write_N)
        read_dim = 2 * read_N**2

        reader = AttentionReader(x_dim=x_dim,
                                 dec_dim=dec_dim,
                                 width=img_width,
                                 height=img_height,
                                 N=read_N,
                                 **inits)
        writer = AttentionWriter(input_dim=dec_dim,
                                 output_dim=x_dim,
                                 width=img_width,
                                 height=img_height,
                                 N=write_N,
                                 **inits)
        attention_tag = "r%d-w%d" % (read_N, write_N)
    else:
        read_dim = 2 * x_dim

        reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits)
        writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits)

        attention_tag = "full"

    #----------------------------------------------------------------------

    # Learning rate
    def lr_tag(value):
        """ Convert a float into a short tag-usable string representation. E.g.:
            0.1   -> 11
            0.01  -> 12
            0.001 -> 13
            0.005 -> 53
        """
        exp = np.floor(np.log10(value))
        leading = ("%e" % value)[0]
        return "%s%d" % (leading, -exp)

    lr_str = lr_tag(learning_rate)

    subdir = time.strftime("%Y%m%d-%H%M%S") + "-" + name
    longname = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % (
        dataset, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str)
    pickle_file = subdir + "/" + longname + ".pkl"

    print("\nRunning experiment %s" % longname)
    print("               dataset: %s" % dataset)
    print("          subdirectory: %s" % subdir)
    print("         learning rate: %g" % learning_rate)
    print("             attention: %s" % attention)
    print("          n_iterations: %d" % n_iter)
    print("     encoder dimension: %d" % enc_dim)
    print("           z dimension: %d" % z_dim)
    print("     decoder dimension: %d" % dec_dim)
    print("            batch size: %d" % batch_size)
    print("                epochs: %d" % epochs)
    print()

    #----------------------------------------------------------------------

    encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits)
    decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits)
    encoder_mlp = MLP([Identity()], [(read_dim + dec_dim), 4 * enc_dim],
                      name="MLP_enc",
                      **inits)
    decoder_mlp = MLP([Identity()], [z_dim, 4 * dec_dim],
                      name="MLP_dec",
                      **inits)
    q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits)

    draw = DrawModel(n_iter,
                     reader=reader,
                     encoder_mlp=encoder_mlp,
                     encoder_rnn=encoder_rnn,
                     sampler=q_sampler,
                     decoder_mlp=decoder_mlp,
                     decoder_rnn=decoder_rnn,
                     writer=writer)
    draw.initialize()

    #------------------------------------------------------------------------
    x = tensor.matrix('features')

    #x_recons = 1. + x
    x_recons, kl_terms = draw.reconstruct(x)
    #x_recons, _, _, _, _ = draw.silly(x, n_steps=10, batch_size=100)
    #x_recons = x_recons[-1,:,:]

    #samples = draw.sample(100)
    #x_recons = samples[-1, :, :]
    #x_recons = samples[-1, :, :]

    recons_term = BinaryCrossEntropy().apply(x, x_recons)
    recons_term.name = "recons_term"

    cost = recons_term + kl_terms.sum(axis=0).mean()
    cost.name = "nll_bound"

    #------------------------------------------------------------
    cg = ComputationGraph([cost])
    params = VariableFilter(roles=[PARAMETER])(cg.variables)

    algorithm = GradientDescent(
        cost=cost,
        params=params,
        step_rule=CompositeRule([
            StepClipping(10.),
            Adam(learning_rate),
        ])
        #step_rule=RMSProp(learning_rate),
        #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95)
    )
    #algorithm.add_updates(scan_updates)

    #------------------------------------------------------------------------
    # Setup monitors
    monitors = [cost]
    for t in range(n_iter):
        kl_term_t = kl_terms[t, :].mean()
        kl_term_t.name = "kl_term_%d" % t

        #x_recons_t = T.nnet.sigmoid(c[t,:,:])
        #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t)
        #recons_term_t = recons_term_t.mean()
        #recons_term_t.name = "recons_term_%d" % t

        monitors += [kl_term_t]

    train_monitors = monitors[:]
    train_monitors += [aggregation.mean(algorithm.total_gradient_norm)]
    train_monitors += [aggregation.mean(algorithm.total_step_norm)]
    # Live plotting...
    plot_channels = [
        ["train_nll_bound", "test_nll_bound"],
        ["train_kl_term_%d" % t for t in range(n_iter)],
        #["train_recons_term_%d" % t for t in range(n_iter)],
        ["train_total_gradient_norm", "train_total_step_norm"]
    ]

    #------------------------------------------------------------

    if not os.path.exists(subdir):
        os.makedirs(subdir)

    main_loop = MainLoop(
        model=Model(cost),
        data_stream=train_stream,
        algorithm=algorithm,
        extensions=[
            Timing(),
            FinishAfter(after_n_epochs=epochs),
            TrainingDataMonitoring(train_monitors,
                                   prefix="train",
                                   after_epoch=True),
            #            DataStreamMonitoring(
            #                monitors,
            #                valid_stream,
            ##                updates=scan_updates,
            #                prefix="valid"),
            DataStreamMonitoring(
                monitors,
                test_stream,
                #                updates=scan_updates,
                prefix="test"),
            Checkpoint(name,
                       before_training=False,
                       after_epoch=True,
                       save_separately=['log', 'model']),
            #Checkpoint(image_size=image_size, save_subdir=subdir, path=pickle_file, before_training=False, after_epoch=True, save_separately=['log', 'model']),
            Plot(name, channels=plot_channels),
            ProgressBar(),
            Printing()
        ])

    if oldmodel is not None:
        print("Initializing parameters with old model %s" % oldmodel)
        with open(oldmodel, "rb") as f:
            oldmodel = pickle.load(f)
            main_loop.model.set_param_values(oldmodel.get_param_values())
        del oldmodel

    main_loop.run()
示例#24
0
                                   data_stream=data_stream_val,
                                   prefix='validation',
                                   after_epoch=True)

monitor_test = DataStreamMonitoring(variables=[error],
                                    data_stream=data_stream_test,
                                    prefix='test',
                                    after_epoch=True)

learning_rate = 0.00008
n_epochs = 100
algorithm = GradientDescent(cost=cost,
                            parameters=cg.parameters,
                            on_unused_sources='ignore',
                            step_rule=CompositeRule([
                                StepClipping(10.),
                                Adam(learning_rate),
                            ]))
main_loop = MainLoop(model=Model(cost),
                     data_stream=data_stream_train,
                     algorithm=algorithm,
                     extensions=[
                         Timing(),
                         FinishAfter(after_n_epochs=n_epochs), monitor,
                         monitor_val, monitor_test,
                         saveSnapshot(
                             '/home/xuehongyang/checkpoints_read/snapshot',
                             save_main_loop=False,
                             after_epoch=True,
                             save_separately=['log', 'model']),
                         ProgressBar(),
                         Printing(every_n_batches=500),
示例#25
0
def main(args):
    """Run experiment. """
    lr_tag = float_tag(args.learning_rate)

    x_dim, train_stream, valid_stream, test_stream = datasets.get_streams(
        args.data, args.batch_size)

    #------------------------------------------------------------
    # Setup model
    deterministic_act = Tanh
    deterministic_size = 1.

    if args.method == 'vae':
        sizes_tag = args.layer_spec.replace(",", "-")
        layer_sizes = [int(i) for i in args.layer_spec.split(",")]
        layer_sizes, z_dim = layer_sizes[:-1], layer_sizes[-1]

        name = "%s-%s-%s-lr%s-spl%d-%s" % \
            (args.data, args.method, args.name, lr_tag, args.n_samples, sizes_tag)

        if args.activation == "tanh":
            hidden_act = Tanh()
        elif args.activation == "logistic":
            hidden_act = Logistic()
        elif args.activation == "relu":
            hidden_act = Rectifier()
        else:
            raise "Unknown hidden nonlinearity %s" % args.hidden_act

        model = VAE(x_dim=x_dim,
                    hidden_layers=layer_sizes,
                    hidden_act=hidden_act,
                    z_dim=z_dim,
                    batch_norm=args.batch_normalization)
        model.initialize()
    elif args.method == 'dvae':
        sizes_tag = args.layer_spec.replace(",", "-")
        layer_sizes = [int(i) for i in args.layer_spec.split(",")]
        layer_sizes, z_dim = layer_sizes[:-1], layer_sizes[-1]

        name = "%s-%s-%s-lr%s-spl%d-%s" % \
            (args.data, args.method, args.name, lr_tag, args.n_samples, sizes_tag)

        if args.activation == "tanh":
            hidden_act = Tanh()
        elif args.activation == "logistic":
            hidden_act = Logistic()
        elif args.activation == "relu":
            hidden_act = Rectifier()
        else:
            raise "Unknown hidden nonlinearity %s" % args.hidden_act

        model = DVAE(x_dim=x_dim,
                     hidden_layers=layer_sizes,
                     hidden_act=hidden_act,
                     z_dim=z_dim,
                     batch_norm=args.batch_normalization)
        model.initialize()
    elif args.method == 'rws':
        sizes_tag = args.layer_spec.replace(",", "-")
        qbase = "" if not args.no_qbaseline else "noqb-"

        name = "%s-%s-%s-%slr%s-dl%d-spl%d-%s" % \
            (args.data, args.method, args.name, qbase, lr_tag, args.deterministic_layers, args.n_samples, sizes_tag)

        p_layers, q_layers = create_layers(args.layer_spec, x_dim,
                                           args.deterministic_layers,
                                           deterministic_act,
                                           deterministic_size)

        model = ReweightedWakeSleep(
            p_layers,
            q_layers,
            qbaseline=(not args.no_qbaseline),
        )
        model.initialize()
    elif args.method == 'bihm-rws':
        sizes_tag = args.layer_spec.replace(",", "-")
        name = "%s-%s-%s-lr%s-dl%d-spl%d-%s" % \
            (args.data, args.method, args.name, lr_tag, args.deterministic_layers, args.n_samples, sizes_tag)

        p_layers, q_layers = create_layers(args.layer_spec, x_dim,
                                           args.deterministic_layers,
                                           deterministic_act,
                                           deterministic_size)

        model = BiHM(
            p_layers,
            q_layers,
            l1reg=args.l1reg,
            l2reg=args.l2reg,
        )
        model.initialize()
    elif args.method == 'continue':
        import cPickle as pickle
        from os.path import basename, splitext

        with open(args.model_file, 'rb') as f:
            m = pickle.load(f)

        if isinstance(m, MainLoop):
            m = m.model

        model = m.get_top_bricks()[0]
        while len(model.parents) > 0:
            model = model.parents[0]

        assert isinstance(model, (BiHM, ReweightedWakeSleep, VAE))

        mname, _, _ = basename(args.model_file).rpartition("_model.pkl")
        name = "%s-cont-%s-lr%s-spl%s" % (mname, args.name, lr_tag,
                                          args.n_samples)
    else:
        raise ValueError("Unknown training method '%s'" % args.method)

    #------------------------------------------------------------

    x = tensor.matrix('features')

    #------------------------------------------------------------
    # Testset monitoring

    train_monitors = []
    valid_monitors = []
    test_monitors = []
    for s in [1, 10, 100, 1000]:
        log_p, log_ph = model.log_likelihood(x, s)
        log_p = -log_p.mean()
        log_ph = -log_ph.mean()
        log_p.name = "log_p_%d" % s
        log_ph.name = "log_ph_%d" % s

        #train_monitors += [log_p, log_ph]
        #valid_monitors += [log_p, log_ph]
        test_monitors += [log_p, log_ph]

    #------------------------------------------------------------
    # Z estimation
    #for s in [100000]:
    #    z2 = tensor.exp(model.estimate_log_z2(s)) / s
    #    z2.name = "z2_%d" % s
    #
    #    valid_monitors += [z2]
    #    test_monitors += [z2]

    #------------------------------------------------------------
    # Gradient and training monitoring

    if args.method in ['vae', 'dvae']:
        log_p_bound, gradients = model.get_gradients(x, args.n_samples)
        log_p_bound = -log_p_bound.mean()
        log_p_bound.name = "log_p_bound"
        cost = log_p_bound

        train_monitors += [
            log_p_bound,
            named(model.kl_term.mean(), 'kl_term'),
            named(model.recons_term.mean(), 'recons_term')
        ]
        valid_monitors += [
            log_p_bound,
            named(model.kl_term.mean(), 'kl_term'),
            named(model.recons_term.mean(), 'recons_term')
        ]
        test_monitors += [
            log_p_bound,
            named(model.kl_term.mean(), 'kl_term'),
            named(model.recons_term.mean(), 'recons_term')
        ]
    else:
        log_p, log_ph, gradients = model.get_gradients(x, args.n_samples)
        log_p = -log_p.mean()
        log_ph = -log_ph.mean()
        log_p.name = "log_p"
        log_ph.name = "log_ph"
        cost = log_ph

        train_monitors += [log_p, log_ph]
        valid_monitors += [log_p, log_ph]

    #------------------------------------------------------------
    # Detailed monitoring
    """
    n_layers = len(p_layers)

    log_px, w, log_p, log_q, samples = model.log_likelihood(x, n_samples)

    exp_samples = []
    for l in xrange(n_layers):
        e = (w.dimshuffle(0, 1, 'x')*samples[l]).sum(axis=1)
        e.name = "inference_h%d" % l
        e.tag.aggregation_scheme = aggregation.TakeLast(e)
        exp_samples.append(e)

    s1 = samples[1]
    sh1 = s1.shape
    s1_ = s1.reshape([sh1[0]*sh1[1], sh1[2]])
    s0, _ = model.p_layers[0].sample_expected(s1_)
    s0 = s0.reshape([sh1[0], sh1[1], s0.shape[1]])
    s0 = (w.dimshuffle(0, 1, 'x')*s0).sum(axis=1)
    s0.name = "inference_h0^"
    s0.tag.aggregation_scheme = aggregation.TakeLast(s0)
    exp_samples.append(s0)

    # Draw P-samples
    p_samples, _, _ = model.sample_p(100)
    #weights = model.importance_weights(samples)
    #weights = weights / weights.sum()

    for i, s in enumerate(p_samples):
        s.name = "psamples_h%d" % i
        s.tag.aggregation_scheme = aggregation.TakeLast(s)

    #
    samples = model.sample(100, oversample=100)

    for i, s in enumerate(samples):
        s.name = "samples_h%d" % i
        s.tag.aggregation_scheme = aggregation.TakeLast(s)
    """
    cg = ComputationGraph([cost])

    #------------------------------------------------------------

    if args.step_rule == "momentum":
        step_rule = Momentum(args.learning_rate, 0.95)
    elif args.step_rule == "rmsprop":
        step_rule = RMSProp(args.learning_rate)
    elif args.step_rule == "adam":
        step_rule = Adam(args.learning_rate)
    else:
        raise "Unknown step_rule %s" % args.step_rule

    #parameters = cg.parameters[:4] + cg.parameters[5:]
    parameters = cg.parameters

    algorithm = GradientDescent(
        cost=cost,
        parameters=parameters,
        gradients=gradients,
        step_rule=CompositeRule([
            #StepClipping(25),
            step_rule,
            #RemoveNotFinite(1.0),
        ]))

    #------------------------------------------------------------

    train_monitors += [
        aggregation.mean(algorithm.total_gradient_norm),
        aggregation.mean(algorithm.total_step_norm)
    ]

    #------------------------------------------------------------

    # Live plotting?
    plotting_extensions = []
    if args.live_plotting:
        plotting_extensions = [
            PlotManager(
                name,
                [
                    Plotter(channels=[[
                        "valid_%s" % cost.name, "valid_log_p"
                    ], ["train_total_gradient_norm", "train_total_step_norm"]],
                            titles=[
                                "validation cost",
                                "norm of training gradient and step"
                            ]),
                    DisplayImage(
                        [
                            WeightDisplay(model.p_layers[0].mlp.
                                          linear_transformations[0].W,
                                          n_weights=100,
                                          image_shape=(28, 28))
                        ]
                        #ImageDataStreamDisplay(test_stream, image_shape=(28,28))]
                    )
                ])
        ]

    main_loop = MainLoop(
        model=Model(cost),
        data_stream=train_stream,
        algorithm=algorithm,
        extensions=[
            Timing(),
            ProgressBar(),
            TrainingDataMonitoring(
                train_monitors, prefix="train", after_epoch=True),
            DataStreamMonitoring(
                valid_monitors, data_stream=valid_stream, prefix="valid"),
            DataStreamMonitoring(test_monitors,
                                 data_stream=test_stream,
                                 prefix="test",
                                 after_epoch=False,
                                 after_training=True,
                                 every_n_epochs=10),
            #SharedVariableModifier(
            #    algorithm.step_rule.components[0].learning_rate,
            #    half_lr_func,
            #    before_training=False,
            #    after_epoch=False,
            #    after_batch=False,
            #    every_n_epochs=half_lr),
            TrackTheBest('valid_%s' % cost.name),
            Checkpoint(name + ".pkl", save_separately=['log', 'model']),
            FinishIfNoImprovementAfter('valid_%s_best_so_far' % cost.name,
                                       epochs=args.patience),
            FinishAfter(after_n_epochs=args.max_epochs),
            Printing()
        ] + plotting_extensions)
    main_loop.run()
示例#26
0
def construct_main_loop(name, task_name, patch_shape, batch_size,
                        n_spatial_dims, n_patches, max_epochs, patience_epochs,
                        learning_rate, gradient_limiter, hyperparameters,
                        **kwargs):
    task = tasks.get_task(**hyperparameters)
    hyperparameters["n_channels"] = task.n_channels

    extensions = []

    # let theta noise decay as training progresses
    for key in "location_std scale_std".split():
        hyperparameters[key] = theano.shared(hyperparameters[key], name=key)
        extensions.append(
            util.ExponentialDecay(hyperparameters[key],
                                  hyperparameters["%s_decay" % key],
                                  after_batch=True))

    print "constructing graphs..."
    graphs, outputs, updates = construct_graphs(task=task, **hyperparameters)

    print "setting up main loop..."

    from blocks.model import Model
    model = Model(outputs["train"]["cost"])

    from blocks.algorithms import GradientDescent, CompositeRule, StepClipping, Adam, RMSProp
    from extensions import Compressor
    if gradient_limiter == "clip":
        limiter = StepClipping(1.)
    elif gradient_limiter == "compress":
        limiter = Compressor()
    else:
        raise ValueError()

    algorithm = GradientDescent(
        cost=outputs["train"]["cost"],
        parameters=graphs["train"].parameters,
        step_rule=CompositeRule([limiter,
                                 Adam(learning_rate=learning_rate)]))
    algorithm.add_updates(updates["train"])

    extensions.extend(
        construct_monitors(algorithm=algorithm,
                           task=task,
                           model=model,
                           graphs=graphs,
                           outputs=outputs,
                           updates=updates,
                           **hyperparameters))

    from blocks.extensions import FinishAfter, Printing, ProgressBar, Timing
    from blocks.extensions.stopping import FinishIfNoImprovementAfter
    from blocks.extensions.training import TrackTheBest
    from blocks.extensions.saveload import Checkpoint
    from dump import DumpBest, LightCheckpoint, PrintingTo, DumpGraph, DumpLog
    extensions.extend([
        TrackTheBest("valid_error_rate", "best_valid_error_rate"),
        FinishIfNoImprovementAfter("best_valid_error_rate",
                                   epochs=patience_epochs),
        FinishAfter(after_n_epochs=max_epochs),
        DumpBest("best_valid_error_rate", name + "_best.zip"),
        Checkpoint(hyperparameters["checkpoint_save_path"],
                   on_interrupt=False,
                   every_n_epochs=10,
                   use_cpickle=True),
        DumpLog("log.pkl", after_epoch=True),
        ProgressBar(),
        Timing(),
        Printing(),
        PrintingTo(name + "_log"),
        DumpGraph(name + "_grad_graph")
    ])

    from blocks.main_loop import MainLoop
    main_loop = MainLoop(data_stream=task.get_stream("train"),
                         algorithm=algorithm,
                         extensions=extensions,
                         model=model)

    from tabulate import tabulate
    print "parameter sizes:"
    print tabulate(
        (key, "x".join(map(str,
                           value.get_value().shape)), value.get_value().size)
        for key, value in main_loop.model.get_parameter_dict().items())

    return main_loop
def train_language_model(new_training_job, config, save_path, params,
                         fast_start, fuel_server, seed):
    c = config
    if seed:
        fuel.config.default_seed = seed
        blocks.config.config.default_seed = seed

    data, lm, retrieval = initialize_data_and_model(config)

    # full main loop can be saved...
    main_loop_path = os.path.join(save_path, 'main_loop.tar')
    # or only state (log + params) which can be useful not to pickle embeddings
    state_path = os.path.join(save_path, 'training_state.tar')
    stream_path = os.path.join(save_path, 'stream.pkl')
    best_tar_path = os.path.join(save_path, "best_model.tar")

    words = tensor.ltensor3('words')
    words_mask = tensor.matrix('words_mask')
    if theano.config.compute_test_value != 'off':
        test_value_data = next(
            data.get_stream('train', batch_size=4,
                            max_length=5).get_epoch_iterator())
        words.tag.test_value = test_value_data[0]
        words_mask.tag.test_value = test_value_data[1]

    costs, updates = lm.apply(words, words_mask)
    cost = rename(costs.mean(), 'mean_cost')

    cg = Model(cost)
    if params:
        logger.debug("Load parameters from {}".format(params))
        with open(params) as src:
            cg.set_parameter_values(load_parameters(src))

    length = rename(words.shape[1], 'length')
    perplexity, = VariableFilter(name='perplexity')(cg)
    perplexities = VariableFilter(name_regex='perplexity.*')(cg)
    monitored_vars = [length, cost] + perplexities
    if c['dict_path']:
        num_definitions, = VariableFilter(name='num_definitions')(cg)
        monitored_vars.extend([num_definitions])

    parameters = cg.get_parameter_dict()
    trained_parameters = parameters.values()
    saved_parameters = parameters.values()
    if c['embedding_path']:
        logger.debug("Exclude word embeddings from the trained parameters")
        trained_parameters = [
            p for p in trained_parameters
            if not p == lm.get_def_embeddings_params()
        ]
        saved_parameters = [
            p for p in saved_parameters
            if not p == lm.get_def_embeddings_params()
        ]

    if c['cache_size'] != 0:
        logger.debug("Enable fake recursivity for looking up embeddings")
        trained_parameters = [
            p for p in trained_parameters if not p == lm.get_cache_params()
        ]

    logger.info("Cost parameters" + "\n" + pprint.pformat([
        " ".join(
            (key, str(parameters[key].get_value().shape),
             'trained' if parameters[key] in trained_parameters else 'frozen'))
        for key in sorted(parameters.keys())
    ],
                                                          width=120))

    rules = []
    if c['grad_clip_threshold']:
        rules.append(StepClipping(c['grad_clip_threshold']))
    rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum']))
    algorithm = GradientDescent(cost=cost,
                                parameters=trained_parameters,
                                step_rule=CompositeRule(rules))

    if c['cache_size'] != 0:
        algorithm.add_updates(updates)

    train_monitored_vars = list(monitored_vars)
    if c['grad_clip_threshold']:
        train_monitored_vars.append(algorithm.total_gradient_norm)

    word_emb_RMS, = VariableFilter(name='word_emb_RMS')(cg)
    main_rnn_in_RMS, = VariableFilter(name='main_rnn_in_RMS')(cg)
    train_monitored_vars.extend([word_emb_RMS, main_rnn_in_RMS])

    if c['monitor_parameters']:
        train_monitored_vars.extend(parameter_stats(parameters, algorithm))

    # We use a completely random seed on purpose. With Fuel server
    # it's currently not possible to restore the state of the training
    # stream. That's why it's probably better to just have it stateless.
    stream_seed = numpy.random.randint(0, 10000000) if fuel_server else None
    training_stream = data.get_stream('train',
                                      batch_size=c['batch_size'],
                                      max_length=c['max_length'],
                                      seed=stream_seed)
    valid_stream = data.get_stream('valid',
                                   batch_size=c['batch_size_valid'],
                                   max_length=c['max_length'],
                                   seed=stream_seed)
    original_training_stream = training_stream
    if fuel_server:
        # the port will be configured by the StartFuelServer extension
        training_stream = ServerDataStream(
            sources=training_stream.sources,
            produces_examples=training_stream.produces_examples)

    validation = DataStreamMonitoring(monitored_vars,
                                      valid_stream,
                                      prefix="valid").set_conditions(
                                          before_first_epoch=not fast_start,
                                          on_resumption=True,
                                          every_n_batches=c['mon_freq_valid'])
    track_the_best = TrackTheBest(validation.record_name(perplexity),
                                  choose_best=min).set_conditions(
                                      on_resumption=True,
                                      after_epoch=True,
                                      every_n_batches=c['mon_freq_valid'])

    # don't save them the entire main loop to avoid pickling everything
    if c['fast_checkpoint']:
        load = (LoadNoUnpickling(state_path,
                                 load_iteration_state=True,
                                 load_log=True).set_conditions(
                                     before_training=not new_training_job))
        cp_args = {
            'save_main_loop': False,
            'save_separately': ['log', 'iteration_state'],
            'parameters': saved_parameters
        }

        checkpoint = Checkpoint(state_path,
                                before_training=not fast_start,
                                every_n_batches=c['save_freq_batches'],
                                after_training=not fast_start,
                                **cp_args)

        if c['checkpoint_every_n_batches']:
            intermediate_cp = IntermediateCheckpoint(
                state_path,
                every_n_batches=c['checkpoint_every_n_batches'],
                after_training=False,
                **cp_args)
    else:
        load = (Load(main_loop_path, load_iteration_state=True,
                     load_log=True).set_conditions(
                         before_training=not new_training_job))
        cp_args = {
            'save_separately': ['iteration_state'],
            'parameters': saved_parameters
        }

        checkpoint = Checkpoint(main_loop_path,
                                before_training=not fast_start,
                                every_n_batches=c['save_freq_batches'],
                                after_training=not fast_start,
                                **cp_args)

        if c['checkpoint_every_n_batches']:
            intermediate_cp = IntermediateCheckpoint(
                main_loop_path,
                every_n_batches=c['checkpoint_every_n_batches'],
                after_training=False,
                **cp_args)

    checkpoint = checkpoint.add_condition(
        ['after_batch', 'after_epoch'],
        OnLogRecord(track_the_best.notification_name), (best_tar_path, ))

    extensions = [
        load,
        StartFuelServer(original_training_stream,
                        stream_path,
                        before_training=fuel_server),
        Timing(every_n_batches=c['mon_freq_train'])
    ]

    if retrieval:
        extensions.append(
            RetrievalPrintStats(retrieval=retrieval,
                                every_n_batches=c['mon_freq_train'],
                                before_training=not fast_start))

    extensions.extend([
        TrainingDataMonitoring(train_monitored_vars,
                               prefix="train",
                               every_n_batches=c['mon_freq_train']),
        validation, track_the_best, checkpoint
    ])
    if c['checkpoint_every_n_batches']:
        extensions.append(intermediate_cp)
    extensions.extend([
        DumpTensorflowSummaries(save_path,
                                every_n_batches=c['mon_freq_train'],
                                after_training=True),
        Printing(on_resumption=True, every_n_batches=c['mon_freq_train']),
        FinishIfNoImprovementAfter(track_the_best.notification_name,
                                   iterations=50 * c['mon_freq_valid'],
                                   every_n_batches=c['mon_freq_valid']),
        FinishAfter(after_n_batches=c['n_batches'])
    ])

    logger.info("monitored variables during training:" + "\n" +
                pprint.pformat(train_monitored_vars, width=120))
    logger.info("monitored variables during valid:" + "\n" +
                pprint.pformat(monitored_vars, width=120))

    main_loop = MainLoop(algorithm,
                         training_stream,
                         model=Model(cost),
                         extensions=extensions)

    main_loop.run()
示例#28
0
def main(config,
         tr_stream,
         dev_stream,
         source_vocab,
         target_vocab,
         use_bokeh=False):

    # Create Theano variables
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')
    target_sentence = tensor.lmatrix('target')
    target_sentence_mask = tensor.matrix('target_mask')
    initial_context = tensor.matrix('initial_context')

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'], config['enc_nhids'])

    # let user specify the target transition class name in config,
    # eval it and pass to decoder
    target_transition_name = config.get(
        'target_transition', 'GRUInitialStateWithInitialStateSumContext')
    target_transition = eval(target_transition_name)

    logger.info('Using target transition: {}'.format(target_transition_name))
    decoder = InitialContextDecoder(config['trg_vocab_size'],
                                    config['dec_embed'], config['dec_nhids'],
                                    config['enc_nhids'] * 2,
                                    config['context_dim'], target_transition)

    cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask),
                        source_sentence_mask, target_sentence,
                        target_sentence_mask, initial_context)

    cost.name = 'decoder_cost'

    # Initialize model
    logger.info('Initializing model')
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    decoder.transition.weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # GRAPH TRANSFORMATIONS FOR BETTER TRAINING
    # TODO: validate performance with/without regularization
    if config.get('l2_regularization', False) is True:
        l2_reg_alpha = config['l2_regularization_alpha']
        logger.info(
            'Applying l2 regularization with alpha={}'.format(l2_reg_alpha))
        model_weights = VariableFilter(roles=[WEIGHT])(cg.variables)

        for W in model_weights:
            cost = cost + (l2_reg_alpha * (W**2).sum())

        # why do we need to name the cost variable? Where did the original name come from?
        cost.name = 'decoder_cost_cost'

    cg = ComputationGraph(cost)

    # apply dropout for regularization
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        # this is the probability of dropping out, so you probably want to make it <=0.5
        logger.info('Applying dropout')
        dropout_inputs = [
            x for x in cg.intermediary_variables
            if x.name == 'maxout_apply_output'
        ]
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names
    enc_dec_param_dict = merge(
        Selector(encoder).get_parameters(),
        Selector(decoder).get_parameters())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.items():
        logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}".format(
        len(enc_dec_param_dict)))

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)

    # create the training directory, and copy this config there if directory doesn't exist
    if not os.path.isdir(config['saveto']):
        os.makedirs(config['saveto'])
        shutil.copy(config['config_file'], config['saveto'])

    # Set extensions

    # TODO: add checking for existing model and loading
    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=config['finish_after']),
        TrainingDataMonitoring([cost], after_batch=True),
        Printing(after_batch=True),
        CheckpointNMT(config['saveto'], every_n_batches=config['save_freq'])
    ]

    # Create the theano variables that we need for the sampling graph
    sampling_input = tensor.lmatrix('input')
    sampling_context = tensor.matrix('context_input')

    # Set up beam search and sampling computation graphs if necessary
    if config['hook_samples'] >= 1 or config.get('bleu_script',
                                                 None) is not None:
        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))

        generated = decoder.generate(sampling_input, sampling_representation,
                                     sampling_context)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

    # Add sampling
    if config['hook_samples'] >= 1:
        logger.info("Building sampler")
        extensions.append(
            Sampler(
                model=search_model,
                data_stream=tr_stream,
                hook_samples=config['hook_samples'],
                every_n_batches=config['sampling_freq'],
                src_vocab=source_vocab,
                trg_vocab=target_vocab,
                src_vocab_size=config['src_vocab_size'],
            ))

    # Add early stopping based on bleu
    if config.get('bleu_script', None) is not None:
        logger.info("Building bleu validator")
        extensions.append(
            BleuValidator(sampling_input,
                          sampling_context,
                          samples=samples,
                          config=config,
                          model=search_model,
                          data_stream=dev_stream,
                          src_vocab=source_vocab,
                          trg_vocab=target_vocab,
                          normalize=config['normalized_bleu'],
                          every_n_batches=config['bleu_val_freq']))

    # Add early stopping based on Meteor
    if config.get('meteor_directory', None) is not None:
        logger.info("Building meteor validator")
        extensions.append(
            MeteorValidator(sampling_input,
                            sampling_context,
                            samples=samples,
                            config=config,
                            model=search_model,
                            data_stream=dev_stream,
                            src_vocab=source_vocab,
                            trg_vocab=target_vocab,
                            normalize=config['normalized_bleu'],
                            every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            Plot(config['model_save_directory'],
                 channels=[[
                     'decoder_cost', 'validation_set_bleu_score',
                     'validation_set_meteor_score'
                 ]],
                 every_n_batches=10))

    # Set up training algorithm
    logger.info("Initializing training algorithm")
    # if there is dropout or random noise, we need to use the output of the modified graph
    if config['dropout'] < 1.0 or config['weight_noise_ff'] > 0.0:
        algorithm = GradientDescent(cost=cg.outputs[0],
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]))
    else:
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(config['step_clipping']),
                                        eval(config['step_rule'])()
                                    ]))

    # enrich the logged information
    extensions.append(Timing(every_n_batches=100))

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=training_model,
                         algorithm=algorithm,
                         data_stream=tr_stream,
                         extensions=extensions)

    # Train!
    main_loop.run()
示例#29
0
print("Cost graph is built", file=sys.stderr)

model = Model(cost)
parameters = model.get_parameter_dict()

for brick in model.get_top_bricks():  #{
    brick.initialize()
#}

cg = ComputationGraph(cost)

algo = GradientDescent(cost=cost,
                       parameters=cg.parameters,
                       step_rule=CompositeRule(
                           [StepClipping(10.0),
                            Scale(0.01)]))
#algo = RMSProp(learning_rate=1.0, decay_rate=0.9)

max_length = chars.shape[0].copy(name="max_length")
observables = [
    batch_size, max_length, algo.total_step_norm, algo.total_gradient_norm,
    cost
]

# Construct the main loop and start training!
average_monitoring = TrainingDataMonitoring(observables,
                                            prefix="average",
                                            every_n_batches=10)

checkpoint_after = n_epochs / 5
示例#30
0
                                    x_mask,
                                    attended=mlp_context.apply(context))
cost = cost_matrix.sum() / x_mask.sum()
cost.name = "sequence_log_likelihood"

cg = ComputationGraph(cost)
model = Model(cost)

#################
# Algorithm
#################

algorithm = GradientDescent(cost=cost,
                            parameters=cg.parameters,
                            step_rule=CompositeRule(
                                [StepClipping(10.0),
                                 Adam(lr)]))

train_monitor = TrainingDataMonitoring(variables=[cost],
                                       after_epoch=True,
                                       prefix="train")

extensions = extensions = [
    train_monitor,
    TrackTheBest('train_sequence_log_likelihood'),
    Printing(after_epoch=True)
]

main_loop = MainLoop(model=model,
                     data_stream=data_stream,
                     algorithm=algorithm,
def main():
    nclasses = 27

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed", type=int, default=1)
    parser.add_argument("--length", type=int, default=180)
    parser.add_argument("--num-epochs", type=int, default=100)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--learning-rate", type=float, default=1e-3)
    parser.add_argument("--epsilon", type=float, default=1e-5)
    parser.add_argument("--num-hidden", type=int, default=1000)
    parser.add_argument("--baseline", action="store_true")
    parser.add_argument("--initialization",
                        choices="identity glorot orthogonal uniform".split(),
                        default="identity")
    parser.add_argument("--initial-gamma", type=float, default=1e-1)
    parser.add_argument("--initial-beta", type=float, default=0)
    parser.add_argument("--cluster", action="store_true")
    parser.add_argument("--activation",
                        choices=list(activations.keys()),
                        default="tanh")
    parser.add_argument("--optimizer",
                        choices="sgdmomentum adam rmsprop",
                        default="rmsprop")
    parser.add_argument("--continue-from")
    parser.add_argument("--evaluate")
    parser.add_argument("--dump-hiddens")
    args = parser.parse_args()

    np.random.seed(args.seed)
    blocks.config.config.default_seed = args.seed

    if args.continue_from:
        from blocks.serialization import load
        main_loop = load(args.continue_from)
        main_loop.run()
        sys.exit(0)

    graphs, extensions, updates = construct_graphs(args, nclasses)

    ### optimization algorithm definition
    if args.optimizer == "adam":
        optimizer = Adam(learning_rate=args.learning_rate)
    elif args.optimizer == "rmsprop":
        optimizer = RMSProp(learning_rate=args.learning_rate, decay_rate=0.9)
    elif args.optimizer == "sgdmomentum":
        optimizer = Momentum(learning_rate=args.learning_rate, momentum=0.99)
    step_rule = CompositeRule([
        StepClipping(1.),
        optimizer,
    ])
    algorithm = GradientDescent(cost=graphs["training"].outputs[0],
                                parameters=graphs["training"].parameters,
                                step_rule=step_rule)
    algorithm.add_updates(updates["training"])
    model = Model(graphs["training"].outputs[0])
    extensions = extensions["training"] + extensions["inference"]

    # step monitor
    step_channels = []
    step_channels.extend([
        algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name)
        for name, param in model.get_parameter_dict().items()
    ])
    step_channels.append(
        algorithm.total_step_norm.copy(name="total_step_norm"))
    step_channels.append(
        algorithm.total_gradient_norm.copy(name="total_gradient_norm"))
    step_channels.extend(graphs["training"].outputs)
    logger.warning("constructing training data monitor")
    extensions.append(
        TrainingDataMonitoring(step_channels,
                               prefix="iteration",
                               after_batch=True))

    # parameter monitor
    extensions.append(
        DataStreamMonitoring([
            param.norm(2).copy(name="parameter.norm:%s" % name)
            for name, param in model.get_parameter_dict().items()
        ],
                             data_stream=None,
                             after_epoch=True))

    validation_interval = 500
    # performance monitor
    for situation in "training inference".split():
        if situation == "inference" and not args.evaluate:
            # save time when we don't need the inference graph
            continue

        for which_set in "train valid test".split():
            logger.warning("constructing %s %s monitor" %
                           (which_set, situation))
            channels = list(graphs[situation].outputs)
            extensions.append(
                DataStreamMonitoring(channels,
                                     prefix="%s_%s" % (which_set, situation),
                                     every_n_batches=validation_interval,
                                     data_stream=get_stream(
                                         which_set=which_set,
                                         batch_size=args.batch_size,
                                         num_examples=10000,
                                         length=args.length)))

    extensions.extend([
        TrackTheBest("valid_training_error_rate",
                     "best_valid_training_error_rate"),
        DumpBest("best_valid_training_error_rate", "best.zip"),
        FinishAfter(after_n_epochs=args.num_epochs),
        #FinishIfNoImprovementAfter("best_valid_error_rate", epochs=50),
        Checkpoint("checkpoint.zip",
                   on_interrupt=False,
                   every_n_epochs=1,
                   use_cpickle=True),
        DumpLog("log.pkl", after_epoch=True)
    ])

    if not args.cluster:
        extensions.append(ProgressBar())

    extensions.extend([
        Timing(),
        Printing(every_n_batches=validation_interval),
        PrintingTo("log"),
    ])
    main_loop = MainLoop(data_stream=get_stream(which_set="train",
                                                batch_size=args.batch_size,
                                                length=args.length,
                                                augment=True),
                         algorithm=algorithm,
                         extensions=extensions,
                         model=model)

    if args.dump_hiddens:
        dump_hiddens(args, main_loop)
        return

    if args.evaluate:
        evaluate(args, main_loop)
        return

    main_loop.run()