示例#1
0
    def apply(self, input_lb, input_un, target):
        batch_size = input_lb.shape[0]
        get_labeled = lambda x: x[:batch_size] if x is not None else x
        input = T.concatenate([input_lb, input_un], axis=0)
        self.layer_dims = {0: self.input_dim}
        self.lr = self.shared(self.default_lr, 'learning_rate', role=None)
        top = len(self.layers) - 1

        clean = self.encoder(input, noise_std=[0])
        corr = self.encoder(input, noise_std=self.noise_std)

        ests, costs = self.decoder(clean, corr, batch_size)

        # Costs
        y = target.flatten()

        costs.class_clean = CategoricalCrossEntropy().apply(
            y, get_labeled(clean.h[top]))
        costs.class_clean.name = 'CE_clean'

        costs.class_corr = CategoricalCrossEntropy().apply(
            y, get_labeled(corr.h[top]))
        costs.class_corr.name = 'CE_corr'

        costs.total = costs.class_corr * 1.0
        for i in range(len(self.layers)):
            costs.total += costs.denois[i] * self.denoising_cost_x[i]
        costs.total.name = 'Total_cost'

        self.costs = costs

        # Classification error
        mr = MisclassificationRate()
        self.error = mr.apply(y, get_labeled(clean.h[top])) * np.float32(100.)
        self.error.name = 'Error_rate'
示例#2
0
    def __init__(self, config):
        self.X = T.tensor4("features")
        c = config

        seq = BrickSequence(
            input_dim=(3, 32, 32),
            bricks=[
                conv3(c['n_l1']),
                conv3(c['n_l2']),
                max_pool(),
                conv3(c['n_l3']),
                conv3(c['n_l4']),
                max_pool(),
                #conv3(10),
                #conv3(10),
                Flattener(),
                linear(c['n_l5']),
                Softmax()
            ])

        seq.initialize()

        self.pred = seq.apply(self.X)
        self.Y = T.imatrix("targets")

        self.cost = CategoricalCrossEntropy().apply(self.Y.flatten(),
                                                    self.pred)
        self.cost.name = "cost"

        self.accur = 1.0 - MisclassificationRate().apply(
            self.Y.flatten(), self.pred)
        self.accur.name = "accur"
示例#3
0
def main(save_to, num_epochs):
    mlp = MLP([Tanh(), Softmax()], [784, 100, 10],
              weights_init=IsotropicGaussian(0.01),
              biases_init=Constant(0))
    mlp.initialize()
    x = tensor.matrix('features')
    y = tensor.lmatrix('targets')
    probs = mlp.apply(tensor.flatten(x, outdim=2))
    cost = CategoricalCrossEntropy().apply(y.flatten(), probs)
    error_rate = MisclassificationRate().apply(y.flatten(), probs)

    cg = ComputationGraph([cost])
    W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables)
    cost = cost + .00005 * (W1**2).sum() + .00005 * (W2**2).sum()
    cost.name = 'final_cost'

    mnist_train = MNIST(("train", ))
    mnist_test = MNIST(("test", ))

    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=Scale(learning_rate=0.1))
    extensions = [
        Timing(),
        FinishAfter(after_n_epochs=num_epochs),
        DataStreamMonitoring([cost, error_rate],
                             Flatten(DataStream.default_stream(
                                 mnist_test,
                                 iteration_scheme=SequentialScheme(
                                     mnist_test.num_examples, 500)),
                                     which_sources=('features', )),
                             prefix="test"),
        TrainingDataMonitoring([
            cost, error_rate,
            aggregation.mean(algorithm.total_gradient_norm)
        ],
                               prefix="train",
                               after_epoch=True),
        Checkpoint(save_to),
        Printing()
    ]

    if BLOCKS_EXTRAS_AVAILABLE:
        extensions.append(
            Plot('MNIST example',
                 channels=[[
                     'test_final_cost',
                     'test_misclassificationrate_apply_error_rate'
                 ], ['train_total_gradient_norm']]))

    main_loop = MainLoop(algorithm,
                         Flatten(DataStream.default_stream(
                             mnist_train,
                             iteration_scheme=SequentialScheme(
                                 mnist_train.num_examples, 50)),
                                 which_sources=('features', )),
                         model=Model(cost),
                         extensions=extensions)

    main_loop.run()
示例#4
0
def build_model(images, labels):
    
    # Construct a bottom convolutional sequence
    bottom_conv_sequence = convolutional_sequence((3,3), 16, (160, 160))
    bottom_conv_sequence._push_allocation_config()
    
    # Flatten layer
    flattener = Flattener()

    # Construct a top MLP
    conv_out_dim = numpy.prod(bottom_conv_sequence.get_dim('output'))
    #top_mlp = MLP([Rectifier(name='non_linear_9'), Softmax(name='non_linear_11')], [conv_out_dim, 1024, 10], weights_init=IsotropicGaussian(), biases_init=Constant(0))
    top_mlp = BatchNormalizedMLP([Rectifier(name='non_linear_9'), Softmax(name='non_linear_11')], [conv_out_dim, 1024, 10], weights_init=IsotropicGaussian(), biases_init=Constant(0))
    
    # Construct feedforward sequence
    ss_seq = FeedforwardSequence([bottom_conv_sequence.apply, flattener.apply, top_mlp.apply])
    ss_seq.push_initialization_config()
    ss_seq.initialize()
    
    prediction = ss_seq.apply(images)
    cost_noreg = CategoricalCrossEntropy().apply(labels.flatten(), prediction)

    # add regularization
    selector = Selector([top_mlp])
    Ws = selector.get_parameters('W')
    mlp_brick_name = 'batchnormalizedmlp'
    W0 = Ws['/%s/linear_0.W' % mlp_brick_name]
    W1 = Ws['/%s/linear_1.W' % mlp_brick_name]

    cost = cost_noreg + .01 * (W0 ** 2).mean() + .01 * (W1 ** 2).mean()


    return cost
示例#5
0
def build_model(images, labels):

    # Construct a bottom convolutional sequence
    bottom_conv_sequence = convolutional_sequence((3, 3), 64, (150, 150))
    bottom_conv_sequence._push_allocation_config()

    # Flatten layer
    flattener = Flattener()

    # Construct a top MLP
    conv_out_dim = numpy.prod(bottom_conv_sequence.get_dim('output'))
    top_mlp = MLP([
        LeakyRectifier(name='non_linear_9'),
        LeakyRectifier(name='non_linear_10'),
        Softmax(name='non_linear_11')
    ], [conv_out_dim, 2048, 612, 10],
                  weights_init=IsotropicGaussian(),
                  biases_init=Constant(1))

    # Construct feedforward sequence
    ss_seq = FeedforwardSequence(
        [bottom_conv_sequence.apply, flattener.apply, top_mlp.apply])
    ss_seq.push_initialization_config()
    ss_seq.initialize()

    prediction = ss_seq.apply(images)
    cost = CategoricalCrossEntropy().apply(labels.flatten(), prediction)

    return cost
示例#6
0
def setup_model():
    # shape: T x B x F
    input_ = T.tensor3('features')
    # shape: B
    target = T.lvector('targets')
    model = LSTMAttention(input_dim=10000,
                          dim=500,
                          mlp_hidden_dims=[2000, 500, 4],
                          batch_size=100,
                          image_shape=(100, 100),
                          patch_shape=(28, 28),
                          weights_init=IsotropicGaussian(0.01),
                          biases_init=Constant(0))
    model.initialize()
    h, c = model.apply(input_)
    classifier = MLP([Rectifier(), Softmax()], [500, 100, 10],
                     weights_init=IsotropicGaussian(0.01),
                     biases_init=Constant(0))
    classifier.initialize()

    probabilities = classifier.apply(h[-1])
    cost = CategoricalCrossEntropy().apply(target, probabilities)
    error_rate = MisclassificationRate().apply(target, probabilities)

    return cost, error_rate
示例#7
0
    def __init__(self, split_idx, domain_classifier, **kwargs):
        super(DomainUnsupervisedCost, self).__init__(**kwargs)
        batchwise_split = BatchwiseSplit(split_idx)
        gradient_reversal = GradientReversal()
        cost = CategoricalCrossEntropy()

        self.children = [
            gradient_reversal, domain_classifier, batchwise_split, cost
        ]
示例#8
0
 def apply(self, input_, target):
     mlp = MLP(self.non_lins,
               self.dims,
               weights_init=IsotropicGaussian(0.01),
               biases_init=Constant(0),
               name=self.name)
     mlp.initialize()
     probs = mlp.apply(T.flatten(input_, outdim=2))
     probs.name = 'probs'
     cost = CategoricalCrossEntropy().apply(target.flatten(), probs)
     cost.name = "CE"
     self.outputs = {}
     self.outputs['probs'] = probs
     self.outputs['cost'] = cost
示例#9
0
def main(save_to, num_epochs):
    mlp = MLP([Tanh(), Softmax()], [784, 100, 10],
              weights_init=IsotropicGaussian(0.01),
              biases_init=Constant(0))
    mlp.initialize()
    x = tensor.matrix('features')
    y = tensor.lmatrix('targets')
    probs = mlp.apply(x)
    cost = CategoricalCrossEntropy().apply(y.flatten(), probs)
    error_rate = MisclassificationRate().apply(y.flatten(), probs)

    cg = ComputationGraph([cost])
    W1, W2 = VariableFilter(roles=[WEIGHTS])(cg.variables)
    cost = cost + .00005 * (W1**2).sum() + .00005 * (W2**2).sum()
    cost.name = 'final_cost'

    mnist_train = MNIST("train")
    mnist_test = MNIST("test")

    algorithm = GradientDescent(cost=cost,
                                step_rule=SteepestDescent(learning_rate=0.1))
    main_loop = MainLoop(
        mlp,
        DataStream(mnist_train,
                   iteration_scheme=SequentialScheme(mnist_train.num_examples,
                                                     50)),
        algorithm,
        extensions=[
            Timing(),
            FinishAfter(after_n_epochs=num_epochs),
            DataStreamMonitoring([cost, error_rate],
                                 DataStream(mnist_test,
                                            iteration_scheme=SequentialScheme(
                                                mnist_test.num_examples, 500)),
                                 prefix="test"),
            TrainingDataMonitoring([
                cost, error_rate,
                aggregation.mean(algorithm.total_gradient_norm)
            ],
                                   prefix="train",
                                   after_every_epoch=True),
            SerializeMainLoop(save_to),
            Plot('MNIST example',
                 channels=[[
                     'test_final_cost',
                     'test_misclassificationrate_apply_error_rate'
                 ], ['train_total_gradient_norm']]),
            Printing()
        ])
    main_loop.run()
示例#10
0
文件: cifar_mlp.py 项目: oplatek/ALI
def main(save_to, num_epochs, batch_size):
    mlp = MLP([Tanh(), Tanh(), Tanh(), Softmax()], [3072, 4096, 1024, 512, 10],
              weights_init=IsotropicGaussian(0.01),
              biases_init=Constant(0))
    mlp.initialize()
    x = tt.tensor4('features', dtype='float32')
    y = tt.vector('label', dtype='int32')

    probs = mlp.apply(x.reshape((-1, 3072)))
    cost = CategoricalCrossEntropy().apply(y, probs)
    error_rate = MisclassificationRate().apply(y, probs)

    cg = ComputationGraph([cost])
    ws = VariableFilter(roles=[WEIGHT])(cg.variables)
    cost = cost + .00005 * sum(([(w**2).sum() for w in ws]))
    cost.name = 'final_cost'

    train_dataset = Cifar10Dataset(data_dir='/home/belohlavek/data/cifar10',
                                   is_train=True)
    valid_dataset = Cifar10Dataset(data_dir='/home/belohlavek/data/cifar10',
                                   is_train=False)

    train_stream = train_dataset.get_stream(batch_size)
    valid_stream = valid_dataset.get_stream(batch_size)

    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=Adam(learning_rate=0.001))
    extensions = [
        Timing(),
        LogExtension('/home/belohlavek/ALI/mlp.log'),
        FinishAfter(after_n_epochs=num_epochs),
        DataStreamMonitoring([cost, error_rate], valid_stream, prefix="test"),
        TrainingDataMonitoring([
            cost, error_rate,
            aggregation.mean(algorithm.total_gradient_norm)
        ],
                               prefix="train",
                               after_epoch=True),
        Checkpoint(save_to),
        Printing()
    ]

    main_loop = MainLoop(algorithm,
                         train_stream,
                         model=Model(cost),
                         extensions=extensions)

    main_loop.run()
示例#11
0
def setup_model():
    # shape: T x B x F
    input_ = T.tensor3('features')
    # shape: B
    target = T.lvector('targets')
    model = LSTMAttention(dim=256,
                          mlp_hidden_dims=[256, 4],
                          batch_size=100,
                          image_shape=(64, 64),
                          patch_shape=(16, 16),
                          weights_init=Glorot(),
                          biases_init=Constant(0))
    model.initialize()
    h, c, location, scale = model.apply(input_)
    classifier = MLP([Rectifier(), Softmax()], [256 * 2, 200, 10],
                     weights_init=Glorot(),
                     biases_init=Constant(0))
    model.h = h
    model.c = c
    model.location = location
    model.scale = scale
    classifier.initialize()

    probabilities = classifier.apply(T.concatenate([h[-1], c[-1]], axis=1))
    cost = CategoricalCrossEntropy().apply(target, probabilities)
    error_rate = MisclassificationRate().apply(target, probabilities)
    model.cost = cost

    location_x_0_avg = T.mean(location[0, :, 0])
    location_x_0_avg.name = 'location_x_0_avg'
    location_x_10_avg = T.mean(location[10, :, 0])
    location_x_10_avg.name = 'location_x_10_avg'
    location_x_20_avg = T.mean(location[-1, :, 0])
    location_x_20_avg.name = 'location_x_20_avg'

    scale_x_0_avg = T.mean(scale[0, :, 0])
    scale_x_0_avg.name = 'scale_x_0_avg'
    scale_x_10_avg = T.mean(scale[10, :, 0])
    scale_x_10_avg.name = 'scale_x_10_avg'
    scale_x_20_avg = T.mean(scale[-1, :, 0])
    scale_x_20_avg.name = 'scale_x_20_avg'

    monitorings = [
        error_rate, location_x_0_avg, location_x_10_avg, location_x_20_avg,
        scale_x_0_avg, scale_x_10_avg, scale_x_20_avg
    ]
    model.monitorings = monitorings

    return model
示例#12
0
def test_dataset_evaluators():
    X = theano.tensor.vector('X')
    Y = theano.tensor.vector('Y')

    data = [
        numpy.arange(1, 7, dtype=theano.config.floatX).reshape(3, 2),
        numpy.arange(11, 17, dtype=theano.config.floatX).reshape(3, 2)
    ]
    data_stream = IterableDataset(dict(X=data[0],
                                       Y=data[1])).get_example_stream()

    validator = DatasetEvaluator([
        CrossEntropy(requires=[X, Y], name="monitored_cross_entropy0"),
        # to test two same quantities and make sure that state will be reset
        CrossEntropy(requires=[X, Y], name="monitored_cross_entropy1"),
        CategoricalCrossEntropy().apply(X, Y),
    ])
    values = validator.evaluate(data_stream)
    numpy.testing.assert_allclose(values['monitored_cross_entropy1'],
                                  values['categoricalcrossentropy_apply_cost'])
示例#13
0
def test_softmax_vector():
    x = tensor.matrix('x')
    y = tensor.lvector('y')

    softmax_out = Softmax().apply(x)
    cost = CategoricalCrossEntropy().apply(y, softmax_out)

    cost_stable = Softmax().categorical_cross_entropy(y, x)

    softmax_cost_func = function([x, y], cost)
    softmax_cost_stable_func = function([x, y], cost_stable)

    batch_size = 100
    x_size = 10

    rng = numpy.random.RandomState(1)
    x_val = rng.randn(batch_size, x_size).astype(theano.config.floatX)
    y_val = rng.randint(low=0, high=x_size, size=(batch_size))
    softmax_cost = softmax_cost_func(x_val, y_val)
    softmax_cost_stable = softmax_cost_stable_func(x_val, y_val)

    assert_allclose(softmax_cost, softmax_cost_stable)
示例#14
0
def test_softmax_matrix():
    x = tensor.matrix('x')
    y = tensor.matrix('y')

    softmax_out = Softmax().apply(x)
    cost = CategoricalCrossEntropy().apply(y, softmax_out)

    cost_stable = Softmax().categorical_cross_entropy(y, x)

    softmax_cost_func = function([x, y], cost)
    softmax_cost_stable_func = function([x, y], cost_stable)

    batch_size = 2
    x_size = 2

    rng = numpy.random.RandomState(1)
    x_val = rng.randn(batch_size, x_size).astype(theano.config.floatX)
    y_val_us = rng.uniform(size=(batch_size,
                                 x_size)).astype(theano.config.floatX)
    y_val = y_val_us / numpy.expand_dims(y_val_us.sum(axis=1), axis=1)
    softmax_cost = softmax_cost_func(x_val, y_val)
    softmax_cost_stable = softmax_cost_stable_func(x_val, y_val)

    assert_allclose(softmax_cost, softmax_cost_stable, rtol=1e-5)
示例#15
0
probs = Softmax().apply(n5)

statistics_list=[(M1,S1,a1), (M2,S2,a2), (M3,S3,a3), (M4,S4,a4), (M5,S5,a5)]

# initialize_variables
# for variable (M,S) in variables:
# 	compute M and S in the whole data.

if normalization == 'bn2':
    for m,s,var in statistics_list:
        var.tag.aggregation_scheme = MeanAndVariance(var, var.shape[0], axis = 0)
        init_mn, init_var = DatasetEvaluator([var]).evaluate(stream_train)[var.name]
        m.set_value(init_mn.astype(floatX))
        s.set_value(sqrt(init_var).astype(floatX))

cost = CategoricalCrossEntropy().apply(y.flatten(), probs)
cost.name = 'cost'
error_rate = MisclassificationRate().apply(y.flatten(), probs)
error_rate.name = 'error_rate'

cg = ComputationGraph([cost])
    
parameters = cg.parameters
# add gradient descent to M,S
if normalization == 'bn2':
    for m,s,var in statistics_list:
        parameters.extend([m,s])

algorithm = GradientDescent(
    cost=cost, parameters=parameters, step_rule=Adam(0.01))
示例#16
0
from fuel.streams import DataStream
from fuel.schemes import SequentialScheme
from fuel.transformers import Flatten

# Construct the model
mlp = MLP(activations=[Tanh(), Softmax()],
          dims=[784, 100, 10],
          weights_init=IsotropicGaussian(0.01),
          biases_init=Constant(0))
mlp.initialize()

# Calculate the loss function
x = T.matrix('features')
y = T.lmatrix('targets')
y_hat = mlp.apply(x)
cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat)
error_rate = MisclassificationRate().apply(y.flatten(), y_hat)

# load training data using Fuel
mnist_train = MNIST("train")
train_stream = Flatten(
    DataStream.default_stream(dataset=mnist_train,
                              iteration_scheme=SequentialScheme(
                                  mnist_train.num_examples, 128)), )

# load testing data
mnist_test = MNIST("test")
test_stream = Flatten(
    DataStream.default_stream(dataset=mnist_test,
                              iteration_scheme=SequentialScheme(
                                  mnist_test.num_examples, 1024)), )
示例#17
0
def main():
    feature_maps = [20, 50]
    mlp_hiddens = [50]
    conv_sizes = [5, 5]
    pool_sizes = [3, 3]
    save_to = "DvC.pkl"
    batch_size = 500
    image_size = (32, 32)
    output_size = 2
    learningRate = 0.1
    num_epochs = 10
    num_batches = None
    host_plot = 'http://*****:*****@ %s' %
             ('CNN ', datetime.datetime.now(), socket.gethostname()),
             channels=[['valid_cost', 'valid_error_rate'],
                       ['train_total_gradient_norm']],
             after_epoch=True,
             server_url=host_plot))

    model = Model(cost)

    main_loop = MainLoop(algorithm,
                         stream_data_train,
                         model=model,
                         extensions=extensions)

    main_loop.run()
示例#18
0
def main(save_to,
         num_epochs,
         feature_maps=None,
         mlp_hiddens=None,
         conv_sizes=None,
         pool_sizes=None,
         batch_size=500):
    if feature_maps is None:
        feature_maps = [20, 50]
    if mlp_hiddens is None:
        mlp_hiddens = [500]
    if conv_sizes is None:
        conv_sizes = [5, 5]
    if pool_sizes is None:
        pool_sizes = [2, 2]
    image_size = (28, 28)
    output_size = 10

    # Use ReLUs everywhere and softmax for the final prediction
    conv_activations = [Rectifier() for _ in feature_maps]
    mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()]
    convnet = LeNet(conv_activations,
                    1,
                    image_size,
                    filter_sizes=zip(conv_sizes, conv_sizes),
                    feature_maps=feature_maps,
                    pooling_sizes=zip(pool_sizes, pool_sizes),
                    top_mlp_activations=mlp_activations,
                    top_mlp_dims=mlp_hiddens + [output_size],
                    border_mode='full',
                    weights_init=Uniform(width=.2),
                    biases_init=Constant(0))
    # We push initialization config to set different initialization schemes
    # for convolutional layers.
    convnet.push_initialization_config()
    convnet.layers[0].weights_init = Uniform(width=.2)
    convnet.layers[1].weights_init = Uniform(width=.09)
    convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08)
    convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11)
    convnet.initialize()
    logging.info(
        "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_')))
    for i, layer in enumerate(convnet.layers):
        logging.info("Layer {} dim: {} {} {}".format(i,
                                                     *layer.get_dim('output')))

    x = tensor.tensor4('features')
    y = tensor.lmatrix('targets')

    # Normalize input and apply the convnet
    probs = convnet.apply(x)
    cost = named_copy(CategoricalCrossEntropy().apply(y.flatten(), probs),
                      'cost')
    error_rate = named_copy(MisclassificationRate().apply(y.flatten(), probs),
                            'error_rate')

    cg = ComputationGraph([cost, error_rate])

    mnist_train = MNIST(("train", ))
    mnist_train_stream = DataStream.default_stream(
        mnist_train,
        iteration_scheme=ShuffledScheme(mnist_train.num_examples, batch_size))

    mnist_test = MNIST(("test", ))
    mnist_test_stream = DataStream.default_stream(
        mnist_test,
        iteration_scheme=ShuffledScheme(mnist_test.num_examples, batch_size))

    # Train with simple SGD
    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=Scale(learning_rate=0.1))
    # `Timing` extension reports time for reading data, aggregating a batch
    # and monitoring;
    # `ProgressBar` displays a nice progress bar during training.
    extensions = [
        Timing(),
        FinishAfter(after_n_epochs=num_epochs),
        DataStreamMonitoring([cost, error_rate],
                             mnist_test_stream,
                             prefix="test"),
        TrainingDataMonitoring([
            cost, error_rate,
            aggregation.mean(algorithm.total_gradient_norm)
        ],
                               prefix="train",
                               after_epoch=True),
        Checkpoint(save_to),
        ProgressBar(),
        Printing()
    ]

    model = Model(cost)

    main_loop = MainLoop(algorithm,
                         mnist_train_stream,
                         model=model,
                         extensions=extensions)

    main_loop.run()
示例#19
0
文件: run.py 项目: chargen/net-intent
def main(save_to, num_epochs,
         regularization=0.0003, subset=None, num_batches=None,
         histogram=None, resume=False):
    batch_size = 500
    output_size = 10
    convnet = create_lenet_5()
    layers = convnet.layers

    x = tensor.tensor4('features')
    y = tensor.lmatrix('targets')

    # Normalize input and apply the convnet
    probs = convnet.apply(x)
    cost = (CategoricalCrossEntropy().apply(y.flatten(), probs)
            .copy(name='cost'))
    components = (ComponentwiseCrossEntropy().apply(y.flatten(), probs)
            .copy(name='components'))
    error_rate = (MisclassificationRate().apply(y.flatten(), probs)
                  .copy(name='error_rate'))
    confusion = (ConfusionMatrix().apply(y.flatten(), probs)
                  .copy(name='confusion'))
    confusion.tag.aggregation_scheme = Sum(confusion)

    cg = ComputationGraph([cost, error_rate, components])

    # Apply regularization to the cost
    weights = VariableFilter(roles=[WEIGHT])(cg.variables)
    l2_norm = sum([(W ** 2).sum() for W in weights])
    l2_norm.name = 'l2_norm'
    cost = cost + regularization * l2_norm
    cost.name = 'cost_with_regularization'

    if subset:
        start = 30000 - subset // 2
        mnist_train = MNIST(("train",), subset=slice(start, start+subset))
    else:
        mnist_train = MNIST(("train",))
    mnist_train_stream = DataStream.default_stream(
        mnist_train, iteration_scheme=ShuffledScheme(
            mnist_train.num_examples, batch_size))

    mnist_test = MNIST(("test",))
    mnist_test_stream = DataStream.default_stream(
        mnist_test,
        iteration_scheme=ShuffledScheme(
            mnist_test.num_examples, batch_size))

    # Train with simple SGD
    algorithm = GradientDescent(
        cost=cost, parameters=cg.parameters,
        step_rule=AdaDelta(decay_rate=0.99))

    # `Timing` extension reports time for reading data, aggregating a batch
    # and monitoring;
    # `ProgressBar` displays a nice progress bar during training.
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=num_epochs,
                              after_n_batches=num_batches),
                  DataStreamMonitoring(
                      [cost, error_rate, confusion],
                      mnist_test_stream,
                      prefix="test"),
                  TrainingDataMonitoring(
                      [cost, error_rate, l2_norm,
                       aggregation.mean(algorithm.total_gradient_norm)],
                      prefix="train",
                      after_epoch=True),
                  Checkpoint(save_to),
                  ProgressBar(),
                  Printing()]

    if histogram:
        attribution = AttributionExtension(
            components=components,
            parameters=cg.parameters,
            components_size=output_size,
            after_batch=True)
        extensions.insert(0, attribution)

    if resume:
        extensions.append(Load(save_to, True, True))

    model = Model(cost)

    main_loop = MainLoop(
        algorithm,
        mnist_train_stream,
        model=model,
        extensions=extensions)

    main_loop.run()

    if histogram:
        save_attributions(attribution, filename=histogram)

    with open('execution-log.json', 'w') as outfile:
        json.dump(main_loop.log, outfile, cls=NumpyEncoder)
示例#20
0
    def create_model(self, symbols_num = 500):

        # Hyperparameters

        # The dimension of the hidden state of the GRUs in each direction.
        hidden_states = self.args.encoder_hidden_dims
        # Dimension of the word-embedding space
        embedding_dims = self.args.source_embeddings_dim


        ###################
        # Declaration of the Theano variables that come from the data stream
        ###################

        # The context document.
        context_bt = tt.lmatrix('context')
        # Context document mask used to distinguish real symbols from the sequence and padding symbols that are at the end
        context_mask_bt = tt.matrix('context_mask')

        # The question
        question_bt = tt.lmatrix('question')
        question_mask_bt = tt.matrix('question_mask')

        # The correct answer
        y = tt.lmatrix('answer')
        y = y[:,0] # originally answers are in a 2d matrix, here we convert it to a vector

        # The candidates among which the answer is selected
        candidates_bi = tt.lmatrix("candidates")
        candidates_bi_mask = tt.matrix("candidates_mask")



        ###################
        # Network's components
        ###################

        # Lookup table with randomly initialized word embeddings
        lookup = LookupTable(symbols_num, embedding_dims, weights_init=Uniform(width=0.2))

        # bidirectional encoder that translates context
        context_encoder = self.create_bidi_encoder("context_encoder", embedding_dims, hidden_states)

        # bidirectional encoder for question
        question_encoder = self.create_bidi_encoder("question_encoder", embedding_dims, hidden_states)

        # Initialize the components (where not done upon creation)
        lookup.initialize()



        ###################
        # Wiring the components together
        #
        # Where present, the 3 letters at the end of the variable name identify its dimensions:
        # b ... position of the example within the batch
        # t ... position of the word within the document/question
        # f ... features of the embedding vector
        ###################

        ### Read the context document
        # Map token indices to word embeddings
        context_embedding_tbf = lookup.apply(context_bt.T)

        # Read the embedded context document using the bidirectional GRU and produce the contextual embedding of each word
        memory_encoded_btf = context_encoder.apply(context_embedding_tbf, context_mask_bt.T).dimshuffle(1,0,2)
        memory_encoded_btf.name = "memory_encoded_btf"

        ### Correspondingly, read the query
        x_embedded_tbf = lookup.apply(question_bt.T)
        x_encoded_btf = question_encoder.apply(x_embedded_tbf, question_mask_bt.T).dimshuffle(1,0,2)
        # The query encoding is a concatenation of the final states of the forward and backward GRU encoder
        x_forward_encoded_bf = x_encoded_btf[:,-1,0:hidden_states]
        x_backward_encoded_bf = x_encoded_btf[:,0,hidden_states:hidden_states*2]
        query_representation_bf = tt.concatenate([x_forward_encoded_bf,x_backward_encoded_bf],axis=1)

        # Compute the attention on each word in the context as a dot product of its contextual embedding and the query
        mem_attention_presoft_bt = tt.batched_dot(query_representation_bf, memory_encoded_btf.dimshuffle(0,2,1))

        # TODO is this pre-masking necessary?
        mem_attention_presoft_masked_bt = tt.mul(mem_attention_presoft_bt,context_mask_bt)

        # Normalize the attention using softmax
        mem_attention_bt = SoftmaxWithMask(name="memory_query_softmax").apply(mem_attention_presoft_masked_bt,context_mask_bt)

        if self.args.weighted_att:
            # compute weighted attention over original word vectors
            att_weighted_responses_bf = theano.tensor.batched_dot(mem_attention_bt, context_embedding_tbf.dimshuffle(1,0,2))


            # compare desired response to all candidate responses
            # select relevant candidate answer words
            candidates_embeddings_bfi = lookup.apply(candidates_bi).dimshuffle(0,2,1)

            # convert it to output symbol probabilities
            y_hat_presoft = tt.batched_dot(att_weighted_responses_bf, candidates_embeddings_bfi)
            y_hat = SoftmaxWithMask(name="output_softmax").apply(y_hat_presoft,candidates_bi_mask)

        else:
            # Sum the attention of each candidate word across the whole context document,
            # this is the key innovation of the model

            # TODO: Get rid of sentence-by-sentence processing?
            # TODO: Rewrite into matrix notation instead of scans?
            def sum_prob_of_word(word_ix, sentence_ixs, sentence_attention_probs):
                word_ixs_in_sentence = tt.eq(sentence_ixs,word_ix).nonzero()[0]
                return sentence_attention_probs[word_ixs_in_sentence].sum()

            def sum_probs_single_sentence(candidate_indices_i, sentence_ixs_t, sentence_attention_probs_t):
                result, updates = theano.scan(
                    fn=sum_prob_of_word,
                    sequences=[candidate_indices_i],
                    non_sequences=[sentence_ixs_t, sentence_attention_probs_t])
                return result

            def sum_probs_batch(candidate_indices_bt,sentence_ixs_bt, sentence_attention_probs_bt):
                result, updates = theano.scan(
                    fn=sum_probs_single_sentence,
                    sequences=[candidate_indices_bt, sentence_ixs_bt, sentence_attention_probs_bt],
                    non_sequences=None)
                return result

            # Sum the attention of each candidate word across the whole context document
            y_hat = sum_probs_batch(candidates_bi, context_bt, mem_attention_bt)
        y_hat.name = "y_hat"

        # We use the convention that ground truth is always at index 0, so the following are the target answers
        y = y.zeros_like()

        # We use Cross Entropy as the training objective
        cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat)
        cost.name = "cost"


        predicted_response_index = tt.argmax(y_hat,axis=1)
        accuracy = tt.eq(y,predicted_response_index).mean()
        accuracy.name = "accuracy"

        return cost, accuracy, mem_attention_bt, y_hat, context_bt, candidates_bi, candidates_bi_mask, y, context_mask_bt, question_bt, question_mask_bt
示例#21
0
########## hyper parameters###########################################
# We push initialization config to set different initialization schemes
convnet.push_initialization_config()

convnet.layers[0].weights_init = Uniform(width=0.2)
convnet.layers[1].weights_init = Uniform(width=0.2)
convnet.layers[2].weights_init = Uniform(width=0.2)
convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=0.2)
convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=0.2)
convnet.initialize()

#########################################################
#Generate output and error signal
predict = convnet.apply(x)
cost = CategoricalCrossEntropy().apply(y.flatten(), predict).copy(name='cost')
error = MisclassificationRate().apply(y.flatten(), predict)
#Little trick to plot the error rate in two different plots (We can't use two time the same data in the plot for a unknow reason)
error_rate = error.copy(name='error_rate')
error_rate2 = error.copy(name='error_rate2')
cg = ComputationGraph([cost, error_rate])
########### ALGORITHM of training#############
algorithm = GradientDescent(cost=cost,
                            parameters=cg.parameters,
                            step_rule=Adam(learning_rate=0.0005))
extensions = [
    Timing(),
    FinishAfter(after_n_epochs=num_epochs),
    DataStreamMonitoring([cost, error_rate, error_rate2],
                         stream_valid,
                         prefix="valid"),
示例#22
0
def main(save_to, num_epochs,
         weight_decay=0.0001, noise_pressure=0, subset=None, num_batches=None,
         batch_size=None, histogram=None, resume=False):
    output_size = 10

    prior_noise_level = -10
    noise_step_rule = Scale(1e-6)
    noise_rate = theano.shared(numpy.asarray(1e-5, dtype=theano.config.floatX))
    convnet = create_res_net(out_noise=True, tied_noise=True, tied_sigma=True,
            noise_rate=noise_rate,
            prior_noise_level=prior_noise_level)

    x = tensor.tensor4('features')
    y = tensor.lmatrix('targets')

    # Normalize input and apply the convnet
    test_probs = convnet.apply(x)
    test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs)
            .copy(name='cost'))
    test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs)
                  .copy(name='error_rate'))
    test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs)
                  .copy(name='confusion'))
    test_confusion.tag.aggregation_scheme = Sum(test_confusion)

    test_cg = ComputationGraph([test_cost, test_error_rate])

    # Apply dropout to all layer outputs except final softmax
    # dropout_vars = VariableFilter(
    #         roles=[OUTPUT], bricks=[Convolutional],
    #         theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables)
    # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5)

    # Apply 0.2 dropout to the pre-averaging layer
    # dropout_vars_2 = VariableFilter(
    #         roles=[OUTPUT], bricks=[Convolutional],
    #         theano_name_regex="^conv_8_apply_output$")(test_cg.variables)
    # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2)

    # Apply 0.2 dropout to the input, as in the paper
    # train_cg = apply_dropout(test_cg, [x], 0.2)
    # train_cg = drop_cg
    # train_cg = apply_batch_normalization(test_cg)

    # train_cost, train_error_rate, train_components = train_cg.outputs

    with batch_normalization(convnet):
        with training_noise(convnet):
            train_probs = convnet.apply(x)
    train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs)
                .copy(name='cost'))
    train_components = (ComponentwiseCrossEntropy().apply(y.flatten(),
                train_probs).copy(name='components'))
    train_error_rate = (MisclassificationRate().apply(y.flatten(),
                train_probs).copy(name='error_rate'))
    train_cg = ComputationGraph([train_cost,
                train_error_rate, train_components])
    population_updates = get_batch_normalization_updates(train_cg)
    bn_alpha = 0.9
    extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha))
                for p, m in population_updates]

    # for annealing
    nit_penalty = theano.shared(numpy.asarray(noise_pressure, dtype=theano.config.floatX))
    nit_penalty.name = 'nit_penalty'

    # Compute noise rates for training graph
    train_logsigma = VariableFilter(roles=[LOG_SIGMA])(train_cg.variables)
    train_mean_log_sigma = tensor.concatenate([n.flatten() for n in train_logsigma]).mean()
    train_mean_log_sigma.name = 'mean_log_sigma'
    train_nits = VariableFilter(roles=[NITS])(train_cg.auxiliary_variables)
    train_nit_rate = tensor.concatenate([n.flatten() for n in train_nits]).mean()
    train_nit_rate.name = 'nit_rate'
    train_nit_regularization = nit_penalty * train_nit_rate
    train_nit_regularization.name = 'nit_regularization'

    # Apply regularization to the cost
    trainable_parameters = VariableFilter(roles=[WEIGHT, BIAS])(
            train_cg.parameters)
    mask_parameters = [p for p in trainable_parameters
            if get_brick(p).name == 'mask']
    noise_parameters = VariableFilter(roles=[NOISE])(train_cg.parameters)
    biases = VariableFilter(roles=[BIAS])(train_cg.parameters)
    weights = VariableFilter(roles=[WEIGHT])(train_cg.variables)
    nonmask_weights = [p for p in weights if get_brick(p).name != 'mask']
    l2_norm = sum([(W ** 2).sum() for W in nonmask_weights])
    l2_norm.name = 'l2_norm'
    l2_regularization = weight_decay * l2_norm
    l2_regularization.name = 'l2_regularization'

    # testversion
    test_cost = test_cost + l2_regularization
    test_cost.name = 'cost_with_regularization'

    # Training version of cost
    train_cost_without_regularization = train_cost
    train_cost_without_regularization.name = 'cost_without_regularization'
    train_cost = train_cost + l2_regularization + train_nit_regularization
    train_cost.name = 'cost_with_regularization'

    cifar10_train = CIFAR10(("train",))
    cifar10_train_stream = RandomPadCropFlip(
        NormalizeBatchLevels(DataStream.default_stream(
            cifar10_train, iteration_scheme=ShuffledScheme(
                cifar10_train.num_examples, batch_size)),
        which_sources=('features',)),
        (32, 32), pad=4, which_sources=('features',))

    test_batch_size = 128
    cifar10_test = CIFAR10(("test",))
    cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream(
        cifar10_test,
        iteration_scheme=ShuffledScheme(
            cifar10_test.num_examples, test_batch_size)),
        which_sources=('features',))

    momentum = Momentum(0.01, 0.9)

    # Create a step rule that doubles the learning rate of biases, like Caffe.
    # scale_bias = Restrict(Scale(2), biases)
    # step_rule = CompositeRule([scale_bias, momentum])

    # Create a step rule that reduces the learning rate of noise
    scale_mask = Restrict(noise_step_rule, mask_parameters)
    step_rule = CompositeRule([scale_mask, momentum])

    # from theano.compile.nanguardmode import NanGuardMode

    # Train with simple SGD
    algorithm = GradientDescent(
        cost=train_cost, parameters=trainable_parameters,
        step_rule=step_rule)
    algorithm.add_updates(extra_updates)

    #,
    #    theano_func_kwargs={
    #        'mode': NanGuardMode(
    #            nan_is_error=True, inf_is_error=True, big_is_error=True)})

    exp_name = save_to.replace('.%d', '')

    # `Timing` extension reports time for reading data, aggregating a batch
    # and monitoring;
    # `ProgressBar` displays a nice progress bar during training.
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=num_epochs,
                              after_n_batches=num_batches),
                  EpochSchedule(momentum.learning_rate, [
                      (0, 0.01),     # Warm up with 0.01 learning rate
                      (50, 0.1),     # Then go back to 0.1
                      (100, 0.01),
                      (150, 0.001)
                      # (83, 0.01),  # Follow the schedule in the paper
                      # (125, 0.001)
                  ]),
                  EpochSchedule(noise_step_rule.learning_rate, [
                      (0, 1e-2),
                      (2, 1e-1),
                      (4, 1)
                      # (0, 1e-6),
                      # (2, 1e-5),
                      # (4, 1e-4)
                  ]),
                  EpochSchedule(noise_rate, [
                      (0, 1e-2),
                      (2, 1e-1),
                      (4, 1)
                      # (0, 1e-6),
                      # (2, 1e-5),
                      # (4, 1e-4),
                      # (6, 3e-4),
                      # (8, 1e-3), # Causes nit rate to jump
                      # (10, 3e-3),
                      # (12, 1e-2),
                      # (15, 3e-2),
                      # (19, 1e-1),
                      # (24, 3e-1),
                      # (30, 1)
                  ]),
                  NoiseExtension(
                      noise_parameters=noise_parameters),
                  NoisyDataStreamMonitoring(
                      [test_cost, test_error_rate, test_confusion],
                      cifar10_test_stream,
                      noise_parameters=noise_parameters,
                      prefix="test"),
                  TrainingDataMonitoring(
                      [train_cost, train_error_rate, train_nit_rate,
                       train_cost_without_regularization,
                       l2_regularization,
                       train_nit_regularization,
                       momentum.learning_rate,
                       train_mean_log_sigma,
                       aggregation.mean(algorithm.total_gradient_norm)],
                      prefix="train",
                      every_n_batches=17),
                      # after_epoch=True),
                  Plot('Training performance for ' + exp_name,
                      channels=[
                          ['train_cost_with_regularization',
                           'train_cost_without_regularization',
                           'train_nit_regularization',
                           'train_l2_regularization'],
                          ['train_error_rate'],
                          ['train_total_gradient_norm'],
                          ['train_mean_log_sigma'],
                      ],
                      every_n_batches=17),
                  Plot('Test performance for ' + exp_name,
                      channels=[[
                          'train_error_rate',
                          'test_error_rate',
                          ]],
                      after_epoch=True),
                  EpochCheckpoint(save_to, use_cpickle=True, after_epoch=True),
                  ProgressBar(),
                  Printing()]

    if histogram:
        attribution = AttributionExtension(
            components=train_components,
            parameters=cg.parameters,
            components_size=output_size,
            after_batch=True)
        extensions.insert(0, attribution)

    if resume:
        extensions.append(Load(exp_name, True, True))

    model = Model(train_cost)

    main_loop = MainLoop(
        algorithm,
        cifar10_train_stream,
        model=model,
        extensions=extensions)

    main_loop.run()

    if histogram:
        save_attributions(attribution, filename=histogram)

    with open('execution-log.json', 'w') as outfile:
        json.dump(main_loop.log, outfile, cls=NumpyEncoder)
示例#23
0
def main(save_to, model, train, test, num_epochs, input_size = (150,150), learning_rate=0.01,
batch_size=50, num_batches=None, flatten_stream=False):
    """ 
    save_to : where to save trained model
    model : model given in input must be already initialised (works with convnet and mlp)
    
    input_size : the shape of the reshaped image in input (before flattening is applied if flatten_stream is True)
    
    """
    if flatten_stream :
        x = tensor.matrix('image_features')
    else :
        x = tensor.tensor4('image_features')
    y = tensor.lmatrix('targets')

    #Data augmentation
    #insert data augmentation here 
    
    #Generating stream
    train_stream = DataStream.default_stream(
        train,
        iteration_scheme=ShuffledScheme(train.num_examples, batch_size)
    )

    test_stream = DataStream.default_stream(
        test,
        iteration_scheme=ShuffledScheme(test.num_examples, batch_size)
    )
    
    
    #Reshaping procedure
    #Add a crop option in scikitresize so that the image is not deformed
    
    #Resize to desired square shape
    train_stream = ScikitResize(train_stream, input_size, which_sources=('image_features',))
    test_stream = ScikitResize(test_stream, input_size, which_sources=('image_features',))
    
    #Flattening the stream
    if flatten_stream is True:
        train_stream = Flatten(train_stream, which_sources=('image_features',))
        test_stream = Flatten(test_stream, which_sources=('image_features',))
    
    # Apply input to model
    probs = model.apply(x)
    
    #Defining cost and various indices to watch
    #print(probs)
    #cost = SquaredError().apply(y.flatten(),probs)

    cost = CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name='cost')
    error_rate = MisclassificationRate().apply(y.flatten(), probs).copy(
            name='error_rate')

    #Building Computation Graph
    cg = ComputationGraph([cost, error_rate])

    # Train with simple SGD
    algorithm = GradientDescent(
        cost=cost, parameters=cg.parameters,
        step_rule=Scale(learning_rate=learning_rate))
    
    #Defining extensions
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=num_epochs,
                              after_n_batches=num_batches),
                  TrainingDataMonitoring([cost, error_rate,aggregation.mean(algorithm.total_gradient_norm)], prefix="train", every_n_batches=5),
                  DataStreamMonitoring([cost, error_rate],test_stream,prefix="test", every_n_batches=25),
                  Checkpoint(save_to),
                  ProgressBar(),
                  Printing(every_n_batches=5)]

    # `Timing` extension reports time for reading data, aggregating a batch
    # and monitoring;
    # `ProgressBar` displays a nice progress bar during training.


    model = Model(cost)

    main_loop = MainLoop(
        algorithm,
        train_stream,
        model=model,
        extensions=extensions)

    main_loop.run()
示例#24
0
    def start(self):
        xx = T.matrix('features', config.floatX)
        yy = T.imatrix('targets')
        zm = BNUM*(xx.shape[0]//BNUM)
        x = xx[:zm].reshape((BNUM, zm//BNUM, xx.shape[1])).dimshuffle(1, 0, 2)
        y = yy[:zm].reshape((BNUM, zm//BNUM)).dimshuffle(1, 0)
        # x = xx[:zm].reshape((zm//16, 16, xx.shape[1]))
        # y = yy[:zm].reshape((zm//16, 16))

        DIMS = [108*5, 200, 200, 200, LABEL]
        NUMS = [1, 1, 1, 1, 1]
        # DIMS = [108*5, 48]
        # NUMS = [1, 1]
        FUNCS = [
            Rectifier, 
            Rectifier, 
            Rectifier, 
            # Rectifier, 
            # Rectifier, 
            # Maxout(num_pieces=5),
            # Maxout(num_pieces=5),
            # Maxout(num_pieces=5),
            # SimpleRecurrent,
            # SimpleRecurrent,
            # SimpleRecurrent,
            # LSTM,
            # LSTM,
            # LSTM,

            # SequenceGenerator,

            # Softmax,
            None,
        ]

        def lllistool(i, inp, func):
            if func == LSTM:
                NUMS[i+1] *= 4
            sdim = DIMS[i]
            if func == SimpleRecurrent or func == LSTM:
                sdim = DIMS[i] + DIMS[i+1]
            l = Linear(input_dim=DIMS[i], output_dim=DIMS[i+1] * NUMS[i+1], 
                       weights_init=IsotropicGaussian(std=sdim**(-0.5)), 
                       biases_init=IsotropicGaussian(std=sdim**(-0.5)),
                       name='Lin{}'.format(i))
            l.initialize()
            if func == SimpleRecurrent:
                gong = func(dim=DIMS[i+1], activation=Rectifier(), weights_init=IsotropicGaussian(std=sdim**(-0.5)))
                gong.initialize()
                ret = gong.apply(l.apply(inp))
            elif func == LSTM:
                gong = func(dim=DIMS[i+1], activation=Tanh(), weights_init=IsotropicGaussian(std=sdim**(-0.5)))
                gong.initialize()
                print(inp)
                ret, _ = gong.apply(
                    l.apply(inp), 
                    T.zeros((inp.shape[1], DIMS[i+1])),
                    T.zeros((inp.shape[1], DIMS[i+1])),
                )
            elif func == SequenceGenerator:
                gong = func(
                    readout=None, 
                    transition=SimpleRecurrent(dim=100, activation=Rectifier(), weights_init=IsotropicGaussian(std=0.1)))
                ret = None
            elif func == None:
                ret = l.apply(inp)
            else:
                gong = func()
                ret = gong.apply(l.apply(inp))
            return ret

        oup = x
        for i in range(len(DIMS)-1):
            oup = lllistool(i, oup, FUNCS[i])
        y_hat = oup

        y_rsp = y.reshape((y.shape[0]*y.shape[1],))
        y_dsf_rsp = y.dimshuffle(1, 0).reshape((y.shape[0]*y.shape[1],))
        yh_rsp = y_hat.reshape((y_hat.shape[0]*y_hat.shape[1], y_hat.shape[2]))
        yh_dsf_rsp = y_hat.dimshuffle(1, 0, 2).reshape((y_hat.shape[0]*y_hat.shape[1], y_hat.shape[2]))
        sfmx = Softmax().apply(yh_rsp)

        # cost = CategoricalCrossEntropy().apply(y, y_hat).astype(config.floatX)

        # j, wlh = Yimumu(y_hat, y)
        # cost = CategoricalCrossEntropy().apply(y_rsp, sfmx) + j
        cost = CategoricalCrossEntropy().apply(y_rsp, sfmx)
        # cost_p = cost_p.astype(config.floatX)
        # cost = CTC_cost(y, y_hat)
        cost = cost.astype(config.floatX)

        cg = ComputationGraph(cost)
        # cg_p = ComputationGraph(cost_p)
        orig_cg = cg
        ips = VariableFilter(roles=[INPUT])(cg.variables)
        ops = VariableFilter(roles=[OUTPUT])(cg.variables)
        # print(ips, ops)
        # cg = apply_dropout(cg, ips[0:2:1], 0.2)
        # cg = apply_dropout(cg, ips[2:-2:1], 0.5)
        # cost = cg.outputs[0].astype(config.floatX)

        cost.name = 'cost'

        mps = theano.shared(np.array([ph2id(ph48239(id2ph(t))) for t in range(48)]))
        # yh_dsf_rsp = theano.printing.Print('YapYapYap')(yh_dsf_rsp)
        # z_hat = T.argmax(yh_dsf_rsp[:,:-1], axis=1)
        z_hat = T.argmax(yh_dsf_rsp, axis=1)
        # z_hat = theano.printing.Print('Yap')(z_hat)
        # z_hat = Yimumu_Decode()(y_hat, wlh)
        z_hat_hat = CTC_Decode()(y_hat)

        y39,_ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[y_dsf_rsp])
        y_hat39,_ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[z_hat])
        y_hat_hat39 = y_hat39
        # y_hat_hat39,_ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[z_hat_hat])
        # trm = TrimOp()(y_hat_hat39)
        # trm = trm[1:1+trm[0]]
        # trm = theano.printing.Print('Trm')(trm)

        lost01 = (T.sum(T.neq(y_hat39, y39)) / y39.shape[0]).astype(config.floatX)
        lost01.name = '0/1 loss'
        lost23 = (T.sum(T.neq(y_hat39, y39)) / y39.shape[0]).astype(config.floatX)
        lost23.name = '2/3 loss'

        edit01 = EditDistance()(y39, y_hat_hat39).astype(config.floatX) #+ T.sum(trm) * 1E-10
        # edit01 = edit01.astype(config.floatX)
        edit01.name = '0/1 edit'
        edit23 = EditDistance()(y39, y_hat_hat39).astype(config.floatX)
        edit23.name = '2/3 edit'


        Ws = cg.parameters
        # Ws = Ws + [wlh]
        print(list(Ws))
        norms = sum(w.norm(2) for w in Ws)
        norms = norms.astype(config.floatX)
        norms.name = 'norms'
        path = pjoin(PATH['fuel'], 'train_train.hdf5')
        data = H5PYDataset(path, which_set='train', load_in_memory=True, subset=slice(0, 100000))
        # data = H5PYDataset(path, which_set='train', load_in_memory=True)
        data_v = H5PYDataset(pjoin(PATH['fuel'], 'train_validate.hdf5'), which_set='validate', load_in_memory=True)
        num = data.num_examples
        data_stream = DataStream(data, iteration_scheme=ShuffledScheme(
                        num, batch_size=SLEN*BNUM))
        data_stream_v = DataStream(data_v, iteration_scheme=SequentialScheme(
                        data_v.num_examples, batch_size=SLEN*BNUM))
        algo = GradientDescent(cost=cost, params=Ws, step_rule=CompositeRule([
            Momentum(0.005, 0.9)
            # AdaDelta()
        ]))
        # algo_p = GradientDescent(cost=cost_p, params=cg_p.parameters, step_rule=CompositeRule([
            # Momentum(0.01, 0.9)
            # # AdaDelta()
        # ]))
        monitor = DataStreamMonitoring( variables=[cost, lost01, edit01, norms],
                data_stream=data_stream)
        monitor_v = DataStreamMonitoring( variables=[lost23, edit23],
                data_stream=data_stream_v)
        plt = Plot('AlpYap', channels=[['0/1 loss', '2/3 loss'], ['0/1 edit', '2/3 edit']], after_epoch=True)

        # main_loop_p = MainLoop(data_stream = data_stream, 
                # algorithm=algo_p, 
                # extensions=[monitor, monitor_v, FinishAfter(after_n_epochs=10), Printing(), plt])
        # main_loop_p.run()

        main_loop = MainLoop(data_stream = data_stream, 
                algorithm=algo, 
                extensions=[monitor, monitor_v, FinishAfter(after_n_epochs=2000), Printing(), plt])

        main_loop.run()

        pfile = open('zzz.pkl', 'wb')
        pickle.dump(orig_cg, pfile)
        # pickle.dump(wlh, pfile)
        pfile.close()
        
        ################

        test_feat = np.load(pjoin(PATH['numpy'], 'train_test_features.npy')).astype(config.floatX)
        func = theano.function([xx], y_hat.astype(config.floatX))
        test_hat = []
        for i in range(19):
            tmp = func(test_feat[i*10000:(i+1)*10000])
            tmp = tmp.transpose((1, 0, 2)).reshape((tmp.shape[0]*tmp.shape[1], tmp.shape[2]))
            test_hat.append(tmp)
        test_hat = np.concatenate(test_hat, axis=0)
        test_hat = np.concatenate((test_hat, np.zeros((2, LABEL))), axis=0)

        alpha = T.tensor3(config.floatX)
        beta = alpha.argmax(axis=2)
        # beta = alpha[:,:,:-1].argmax(axis=2)
        # beta = Yimumu_Decode()(alpha, wlh)
        # beta = CTC_Decode()(alpha)
        func2 = theano.function([alpha], beta)

        lens = []
        tags = []
        with shelve.open(SHELVE['test']) as f:
            names = f['names']
            for n in names:
                lens.append(len(f[n]))
                for i in range(lens[-1]):
                    tags.append(n+'_'+str(i+1))

        seq = []
        seq2 = []
        nowcnt = 0
        for i in lens:
            nxt = nowcnt + i
            cur_hat = test_hat[nowcnt:nxt].reshape((i, 1, LABEL)).astype(config.floatX)
            nowcnt = nxt
            fc2 = func2(cur_hat).flatten()
            fc3 = []
            fc4 = []
            for j in fc2:
                fc3.append(ph48239(id2ph(j)))
                fc4.append(ph2c(ph48239(id2ph(j))))
            seq.append(fc3)
            seq2.append(''.join(trim(fc4)))

        seq_flat = np.concatenate(seq)
        with open('hw1_outz.txt', 'w') as f:
            f.write('id,prediction\n')
            for t, i in zip(tags, seq_flat):
                f.write(t+','+i+'\n')

        with open('hw2_outz.txt', 'w') as f:
            f.write('id,phone_sequence\n')
            for n, i in zip(names, seq2):
                f.write(n+','+i+'\n')
示例#25
0
    Convolutional(filter_size=(1, 1), num_filters=1024, name='Convx3'),
    Rectifier(),
    MaxPooling((2, 2), name='MaxPol2'),
    Convolutional(filter_size=(1, 1), num_filters=2, name='Convx4'),
    Rectifier(),
])
conv_sequence1 = ConvolutionalSequence(conv_layers1,
                                       num_channels=512,
                                       image_size=(10, 10),
                                       weights_init=Orthogonal(),
                                       use_bias=False,
                                       name='ConvSeq3')
conv_sequence1.initialize()
out_soft1 = Flattener(name='Flatt1').apply(conv_sequence1.apply(out5))
predict1 = NDimensionalSoftmax(name='Soft1').apply(out_soft1)
cost1 = CategoricalCrossEntropy(name='Cross1').apply(
    y.flatten(), predict1).copy(name='cost1')

#SECOND SOFTMAX
conv_layers2 = list([
    MaxPooling((2, 2), name='MaxPol2'),
    Convolutional(filter_size=(1, 1), num_filters=128, name='Convx21'),
    Rectifier(),
    MaxPooling((2, 2), name='MaxPol11'),
    Convolutional(filter_size=(1, 1), num_filters=1024, name='Convx31'),
    Rectifier(),
    MaxPooling((2, 2), name='MaxPol21'),
    Convolutional(filter_size=(1, 1), num_filters=2, name='Convx41'),
    Rectifier(),
])
conv_sequence2 = ConvolutionalSequence(conv_layers2,
                                       num_channels=832,
示例#26
0
def train(algorithm, learning_rate, clipping, momentum, layer_size, epochs,
          test_cost, experiment_path, initialization, init_width, weight_noise,
          z_prob, z_prob_states, z_prob_cells, drop_prob_igates,
          ogates_zoneout, batch_size, stoch_depth, share_mask, gaussian_drop,
          rnn_type, num_layers, norm_cost_coeff, penalty, testing, seq_len,
          decrease_lr_after_epoch, lr_decay, **kwargs):

    print '.. PTB experiment'
    print '.. arguments:', ' '.join(sys.argv)
    t0 = time.time()

    ###########################################
    #
    # LOAD DATA
    #
    ###########################################

    def onehot(x, numclasses=None):
        """ Convert integer encoding for class-labels (starting with 0 !)
            to one-hot encoding.
            The output is an array whose shape is the shape of the input array
            plus an extra dimension, containing the 'one-hot'-encoded labels.
        """
        if x.shape == ():
            x = x[None]
        if numclasses is None:
            numclasses = x.max() + 1
        result = numpy.zeros(list(x.shape) + [numclasses], dtype="int")
        z = numpy.zeros(x.shape, dtype="int")
        for c in range(numclasses):
            z *= 0
            z[numpy.where(x == c)] = 1
            result[..., c] += z
        return result.astype(theano.config.floatX)

    alphabetsize = 10000
    data = np.load('penntree_char_and_word.npz')
    trainset = data['train_words']
    validset = data['valid_words']
    testset = data['test_words']

    if testing:
        trainset = trainset[:3000]
        validset = validset[:3000]

    if share_mask:
        if not z_prob:
            raise ValueError('z_prob must be provided when using share_mask')
        if z_prob_cells or z_prob_states:
            raise ValueError(
                'z_prob_states and z_prob_cells must not be provided when using share_mask (use z_prob instead)'
            )
        z_prob_cells = z_prob
        # we don't want to actually use these masks, so this is to debug
        z_prob_states = None
    else:
        if z_prob:
            raise ValueError('z_prob is only used with share_mask')
        z_prob_cells = z_prob_cells or '1'
        z_prob_states = z_prob_states or '1'


#    rng = np.random.RandomState(seed)

###########################################
#
# MAKE STREAMS
#
###########################################

    def prep_dataset(dataset):
        dataset = dataset[:(len(dataset) - (len(dataset) %
                                            (seq_len * batch_size)))]
        dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2))

        stream = DataStream(
            IndexableDataset(indexables=OrderedDict([('data', dataset)])),
            iteration_scheme=SequentialExampleScheme(dataset.shape[0]))
        stream = Transpose(stream, [(1, 0)])
        stream = SampleDropsNPWord(stream, z_prob_states, z_prob_cells,
                                   drop_prob_igates, layer_size, num_layers,
                                   False, stoch_depth, share_mask,
                                   gaussian_drop, alphabetsize)
        stream.sources = ('data', ) * 3 + stream.sources + (
            'zoneouts_states', 'zoneouts_cells', 'zoneouts_igates')
        return (stream, )

    train_stream, = prep_dataset(trainset)
    valid_stream, = prep_dataset(validset)
    test_stream, = prep_dataset(testset)

    ####################

    data = train_stream.get_epoch_iterator(as_dict=True).next()

    ####################

    ###########################################
    #
    # BUILD MODEL
    #
    ###########################################
    print '.. building model'

    x = T.tensor3('data')
    y = x
    zoneouts_states = T.tensor3('zoneouts_states')
    zoneouts_cells = T.tensor3('zoneouts_cells')
    zoneouts_igates = T.tensor3('zoneouts_igates')

    x.tag.test_value = data['data']
    zoneouts_states.tag.test_value = data['zoneouts_states']
    zoneouts_cells.tag.test_value = data['zoneouts_cells']
    zoneouts_igates.tag.test_value = data['zoneouts_igates']

    if init_width and not initialization == 'uniform':
        raise ValueError('Width is only for uniform init, whassup?')

    if initialization == 'glorot':
        weights_init = NormalizedInitialization()
    elif initialization == 'uniform':
        weights_init = Uniform(width=init_width)
    elif initialization == 'ortho':
        weights_init = OrthogonalInitialization()
    else:
        raise ValueError('No such initialization')

    if rnn_type.lower() == 'lstm':
        in_to_hids = [
            Linear(layer_size if l > 0 else alphabetsize,
                   layer_size * 4,
                   name='in_to_hid%d' % l,
                   weights_init=weights_init,
                   biases_init=Constant(0.0)) for l in range(num_layers)
        ]
        recurrent_layers = [
            DropLSTM(dim=layer_size,
                     weights_init=weights_init,
                     activation=Tanh(),
                     model_type=6,
                     name='rnn%d' % l,
                     ogates_zoneout=ogates_zoneout) for l in range(num_layers)
        ]
    elif rnn_type.lower() == 'gru':
        in_to_hids = [
            Linear(layer_size if l > 0 else alphabetsize,
                   layer_size * 3,
                   name='in_to_hid%d' % l,
                   weights_init=weights_init,
                   biases_init=Constant(0.0)) for l in range(num_layers)
        ]
        recurrent_layers = [
            DropGRU(dim=layer_size,
                    weights_init=weights_init,
                    activation=Tanh(),
                    name='rnn%d' % l) for l in range(num_layers)
        ]
    elif rnn_type.lower() == 'srnn':  # FIXME!!! make ReLU
        in_to_hids = [
            Linear(layer_size if l > 0 else alphabetsize,
                   layer_size,
                   name='in_to_hid%d' % l,
                   weights_init=weights_init,
                   biases_init=Constant(0.0)) for l in range(num_layers)
        ]
        recurrent_layers = [
            DropSimpleRecurrent(dim=layer_size,
                                weights_init=weights_init,
                                activation=Rectifier(),
                                name='rnn%d' % l) for l in range(num_layers)
        ]
    else:
        raise NotImplementedError

    hid_to_out = Linear(layer_size,
                        alphabetsize,
                        name='hid_to_out',
                        weights_init=weights_init,
                        biases_init=Constant(0.0))

    for layer in in_to_hids:
        layer.initialize()
    for layer in recurrent_layers:
        layer.initialize()
    hid_to_out.initialize()

    layer_input = x  #in_to_hid.apply(x)

    init_updates = OrderedDict()
    for l, (in_to_hid, layer) in enumerate(zip(in_to_hids, recurrent_layers)):
        rnn_embedding = in_to_hid.apply(layer_input)
        if rnn_type.lower() == 'lstm':
            states_init = theano.shared(
                np.zeros((batch_size, layer_size), dtype=floatX))
            cells_init = theano.shared(
                np.zeros((batch_size, layer_size), dtype=floatX))
            states_init.name, cells_init.name = "states_init", "cells_init"
            states, cells = layer.apply(
                rnn_embedding,
                zoneouts_states[:, :, l * layer_size:(l + 1) * layer_size],
                zoneouts_cells[:, :, l * layer_size:(l + 1) * layer_size],
                zoneouts_igates[:, :, l * layer_size:(l + 1) * layer_size],
                states_init, cells_init)
            init_updates.update([(states_init, states[-1]),
                                 (cells_init, cells[-1])])
        elif rnn_type.lower() in ['gru', 'srnn']:
            # untested!
            states_init = theano.shared(
                np.zeros((batch_size, layer_size), dtype=floatX))
            states_init.name = "states_init"
            states = layer.apply(rnn_embedding, zoneouts_states,
                                 zoneouts_igates, states_init)
            init_updates.update([(states_init, states[-1])])
        else:
            raise NotImplementedError
        layer_input = states

    y_hat_pre_softmax = hid_to_out.apply(T.join(0, [states_init], states[:-1]))
    shape_ = y_hat_pre_softmax.shape
    y_hat = Softmax().apply(y_hat_pre_softmax.reshape((-1, alphabetsize)))

    ####################

    ###########################################
    #
    # SET UP COSTS AND MONITORS
    #
    ###########################################

    cost = CategoricalCrossEntropy().apply(y.reshape((-1, alphabetsize)),
                                           y_hat).copy('cost')

    bpc = (cost / np.log(2.0)).copy(name='bpr')
    perp = T.exp(cost).copy(name='perp')

    cost_train = cost.copy(name='train_cost')
    cg_train = ComputationGraph([cost_train])

    ###########################################
    #
    # NORM STABILIZER
    #
    ###########################################
    norm_cost = 0.

    def _magnitude(x, axis=-1):
        return T.sqrt(
            T.maximum(T.sqr(x).sum(axis=axis),
                      numpy.finfo(x.dtype).tiny))

    if penalty == 'cells':
        assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables)
        for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables):
            norms = _magnitude(cell)
            norm_cost += T.mean(
                T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1))
    elif penalty == 'hids':
        for l in range(num_layers):
            assert 'rnn%d_apply_states' % l in [
                o.name
                for o in VariableFilter(roles=[OUTPUT])(cg_train.variables)
            ]
        for output in VariableFilter(roles=[OUTPUT])(cg_train.variables):
            for l in range(num_layers):
                if output.name == 'rnn%d_apply_states' % l:
                    norms = _magnitude(output)
                    norm_cost += T.mean(
                        T.sum((norms[1:] - norms[:-1])**2, axis=0) /
                        (seq_len - 1))

    norm_cost.name = 'norm_cost'
    #cost_valid = cost_train
    cost_train += norm_cost_coeff * norm_cost
    cost_train = cost_train.copy(
        'cost_train')  #should this be cost_train.outputs[0]? no.

    cg_train = ComputationGraph([cost_train])

    ###########################################
    #
    # WEIGHT NOISE
    #
    ###########################################

    if weight_noise > 0:
        weights = VariableFilter(roles=[WEIGHT])(cg_train.variables)
        cg_train = apply_noise(cg_train, weights, weight_noise)
        cost_train = cg_train.outputs[0].copy(name='cost_train')

    model = Model(cost_train)

    learning_rate = float(learning_rate)
    clipping = StepClipping(threshold=np.cast[floatX](clipping))
    if algorithm == 'adam':
        adam = Adam(learning_rate=learning_rate)
        learning_rate = adam.learning_rate
        step_rule = CompositeRule([adam, clipping])
    elif algorithm == 'rms_prop':
        rms_prop = RMSProp(learning_rate=learning_rate)
        learning_rate = rms_prop.learning_rate
        step_rule = CompositeRule([clipping, rms_prop])
    elif algorithm == 'momentum':
        sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum)
        learning_rate = sgd_momentum.learning_rate
        step_rule = CompositeRule([clipping, sgd_momentum])
    elif algorithm == 'sgd':
        sgd = Scale(learning_rate=learning_rate)
        learning_rate = sgd.learning_rate
        step_rule = CompositeRule([clipping, sgd])
    else:
        raise NotImplementedError
    algorithm = GradientDescent(step_rule=step_rule,
                                cost=cost_train,
                                parameters=cg_train.parameters)
    # theano_func_kwargs={"mode": theano.compile.MonitorMode(post_func=detect_nan)})

    algorithm.add_updates(init_updates)

    def cond_number(x):
        _, _, sing_vals = T.nlinalg.svd(x, True, True)
        sing_mags = abs(sing_vals)
        return T.max(sing_mags) / T.min(sing_mags)

    def rms(x):
        return (x * x).mean().sqrt()

    whysplode_cond = []
    whysplode_rms = []
    for i, p in enumerate(init_updates):
        v = p.get_value()
        if p.get_value().shape == 2:
            whysplode_cond.append(
                cond_number(p).copy(
                    'ini%d:%s_cond(%s)' %
                    (i, p.name, "x".join(map(str,
                                             p.get_value().shape)))))
        whysplode_rms.append(
            rms(p).copy('ini%d:%s_rms(%s)' %
                        (i, p.name, "x".join(map(str,
                                                 p.get_value().shape)))))
    for i, p in enumerate(cg_train.parameters):
        v = p.get_value()
        if p.get_value().shape == 2:
            whysplode_cond.append(
                cond_number(p).copy(
                    'ini%d:%s_cond(%s)' %
                    (i, p.name, "x".join(map(str,
                                             p.get_value().shape)))))
        whysplode_rms.append(
            rms(p).copy('ini%d:%s_rms(%s)' %
                        (i, p.name, "x".join(map(str,
                                                 p.get_value().shape)))))

    observed_vars = [
        cost_train, cost, bpc, perp, learning_rate,
        aggregation.mean(
            algorithm.total_gradient_norm).copy("gradient_norm_mean")
    ]  # + whysplode_rms

    parameters = model.get_parameter_dict()
    for name, param in parameters.iteritems():
        observed_vars.append(param.norm(2).copy(name=name + "_norm"))
        observed_vars.append(
            algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm"))

    train_monitor = TrainingDataMonitoring(variables=observed_vars,
                                           prefix="train",
                                           after_epoch=True)

    dev_inits = [p.clone() for p in init_updates]
    cg_dev = ComputationGraph([cost, bpc, perp] +
                              init_updates.values()).replace(
                                  zip(init_updates.keys(), dev_inits))
    dev_cost, dev_bpc, dev_perp = cg_dev.outputs[:3]
    dev_init_updates = OrderedDict(zip(dev_inits, cg_dev.outputs[3:]))

    dev_monitor = DataStreamMonitoring(variables=[dev_cost, dev_bpc, dev_perp],
                                       data_stream=valid_stream,
                                       prefix="dev",
                                       updates=dev_init_updates)

    # noone does this
    if 'load_path' in kwargs:
        with open(kwargs['load_path']) as f:
            loaded = np.load(f)
            model = Model(cost_train)
            params_dicts = model.get_parameter_dict()
            params_names = params_dicts.keys()
            for param_name in params_names:
                param = params_dicts[param_name]
                # '/f_6_.W' --> 'f_6_.W'
                slash_index = param_name.find('/')
                param_name = param_name[slash_index + 1:]
                if param.get_value().shape == loaded[param_name].shape:
                    print 'Found: ' + param_name
                    param.set_value(loaded[param_name])
                else:
                    print 'Not found: ' + param_name

    extensions = []
    extensions.extend(
        [FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor])
    if test_cost:
        test_inits = [p.clone() for p in init_updates]
        cg_test = ComputationGraph([cost, bpc, perp] +
                                   init_updates.values()).replace(
                                       zip(init_updates.keys(), test_inits))
        test_cost, test_bpc, test_perp = cg_test.outputs[:3]
        test_init_updates = OrderedDict(zip(test_inits, cg_test.outputs[3:]))

        test_monitor = DataStreamMonitoring(
            variables=[test_cost, test_bpc, test_perp],
            data_stream=test_stream,
            prefix="test",
            updates=test_init_updates)
        extensions.extend([test_monitor])

    if not os.path.exists(experiment_path):
        os.makedirs(experiment_path)
    log_path = os.path.join(experiment_path, 'log.txt')
    fh = logging.FileHandler(filename=log_path)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)

    extensions.append(
        SaveParams('dev_cost', model, experiment_path, every_n_epochs=1))
    extensions.append(SaveLog(every_n_epochs=1))
    extensions.append(ProgressBar())
    extensions.append(Printing())

    class RollsExtension(TrainingExtension):
        """ rolls the cell and state activations between epochs so that first batch gets correct initial activations """
        def __init__(self, shvars):
            self.shvars = shvars

        def before_epoch(self):
            for v in self.shvars:
                v.set_value(np.roll(v.get_value(), 1, 0))

    extensions.append(
        RollsExtension(init_updates.keys() + dev_init_updates.keys() +
                       (test_init_updates.keys() if test_cost else [])))

    class LearningRateSchedule(TrainingExtension):
        """ Lets you set a number to divide learning rate by each epoch + when to start doing that """
        def __init__(self):
            self.epoch_number = 0

        def after_epoch(self):
            self.epoch_number += 1
            if self.epoch_number > decrease_lr_after_epoch:
                learning_rate.set_value(learning_rate.get_value() / lr_decay)

    if bool(lr_decay) != bool(decrease_lr_after_epoch):
        raise ValueError(
            'Need to define both lr_decay and decrease_lr_after_epoch')
    if lr_decay and decrease_lr_after_epoch:
        extensions.append(LearningRateSchedule())

    main_loop = MainLoop(model=model,
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)
    t1 = time.time()
    print "Building time: %f" % (t1 - t0)

    main_loop.run()
    print "Execution time: %f" % (time.time() - t1)
示例#27
0
o = l.apply(o)

o = Rectifier().apply(o)

l = Linear(input_dim=l.get_dim("output"),
           output_dim=10,
           weights_init=IsotropicGaussian(std=0.01),
           biases_init=IsotropicGaussian(std=0.01))
l.initialize()
o = l.apply(o)

o = Softmax().apply(o)

Y = T.imatrix(name="targets")

cost = CategoricalCrossEntropy().apply(Y.flatten(), o)
cost.name = "cost"

miss_class = 1.0 - MisclassificationRate().apply(Y.flatten(), o)
miss_class.name = "accuracy"

cg = ComputationGraph(cost)
print cg.shared_variables

bricks = [get_brick(var) for var in cg.variables if get_brick(var)]
for i, b in enumerate(bricks):
    b.name += str(i)

step_rule = AdaM()
algorithm = GradientDescent(cost=cost, step_rule=step_rule)
示例#28
0
    def _create_main_loop(self):
        # hyper parameters
        hp = self.params
        batch_size = hp['batch_size']
        biases_init = Constant(0)
        batch_normalize = hp['batch_normalize']

        ### Build fprop
        tensor5 = T.TensorType(config.floatX, (False, ) * 5)
        X = tensor5("images")
        #X = T.tensor4("images")
        y = T.lvector('targets')

        gnet_params = OrderedDict()
        #X_shuffled = X[:, :, :, :, [2, 1, 0]]
        #X_shuffled = gpu_contiguous(X.dimshuffle(0, 1, 4, 2, 3)) * 255

        X = X[:, :, :, :, [2, 1, 0]]
        X_shuffled = X.dimshuffle((0, 1, 4, 2, 3)) * 255
        X_r = X_shuffled.reshape(
            (X_shuffled.shape[0], X_shuffled.shape[1] * X_shuffled.shape[2],
             X_shuffled.shape[3], X_shuffled.shape[4]))
        X_r = X_r - (np.array([104, 117, 123])[None, :, None,
                                               None]).astype('float32')

        expressions, input_data, param = stream_layer_exp(inputs=('data', X_r),
                                                          mode='rgb')
        res = expressions['outloss']
        y_hat = res.flatten(ndim=2)

        import pdb
        pdb.set_trace()

        ### Build Cost
        cost = CategoricalCrossEntropy().apply(y, y_hat)
        cost = T.cast(cost, theano.config.floatX)
        cost.name = 'cross_entropy'

        y_pred = T.argmax(y_hat, axis=1)
        misclass = T.cast(T.mean(T.neq(y_pred, y)), theano.config.floatX)
        misclass.name = 'misclass'

        monitored_channels = []
        monitored_quantities = [cost, misclass, y_hat, y_pred]
        model = Model(cost)

        training_cg = ComputationGraph(monitored_quantities)
        inference_cg = ComputationGraph(monitored_quantities)

        ### Get evaluation function
        #training_eval = training_cg.get_theano_function(additional_updates=bn_updates)
        training_eval = training_cg.get_theano_function()
        #inference_eval = inference_cg.get_theano_function()

        # Dataset
        test = JpegHDF5Dataset(
            'test',
            #name='jpeg_data_flows.hdf5',
            load_in_memory=True)
        #mean = np.load(os.path.join(os.environ['UCF101'], 'mean.npy'))
        import pdb
        pdb.set_trace()

        ### Eval
        labels = np.zeros(test.num_video_examples)
        y_hat = np.zeros((test.num_video_examples, 101))
        labels_flip = np.zeros(test.num_video_examples)
        y_hat_flip = np.zeros((test.num_video_examples, 101))

        ### Important to shuffle list for batch normalization statistic
        #rng = np.random.RandomState()
        #examples_list = range(test.num_video_examples)
        #import pdb; pdb.set_trace()
        #rng.shuffle(examples_list)

        nb_frames = 1

        for i in xrange(24):
            scheme = HDF5SeqScheme(test.video_indexes,
                                   examples=test.num_video_examples,
                                   batch_size=batch_size,
                                   f_subsample=i,
                                   nb_subsample=25,
                                   frames_per_video=nb_frames)
            #for crop in ['upleft', 'upright', 'downleft', 'downright', 'center']:
            for crop in ['center']:
                stream = JpegHDF5Transformer(
                    input_size=(240, 320),
                    crop_size=(224, 224),
                    #input_size=(256, 342), crop_size=(224, 224),
                    crop_type=crop,
                    translate_labels=True,
                    flip='noflip',
                    nb_frames=nb_frames,
                    data_stream=ForceFloatX(
                        DataStream(dataset=test, iteration_scheme=scheme)))
                stream_flip = JpegHDF5Transformer(
                    input_size=(240, 320),
                    crop_size=(224, 224),
                    #input_size=(256, 342), crop_size=(224, 224),
                    crop_type=crop,
                    translate_labels=True,
                    flip='flip',
                    nb_frames=nb_frames,
                    data_stream=ForceFloatX(
                        DataStream(dataset=test, iteration_scheme=scheme)))

                ## Do the evaluation
                epoch = stream.get_epoch_iterator()
                for j, batch in enumerate(epoch):
                    output = training_eval(batch[0], batch[1])
                    # import cv2
                    # cv2.imshow('img', batch[0][0, 0, :, :, :])
                    # cv2.waitKey(160)
                    # cv2.destroyAllWindows()
                    #import pdb; pdb.set_trace()
                    labels_flip[batch_size * j:batch_size * (j + 1)] = batch[1]
                    y_hat_flip[batch_size * j:batch_size *
                               (j + 1), :] += output[2]
                preds = y_hat_flip.argmax(axis=1)
                misclass = np.sum(labels_flip != preds) / float(len(preds))
                print i, crop, "flip Misclass:", misclass

                epoch = stream_flip.get_epoch_iterator()
                for j, batch in enumerate(epoch):
                    output = training_eval(batch[0], batch[1])
                    labels[batch_size * j:batch_size * (j + 1)] = batch[1]
                    y_hat[batch_size * j:batch_size * (j + 1), :] += output[2]
                preds = y_hat.argmax(axis=1)
                misclass = np.sum(labels != preds) / float(len(preds))
                print i, crop, "noflip Misclass:", misclass

                y_merge = y_hat + y_hat_flip
                preds = y_merge.argmax(axis=1)
                misclass = np.sum(labels != preds) / float(len(preds))
                print i, crop, "avg Misclass:", misclass

        ### Compute misclass
        y_hat += y_hat_flip
        preds = y_hat.argmax(axis=1)
        misclass = np.sum(labels != preds) / float(len(preds))
        print "Misclass:", misclass
def main(feature_maps=None, mlp_hiddens=None,
         conv_sizes=None, pool_sizes=None, batch_size=None,
         num_batches=None):
    if feature_maps is None:
        feature_maps = [32, 48, 64, 96, 96, 128]
    if mlp_hiddens is None:
        mlp_hiddens = [1000]
    if conv_sizes is None:
        conv_sizes = [9, 7, 5, 3, 2, 1]
    if pool_sizes is None:
        pool_sizes = [2, 2, 2, 2, 1, 1]
    if batch_size is None:
        batch_size = 64
    conv_steps=[2, 1, 1, 1, 1, 1] #same as stride
    image_size = (128, 128)
    output_size = 2
    learningRate = 0.001
    drop_prob = 0.4
    weight_noise = 0.75
    num_epochs = 150
    num_batches = None
    host_plot='http://*****:*****@ %s' % (graph_name, datetime.datetime.now(), socket.gethostname()),
                                channels=[['train_error_rate', 'valid_error_rate'],
                                 ['train_total_gradient_norm']], after_epoch=True, server_url=host_plot))
            PLOT_AVAILABLE = True
        except ImportError:
            PLOT_AVAILABLE = False
        extensions.append(Checkpoint(save_to, after_epoch=True, after_training=True, save_separately=['log']))


    logger.info("Building the model")

    model = Model(cost)

    ########### Loading images #####################
    main_loop = MainLoop(
        algorithm,
        stream_data_train,
        model=model,
        extensions=extensions)

    main_loop.run()
示例#30
0
    def apply(self, input_labeled, target_labeled, input_unlabeled):
        self.layer_counter = 0
        input_dim = self.p.encoder_layers[0]

        # Store the dimension tuples in the same order as layers.
        layers = self.layers
        self.layer_dims = {0: input_dim}

        self.lr = self.shared(self.default_lr, 'learning_rate', role=None)

        self.costs = costs = AttributeDict()
        self.costs.denois = AttributeDict()

        self.act = AttributeDict()
        self.error = AttributeDict()

        top = len(layers) - 1

        N = input_labeled.shape[0]
        self.join = lambda l, u: T.concatenate([l, u], axis=0)
        self.labeled = lambda x: x[:N] if x is not None else x
        self.unlabeled = lambda x: x[N:] if x is not None else x
        self.split_lu = lambda x: (self.labeled(x), self.unlabeled(x))

        input_concat = self.join(input_labeled, input_unlabeled)

        def encoder(input_, path_name, input_noise_std=0, noise_std=[]):
            h = input_

            logger.info('  0: noise %g' % input_noise_std)
            if input_noise_std > 0.:
                h = h + self.noise_like(h) * input_noise_std

            d = AttributeDict()
            d.unlabeled = self.new_activation_dict()
            d.labeled = self.new_activation_dict()
            d.labeled.z[0] = self.labeled(h)
            d.unlabeled.z[0] = self.unlabeled(h)
            prev_dim = input_dim
            for i, (spec, _, act_f) in layers[1:]:
                d.labeled.h[i - 1], d.unlabeled.h[i - 1] = self.split_lu(h)
                noise = noise_std[i] if i < len(noise_std) else 0.
                curr_dim, z, m, s, h = self.f(h,
                                              prev_dim,
                                              spec,
                                              i,
                                              act_f,
                                              path_name=path_name,
                                              noise_std=noise)
                assert self.layer_dims.get(i) in (None, curr_dim)
                self.layer_dims[i] = curr_dim
                d.labeled.z[i], d.unlabeled.z[i] = self.split_lu(z)
                d.unlabeled.s[i] = s
                d.unlabeled.m[i] = m
                prev_dim = curr_dim
            d.labeled.h[i], d.unlabeled.h[i] = self.split_lu(h)
            return d

        # Clean, supervised
        logger.info('Encoder: clean, labeled')
        clean = self.act.clean = encoder(input_concat, 'clean')

        # Corrupted, supervised
        logger.info('Encoder: corr, labeled')
        corr = self.act.corr = encoder(input_concat,
                                       'corr',
                                       input_noise_std=self.p.super_noise_std,
                                       noise_std=self.p.f_local_noise_std)
        est = self.act.est = self.new_activation_dict()

        # Decoder path in opposite order
        logger.info('Decoder: z_corr -> z_est')
        for i, ((_, spec), l_type, act_f) in layers[::-1]:
            z_corr = corr.unlabeled.z[i]
            z_clean = clean.unlabeled.z[i]
            z_clean_s = clean.unlabeled.s.get(i)
            z_clean_m = clean.unlabeled.m.get(i)
            fspec = layers[i + 1][1][0] if len(layers) > i + 1 else (None,
                                                                     None)

            if i == top:
                ver = corr.unlabeled.h[i]
                ver_dim = self.layer_dims[i]
                top_g = True
            else:
                ver = est.z.get(i + 1)
                ver_dim = self.layer_dims.get(i + 1)
                top_g = False

            z_est = self.g(z_lat=z_corr,
                           z_ver=ver,
                           in_dims=ver_dim,
                           out_dims=self.layer_dims[i],
                           l_type=l_type,
                           num=i,
                           fspec=fspec,
                           top_g=top_g)

            if z_est is not None:
                # Denoising cost
                if z_clean_s:
                    z_est_norm = (z_est - z_clean_m) / z_clean_s
                else:
                    z_est_norm = z_est

                se = SquaredError('denois' + str(i))
                costs.denois[i] = se.apply(z_est_norm.flatten(2),
                                           z_clean.flatten(2)) \
                    / np.prod(self.layer_dims[i], dtype=floatX)
                costs.denois[i].name = 'denois' + str(i)
                denois_print = 'denois %.2f' % self.p.denoising_cost_x[i]
            else:
                denois_print = ''

            # Store references for later use
            est.h[i] = self.apply_act(z_est, act_f)
            est.z[i] = z_est
            est.s[i] = None
            est.m[i] = None
            logger.info('  g%d: %10s, %s, dim %s -> %s' %
                        (i, l_type, denois_print, self.layer_dims.get(i + 1),
                         self.layer_dims.get(i)))

        # Costs
        y = target_labeled.flatten()

        costs.class_clean = CategoricalCrossEntropy().apply(
            y, clean.labeled.h[top])
        costs.class_clean.name = 'cost_class_clean'

        costs.class_corr = CategoricalCrossEntropy().apply(
            y, corr.labeled.h[top])
        costs.class_corr.name = 'cost_class_corr'

        # This will be used for training
        costs.total = costs.class_corr * 1.0
        for i in range(top + 1):
            if costs.denois.get(i) and self.p.denoising_cost_x[i] > 0:
                costs.total += costs.denois[i] * self.p.denoising_cost_x[i]
        costs.total.name = 'cost_total'

        # Classification error
        mr = MisclassificationRate()
        self.error.clean = mr.apply(y, clean.labeled.h[top]) * np.float32(100.)
        self.error.clean.name = 'error_rate_clean'