示例#1
0
    def train(self, X, Y, idx_folds, hyper_params, model_prefix, verbose=False):

        import os
        from collections import OrderedDict
        from fuel.datasets import IndexableDataset
        from blocks.model import Model
        from blocks.bricks import Linear, Softmax
        from blocks.bricks.conv import MaxPooling
        from blocks.initialization import Uniform
        from deepthought.bricks.cost import HingeLoss
        import numpy as np
        import theano
        from theano import tensor

        assert model_prefix is not None

        fold_weights_filename = '{}_weights.npy'.format(model_prefix)

        # convert Y to one-hot encoding
        n_classes = len(set(Y))
        Y = np.eye(n_classes, dtype=int)[Y]

        features = tensor.matrix('features', dtype=theano.config.floatX)
        targets = tensor.lmatrix('targets')

        input_ = features

        dim = X.shape[-1]
        
        # optional additional layers
        if self.pipeline_factory is not None:
            # need to re-shape flattened input to restore bc01 format
            input_shape = (input_.shape[0],) + hyper_params['classifier_input_shape']  # tuple, uses actual batch size
            input_ = input_.reshape(input_shape)

            pipeline = self.pipeline_factory.build_pipeline(input_shape, hyper_params)
            input_ = pipeline.apply(input_)                        
            input_ = input_.flatten(ndim=2)
            
            # this is very hacky, but there seems to be no elegant way to obtain a value for dim
            dummy_fn = theano.function(inputs=[features], outputs=input_)
            dummy_out = dummy_fn(X[:1])
            dim = dummy_out.shape[-1]
            
            
        if hyper_params['classifier_pool_width'] > 1:
            # FIXME: this is probably broken!
            
    #        c = hyper_params['num_components']
    #        input_ = input_.reshape((input_.shape[0], c, input_.shape[-1] // c, 1))  # restore bc01
            # need to re-shape flattened input to restore bc01 format
            input_shape = hyper_params['classifier_pool_input_shape']  # tuple
            input_ = input_.reshape(input_shape)

            pool = MaxPooling(name='pool',
                              input_dim=input_shape[1:],  # (c, X.shape[-1] // c, 1),
                              pooling_size=(hyper_params['classifier_pool_width'], 1),
                              step=(hyper_params['classifier_pool_stride'], 1))
            input_ = pool.apply(input_)
            input_ = input_.reshape((input_.shape[0], tensor.prod(input_.shape[1:])))

            dim = np.prod(pool.get_dim('output'))


        linear = Linear(name='linear',
                        input_dim=dim,
                        output_dim=n_classes,
                        weights_init=Uniform(mean=0, std=0.01),
                        use_bias=False)
        linear.initialize()

        softmax = Softmax('softmax')

        probs = softmax.apply(linear.apply(input_))
        prediction = tensor.argmax(probs, axis=1)

        model = Model(probs)  # classifier with raw probability outputs
        predict = theano.function([features], prediction)  # ready-to-use predict function

        if os.path.isfile(fold_weights_filename):
            # load filter weights from existing file
            fold_weights = np.load(fold_weights_filename)
            print 'loaded filter weights from', fold_weights_filename
        else:
            # train model

            from blocks.bricks.cost import MisclassificationRate
            from blocks.filter import VariableFilter
            from blocks.graph import ComputationGraph
            from blocks.roles import WEIGHT
            from blocks.bricks import Softmax
            from blocks.model import Model
            from blocks.algorithms import GradientDescent, Adam
            from blocks.extensions import FinishAfter, Timing, Printing, ProgressBar
            from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring
            from blocks.extensions.predicates import OnLogRecord
            from fuel.streams import DataStream
            from fuel.schemes import SequentialScheme, ShuffledScheme
            from blocks.monitoring import aggregation
            from blocks.main_loop import MainLoop
            from blocks.extensions.training import TrackTheBest
            from deepthought.extensions.parameters import BestParams
            # from deepthought.datasets.selection import DatasetMetaDB

            init_param_values = model.get_parameter_values()

            cost = HingeLoss().apply(targets, probs)
            # Note: this requires just the class labels, not in a one-hot encoding
            error_rate = MisclassificationRate().apply(targets.argmax(axis=1), probs)
            error_rate.name = 'error_rate'

            cg = ComputationGraph([cost])

            # L1 regularization
            if hyper_params['classifier_l1wdecay'] > 0:
                weights = VariableFilter(roles=[WEIGHT])(cg.variables)
                cost = cost + hyper_params['classifier_l1wdecay'] * sum([abs(W).sum() for W in weights])

            cost.name = 'cost'

            # iterate over trial folds
            fold_weights = []
            fold_errors = []

            # for ifi, ifold in fold_generator.get_inner_cv_folds(outer_fold):
            #
            #     train_selectors = fold_generator.get_fold_selectors(outer_fold=outer_fold, inner_fold=ifold['train'])
            #     valid_selectors = fold_generator.get_fold_selectors(outer_fold=outer_fold, inner_fold=ifold['valid'])
            #
            #     metadb = DatasetMetaDB(meta, train_selectors.keys())
            #
            #     # get selected trial IDs
            #     train_idx = metadb.select(train_selectors)
            #     valid_idx = metadb.select(valid_selectors)

            for train_idx, valid_idx in idx_folds:

                # print train_idx
                # print valid_idx

                trainset = IndexableDataset(indexables=OrderedDict(
                    [('features', X[train_idx]), ('targets', Y[train_idx])]))

                validset = IndexableDataset(indexables=OrderedDict(
                    [('features', X[valid_idx]), ('targets', Y[valid_idx])]))

                model.set_parameter_values(init_param_values)

                best_params = BestParams()
                best_params.add_condition(['after_epoch'],
                                          predicate=OnLogRecord('error_rate_valid_best_so_far'))

                algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam())

                extensions = [Timing(),
                              FinishAfter(after_n_epochs=hyper_params['classifier_max_epochs']),
                              DataStreamMonitoring(
                                  [cost, error_rate],
                                  DataStream.default_stream(
                                      validset,
                                      iteration_scheme=SequentialScheme(
                                          validset.num_examples, hyper_params['classifier_batch_size'])),
                                  suffix="valid"),
                              TrainingDataMonitoring(
                                  [cost, error_rate,
                                   aggregation.mean(algorithm.total_gradient_norm)],
                                  suffix="train",
                                  after_epoch=True),
                              TrackTheBest('error_rate_valid'),
                              best_params  # after TrackTheBest!
                              ]

                if verbose:
                    extensions.append(Printing())  # optional
                    extensions.append(ProgressBar())

                main_loop = MainLoop(
                    algorithm,
                    DataStream.default_stream(
                        trainset,
                        iteration_scheme=ShuffledScheme(trainset.num_examples, hyper_params['classifier_batch_size'])),
                    model=model,
                    extensions=extensions)

                main_loop.run()

                fold_weights.append(best_params.values['/linear.W'])
                fold_errors.append(main_loop.status['best_error_rate_valid'])
                # break # FIXME

            fold_errors = np.asarray(fold_errors).squeeze()
            print 'simple NN fold classification errors:', fold_errors

            fold_weights = np.asarray(fold_weights)

            # store filter weights for later analysis
            np.save(fold_weights_filename, fold_weights)

        weights = fold_weights.mean(axis=0)

        linear.parameters[0].set_value(weights)

        return model, predict
示例#2
0
def main(mode, save_path, steps, num_batches):
    num_states = MarkovChainDataset.num_states

    if mode == "train":
        # Experiment configuration
        rng = numpy.random.RandomState(1)
        batch_size = 50
        seq_len = 100
        dim = 10
        feedback_dim = 8

        # Build the bricks and initialize them
        transition = GatedRecurrent(name="transition",
                                    activation=Tanh(),
                                    dim=dim)
        generator = SequenceGenerator(LinearReadout(
            readout_dim=num_states,
            source_names=["states"],
            emitter=SoftmaxEmitter(name="emitter"),
            feedbacker=LookupFeedback(num_states,
                                      feedback_dim,
                                      name='feedback'),
            name="readout"),
                                      transition,
                                      weights_init=IsotropicGaussian(0.01),
                                      biases_init=Constant(0),
                                      name="generator")
        generator.push_initialization_config()
        transition.weights_init = Orthogonal()
        generator.initialize()

        # Give an idea of what's going on.
        logger.info("Parameters:\n" + pprint.pformat(
            [(key, value.get_value().shape)
             for key, value in Selector(generator).get_params().items()],
            width=120))
        logger.info("Markov chain entropy: {}".format(
            MarkovChainDataset.entropy))
        logger.info("Expected min error: {}".format(
            -MarkovChainDataset.entropy * seq_len))

        # Build the cost computation graph.
        x = tensor.lmatrix('data')
        cost = aggregation.mean(generator.cost(x[:, :]).sum(), x.shape[1])
        cost.name = "sequence_log_likelihood"

        algorithm = GradientDescent(
            cost=cost,
            params=list(Selector(generator).get_params().values()),
            step_rule=Scale(0.001))
        main_loop = MainLoop(algorithm=algorithm,
                             data_stream=DataStream(
                                 MarkovChainDataset(rng, seq_len),
                                 iteration_scheme=ConstantScheme(batch_size)),
                             model=Model(cost),
                             extensions=[
                                 FinishAfter(after_n_batches=num_batches),
                                 TrainingDataMonitoring(
                                     [cost],
                                     prefix="this_step",
                                     after_every_batch=True),
                                 TrainingDataMonitoring([cost],
                                                        prefix="average",
                                                        every_n_batches=100),
                                 SerializeMainLoop(save_path,
                                                   every_n_batches=500),
                                 Printing(every_n_batches=100)
                             ])
        main_loop.run()
    elif mode == "sample":
        main_loop = cPickle.load(open(save_path, "rb"))
        generator = main_loop.model

        sample = ComputationGraph(
            generator.generate(n_steps=steps, batch_size=1,
                               iterate=True)).get_theano_function()

        states, outputs, costs = [data[:, 0] for data in sample()]

        numpy.set_printoptions(precision=3, suppress=True)
        print("Generation cost:\n{}".format(costs.sum()))

        freqs = numpy.bincount(outputs).astype(floatX)
        freqs /= freqs.sum()
        print("Frequencies:\n {} vs {}".format(freqs,
                                               MarkovChainDataset.equilibrium))

        trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX)
        for a, b in zip(outputs, outputs[1:]):
            trans_freqs[a, b] += 1
        trans_freqs /= trans_freqs.sum(axis=1)[:, None]
        print("Transition frequencies:\n{}\nvs\n{}".format(
            trans_freqs, MarkovChainDataset.trans_prob))
    else:
        assert False
示例#3
0
def visualize_generate(cost, hidden_states, updates, train_stream,
                       valid_stream, args):

    use_indices = has_indices(args.dataset)
    output_size = get_output_size(args.dataset)

    # Get presoft and its computation graph
    filter_presoft = VariableFilter(theano_name="presoft")
    presoft = filter_presoft(ComputationGraph(cost).variables)[0]
    cg = ComputationGraph(presoft)

    # Handle the theano shared variables that allow carrying the hidden
    # state
    givens, f_updates = carry_hidden_state(updates, 1, reset=not (use_indices))

    if args.hide_all_except is not None:
        pass

    # Compile the theano function
    compiled = theano.function(inputs=cg.inputs,
                               outputs=presoft,
                               givens=givens,
                               updates=f_updates)

    epoch_iterator = train_stream.get_epoch_iterator()
    for num in range(10):
        all_ = next(epoch_iterator)
        all_sequence = all_[0][:, 0:1]
        targets = all_[1][:, 0:1]

        # In the case of characters and text
        if use_indices:
            init_ = all_sequence[:args.initial_text_length]

            # Time X Features
            probability_array = np.zeros((0, output_size))
            generated_text = init_

            for i in range(args.generated_text_lenght):
                presoft = compiled(generated_text)
                # Get the last value of presoft
                last_presoft = presoft[-1:, 0, :]

                # Compute the probability distribution
                probabilities = softmax(last_presoft)
                # Store it in the list
                probability_array = np.vstack(
                    [probability_array, probabilities])

                # Sample a character out of the probability distribution
                argmax = (args.softmax_sampling == 'argmax')
                last_output_sample = sample(probabilities, argmax)[:, None, :]

                # Concatenate the new value to the text
                generated_text = np.vstack(
                    [generated_text, last_output_sample])

                ploting_path = None
                if args.save_path is not None:
                    ploting_path = os.path.join(args.save_path,
                                                'prob_plot.png')

                # Convert with real characters
                whole_sentence = conv_into_char(generated_text[:, 0],
                                                args.dataset)
                initial_sentence = whole_sentence[:init_.shape[0]]
                selected_sentence = whole_sentence[init_.shape[0]:]

                logger.info(''.join(initial_sentence) + '...')
                logger.info(''.join(whole_sentence))

                if ploting_path is not None:
                    probability_plot(probability_array, selected_sentence,
                                     args.dataset, ploting_path)

        # In the case of sine wave dataset for example
        else:
            presoft = compiled(all_sequence)

            time_plot = presoft.shape[0] - 1

            plt.plot(np.arange(time_plot),
                     targets[:time_plot, 0, 0],
                     label="target")
            plt.plot(np.arange(time_plot),
                     presoft[:time_plot, 0, 0],
                     label="predicted")
            plt.legend()
            plt.grid(True)
            plt.show()
示例#4
0
##############
# Test with first batch
##############

x_tr, x_mask_tr = next(data_stream.get_epoch_iterator())
f1 = function([x, x_mask], cost)
#print f1(x_tr, x_mask_tr)

#ipdb.set_trace()

################
# Optimization Algorithm
################

cg = ComputationGraph(cost)
model = Model(cost)

algorithm = GradientDescent(
    cost=cost, parameters=cg.parameters,
    step_rule=CompositeRule([StepClipping(10.0), Adam(lr)]),
    on_unused_sources='warn')

train_monitor = TrainingDataMonitoring(
    variables=[cost],
    after_epoch = True,
    prefix="train")

extensions = extensions=[
    train_monitor,
    TrackTheBest('train_sequence_log_likelihood'),
示例#5
0
def test_variable_filter():
    # Creating computation graph
    brick1 = Linear(input_dim=2, output_dim=2, name='linear1')
    brick2 = Bias(2, name='bias1')
    activation = Logistic(name='sigm')

    x = tensor.vector()
    h1 = brick1.apply(x)
    h2 = activation.apply(h1)
    h2.name = "h2act"
    y = brick2.apply(h2)
    cg = ComputationGraph(y)

    parameters = [brick1.W, brick1.b, brick2.parameters[0]]
    bias = [brick1.b, brick2.parameters[0]]
    brick1_bias = [brick1.b]

    # Testing filtering by role
    role_filter = VariableFilter(roles=[PARAMETER])
    assert parameters == role_filter(cg.variables)
    role_filter = VariableFilter(roles=[FILTER])
    assert [] == role_filter(cg.variables)

    # Testing filtering by role using each_role flag
    role_filter = VariableFilter(roles=[PARAMETER, BIAS])
    assert parameters == role_filter(cg.variables)
    role_filter = VariableFilter(roles=[PARAMETER, BIAS], each_role=True)
    assert not parameters == role_filter(cg.variables)
    assert bias == role_filter(cg.variables)

    # Testing filtering by bricks classes
    brick_filter = VariableFilter(roles=[BIAS], bricks=[Linear])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by bricks instances
    brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by brick instance
    brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by name
    name_filter = VariableFilter(name='W_norm')
    assert [cg.variables[2]] == name_filter(cg.variables)

    # Testing filtering by name regex
    name_filter_regex = VariableFilter(name_regex='W_no.?m')
    assert [cg.variables[2]] == name_filter_regex(cg.variables)

    # Testing filtering by theano name
    theano_name_filter = VariableFilter(theano_name='h2act')
    assert [cg.variables[11]] == theano_name_filter(cg.variables)

    # Testing filtering by theano name regex
    theano_name_filter_regex = VariableFilter(theano_name_regex='h2a.?t')
    assert [cg.variables[11]] == theano_name_filter_regex(cg.variables)

    # Testing filtering by application
    appli_filter = VariableFilter(applications=[brick1.apply])
    variables = [cg.variables[1], cg.variables[8]]
    assert variables == appli_filter(cg.variables)

    # Testing filtering by application
    appli_filter_list = VariableFilter(applications=[brick1.apply])
    assert variables == appli_filter_list(cg.variables)

    input1 = tensor.matrix('input1')
    input2 = tensor.matrix('input2')
    merge = Merge(['input1', 'input2'], [5, 6], 2)
    merged = merge.apply(input1, input2)
    merge_cg = ComputationGraph(merged)
    outputs = VariableFilter(roles=[OUTPUT],
                             bricks=[merge])(merge_cg.variables)
    assert merged in outputs
    assert len(outputs) == 3

    outputs_application = VariableFilter(roles=[OUTPUT],
                                         applications=[merge.apply
                                                       ])(merge_cg.variables)
    assert outputs_application == [merged]
示例#6
0
文件: run.py 项目: zero76114/ladder
def train(cli_params):
    cli_params['save_dir'] = prepare_dir(cli_params['save_to'])
    logfile = os.path.join(cli_params['save_dir'], 'log.txt')

    # Log also DEBUG to a file
    fh = logging.FileHandler(filename=logfile)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)

    logger.info('Logging into %s' % logfile)

    p, loaded = load_and_log_params(cli_params)
    in_dim, data, whiten, cnorm = setup_data(p, test_set=False)
    if not loaded:
        # Set the zero layer to match input dimensions
        p.encoder_layers = (in_dim,) + p.encoder_layers

    ladder = setup_model(p)

    # Training
    all_params = ComputationGraph([ladder.costs.total]).parameters
    logger.info('Found the following parameters: %s' % str(all_params))

    # Fetch all batch normalization updates. They are in the clean path.
    bn_updates = ComputationGraph([ladder.costs.class_clean]).updates
    assert 'counter' in [u.name for u in bn_updates.keys()], \
        'No batch norm params in graph - the graph has been cut?'

    training_algorithm = GradientDescent(
        cost=ladder.costs.total, params=all_params,
        step_rule=Adam(learning_rate=ladder.lr))
    # In addition to actual training, also do BN variable approximations
    training_algorithm.add_updates(bn_updates)

    short_prints = {
        "train": {
            'T_C_class': ladder.costs.class_corr,
            'T_C_de': ladder.costs.denois.values(),
        },
        "valid_approx": OrderedDict([
            ('V_C_class', ladder.costs.class_clean),
            ('V_E', ladder.error.clean),
            ('V_C_de', ladder.costs.denois.values()),
        ]),
        "valid_final": OrderedDict([
            ('VF_C_class', ladder.costs.class_clean),
            ('VF_E', ladder.error.clean),
            ('VF_C_de', ladder.costs.denois.values()),
        ]),
    }

    main_loop = MainLoop(
        training_algorithm,
        # Datastream used for training
        make_datastream(data.train, data.train_ind,
                        p.batch_size,
                        n_labeled=p.labeled_samples,
                        n_unlabeled=p.unlabeled_samples,
                        whiten=whiten,
                        cnorm=cnorm),
        model=Model(ladder.costs.total),
        extensions=[
            FinishAfter(after_n_epochs=p.num_epochs),

            # This will estimate the validation error using
            # running average estimates of the batch normalization
            # parameters, mean and variance
            ApproxTestMonitoring(
                [ladder.costs.class_clean, ladder.error.clean]
                + ladder.costs.denois.values(),
                make_datastream(data.valid, data.valid_ind,
                                p.valid_batch_size, whiten=whiten, cnorm=cnorm,
                                scheme=ShuffledScheme),
                prefix="valid_approx"),

            # This Monitor is slower, but more accurate since it will first
            # estimate batch normalization parameters from training data and
            # then do another pass to calculate the validation error.
            FinalTestMonitoring(
                [ladder.costs.class_clean, ladder.error.clean]
                + ladder.costs.denois.values(),
                make_datastream(data.train, data.train_ind,
                                p.batch_size,
                                n_labeled=p.labeled_samples,
                                whiten=whiten, cnorm=cnorm,
                                scheme=ShuffledScheme),
                make_datastream(data.valid, data.valid_ind,
                                p.valid_batch_size,
                                n_labeled=len(data.valid_ind),
                                whiten=whiten, cnorm=cnorm,
                                scheme=ShuffledScheme),
                prefix="valid_final",
                after_n_epochs=p.num_epochs),

            TrainingDataMonitoring(
                [ladder.costs.total, ladder.costs.class_corr,
                 training_algorithm.total_gradient_norm]
                + ladder.costs.denois.values(),
                prefix="train", after_epoch=True),

            SaveParams(None, all_params, p.save_dir, after_epoch=True),
            SaveExpParams(p, p.save_dir, before_training=True),
            SaveLog(p.save_dir, after_training=True),
            ShortPrinting(short_prints),
            LRDecay(ladder.lr, p.num_epochs * p.lrate_decay, p.num_epochs,
                    after_epoch=True),
        ])
    main_loop.run()

    # Get results
    df = main_loop.log.to_dataframe()
    col = 'valid_final_error_rate_clean'
    logger.info('%s %g' % (col, df[col].iloc[-1]))

    if main_loop.log.status['epoch_interrupt_received']:
        return None
    return df
    x_synth = mgcf02wav(mgc_reconstruct, f0_tr[this_sample])
    x_synth = .95 * x_synth / max(abs(x_synth)) * 2**15
    wavfile.write(
        save_dir + "samples/new/data" + num_sample + str(this_sample) + ".wav",
        16000, x_synth.astype('int16'))

main_loop = load(save_dir + "pkl/best_" + experiment_name + ".pkl")

lookup, generator = main_loop.model.get_top_bricks()

from theano import tensor, function
phonemes = tensor.imatrix('phonemes')

sample = ComputationGraph(
    generator.generate(attended=lookup.apply(phonemes),
                       n_steps=phonemes.shape[0],
                       batch_size=phonemes.shape[1],
                       iterate=True))
sample_fn = sample.get_theano_function()

outputs_bp = sample_fn(phonemes_tr)[3]

for this_sample in range(n_samples):
    print "Iteration: ", this_sample
    outputs = outputs_bp

    sampled_f0 = outputs[:, :, -2]
    sampled_voiced = outputs[:, :, -1]

    print sampled_voiced.mean()
    print sampled_f0.max(), sampled_f0.min()
示例#8
0
def train_model(cost,
                cross_entropy,
                updates,
                train_stream,
                valid_stream,
                args,
                gate_values=None):

    step_rule = learning_algorithm(args)
    cg = ComputationGraph(cost)

    # ADD REGULARIZATION
    # WEIGHT NOISE
    weight_noise = args.weight_noise
    if weight_noise > 0:
        weights = VariableFilter(roles=[WEIGHT])(cg.variables)
        cg_train = apply_noise(cg, weights, weight_noise)
        cost = cg_train.outputs[0]
    cost.name = "cost_with_weight_noise"
    cg = ComputationGraph(cost)

    logger.info(cg.parameters)

    algorithm = GradientDescent(cost=cost,
                                step_rule=step_rule,
                                params=cg.parameters)
    algorithm.add_updates(updates)

    # extensions to be added
    extensions = []
    if args.load_path is not None:
        extensions.append(Load(args.load_path))

    outputs = [
        variable for variable in cg.variables if variable.name == "presoft"
    ]

    if args.generate:
        extensions.append(
            TextGenerationExtension(
                outputs=outputs,
                generation_length=args.generated_text_lenght,
                initial_text_length=args.initial_text_length,
                every_n_batches=args.monitoring_freq,
                ploting_path=os.path.join(args.save_path, 'prob_plot.png'),
                softmax_sampling=args.softmax_sampling,
                dataset=args.dataset,
                updates=updates,
                interactive_mode=args.interactive_mode))
    extensions.extend([
        TrainingDataMonitoring([cost],
                               prefix='train',
                               every_n_batches=args.monitoring_freq,
                               after_epoch=True),
        DataStreamMonitoring([cost, cross_entropy],
                             valid_stream,
                             args.mini_batch_size_valid,
                             state_updates=updates,
                             prefix='valid',
                             before_first_epoch=not (args.visualize_gates),
                             every_n_batches=args.monitoring_freq),
        ResetStates([v for v, _ in updates], every_n_batches=100),
        ProgressBar()
    ])
    # Creating directory for saving model.
    if not args.interactive_mode:
        if not os.path.exists(args.save_path):
            os.makedirs(args.save_path)
        else:
            raise Exception('Directory already exists')
    early_stopping = EarlyStopping('valid_cross_entropy',
                                   args.patience,
                                   args.save_path,
                                   every_n_batches=args.monitoring_freq)

    # Visualizing extensions
    if args.interactive_mode:
        extensions.append(InteractiveMode())
    if args.visualize_gates and (gate_values is not None):
        if args.rnn_type == "lstm":
            extensions.append(
                VisualizeGateLSTM(gate_values,
                                  updates,
                                  args.dataset,
                                  ploting_path=None))
        elif args.rnn_type == "soft":
            extensions.append(
                VisualizeGateSoft(gate_values,
                                  updates,
                                  args.dataset,
                                  ploting_path=None))
        else:
            assert (False)

    extensions.append(early_stopping)
    extensions.append(Printing(every_n_batches=args.monitoring_freq))

    main_loop = MainLoop(model=Model(cost),
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)
    main_loop.run()
def run_experiment():

    np.random.seed(42)

    X = tensor.tensor4('features')
    nbr_channels = 3
    image_shape = (5, 5)

    conv_layers = [
        ConvolutionalLayer(
            filter_size=(2, 2),
            num_filters=10,
            activation=Rectifier().apply,
            border_mode='valid',
            pooling_size=(1, 1),
            weights_init=Uniform(width=0.1),
            #biases_init=Uniform(width=0.01),
            biases_init=Constant(0.0),
            name='conv0')
    ]
    conv_sequence = ConvolutionalSequence(conv_layers,
                                          num_channels=nbr_channels,
                                          image_size=image_shape)
    #conv_sequence.push_allocation_config()
    conv_sequence.initialize()

    flattener = Flattener()
    conv_output = conv_sequence.apply(X)
    y_hat = flattener.apply(conv_output)
    # Whatever. Not important since we're not going to actually train anything.
    cost = tensor.sqr(y_hat).sum()

    #L_grads_method_02 = [tensor.grad(cost, v) for v in VariableFilter(roles=[FILTER, BIAS])(ComputationGraph([y_hat]).variables)]
    L_grads_method_02 = [
        tensor.grad(cost, v) for v in VariableFilter(
            roles=[BIAS])(ComputationGraph([y_hat]).variables)
    ]
    # works on the sum of the gradients in a mini-batch
    sum_square_norm_gradients_method_02 = sum(
        [tensor.sqr(g).sum() for g in L_grads_method_02])

    D_by_layer = get_conv_layers_transformation_roles(
        ComputationGraph(conv_output))
    individual_sum_square_norm_gradients_method_00 = get_sum_square_norm_gradients_conv_transformations(
        D_by_layer, cost)

    # why does this thing depend on N again ?
    # I don't think I've used a cost that divides by N.

    N = 2
    Xtrain = np.random.randn(N, nbr_channels, image_shape[0],
                             image_shape[1]).astype(np.float32)
    #Xtrain[1:,:,:,:] = 0.0
    Xtrain[:, :, :, :] = 1.0

    convolution_filter_variable = VariableFilter(roles=[FILTER])(
        ComputationGraph([y_hat]).variables)[0]
    convolution_filter_variable_value = convolution_filter_variable.get_value()
    convolution_filter_variable_value[:, :, :, :] = 1.0
    #convolution_filter_variable_value[0,0,:,:] = 1.0
    convolution_filter_variable.set_value(convolution_filter_variable_value)

    f = theano.function([X], [
        cost, individual_sum_square_norm_gradients_method_00,
        sum_square_norm_gradients_method_02
    ])

    [c, v0, gs2] = f(Xtrain)

    #print "[c, v0, gs2]"
    L_c, L_v0, L_gs2 = ([], [], [])
    for n in range(N):
        [nc, nv0, ngs2] = f(Xtrain[n, :, :, :].reshape(
            (1, Xtrain.shape[1], Xtrain.shape[2], Xtrain.shape[3])))
        L_c.append(nc)
        L_v0.append(nv0)
        L_gs2.append(ngs2)

    print "Cost for whole mini-batch in single shot : %f." % c
    print "Cost for whole mini-batch accumulated    : %f." % sum(L_c)
    print ""
    print "Square-norm of all gradients for each data point in single shot :"
    print v0.reshape((1, -1))
    print "Square-norm of all gradients for each data point iteratively :"
    print np.array(L_gs2).reshape((1, -1))
    print ""
    print "Difference max abs : %f." % np.max(np.abs(v0 - np.array(L_gs2)))
    print ""
    print "Ratios : "
    print np.array(L_gs2).reshape((1, -1)) / v0.reshape((1, -1))
示例#10
0
def train(step_rule, input_dim, state_dim, label_dim, layers, epochs, seed,
          pretrain_alignment, uniform_alignment, dropout, beam_search,
          test_cost, experiment_path, window_features, features, pool_size,
          maximum_frames, initialization, weight_noise, to_watch, patience,
          plot, write_predictions, static_mask, drop_prob, drop_prob_states,
          drop_prob_cells, drop_prob_igates, ogates_zoneout, batch_size,
          stoch_depth, share_mask, gaussian_drop, rnn_type, num_layers,
          norm_cost_coeff, penalty, seq_len, input_drop, augment, **kwargs):

    print '.. PTB experiment'
    print '.. arguments:', ' '.join(sys.argv)
    t0 = time.time()

    ###########################################
    #
    # LOAD DATA
    #
    ###########################################

    def numpy_rng(random_seed=None):
        if random_seed == None:
            random_seed = 1223
        return numpy.random.RandomState(random_seed)

    #from utilities import onehot, unhot, vec2chars
    # from http://www.iro.umontreal.ca/~memisevr/code/logreg.py
    #def onehot(x,numclasses=None):
    #""" Convert integer encoding for class-labels (starting with 0 !)
    #to one-hot encoding.
    #The output is an array who's shape is the shape of the input array plus
    #an extra dimension, containing the 'one-hot'-encoded labels.
    #"""
    #if x.shape==():
    #x = x[None]
    #if numclasses is None:
    #numclasses = x.max() + 1
    #result = numpy.zeros(list(x.shape) + [numclasses], dtype="int")
    #z = numpy.zeros(x.shape, dtype="int")
    #for c in range(numclasses):
    #z *= 0
    #z[numpy.where(x==c)] = 1
    #result[...,c] += z
    #return result.astype(theano.config.floatX)

    #framelen = 1
    #50 = 50
    ##data = np.load(os.path.join(os.environ['FUEL_DATA_PATH'], 'PennTreebankCorpus/char_level_penntree.npz'))#pentree_char_and_word.npz')
    #data = np.load('char_level_penntree.npz')
    #trainset = data['train']
    #validset = data['valid']

    #allletters = " etanoisrhludcmfpkgybw<>\nvN.'xj$-qz&0193#285\\764/*"
    #dictionary = dict(zip(list(set(allletters)), range(50)))
    #invdict = {v: k for k, v in dictionary.items()}

    #numtrain = len(trainset) / seq_len * seq_len
    #numvalid = len(validset) / seq_len * seq_len
    #trainset = trainset[:numtrain]
    #validset = validset[:numvalid]
    ##if testing:
    ##    train_features_numpy = train_features_numpy[:32 * 5]
    ##    valid_features_numpy = valid_features_numpy[:100]
    #train_targets = trainset.reshape(-1, seq_len*framelen)[:,1:]
    #valid_targets = validset.reshape(-1, seq_len*framelen)[:,1:]
    ## still only 2d (b, t*n)
    #train_features_numpy = onehot(trainset).reshape(-1, 50*seq_len*framelen)[:,:-50]
    #valid_features_numpy = onehot(validset).reshape(-1, 50*seq_len*framelen)[:,:-50]
    #del trainset, validset
    #data_loaded = True
    #print '... done'
    #test_value = train_features_numpy[:32]

    ####################

    ###########################################
    #
    # MAKE STREAMS
    #
    ###########################################
    rng = np.random.RandomState(seed)
    stream_args = dict(rng=rng,
                       pool_size=pool_size,
                       maximum_frames=maximum_frames,
                       pretrain_alignment=pretrain_alignment,
                       uniform_alignment=uniform_alignment,
                       window_features=window_features)
    if share_mask:
        drop_prob_cells = drop_prob
        # we don't want to actually use these masks, so this is to debug
        drop_prob_states = None

    # the threes in here are because the number of layers is hardcoded to 3 atm. NIPS!
    print '.. initializing iterators'

    # train_stream, valid_stream = get_seq_mnist_streams(
    #    h_dim, batch_size, update_prob)
    if static_mask:
        train_stream = get_static_mask_ptb_stream('train',
                                                  batch_size,
                                                  seq_len,
                                                  drop_prob_states,
                                                  drop_prob_cells,
                                                  drop_prob_igates,
                                                  state_dim,
                                                  False,
                                                  augment=augment)
        train_stream_evaluation = get_static_mask_ptb_stream('train',
                                                             batch_size,
                                                             seq_len,
                                                             drop_prob_states,
                                                             drop_prob_cells,
                                                             drop_prob_igates,
                                                             state_dim,
                                                             True,
                                                             augment=augment)
        dev_stream = get_static_mask_ptb_stream('valid',
                                                batch_size,
                                                seq_len,
                                                drop_prob_states,
                                                drop_prob_cells,
                                                drop_prob_igates,
                                                state_dim,
                                                True,
                                                augment=augment)
    else:
        train_stream = get_ptb_stream('train',
                                      batch_size,
                                      seq_len,
                                      drop_prob_states,
                                      drop_prob_cells,
                                      drop_prob_igates,
                                      state_dim,
                                      False,
                                      augment=augment)
        train_stream_evaluation = get_ptb_stream('train',
                                                 batch_size,
                                                 seq_len,
                                                 drop_prob_states,
                                                 drop_prob_cells,
                                                 drop_prob_igates,
                                                 state_dim,
                                                 True,
                                                 augment=augment)
        dev_stream = get_ptb_stream('valid',
                                    batch_size,
                                    seq_len,
                                    drop_prob_states,
                                    drop_prob_cells,
                                    drop_prob_igates,
                                    state_dim,
                                    True,
                                    augment=augment)

    #train_dataset = Timit('train', features=features)
    # assert (train_features_numpy[:,-50:].sum(axis=-2)==1).all()
    #train_features_numpy = train_features_numpy.reshape(-1, seq_len-1, 50)#BTN for shuffled dataset?
    #train_dataset = IndexableDataset(indexables=OrderedDict(
    #[('features', train_features_numpy),
    #('outputs', train_targets)]))

    #train_stream = construct_stream_np(train_dataset, state_dim, batch_size, len(train_targets),
    #drop_prob_states, drop_prob_cells, drop_prob_igates,
    #num_layers=num_layers,
    #is_for_test=False, stoch_depth=stoch_depth, share_mask=share_mask,
    #gaussian_drop=gaussian_drop, input_drop=input_drop, **stream_args)
    ##dev_dataset = Timit('dev', features=features)
    #valid_features_numpy = valid_features_numpy.reshape(-1, seq_len-1,  50)
    #dev_dataset = IndexableDataset(indexables=OrderedDict(
    #[('features', valid_features_numpy),
    #('outputs', valid_targets)]))
    #dev_stream = construct_stream_np(dev_dataset, state_dim, batch_size, len(valid_targets),
    #drop_prob_states, drop_prob_cells, drop_prob_igates,
    #num_layers=num_layers,
    #is_for_test=True, stoch_depth=stoch_depth, share_mask=share_mask,
    #gaussian_drop=gaussian_drop, input_drop=input_drop, **stream_args)
    ##test_dataset = Timit('test', features=features)
    ##test_stream = construct_stream(test_dataset, state_dim, drop_prob_states, drop_prob_cells, drop_prob_igates,  3,
    ##                               is_for_test=True, stoch_depth=stoch_depth, share_mask=share_mask,
    ##                               gaussian_drop=gaussian_drop, **stream_args)
    data = train_stream.get_epoch_iterator(as_dict=True).next()
    #import ipdb; ipdb.set_trace()

    #phone_dict = train_dataset.get_phoneme_dict()
    #phoneme_dict = {k: phone_to_phoneme_dict[v]
    #                if v in phone_to_phoneme_dict else v
    #                for k, v in phone_dict.iteritems()}
    #ind_to_phoneme = {v: k for k, v in phoneme_dict.iteritems()}
    #eol_symbol = ind_to_phoneme['<STOP>']

    ####################

    ###########################################
    #
    # BUILD MODEL
    #
    ###########################################

    print '.. building model'

    x = T.tensor3('features', dtype=floatX)
    x, y = x[:-1], x[1:]  #T.lmatrix('outputs')# phonemes')
    drops_states = T.tensor3('drops_states')
    drops_cells = T.tensor3('drops_cells')
    drops_igates = T.tensor3('drops_igates')

    x.tag.test_value = data['features']
    #y.tag.test_value = data['outputs']
    drops_states.tag.test_value = data['drops_states']
    drops_cells.tag.test_value = data['drops_cells']
    drops_igates.tag.test_value = data['drops_igates']

    if initialization == 'glorot':
        weights_init = NormalizedInitialization()
    elif initialization == 'uniform':
        weights_init = Uniform(width=.2)
    elif initialization == 'ortho':
        weights_init = OrthogonalInitialization()
    else:
        raise ValueError('No such initialization')

    if rnn_type.lower() == 'lstm':
        in_to_hid = Linear(50,
                           state_dim * 4,
                           name='in_to_hid',
                           weights_init=weights_init,
                           biases_init=Constant(0.0))
        recurrent_layer = DropLSTM(dim=state_dim,
                                   weights_init=weights_init,
                                   activation=Tanh(),
                                   model_type=6,
                                   name='rnn',
                                   ogates_zoneout=ogates_zoneout)
    elif rnn_type.lower() == 'gru':
        in_to_hid = Linear(50,
                           state_dim * 3,
                           name='in_to_hid',
                           weights_init=weights_init,
                           biases_init=Constant(0.0))
        recurrent_layer = DropGRU(dim=state_dim,
                                  weights_init=weights_init,
                                  activation=Tanh(),
                                  name='rnn')
    elif rnn_type.lower() == 'srnn':  #FIXME!!! make ReLU
        in_to_hid = Linear(50,
                           state_dim,
                           name='in_to_hid',
                           weights_init=weights_init,
                           biases_init=Constant(0.0))
        recurrent_layer = DropSimpleRecurrent(dim=state_dim,
                                              weights_init=weights_init,
                                              activation=Rectifier(),
                                              name='rnn')
    else:
        raise NotImplementedError

    #lstm2 = DropLSTM(dim=state_dim, activation=Tanh(), model_type=6)

    #lstm3 = DropLSTM(dim=state_dim, activation=Tanh(), model_type=6)

    #encoder = DropMultiLayerEncoder(weights_init=weights_init,
    #biases_init=Constant(.0),
    #networks=[lstm1, lstm2, bidir3],
    #dims=[input_dim * window_features,
    #state_dim,
    #state_dim,
    #state_dim,
    #label_dim + 1])
    #encoder.initialize()
    #drops_states = [drops_forw_states, drops_back_states]
    #drops_cells = [drops_forw_cells, drops_back_cells]
    #drops_igates = [drops_forw_igates, drops_back_igates]
    hid_to_out = Linear(state_dim,
                        50,
                        name='hid_to_out',
                        weights_init=weights_init,
                        biases_init=Constant(0.0))

    in_to_hid.initialize()
    recurrent_layer.initialize()
    hid_to_out.initialize()

    h = in_to_hid.apply(x)

    if rnn_type.lower() == 'lstm':
        yh = recurrent_layer.apply(h, drops_states, drops_cells,
                                   drops_igates)[0]
    else:
        yh = recurrent_layer.apply(h, drops_states, drops_cells, drops_igates)

    y_hat_pre_softmax = hid_to_out.apply(yh)
    shape_ = y_hat_pre_softmax.shape

    # y_hat = Softmax().apply(
    #     y_hat_pre_softmax.reshape((-1, shape_[-1])))# .reshape(shape_)

    ####################

    ###########################################
    #
    # SET UP COSTS AND MONITORS
    #
    ###########################################

    # cost = CategoricalCrossEntropy().apply(y.flatten().astype('int64'), y_hat)

    def crossentropy_lastaxes(yhat, y):
        # for sequence of distributions/targets
        return -(y * T.log(yhat)).sum(axis=yhat.ndim - 1)

    def softmax_lastaxis(x):
        # for sequence of distributions
        return T.nnet.softmax(x.reshape((-1, x.shape[-1]))).reshape(x.shape)

    yhat = softmax_lastaxis(y_hat_pre_softmax)
    cross_entropies = crossentropy_lastaxes(yhat, y)
    cross_entropy = cross_entropies.mean().copy(name="cross_entropy")
    cost = cross_entropy.copy(name="cost")

    batch_cost = cost.copy(name='batch_cost')
    nll_cost = cost.copy(name='nll_cost')
    bpc = (nll_cost / np.log(2.0)).copy(name='bpr')

    #nll_cost = aggregation.mean(batch_cost, batch_size).copy(name='nll_cost')

    cost_monitor = aggregation.mean(
        batch_cost, batch_size).copy(name='sequence_cost_monitor')
    cost_per_character = aggregation.mean(
        batch_cost, (seq_len - 1) * batch_size).copy(name='character_cost')
    cost_train = cost.copy(name='train_batch_cost')
    cost_train_monitor = cost_monitor.copy('train_batch_cost_monitor')
    cg_train = ComputationGraph([cost_train, cost_train_monitor])

    ##################### DK ADD COST ########################
    ##################### DK ADD COST ########################
    ##################### DK ADD COST ########################
    ##################### DK ADD COST ########################
    ##################### DK ADD COST ########################
    ##################### DK ADD COST ########################
    ##################### DK ADD COST ########################
    ##################### DK ADD COST ########################
    norm_cost = 0.

    def _magnitude(x, axis=-1):
        return T.sqrt(
            T.maximum(T.sqr(x).sum(axis=axis),
                      numpy.finfo(x.dtype).tiny))

    if penalty == 'cells':
        assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables)
        for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables):
            norms = _magnitude(cell)
            norm_cost += T.mean(
                T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1))
            ## debugging nans stuff
            #gr = T.grad(norm_cost, cg_train.parameters, disconnected_inputs='ignore')
            #grf = theano.function([x, input_mask], gr)
            #grz = grf(x.tag.test_value, input_mask.tag.test_value)
            #params = cg_train.parameters
            #mynanz = [(pp, np.sum(gg)) for pp,gg in zip(params, grz) if np.isnan(np.sum(gg))]
            #for mm in mynanz: print mm
            ##import ipdb; ipdb.set_trace()
    elif penalty == 'hids':
        assert 'rnn_apply_states' in [
            o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables)
        ]
        for output in VariableFilter(roles=[OUTPUT])(cg_train.variables):
            if output.name == 'rnn_apply_states':
                norms = _magnitude(output)
                norm_cost += T.mean(
                    T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1))
                ## debugging nans stuff
                #gr = T.grad(norm_cost, cg_train.parameters, disconnected_inputs='ignore')
                #grf = theano.function([x, input_mask], gr)
                #grz = grf(x.tag.test_value, input_mask.tag.test_value)
                #params = cg_train.parameters
                #mynanz = [(pp, np.sum(gg)) for pp,gg in zip(params, grz) if np.isnan(np.sum(gg))]
                #for mm in mynanz: print mm
                ##import ipdb; ipdb.set_trace()

    norm_cost.name = 'norm_cost'
    #cost_valid = cost_train
    cost_train += norm_cost_coeff * norm_cost
    cost_train = cost_train.copy(
        'cost_train')  #should this be cost_train.outputs[0]?

    cg_train = ComputationGraph([cost_train,
                                 cost_train_monitor])  #, norm_cost])

    ##################### DK ADD COST ########################
    ##################### DK ADD COST ########################
    ##################### DK ADD COST ########################
    ##################### DK ADD COST ########################

    if weight_noise > 0:
        weights = VariableFilter(roles=[WEIGHT])(cg_train.variables)
        cg_train = apply_noise(cg_train, weights, weight_noise)
        cost_train = cg_train.outputs[0].copy(name='cost_train')
        cost_train_monitor = cg_train.outputs[1].copy(
            'train_batch_cost_monitor')

    # if 'l2regularization' in kwargs:
    #     weights = VariableFilter(roles=[WEIGHT])(cg_train.variables)
    #     cost_train += kwargs['l2regularization'] * sum([
    #         (weight ** 2).sum() for weight in weights])
    #     cost_train.name = 'cost_train'
    #     cg_train = ComputationGraph(cost_train)

    model = Model(cost_train)
    train_cost_per_character = aggregation.mean(
        cost_train_monitor,
        (seq_len - 1) * batch_size).copy(name='train_character_cost')

    algorithm = GradientDescent(step_rule=step_rule,
                                cost=cost_train,
                                parameters=cg_train.parameters)

    observed_vars = [
        cost_train, cost_train_monitor, train_cost_per_character,
        aggregation.mean(algorithm.total_gradient_norm)
    ]
    # parameters = model.get_parameter_dict()
    # for name, param in parameters.iteritems():
    #     observed_vars.append(param.norm(2).copy(name=name + "_norm"))
    #     observed_vars.append(
    #         algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm"))
    train_monitor = TrainingDataMonitoring(variables=observed_vars,
                                           prefix="train",
                                           after_epoch=True)

    dev_monitor = DataStreamMonitoring(variables=[nll_cost, bpc],
                                       data_stream=dev_stream,
                                       prefix="dev")
    #train_ctc_monitor = CTCMonitoring(
    #x, input_mask,
    #drops_forw_states, drops_forw_cells, drops_forw_igates,
    #drops_back_states, drops_back_cells, drops_back_igates,
    #y_hat, eol_symbol, train_stream,
    #prefix='train', every_n_epochs=1,
    #before_training=True,
    #phoneme_dict=phoneme_dict,
    #black_list=black_list, train=True)
    #dev_ctc_monitor = CTCMonitoring(
    #x, input_mask,
    #drops_forw_states, drops_forw_cells, drops_forw_igates,
    #drops_back_states, drops_back_cells, drops_back_igates,
    #y_hat, eol_symbol, dev_stream,
    #prefix='dev', every_n_epochs=1,
    #phoneme_dict=phoneme_dict,
    #black_list=black_list)

    extensions = []
    # /u/pezeshki/speech_project/five_layer_timit/trained_params_best.npz
    if 'load_path' in kwargs:
        with open(kwargs['load_path']) as f:
            loaded = np.load(f)
            model = Model(cost_train)
            params_dicts = model.get_parameter_dict()
            params_names = params_dicts.keys()
            for param_name in params_names:
                param = params_dicts[param_name]
                # '/f_6_.W' --> 'f_6_.W'
                slash_index = param_name.find('/')
                param_name = param_name[slash_index + 1:]
                if param.get_value().shape == loaded[param_name].shape:
                    print 'Found: ' + param_name
                    param.set_value(loaded[param_name])
                else:
                    print 'Not found: ' + param_name

        #_evaluator = CTCEvaluator(eol_symbol, x, input_mask, y_hat,
        #phoneme_dict=phoneme_dict,
        #black_list=black_list)

        #logger.info("CTC monitoring on TEST data started")
        #value_dict = _evaluator.evaluate(test_stream, False)
        #print value_dict.items()
        #logger.info("CTC monitoring on TEST data finished")

        #logger.info("CTC monitoring on TRAIN data started")
        #value_dict = _evaluator.evaluate(train_stream, True)
        #print value_dict.items()
        #logger.info("CTC monitoring on TRAIN data finished")

        #logger.info("CTC monitoring on DEV data started")
        #value_dict = _evaluator.evaluate(dev_stream, False)
        #print value_dict.items()
        #logger.info("CTC monitoring on DEV data finished")

    extensions.extend(
        [FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor])
    #train_ctc_monitor,
    #dev_ctc_monitor])

    if test_cost:
        test_monitor = DataStreamMonitoring(
            variables=[cost_monitor, cost_per_character],
            data_stream=test_stream,
            prefix="test")
        extensions.append(test_monitor)

    if not os.path.exists(experiment_path):
        os.makedirs(experiment_path)
    log_path = os.path.join(experiment_path, 'log.txt')
    fh = logging.FileHandler(filename=log_path)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)

    extensions.append(
        SaveParams('dev_nll_cost', model, experiment_path, every_n_epochs=1))
    extensions.append(SaveLog(every_n_epochs=1))
    extensions.append(ProgressBar())
    extensions.append(Printing())

    main_loop = MainLoop(model=model,
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)
    t1 = time.time()
    print "Building time: %f" % (t1 - t0)
    # if write_predictions:
    #     with open('predicted.txt', 'w') as f_pred:
    #         with open('targets.txt', 'w') as f_targets:
    #             evaluator = CTCEvaluator(
    #                 eol_symbol, x, input_mask, y_hat, phoneme_dict, black_list)
    #             evaluator.evaluate(dev_stream, file_pred=f_pred,
    #                                file_targets=f_targets)
    #     return
    main_loop.run()
    print "Execution time: %f" % (time.time() - t1)
示例#11
0
# load training data using Fuel
mnist_train = MNIST("train")
train_stream = Flatten(DataStream.default_stream(
                        dataset=mnist_train,
                        iteration_scheme=SequentialScheme(mnist_train.num_examples, 128)),
                       )

# load testing data
mnist_test = MNIST("test")
test_stream = Flatten(DataStream.default_stream(
                        dataset=mnist_test,
                        iteration_scheme=SequentialScheme(mnist_test.num_examples, 1024)),
                      )

# train the model
from blocks.model import Model
main_loop = MainLoop(
    model=Model(cost),
    data_stream=train_stream,
    algorithm=GradientDescent(
    cost=cost, params=ComputationGraph(cost).parameters,
    step_rule=Scale(learning_rate=0.1)),
    extensions=[FinishAfter(after_n_epochs=5),
        DataStreamMonitoring(
            variables=[cost, error_rate],
            data_stream=test_stream,
            prefix="test"),
        Printing()])

main_loop.run()
top_mlp_dims = [numpy.prod(conv_sequence4.get_dim('output'))
                ] + [mlp_hiddens] + [output_size]
top_mlp = MLP(mlp_activation,
              top_mlp_dims,
              weights_init=Uniform(width=0.2),
              biases_init=Constant(0.))
top_mlp.initialize()

probs = top_mlp.apply(out)
cost = CategoricalCrossEntropy(name='Cross1').apply(y.flatten(),
                                                    probs).copy(name='cost1')

error_rate = (MisclassificationRate().apply(y.flatten(),
                                            probs).copy(name='error_rate'))
error_rate2 = error_rate.copy(name='error_rate2')
cg = ComputationGraph([cost, error_rate])
weights = VariableFilter(roles=[FILTER, WEIGHT])(cg.variables)

########### Loading images #####################

from fuel.datasets.dogs_vs_cats import DogsVsCats
from fuel.streams import DataStream, ServerDataStream
from fuel.schemes import ShuffledScheme
from fuel.transformers.image import RandomFixedSizeCrop, MinimumImageDimensions, Random2DRotation
from fuel.transformers import Flatten, Cast, ScaleAndShift


def create_data(data):

    stream = DataStream(data,
                        iteration_scheme=ShuffledScheme(
示例#13
0
def train_ladder(cli_params, dataset=None, save_to='results/ova_all_full'):
    cli_params['save_dir'] = prepare_dir(save_to)
    logfile = os.path.join(cli_params['save_dir'], 'log.txt')

    # Log also DEBUG to a file
    fh = logging.FileHandler(filename=logfile)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)

    logger.info('Logging into %s' % logfile)

    p, loaded = load_and_log_params(cli_params)

    ladder = setup_model(p)

    # Training
    all_params = ComputationGraph([ladder.costs.total]).parameters
    logger.info('Found the following parameters: %s' % str(all_params))

    # Fetch all batch normalization updates. They are in the clean path.
    bn_updates = ComputationGraph([ladder.costs.class_clean]).updates
    assert 'counter' in [u.name for u in bn_updates.keys()], \
        'No batch norm params in graph - the graph has been cut?'

    training_algorithm = GradientDescent(
        cost=ladder.costs.total,
        params=all_params,
        step_rule=Adam(learning_rate=ladder.lr))
    # In addition to actual training, also do BN variable approximations
    training_algorithm.add_updates(bn_updates)

    short_prints = {
        "train": {
            'T_C_class': ladder.costs.class_corr,
            'T_C_de': ladder.costs.denois.values(),
        },
        "valid_approx":
        OrderedDict([
            ('V_C_class', ladder.costs.class_clean),
            ('V_E', ladder.error.clean),
            ('V_C_de', ladder.costs.denois.values()),
        ]),
        "valid_final":
        OrderedDict([
            ('VF_C_class', ladder.costs.class_clean),
            ('VF_E', ladder.error.clean),
            ('VF_C_de', ladder.costs.denois.values()),
        ]),
    }

    ovadataset = dataset['ovadataset']
    train_indexes = dataset['train_indexes']
    val_indexes = dataset['val_indexes']

    main_loop = MainLoop(
        training_algorithm,
        # Datastream used for training
        make_datastream(ovadataset,
                        train_indexes,
                        p.batch_size,
                        scheme=ShuffledScheme),
        model=Model(ladder.costs.total),
        extensions=[
            FinishAfter(after_n_epochs=p.num_epochs),

            # This will estimate the validation error using
            # running average estimates of the batch normalization
            # parameters, mean and variance
            ApproxTestMonitoring(
                [ladder.costs.class_clean, ladder.error.clean] +
                ladder.costs.denois.values(),
                make_datastream(ovadataset, val_indexes, p.batch_size),
                prefix="valid_approx"),

            # This Monitor is slower, but more accurate since it will first
            # estimate batch normalization parameters from training data and
            # then do another pass to calculate the validation error.
            FinalTestMonitoring(
                [ladder.costs.class_clean, ladder.error.clean_mc] +
                ladder.costs.denois.values(),
                make_datastream(ovadataset, train_indexes, p.batch_size),
                make_datastream(ovadataset, val_indexes, p.batch_size),
                prefix="valid_final",
                after_n_epochs=p.num_epochs),
            TrainingDataMonitoring([
                ladder.costs.total, ladder.costs.class_corr,
                training_algorithm.total_gradient_norm
            ] + ladder.costs.denois.values(),
                                   prefix="train",
                                   after_epoch=True),
            ShortPrinting(short_prints),
            LRDecay(ladder.lr,
                    p.num_epochs * p.lrate_decay,
                    p.num_epochs,
                    after_epoch=True),
        ])
    main_loop.run()

    # Get results
    df = main_loop.log.to_dataframe()
    col = 'valid_final_error_matrix_cost'
    logger.info('%s %g' % (col, df[col].iloc[-1]))

    ds = make_datastream(ovadataset, val_indexes, p.batch_size)
    outputs = ladder.act.clean.labeled.h[len(ladder.layers) - 1]
    outputreplacer = TestMonitoring()
    _, _, outputs = outputreplacer._get_bn_params(outputs)

    cg = ComputationGraph(outputs)
    f = cg.get_theano_function()

    it = ds.get_epoch_iterator(as_dict=True)
    res = []
    inputs = {
        'features_labeled': [],
        'targets_labeled': [],
        'features_unlabeled': []
    }
    # Loop over one epoch
    for d in it:
        # Store all inputs
        for k, v in d.iteritems():
            inputs[k] += [v]
        # Store outputs
        res += [f(*[d[str(inp)] for inp in cg.inputs])]

    # Concatenate all minibatches
    res = [numpy.vstack(minibatches) for minibatches in zip(*res)]
    inputs = {k: numpy.vstack(v) for k, v in inputs.iteritems()}

    if main_loop.log.status['epoch_interrupt_received']:
        return None
    return res[0], inputs
def test_convolutional_layer():
	batch_size=2
	x = T.tensor4();
	y = T.ivector()
	V = 200
	layer_conv = Convolutional(filter_size=(5,5),num_filters=V,
				name="toto",
				weights_init=IsotropicGaussian(0.01),
				biases_init=Constant(0.0))
	# try with no bias
	activation = Rectifier()
	pool = MaxPooling(pooling_size=(2,2))

	convnet = ConvolutionalSequence([layer_conv, activation, pool], num_channels=15,
					image_size=(10,10),
					name="conv_section")
	convnet.push_allocation_config()
	convnet.initialize()
	output=convnet.apply(x)
	batch_size=output.shape[0]
	output_dim=np.prod(convnet.get_dim('output'))
	result_conv = output.reshape((batch_size, output_dim))
	mlp=MLP(activations=[Rectifier().apply], dims=[output_dim, 10],
				weights_init=IsotropicGaussian(0.01),
				biases_init=Constant(0.0))
	mlp.initialize()
	output=mlp.apply(result_conv)
	cost = T.mean(Softmax().categorical_cross_entropy(y.flatten(), output))
	cg = ComputationGraph(cost)
	W = VariableFilter(roles=[WEIGHT])(cg.variables)
	B = VariableFilter(roles=[BIAS])(cg.variables)
	W = W[-1]; b = B[-1]
	
	print W.shape.eval()
	print b.shape.eval()
	import pdb
	pdb.set_trace()
	inputs_conv = VariableFilter(roles=[INPUT], bricks=[Convolutional])(cg)
	outputs_conv = VariableFilter(roles=[OUTPUT], bricks=[Convolutional])(cg)
	var_input=inputs_conv[0]
	var_output=outputs_conv[0]
	
	[d_W,d_S,d_b] = T.grad(cost, [W, var_output, b])

	import pdb
	pdb.set_trace()
	w_shape = W.shape.eval()
	d_W = d_W.reshape((w_shape[0], w_shape[1]*w_shape[2]*w_shape[3]))

	d_b = T.zeros((w_shape[0],6*6))
	#d_b = d_b.reshape((w_shape[0], 8*8))
	d_p = T.concatenate([d_W, d_b], axis=1)
	d_S = d_S.dimshuffle((1, 0, 2, 3)).reshape((w_shape[0], batch_size, 6*6)).reshape((w_shape[0], batch_size*6*6))
	#d_S = d_S.reshape((2,200, 64))
	#x_value=1e3*np.random.ranf((1,15,10,10))
	x_value = 1e3*np.random.ranf((2,15, 10, 10))
	f = theano.function([x,y], [var_input, d_S, d_W], allow_input_downcast=True, on_unused_input='ignore')
	A, B, C= f(x_value, [5, 5])
	print np.mean(B)
	return
	
	E_A = expansion_op(A, (2, 15, 10, 10), (5,5))
	print E_A.shape
	E_A = E_A.reshape((2*36, C.shape[1]))
	print E_A.shape
	tmp = C - np.dot(B, E_A)
	print lin.norm(tmp, 'fro')
示例#15
0
# Convert RGB to BGR
texture_image_nn_input = texture_image_nn_input[::-1, :, :] - MEAN_VALUES
texture_image_nn_input = texture_image_nn_input.astype('float32')

# print texture_image_nn_input
print texture_image_nn_input.shape

f_features_gram = theano.function(
    inputs=[X], outputs=[gram_matrix(f) for f in texture_features(X)])
target_image_features = f_features_gram(texture_image_nn_input)
# print target_image_features
print[t.shape for t in target_image_features]

from blocks.graph import ComputationGraph, apply_batch_normalization, get_batch_normalization_updates

cg = ComputationGraph(generated_image_graph)
cg_bn = apply_batch_normalization(cg)
pop_updates = get_batch_normalization_updates(cg_bn)

text_generated = texture_features(cg.outputs[0])
gram_generated = [gram_matrix(f) for f in text_generated]

loss = 0
for i in range(len(target_image_features)):
    N = text_generated[i].shape[1]
    M = text_generated[i].shape[2] * text_generated[i].shape[3]
    loss += 1. / (4 * 16 * N**2 * M**2) * (
        (gram_generated[i] - tensor.addbroadcast(
            theano.shared(target_image_features[i]), 0))**2).sum()

alpha = 0.1
示例#16
0
        if not key.startswith('__') and isinstance(getattr(config, key),
                                                   (int, str, list, tuple)):
            logger.info('    %20s %s' % (key, str(getattr(config, key))))

    model = config.Model(config)
    model.initialize()

    stream = config.Stream(config)
    inputs = stream.inputs()
    req_vars = model.cost.inputs

    train_stream = stream.train(req_vars)
    valid_stream = stream.valid(req_vars)

    cost = model.cost(**inputs)
    cg = ComputationGraph(cost)
    monitored = set([cost] + VariableFilter(roles=[roles.COST])(cg.variables))

    valid_monitored = monitored
    if hasattr(model, 'valid_cost'):
        valid_cost = model.valid_cost(**inputs)
        valid_cg = ComputationGraph(valid_cost)
        valid_monitored = set([valid_cost] + VariableFilter(
            roles=[roles.COST])(valid_cg.variables))

    if hasattr(config, 'dropout') and config.dropout < 1.0:
        cg = apply_dropout(cg, config.dropout_inputs(cg), config.dropout)
    if hasattr(config, 'noise') and config.noise > 0.0:
        cg = apply_noise(cg, config.noise_inputs(cg), config.noise)
    cost = cg.outputs[0]
    cg = Model(cost)
示例#17
0
文件: run.py 项目: zero76114/ladder
def analyze(cli_params):
    p, _ = load_and_log_params(cli_params)
    _, data, whiten, cnorm = setup_data(p, test_set=True)
    ladder = setup_model(p)

    # Analyze activations
    dset, indices, calc_batchnorm = {
        'train': (data.train, data.train_ind, False),
        'valid': (data.valid, data.valid_ind, True),
        'test':  (data.test, data.test_ind, True),
    }[p.data_type]

    if calc_batchnorm:
        logger.info('Calculating batch normalization for clean.labeled path')
        main_loop = DummyLoop(
            extensions=[
                FinalTestMonitoring(
                    [ladder.costs.class_clean, ladder.error.clean]
                    + ladder.costs.denois.values(),
                    make_datastream(data.train, data.train_ind,
                                    # These need to match with the training
                                    p.batch_size,
                                    n_labeled=p.labeled_samples,
                                    n_unlabeled=len(data.train_ind),
                                    cnorm=cnorm,
                                    whiten=whiten, scheme=ShuffledScheme),
                    make_datastream(data.valid, data.valid_ind,
                                    p.valid_batch_size,
                                    n_labeled=len(data.valid_ind),
                                    n_unlabeled=len(data.valid_ind),
                                    cnorm=cnorm,
                                    whiten=whiten, scheme=ShuffledScheme),
                    prefix="valid_final", before_training=True),
                ShortPrinting({
                    "valid_final": OrderedDict([
                        ('VF_C_class', ladder.costs.class_clean),
                        ('VF_E', ladder.error.clean),
                        ('VF_C_de', [ladder.costs.denois.get(0),
                                     ladder.costs.denois.get(1),
                                     ladder.costs.denois.get(2),
                                     ladder.costs.denois.get(3)]),
                    ]),
                }, after_training=True, use_log=False),
            ])
        main_loop.run()

    # Make a datastream that has all the indices in the labeled pathway
    ds = make_datastream(dset, indices,
                         batch_size=p.get('batch_size'),
                         n_labeled=len(indices),
                         n_unlabeled=len(indices),
                         balanced_classes=False,
                         whiten=whiten,
                         cnorm=cnorm,
                         scheme=SequentialScheme)

    # We want out the values after softmax
    outputs = ladder.act.clean.labeled.h[len(ladder.layers) - 1]

    # Replace the batch normalization paramameters with the shared variables
    if calc_batchnorm:
        outputreplacer = TestMonitoring()
        _, _,  outputs = outputreplacer._get_bn_params(outputs)

    cg = ComputationGraph(outputs)
    f = cg.get_theano_function()

    it = ds.get_epoch_iterator(as_dict=True)
    res = []
    inputs = {'features_labeled': [],
              'targets_labeled': [],
              'features_unlabeled': []}
    # Loop over one epoch
    for d in it:
        # Store all inputs
        for k, v in d.iteritems():
            inputs[k] += [v]
        # Store outputs
        res += [f(*[d[str(inp)] for inp in cg.inputs])]

    # Concatenate all minibatches
    res = [numpy.vstack(minibatches) for minibatches in zip(*res)]
    inputs = {k: numpy.vstack(v) for k, v in inputs.iteritems()}

    return inputs['targets_labeled'], res[0]
    
    # scale is applied before shift
    train_stream = ScaleAndShift(train_stream, scl, shft)
    test_stream = ScaleAndShift(test_stream, scl, shft)
    baseline_uniform_noise = 1./255. # appropriate for MNIST and CIFAR10 Fuel datasets, which are scaled [0,1]
    uniform_noise = baseline_uniform_noise/scl

    ## initialize the model
    dpm = model.DiffusionModel(spatial_width, n_colors, uniform_noise=uniform_noise, **model_args)
    dpm.initialize()

    ## set up optimization
    features = T.matrix('features', dtype=theano.config.floatX)
    cost = dpm.cost(features)
    blocks_model = blocks.model.Model(cost)
    cg_nodropout = ComputationGraph(cost)
    if args.dropout_rate > 0:
        # DEBUG this triggers an error on my machine
        # apply dropout to all the input variables
        inputs = VariableFilter(roles=[INPUT])(cg_nodropout.variables)
        # dropconnect
        # inputs = VariableFilter(roles=[PARAMETER])(cg_nodropout.variables)
        cg = apply_dropout(cg_nodropout, inputs, args.dropout_rate)
    else:
        cg = cg_nodropout
    step_compute = RMSProp(learning_rate=args.lr, max_scaling=1e10)
    algorithm = GradientDescent(step_rule=CompositeRule([RemoveNotFinite(),
        step_compute]),
        parameters=cg.parameters, cost=cost)
    extension_list = []
    extension_list.append(
示例#19
0
def main(name, epochs, batch_size, learning_rate):
    if name is None:
        name = "att-rw"

    print("\nRunning experiment %s" % name)
    print("         learning rate: %5.3f" % learning_rate)
    print()

    #------------------------------------------------------------------------

    img_height, img_width = 28, 28

    read_N = 12
    write_N = 14

    inits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.001),
        'biases_init': Constant(0.),
    }

    x_dim = img_height * img_width

    reader = ZoomableAttentionWindow(img_height, img_width, read_N)
    writer = ZoomableAttentionWindow(img_height, img_width, write_N)

    # Parameterize the attention reader and writer
    mlpr = MLP(activations=[Tanh(), Identity()],
               dims=[x_dim, 50, 5],
               name="RMLP",
               **inits)
    mlpw = MLP(activations=[Tanh(), Identity()],
               dims=[x_dim, 50, 5],
               name="WMLP",
               **inits)

    # MLP between the reader and writer
    mlp = MLP(activations=[Tanh(), Identity()],
              dims=[read_N**2, 300, write_N**2],
              name="MLP",
              **inits)

    for brick in [mlpr, mlpw, mlp]:
        brick.allocate()
        brick.initialize()

    #------------------------------------------------------------------------
    x = tensor.matrix('features')

    hr = mlpr.apply(x)
    hw = mlpw.apply(x)

    center_y, center_x, delta, sigma, gamma = reader.nn2att(hr)
    r = reader.read(x, center_y, center_x, delta, sigma)

    h = mlp.apply(r)

    center_y, center_x, delta, sigma, gamma = writer.nn2att(hw)
    c = writer.write(h, center_y, center_x, delta, sigma) / gamma
    x_recons = T.nnet.sigmoid(c)

    cost = BinaryCrossEntropy().apply(x, x_recons)
    cost.name = "cost"

    #------------------------------------------------------------
    cg = ComputationGraph([cost])
    params = VariableFilter(roles=[PARAMETER])(cg.variables)

    algorithm = GradientDescent(
        cost=cost,
        parameters=params,
        step_rule=CompositeRule([
            RemoveNotFinite(),
            Adam(learning_rate),
            StepClipping(3.),
        ])
        #step_rule=RMSProp(learning_rate),
        #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95)
    )

    #------------------------------------------------------------------------
    # Setup monitors
    monitors = [cost]
    #for v in [center_y, center_x, log_delta, log_sigma, log_gamma]:
    #    v_mean = v.mean()
    #    v_mean.name = v.name
    #    monitors += [v_mean]
    #    monitors += [aggregation.mean(v)]

    train_monitors = monitors[:]
    train_monitors += [aggregation.mean(algorithm.total_gradient_norm)]
    train_monitors += [aggregation.mean(algorithm.total_step_norm)]

    # Live plotting...
    plot_channels = [
        ["cost"],
    ]

    #------------------------------------------------------------

    mnist_train = BinarizedMNIST("train", sources=['features'])
    mnist_test = BinarizedMNIST("test", sources=['features'])
    #mnist_train = MNIST("train", binary=True, sources=['features'])
    #mnist_test = MNIST("test", binary=True, sources=['features'])

    main_loop = MainLoop(
        model=Model(cost),
        data_stream=ForceFloatX(
            DataStream(mnist_train,
                       iteration_scheme=SequentialScheme(
                           mnist_train.num_examples, batch_size))),
        algorithm=algorithm,
        extensions=[
            Timing(),
            FinishAfter(after_n_epochs=epochs),
            DataStreamMonitoring(
                monitors,
                ForceFloatX(
                    DataStream(mnist_test,
                               iteration_scheme=SequentialScheme(
                                   mnist_test.num_examples, batch_size))),
                prefix="test"),
            TrainingDataMonitoring(train_monitors,
                                   prefix="train",
                                   after_every_epoch=True),
            SerializeMainLoop(name + ".pkl"),
            #Plot(name, channels=plot_channels),
            ProgressBar(),
            Printing()
        ])
    main_loop.run()
示例#20
0
    def __init__(self,
                 input1_size,
                 input2_size,
                 lookup1_dim=200,
                 lookup2_dim=200,
                 hidden_size=512):
        self.hidden_size = hidden_size
        self.input1_size = input1_size
        self.input2_size = input2_size
        self.lookup1_dim = lookup1_dim
        self.lookup2_dim = lookup2_dim

        x1 = tensor.lmatrix('durations')
        x2 = tensor.lmatrix('syllables')
        y = tensor.lmatrix('pitches')

        lookup1 = LookupTable(dim=self.lookup1_dim,
                              length=self.input1_size,
                              name='lookup1',
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        lookup1.initialize()
        lookup2 = LookupTable(dim=self.lookup2_dim,
                              length=self.input2_size,
                              name='lookup2',
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        lookup2.initialize()
        merge = Merge(['lookup1', 'lookup2'],
                      [self.lookup1_dim, self.lookup2_dim],
                      self.hidden_size,
                      weights_init=initialization.Uniform(width=0.01),
                      biases_init=Constant(0))
        merge.initialize()
        recurrent_block = LSTM(
            dim=self.hidden_size,
            activation=Tanh(),
            weights_init=initialization.Uniform(width=0.01)
        )  #RecurrentStack([LSTM(dim=self.hidden_size, activation=Tanh())] * 3)
        recurrent_block.initialize()
        linear = Linear(input_dim=self.hidden_size,
                        output_dim=self.input1_size,
                        weights_init=initialization.Uniform(width=0.01),
                        biases_init=Constant(0))
        linear.initialize()
        softmax = NDimensionalSoftmax()

        l1 = lookup1.apply(x1)
        l2 = lookup2.apply(x2)
        m = merge.apply(l1, l2)
        h = recurrent_block.apply(m)
        a = linear.apply(h)

        y_hat = softmax.apply(a, extra_ndim=1)
        # ValueError: x must be 1-d or 2-d tensor of floats. Got TensorType(float64, 3D)

        self.Cost = softmax.categorical_cross_entropy(y, a,
                                                      extra_ndim=1).mean()

        self.ComputationGraph = ComputationGraph(self.Cost)

        self.Model = Model(y_hat)
示例#21
0
def main(config, tr_stream, dev_stream, use_bokeh=False):

    logger.info('Building RNN encoder-decoder')
    cost, samples, search_model = create_model(config)
    #cost, samples, search_model = create_multitask_model(config)

    logger.info("Building model")
    cg = ComputationGraph(cost)
    training_model = Model(cost)

    # apply dropout for regularization
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        logger.info('Applying dropout')
        dropout_inputs = [
            x for x in cg.intermediary_variables
            if x.name == 'maxout_apply_output'
        ]
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])

    # Set extensions
    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=config['finish_after']),
        TrainingDataMonitoring([cost], after_batch=True),
        Printing(after_batch=True),
        CheckpointNMT(config['saveto'], every_n_batches=config['save_freq'])
    ]

    # Add sampling
    if config['hook_samples'] >= 1:
        logger.info("Building sampler")
        extensions.append(
            Sampler(model=search_model,
                    data_stream=tr_stream,
                    src_vocab=config['src_vocab'],
                    trg_vocab=config['trg_vocab'],
                    phones_vocab=config['phones'],
                    hook_samples=config['hook_samples'],
                    every_n_batches=config['sampling_freq'],
                    src_vocab_size=config['src_vocab_size']))

    # Add early stopping based on f1
    if config['f1_validation'] is not None:
        logger.info("Building f1 validator")
        extensions.append(
            F1Validator(samples=samples,
                        config=config,
                        model=search_model,
                        data_stream=dev_stream,
                        normalize=config['normalized_f1'],
                        every_n_batches=config['f1_val_freq']))

    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # Set up training algorithm
    logger.info("Initializing training algorithm")
    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=CompositeRule([
                                    StepClipping(config['step_clipping']),
                                    eval(config['step_rule'])(),
                                    RemoveNotFinite()
                                ]),
                                on_unused_sources='warn')

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=training_model,
                         algorithm=algorithm,
                         data_stream=tr_stream,
                         extensions=extensions)

    # Train!
    main_loop.run()
示例#22
0
def main(config): 
	vocab_src, _ = text_to_dict([config['train_src'],
		config['dev_src'], config['test_src']])
	vocab_tgt, cabvo = text_to_dict([config['train_tgt'],
		config['dev_tgt']])

	# Create Theano variables
	logger.info('Creating theano variables')
	source_sentence = tensor.lmatrix('source')
	source_sentence_mask = tensor.matrix('source_mask')
	target_sentence = tensor.lmatrix('target')
	target_sentence_mask = tensor.matrix('target_mask')
	source_sentence.tag.test_value = [[13, 20, 0, 20, 0, 20, 0],
										[1, 4, 8, 4, 8, 4, 8],]
	source_sentence_mask.tag.test_value = [[0, 1, 0, 1, 0, 1, 0],
											[1, 0, 1, 0, 1, 0, 1],]
	target_sentence.tag.test_value = [[0,1,1,5],
										[2,0,1,0],]
	target_sentence_mask.tag.test_value = [[0,1,1,0],
											[1,1,1,0],]


	logger.info('Building RNN encoder-decoder')
	### Building Encoder 
	embedder = LookupTable(
		length=len(vocab_src), 
		dim=config['embed_src'], 
		weights_init=IsotropicGaussian(),
		biases_init=Constant(0.0), 
		name='embedder')
	transformer = Linear(
		config['embed_src'], 
		config['hidden_src']*4, 
		weights_init=IsotropicGaussian(),
		biases_init=Constant(0.0), 
		name='transformer')

	lstminit = np.asarray([0.0,]*config['hidden_src']+[0.0,]*config['hidden_src']+[1.0,]*config['hidden_src']+[0.0,]*config['hidden_src'])
	encoder = Bidirectional(
		LSTM(
			dim=config['hidden_src'], 
			weights_init=IsotropicGaussian(0.01),
			biases_init=Constant(lstminit)),
		name='encoderBiLSTM'
		)
	encoder.prototype.weights_init = Orthogonal()
	
	### Building Decoder 
	lstminit = np.asarray([0.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt']+[1.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt'])
	transition = LSTM2GO(
		attended_dim=config['hidden_tgt'], 
		dim=config['hidden_tgt'], 
		weights_init=IsotropicGaussian(0.01),
		biases_init=Constant(lstminit), 
		name='decoderLSTM')

	attention = SequenceContentAttention( 
		state_names=transition.apply.states, # default activation is Tanh
		state_dims=[config['hidden_tgt']],
		attended_dim=config['hidden_src']*2,
		match_dim=config['hidden_tgt'], 
		name="attention")

	readout = Readout(
		source_names=['states', 
			'feedback', 
			attention.take_glimpses.outputs[0]],
		readout_dim=len(vocab_tgt),
		emitter = SoftmaxEmitter(
			name='emitter'), 
		feedback_brick = LookupFeedback(
			num_outputs=len(vocab_tgt), 
			feedback_dim=config['embed_tgt'], 
			name='feedback'), 
		post_merge=InitializableFeedforwardSequence([
			Bias(dim=config['hidden_tgt'], 
				name='softmax_bias').apply,
			Linear(input_dim=config['hidden_tgt'], 
				output_dim=config['embed_tgt'],
				use_bias=False, 
				name='softmax0').apply,
			Linear(input_dim=config['embed_tgt'], 
				name='softmax1').apply]),
		merged_dim=config['hidden_tgt'])

	decoder = SequenceGenerator(
		readout=readout, 
		transition=transition, 
		attention=attention, 
		weights_init=IsotropicGaussian(0.01), 
		biases_init=Constant(0),
		name="generator",
		fork=Fork(
			[name for name in transition.apply.sequences if name != 'mask'], 
			prototype=Linear()),
		add_contexts=True)
	decoder.transition.weights_init = Orthogonal()

	#printchildren(encoder, 1)
	# Initialize model
	logger.info('Initializing model')
	embedder.initialize()
	transformer.initialize()
	encoder.initialize()
	decoder.initialize()
	
	# Apply model 
	embedded = embedder.apply(source_sentence)
	tansformed = transformer.apply(embedded)
	encoded = encoder.apply(tansformed)[0]
	generated = decoder.generate(
		n_steps=2*source_sentence.shape[1], 
		batch_size=source_sentence.shape[0], 
		attended = encoded.dimshuffle(1,0,2), 
		attended_mask=tensor.ones(source_sentence.shape).T
		)
	print 'Generated: ', generated
	# generator_generate_outputs
	#samples = generated[1] # For GRU 
	samples = generated[2] # For LSTM
	samples.name = 'samples'
	#samples_cost = generated[4] # For GRU 
	samples_cost = generated[5] # For LSTM
	samples_cost = 'sampling_cost'
	cost = decoder.cost(
		mask = target_sentence_mask.T, 
		outputs = target_sentence.T, 
		attended = encoded.dimshuffle(1,0,2), 
		attended_mask = source_sentence_mask.T)
	cost.name = 'target_cost'
	cost.tag.aggregation_scheme = TakeLast(cost)
	model = Model(cost)
	
	logger.info('Creating computational graph')
	cg = ComputationGraph(cost)
	
	# apply dropout for regularization
	if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog
		logger.info('Applying dropout')
		dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output']
		cg = apply_dropout(cg, dropout_inputs, config['dropout'])

	######## 
	# Print shapes
	shapes = [param.get_value().shape for param in cg.parameters]
	logger.info("Parameter shapes: ")
	for shape, count in Counter(shapes).most_common():
		logger.info('	{:15}: {}'.format(shape, count))
	logger.info("Total number of parameters: {}".format(len(shapes)))

	printchildren(embedder, 1)
	printchildren(transformer, 1)
	printchildren(encoder, 1)
	printchildren(decoder, 1)
	# Print parameter names
	# enc_dec_param_dict = merge(Selector(embedder).get_parameters(), Selector(encoder).get_parameters(), Selector(decoder).get_parameters())
	# enc_dec_param_dict = merge(Selector(decoder).get_parameters())
	# logger.info("Parameter names: ")
	# for name, value in enc_dec_param_dict.items():
	# 	logger.info('	{:15}: {}'.format(value.get_value().shape, name))
	# logger.info("Total number of parameters: {}".format(len(enc_dec_param_dict)))
	##########

	# Training data 
	train_stream = get_train_stream(config, 
		[config['train_src'],], [config['train_tgt'],], 
		vocab_src, vocab_tgt)
	dev_stream = get_dev_stream(
		[config['dev_src'],], [config['dev_tgt'],], 
		vocab_src, vocab_tgt)
	test_stream = get_test_stream([config['test_src'],], vocab_src)

	# Set extensions
	logger.info("Initializing extensions")
	extensions = [
		FinishAfter(after_n_batches=config['finish_after']),
		ProgressBar(),
		TrainingDataMonitoring([cost], 
			prefix="tra", 
			after_batch=True),
		DataStreamMonitoring(variables=[cost], 
			data_stream=dev_stream, 
			prefix="dev", 
			after_batch=True), 
		Sampler(
			model=Model(samples), 
			data_stream=dev_stream,
			vocab=cabvo,
			saveto=config['saveto']+'dev',
			every_n_batches=config['save_freq']), 
		Sampler(
			model=Model(samples), 
			data_stream=test_stream,
			vocab=cabvo,
			saveto=config['saveto']+'test',
			after_n_batches=1, 
			on_resumption=True,
			before_training=True), 
		Plotter(saveto=config['saveto'], after_batch=True),
		Printing(after_batch=True),
		Checkpoint(
			path=config['saveto'], 
			parameters = cg.parameters,
			save_main_loop=False,
			every_n_batches=config['save_freq'])]
	if BOKEH_AVAILABLE: 
		Plot('Training cost', channels=[['target_cost']], after_batch=True)
	if config['reload']: 
		extensions.append(Load(path=config['saveto'], 
			load_iteration_state=False, 
			load_log=False))
	else: 
		with open(config['saveto']+'.txt', 'w') as f: 
			pass 

	# Set up training algorithm
	logger.info("Initializing training algorithm")
	algorithm = GradientDescent(cost=cost, 
		parameters=cg.parameters,
		step_rule=CompositeRule([StepClipping(config['step_clipping']), 
			eval(config['step_rule'])()])
    )

	# Initialize main loop
	logger.info("Initializing main loop")
	main_loop = MainLoop(
		model=model,
		algorithm=algorithm,
		data_stream=train_stream,
		extensions=extensions)
	main_loop.run()
示例#23
0
def main(num_epochs=50, batch_normalized=True, alpha=0.1):
    """Run the example.

    Parameters
    ----------
    num_epochs : int, optional
        Number of epochs for which to train.

    batch_normalized : bool, optional
        Batch-normalize the training graph. Defaults to `True`.

    alpha : float, optional
        Weight to apply to a new sample when calculating running
        averages for population statistics (1 - alpha weight is
        given to the existing average).

    """
    if batch_normalized:
        # Add an extra keyword argument that only BatchNormalizedMLP takes,
        # in order to speed things up at the cost of a bit of extra memory.
        mlp_class = BatchNormalizedMLP
        extra_kwargs = {'conserve_memory': False}
    else:
        mlp_class = MLP
        extra_kwargs = {}
    mlp = mlp_class([Logistic(), Logistic(),
                     Logistic(), Softmax()], [2, 5, 5, 5, 3],
                    weights_init=IsotropicGaussian(0.2),
                    biases_init=Constant(0.),
                    **extra_kwargs)
    mlp.initialize()

    # Generate a dataset with 3 spiral arms, using 8000 examples for
    # training and 2000 for testing.
    dataset = Spiral(num_examples=10000,
                     classes=3,
                     sources=['features', 'label'],
                     noise=0.05)
    train_stream = DataStream(dataset,
                              iteration_scheme=ShuffledScheme(examples=8000,
                                                              batch_size=20))
    test_stream = DataStream(dataset,
                             iteration_scheme=SequentialScheme(
                                 examples=list(range(8000, 10000)),
                                 batch_size=2000))

    # Build a cost graph; this contains BatchNormalization bricks that will
    # by default run in inference mode.
    features = tensor.matrix('features')
    label = tensor.lvector('label')
    prediction = mlp.apply(features)
    cost = CategoricalCrossEntropy().apply(label, prediction)
    misclass = MisclassificationRate().apply(label, prediction)
    misclass.name = 'misclass'  # The default name for this is annoyingly long
    original_cg = ComputationGraph([cost, misclass])

    if batch_normalized:
        cg = apply_batch_normalization(original_cg)
        # Add updates for population parameters
        pop_updates = get_batch_normalization_updates(cg)
        extra_updates = [(p, m * alpha + p * (1 - alpha))
                         for p, m in pop_updates]
    else:
        cg = original_cg
        extra_updates = []

    algorithm = GradientDescent(step_rule=Adam(0.001),
                                cost=cg.outputs[0],
                                parameters=cg.parameters)
    algorithm.add_updates(extra_updates)

    main_loop = MainLoop(
        algorithm=algorithm,
        data_stream=train_stream,
        # Use the original cost and misclass variables so
        # that we monitor the (original) inference-mode graph.
        extensions=[
            DataStreamMonitoring([cost, misclass],
                                 train_stream,
                                 prefix='train'),
            DataStreamMonitoring([cost, misclass], test_stream, prefix='test'),
            Printing(),
            FinishAfter(after_n_epochs=num_epochs)
        ])
    main_loop.run()
    return main_loop
示例#24
0
def main(name, epochs, batch_size, learning_rate, 
         attention, n_iter, enc_dim, dec_dim, z_dim):

     # Learning rate
    def lr_tag(value):
        """ Convert a float into a short tag-usable string representation. E.g.:
            0.1   -> 11
            0.01  -> 12
            0.001 -> 13
            0.005 -> 53
        """
        exp = np.floor(np.log10(value))
        leading = ("%e"%value)[0]
        return "%s%d" % (leading, -exp)

    if name is None:
        tag = "watt" if attention else "woatt"
        lr_str = lr_tag(learning_rate)
        name = "%s-t%d-enc%d-dec%d-z%d-lr%s" % (tag, n_iter, enc_dim, dec_dim, z_dim, lr_str)

    print("\nRunning experiment %s" % name)
    print("         learning rate: %5.3f" % learning_rate) 
    print("             attention: %s" % attention)
    print("          n_iterations: %d" % n_iter)
    print("     encoder dimension: %d" % enc_dim)
    print("           z dimension: %d" % z_dim)
    print("     decoder dimension: %d" % dec_dim)
    print()


    #------------------------------------------------------------------------

    x_dim = 28*28
    img_height, img_width = (28, 28)
    
    rnninits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    inits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    
    if attention:
        read_N = 4
        write_N = 7
        read_dim = 2*read_N**2

        reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim,
                                 width=img_width, height=img_height,
                                 N=read_N, **inits)
        writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim,
                                 width=img_width, height=img_height,
                                 N=read_N, **inits)
    else:
        read_dim = 2*x_dim

        reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits)
        writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits)

    encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits)
    decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits)
    encoder_mlp = MLP([Tanh()], [(read_dim+dec_dim), 4*enc_dim], name="MLP_enc", **inits)
    decoder_mlp = MLP([Tanh()], [             z_dim, 4*dec_dim], name="MLP_dec", **inits)
    q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits)

    draw = DrawModel(
                n_iter, 
                reader=reader,
                encoder_mlp=encoder_mlp,
                encoder_rnn=encoder_rnn,
                sampler=q_sampler,
                decoder_mlp=decoder_mlp,
                decoder_rnn=decoder_rnn,
                writer=writer)
    draw.initialize()


    #------------------------------------------------------------------------
    x = tensor.matrix('features')
    
    #x_recons = 1. + x
    x_recons, kl_terms = draw.reconstruct(x)
    #x_recons, _, _, _, _ = draw.silly(x, n_steps=10, batch_size=100)
    #x_recons = x_recons[-1,:,:]

    #samples = draw.sample(100) 
    #x_recons = samples[-1, :, :]
    #x_recons = samples[-1, :, :]

    recons_term = BinaryCrossEntropy().apply(x, x_recons)
    recons_term.name = "recons_term"

    cost = recons_term + kl_terms.sum(axis=0).mean()
    cost.name = "nll_bound"

    #------------------------------------------------------------
    cg = ComputationGraph([cost])
    params = VariableFilter(roles=[PARAMETER])(cg.variables)

    algorithm = GradientDescent(
        cost=cost, 
        params=params,
        step_rule=CompositeRule([
            StepClipping(3.), 
            Adam(learning_rate),
        ])
        #step_rule=RMSProp(learning_rate),
        #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95)
    )
    #algorithm.add_updates(scan_updates)


    #------------------------------------------------------------------------
    # Setup monitors
    monitors = [cost]
    """
    for t in range(n_iter):
        kl_term_t = kl_terms[t,:].mean()
        kl_term_t.name = "kl_term_%d" % t

        x_recons_t = T.nnet.sigmoid(c[t,:,:])
        recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t)
        recons_term_t = recons_term_t.mean()
        recons_term_t.name = "recons_term_%d" % t

        monitors +=[kl_term_t, recons_term_t]
    """
    train_monitors = monitors[:]
    train_monitors += [aggregation.mean(algorithm.total_gradient_norm)]
    train_monitors += [aggregation.mean(algorithm.total_step_norm)]
    # Live plotting...
    plot_channels = [
        ["train_nll_bound", "test_nll_bound"],
        ["train_kl_term_%d" % t for t in range(n_iter)],
        ["train_recons_term_%d" % t for t in range(n_iter)],
        ["train_total_gradient_norm", "train_total_step_norm"]
    ]

    #------------------------------------------------------------

    mnist_train = BinarizedMNIST("train", sources=['features'])
    mnist_test = BinarizedMNIST("test", sources=['features'])

    main_loop = MainLoop(
        model=Model(cost),
        data_stream=ForceFloatX(DataStream(mnist_train,
                        iteration_scheme=SequentialScheme(
                        mnist_train.num_examples, batch_size))),
        algorithm=algorithm,
        extensions=[
            Timing(),
            FinishAfter(after_n_epochs=epochs),
            DataStreamMonitoring(
                monitors,
                ForceFloatX(DataStream(mnist_test,
                    iteration_scheme=SequentialScheme(
                    mnist_test.num_examples, batch_size))),
##                updates=scan_updates, 
                prefix="test"),
            TrainingDataMonitoring(
                train_monitors, 
                prefix="train",
                after_every_epoch=True),
            SerializeMainLoop(name+".pkl"),
            Plot(name, channels=plot_channels),
            ProgressBar(),
            Printing()])
    main_loop.run()
示例#25
0
def main(mode, save_path, num_batches, data_path=None):
    reverser = WordReverser(100, len(char2code), name="reverser")

    if mode == "train":
        # Data processing pipeline
        dataset_options = dict(dictionary=char2code,
                               level="character",
                               preprocess=_lower)
        if data_path:
            dataset = TextFile(data_path, **dataset_options)
        else:
            dataset = OneBillionWord("training", [99], **dataset_options)
        data_stream = dataset.get_example_stream()
        data_stream = Filter(data_stream, _filter_long)
        data_stream = Mapping(data_stream,
                              reverse_words,
                              add_sources=("targets", ))
        data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10))
        data_stream = Padding(data_stream)
        data_stream = Mapping(data_stream, _transpose)

        # Initialization settings
        reverser.weights_init = IsotropicGaussian(0.1)
        reverser.biases_init = Constant(0.0)
        reverser.push_initialization_config()
        reverser.encoder.weights_init = Orthogonal()
        reverser.generator.transition.weights_init = Orthogonal()

        # Build the cost computation graph
        chars = tensor.lmatrix("features")
        chars_mask = tensor.matrix("features_mask")
        targets = tensor.lmatrix("targets")
        targets_mask = tensor.matrix("targets_mask")
        batch_cost = reverser.cost(chars, chars_mask, targets,
                                   targets_mask).sum()
        batch_size = named_copy(chars.shape[1], "batch_size")
        cost = aggregation.mean(batch_cost, batch_size)
        cost.name = "sequence_log_likelihood"
        logger.info("Cost graph is built")

        # Give an idea of what's going on
        model = Model(cost)
        parameters = model.get_parameter_dict()
        logger.info("Parameters:\n" +
                    pprint.pformat([(key, value.get_value().shape)
                                    for key, value in parameters.items()],
                                   width=120))

        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

        # Define the training algorithm.
        cg = ComputationGraph(cost)
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule(
                                        [StepClipping(10.0),
                                         Scale(0.01)]))

        # Fetch variables useful for debugging
        generator = reverser.generator
        (energies, ) = VariableFilter(applications=[generator.readout.readout],
                                      name_regex="output")(cg.variables)
        (activations, ) = VariableFilter(
            applications=[generator.transition.apply],
            name=generator.transition.apply.states[0])(cg.variables)
        max_length = named_copy(chars.shape[0], "max_length")
        cost_per_character = named_copy(
            aggregation.mean(batch_cost, batch_size * max_length),
            "character_log_likelihood")
        min_energy = named_copy(energies.min(), "min_energy")
        max_energy = named_copy(energies.max(), "max_energy")
        mean_activation = named_copy(
            abs(activations).mean(), "mean_activation")
        observables = [
            cost, min_energy, max_energy, mean_activation, batch_size,
            max_length, cost_per_character, algorithm.total_step_norm,
            algorithm.total_gradient_norm
        ]
        for name, parameter in parameters.items():
            observables.append(named_copy(parameter.norm(2), name + "_norm"))
            observables.append(
                named_copy(algorithm.gradients[parameter].norm(2),
                           name + "_grad_norm"))

        # Construct the main loop and start training!
        average_monitoring = TrainingDataMonitoring(observables,
                                                    prefix="average",
                                                    every_n_batches=10)
        main_loop = MainLoop(
            model=model,
            data_stream=data_stream,
            algorithm=algorithm,
            extensions=[
                Timing(),
                TrainingDataMonitoring(observables, after_batch=True),
                average_monitoring,
                FinishAfter(after_n_batches=num_batches)
                # This shows a way to handle NaN emerging during
                # training: simply finish it.
                .add_condition(["after_batch"], _is_nan),
                # Saving the model and the log separately is convenient,
                # because loading the whole pickle takes quite some time.
                Checkpoint(save_path,
                           every_n_batches=500,
                           save_separately=["model", "log"]),
                Printing(every_n_batches=1)
            ])
        main_loop.run()
    elif mode == "sample" or mode == "beam_search":
        chars = tensor.lmatrix("input")
        generated = reverser.generate(chars)
        model = Model(generated)
        logger.info("Loading the model..")
        model.set_parameter_values(load_parameter_values(save_path))

        def generate(input_):
            """Generate output sequences for an input sequence.

            Incapsulates most of the difference between sampling and beam
            search.

            Returns
            -------
            outputs : list of lists
                Trimmed output sequences.
            costs : list
                The negative log-likelihood of generating the respective
                sequences.

            """
            if mode == "beam_search":
                samples, = VariableFilter(bricks=[reverser.generator],
                                          name="outputs")(ComputationGraph(
                                              generated[1]))
                # NOTE: this will recompile beam search functions
                # every time user presses Enter. Do not create
                # a new `BeamSearch` object every time if
                # speed is important for you.
                beam_search = BeamSearch(samples)
                outputs, costs = beam_search.search({chars: input_},
                                                    char2code['</S>'],
                                                    3 * input_.shape[0])
            else:
                _1, outputs, _2, _3, costs = (
                    model.get_theano_function()(input_))
                outputs = list(outputs.T)
                costs = list(costs.T)
                for i in range(len(outputs)):
                    outputs[i] = list(outputs[i])
                    try:
                        true_length = outputs[i].index(char2code['</S>']) + 1
                    except ValueError:
                        true_length = len(outputs[i])
                    outputs[i] = outputs[i][:true_length]
                    costs[i] = costs[i][:true_length].sum()
            return outputs, costs

        while True:
            line = input("Enter a sentence\n")
            message = ("Enter the number of samples\n"
                       if mode == "sample" else "Enter the beam size\n")
            batch_size = int(input(message))

            encoded_input = [
                char2code.get(char, char2code["<UNK>"])
                for char in line.lower().strip()
            ]
            encoded_input = ([char2code['<S>']] + encoded_input +
                             [char2code['</S>']])
            print("Encoder input:", encoded_input)
            target = reverse_words((encoded_input, ))[0]
            print("Target: ", target)

            samples, costs = generate(
                numpy.repeat(numpy.array(encoded_input)[:, None],
                             batch_size,
                             axis=1))
            messages = []
            for sample, cost in equizip(samples, costs):
                message = "({})".format(cost)
                message += "".join(code2char[code] for code in sample)
                if sample == target:
                    message += " CORRECT!"
                messages.append((cost, message))
            messages.sort(key=operator.itemgetter(0), reverse=True)
            for _, message in messages:
                print(message)
示例#26
0
def main(config, test_stream, testing_model):
    # Create Theano variables
    logger.info('Creating theano variables')
    source_char_seq = tensor.lmatrix('source_char_seq')
    source_sample_matrix = tensor.btensor3('source_sample_matrix')
    source_char_aux = tensor.bmatrix('source_char_aux')
    source_word_mask = tensor.bmatrix('source_word_mask')
    target_char_seq = tensor.lmatrix('target_char_seq')
    target_char_aux = tensor.bmatrix('target_char_aux')
    target_char_mask = tensor.bmatrix('target_char_mask')
    target_sample_matrix = tensor.btensor3('target_sample_matrix')
    target_word_mask = tensor.bmatrix('target_word_mask')
    target_resample_matrix = tensor.btensor3('target_resample_matrix')
    target_prev_char_seq = tensor.lmatrix('target_prev_char_seq')
    target_prev_char_aux = tensor.bmatrix('target_prev_char_aux')
    target_bos_idx = test_stream.trg_bos
    target_space_idx = test_stream.space_idx['target']

    # Construct model
    logger.info('Building RNN encoder-decoder')

    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'],
                                   config['src_dgru_nhids'],
                                   config['enc_nhids'],
                                   config['src_dgru_depth'],
                                   config['bidir_encoder_depth'])

    decoder = Decoder(config['trg_vocab_size'], config['dec_embed'],
                      config['trg_dgru_nhids'], config['trg_igru_nhids'],
                      config['dec_nhids'], config['enc_nhids'] * 2,
                      config['transition_depth'], config['trg_igru_depth'],
                      config['trg_dgru_depth'], target_space_idx,
                      target_bos_idx)

    representation = encoder.apply(source_char_seq, source_sample_matrix,
                                   source_char_aux, source_word_mask)
    cost = decoder.cost(representation, source_word_mask, target_char_seq,
                        target_sample_matrix, target_resample_matrix,
                        target_char_aux, target_char_mask, target_word_mask,
                        target_prev_char_seq, target_prev_char_aux)

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)

    # Set extensions
    logger.info("Initializing extensions")
    # Extensions
    extensions = []
    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(testing_model))

    # Set up beam search and sampling computation graphs if necessary
    if config['bleu_script'] is not None:
        logger.info("Building sampling model")
        generated = decoder.generate(representation, source_word_mask)
        search_model = Model(generated)
        _, samples = VariableFilter(bricks=[decoder.sequence_generator],
                                    name="outputs")(ComputationGraph(
                                        generated[config['transition_depth']]))
        # generated[config['transition_depth']] is next_outputs

        logger.info("Building bleu tester")
        extensions.append(
            BleuTester(source_char_seq,
                       source_sample_matrix,
                       source_char_aux,
                       source_word_mask,
                       samples=samples,
                       config=config,
                       model=search_model,
                       data_stream=test_stream,
                       testing_model=testing_model,
                       normalize=config['normalized_bleu']))

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=training_model,
                         algorithm=None,
                         data_stream=None,
                         extensions=extensions)

    for extension in main_loop.extensions:
        extension.main_loop = main_loop
    main_loop._run_extensions('before_training')
示例#27
0
f0_mean = data_stats['f0_mean']
f0_std = data_stats['f0_std']

save_dir = os.environ['RESULTS_DIR']
save_dir = os.path.join(save_dir, 'blizzard/')

experiment_name = "f0_only_1"

main_loop = load(save_dir + "pkl/best_" + experiment_name + ".pkl")

generator = main_loop.model.get_top_bricks()[0]

steps = 2048
n_samples = 1

sample = ComputationGraph(
    generator.generate(n_steps=steps, batch_size=n_samples, iterate=True))
sample_fn = sample.get_theano_function()

outputs = sample_fn()[-2]

voiced = outputs[:, :, 1]
outputs = outputs[:, :, 0]
outputs = outputs * f0_std + f0_mean
outputs = outputs * voiced
outputs = outputs.swapaxes(0, 1)

outputs = outputs[0]
pyplot.figure(figsize=(100, 15))
pyplot.plot(outputs, linewidth=3)
pyplot.gca().set_xlim(0, 2048)
pyplot.savefig(save_dir + "samples/best_" + experiment_name + "3.png")
示例#28
0
    def __init__(self, costs, tparams, step_rule, drop_input=None,
                 learning_rate=None, clip_c=0., step_rule_kwargs=None,
                 **kwargs):
        """
        costs : dict, mapping cg_name to cost
        tparams : dict, mapping cg_name to shared parameters
        step_rule : str, optimizer
        drop_input : dict, mapping cg_name to drop_input ratio (float)
        learning_rate : theano tensor variable
        clip_c : float, gradient clipping threshold
        step_rule_kwargs : dict, additional arguments to the step rule
        """
        self.costs = costs
        self.tparams = tparams
        self.step_rule = step_rule
        self.learning_rate = learning_rate
        self.clip_c = clip_c
        self.step_rule_kwargs = step_rule_kwargs

        self.num_cgs = len(costs)
        self.cg_names = costs.keys()
        if any([is_multiSource(cg) for cg in self.cg_names]):
            self.enc_ids, self.dec_ids = get_enc_dec_ids_mSrc(self.cg_names)
        else:
            self.enc_ids, self.dec_ids = get_enc_dec_ids(self.cg_names)

        self.f_grads = OrderedDict()
        self.f_grad_shareds = OrderedDict()
        self.f_updates = OrderedDict()
        self.drop_input = drop_input
        self._cost = None
        self.algorithms = OrderedDict()  # blocks legacy

        if drop_input is None:
            self.drop_input = {name: 0.0 for name in costs.keys()}

        for cg_name in self.cg_names:
            cost = self.costs[cg_name]
            inps = ComputationGraph(cost).inputs
            params = make_ordered_dict(self.tparams[cg_name])
            logger.info(
                "Initializing the training algorithm [{}]".format(cg_name))

            logger.info("...computing gradient")
            grads = theano.tensor.grad(
                cost=cost, wrt=self.tparams[cg_name])

            if self.clip_c > 0.:
                logger.info("...clipping gradients")
                g2 = 0.
                for g in grads:
                    g2 += (g**2).sum()
                notfinite = tensor.isnan(g2) + tensor.isinf(g2)
                new_grads = []
                for g in grads:
                    p = self._get_p_from_g(cg_name, g, params)
                    tmpg = tensor.switch(
                        g2 > (self.clip_c**2),
                        g / tensor.sqrt(g2) * self.clip_c, g)
                    new_grads.append(
                        tensor.switch(notfinite, numpy.float32(.1) * p, tmpg))
                grads = new_grads

            start_time = time.time()
            logger.info("...building optimizer",)
            lr = tensor.scalar(name='lr')
            self.f_grad_shareds[cg_name], self.f_updates[cg_name], \
                step_rule_updates = eval(
                    self.step_rule)(lr, params, grads, inps, cost,
                                    **self.step_rule_kwargs)
            logger.info(" took: {} seconds".format(time.time() - start_time))

            # blocks legacy, just a helper
            self.algorithms[cg_name] = Algorithm(cost, inps, params, grads,
                                                 step_rule_updates)
示例#29
0
def main_rnn(config):

    x = tensor.tensor3('features')
    y = tensor.matrix('targets')

#    if 'LSTM' in config['model'] :
#        from models import getLSTMstack
#        y_hat = getLSTMstack(input_dim=13, input_var=x, depth=int(config['model'][-1]))
#    else :
#        raise Exception("These are not the LSTM we are looking for")

#    y_hat = model.apply(x)
    

    emitter = TestEmitter()
#    emitter = TrivialEmitter(readout_dim=config['lstm_hidden_size'])

#    cost_func = SquaredError()

 #   @application
 #   def qwe(self, readouts, outputs=None):
 #       print(type(self), type(readouts))
 #       x = cost_func.apply(readouts,outputs)
 #       return x
    print(type(emitter.cost))
 #   emitter.cost = qwe
  #  print(type(qwe))

    steps = 2 
    n_samples= config['target_size']

    transition = [LSTM(config['lstm_hidden_size']) for _ in range(4)]
    transition = RecurrentStack(transition,
            name="transition", skip_connections=False)

    source_names = [name for name in transition.apply.states if 'states' in name]

    readout = Readout(emitter, readout_dim=config['lstm_hidden_size'], source_names=source_names,feedback_brick=None, merge=None, merge_prototype=None, post_merge=None, merged_dim=None)

    seqgen = SequenceGenerator(readout, transition, attention=None, add_contexts=False)
    seqgen.weights_init = IsotropicGaussian(0.01)
    seqgen.biases_init = Constant(0.)
    seqgen.push_initialization_config()

    seqgen.transition.biases_init = IsotropicGaussian(0.01,1)
    seqgen.transition.push_initialization_config()
    seqgen.initialize()

    states = seqgen.transition.apply.outputs
    print('states',states)
    states = {name: shared_floatx_zeros((n_samples, config['lstm_hidden_size']))
        for name in states}

    cost_matrix = seqgen.cost_matrix(x, **states)
    cost = cost_matrix.mean()
    cost.name = "nll"

    cg = ComputationGraph(cost)
    model = Model(cost)
    #Cost
#    cost = SquaredError().apply(y_hat ,y)
    #cost = CategoricalCrossEntropy().apply(T.flatten(),Y)
 #   

        #for sampling
    #cg = ComputationGraph(seqgen.generate(n_steps=steps,batch_size=n_samples, iterate=True))
  

    algorithm = GradientDescent(
        cost=cost, parameters=cg.parameters,
        step_rule=Scale(learning_rate=config['learning_rate']))



    #Getting the stream
    train_stream = MFCC.get_stream(config['batch_size'],config['source_size'],config['target_size'],config['num_examples'])


    #Monitoring stuff
    extensions = [Timing(),
                  FinishAfter(after_n_batches=config['num_batches']),
                  #DataStreamMonitoring([cost, error_rate],test_stream,prefix="test"),
                  TrainingDataMonitoring([cost], prefix="train", every_n_batches=1),
                  #Checkpoint(save_to),
                  ProgressBar(),
                  Printing(every_n_batches=1)]
   

    main_loop = MainLoop(
        algorithm,
        train_stream,
 #       model=model,
        extensions=extensions)

    main_loop.run()
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        answer = tensor.ivector('answer')
        candidates = tensor.imatrix('candidates')
        candidates_mask = tensor.imatrix('candidates_mask')

        bricks = []

        # set time as first dimension
        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)

        # Embed questions
        embed = LookupTable(vocab_size,
                            config.embed_size,
                            name='question_embed')
        bricks.append(embed)
        qembed = embed.apply(question)

        # Create and apply LSTM stack
        curr_dim = [config.embed_size]
        curr_hidden = [qembed]

        hidden_list = []
        for k, dim in enumerate(config.lstm_size):
            fwd_lstm_ins = [
                Linear(input_dim=d,
                       output_dim=4 * dim,
                       name='fwd_lstm_in_%d_%d' % (k, l))
                for l, d in enumerate(curr_dim)
            ]
            fwd_lstm = LSTM(dim=dim, activation=Tanh(), name='fwd_lstm_%d' % k)

            bwd_lstm_ins = [
                Linear(input_dim=d,
                       output_dim=4 * dim,
                       name='bwd_lstm_in_%d_%d' % (k, l))
                for l, d in enumerate(curr_dim)
            ]
            bwd_lstm = LSTM(dim=dim, activation=Tanh(), name='bwd_lstm_%d' % k)

            bricks = bricks + [fwd_lstm, bwd_lstm
                               ] + fwd_lstm_ins + bwd_lstm_ins

            fwd_tmp = sum(
                x.apply(v) for x, v in zip(fwd_lstm_ins, curr_hidden))
            bwd_tmp = sum(
                x.apply(v) for x, v in zip(bwd_lstm_ins, curr_hidden))
            fwd_hidden, _ = fwd_lstm.apply(fwd_tmp,
                                           mask=question_mask.astype(
                                               theano.config.floatX))
            bwd_hidden, _ = bwd_lstm.apply(bwd_tmp[::-1],
                                           mask=question_mask.astype(
                                               theano.config.floatX)[::-1])
            hidden_list = hidden_list + [fwd_hidden, bwd_hidden]
            if config.skip_connections:
                curr_hidden = [qembed, fwd_hidden, bwd_hidden[::-1]]
                curr_dim = [config.embed_size, dim, dim]
            else:
                curr_hidden = [fwd_hidden, bwd_hidden[::-1]]
                curr_dim = [dim, dim]

        # Create and apply output MLP
        if config.skip_connections:
            out_mlp = MLP(dims=[2 * sum(config.lstm_size)] +
                          config.out_mlp_hidden + [config.n_entities],
                          activations=config.out_mlp_activations +
                          [Identity()],
                          name='out_mlp')
            bricks.append(out_mlp)

            probs = out_mlp.apply(
                tensor.concatenate([h[-1, :, :] for h in hidden_list], axis=1))
        else:
            out_mlp = MLP(dims=[2 * config.lstm_size[-1]] +
                          config.out_mlp_hidden + [config.n_entities],
                          activations=config.out_mlp_activations +
                          [Identity()],
                          name='out_mlp')
            bricks.append(out_mlp)

            probs = out_mlp.apply(
                tensor.concatenate([h[-1, :, :] for h in hidden_list[-2:]],
                                   axis=1))

        is_candidate = tensor.eq(
            tensor.arange(config.n_entities, dtype='int32')[None, None, :],
            tensor.switch(candidates_mask, candidates,
                          -tensor.ones_like(candidates))[:, :,
                                                         None]).sum(axis=1)
        probs = tensor.switch(is_candidate, probs,
                              -1000 * tensor.ones_like(probs))

        # Calculate prediction, cost and error rate
        pred = probs.argmax(axis=1)
        cost = Softmax().categorical_cross_entropy(answer, probs).mean()
        error_rate = tensor.neq(answer, pred).mean()

        # Apply dropout
        cg = ComputationGraph([cost, error_rate])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, hidden_list, config.dropout)
        [cost_reg, error_rate_reg] = cg.outputs

        # Other stuff
        cost_reg.name = cost.name = 'cost'
        error_rate_reg.name = error_rate.name = 'error_rate'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg], [error_rate_reg]]
        self.monitor_vars_valid = [[cost], [error_rate]]

        # Initialize bricks
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()