dims = (10, 13, 5, 4)

        config_dict = dict()
        config_dict['batch_size'] = dims[2]
        config_dict['num_subwords'] = dims[1]
        config_dict['num_words'] = dims[0]
        config_dict['subword_embedding_size'] = dims[3]
        config_dict['input_vocab_size'] = 42
        config_dict['output_vocab_size'] = 42
        config_dict['subword_RNN_hidden_state_size'] = 6  # 2 more than subword_embedding_size for testing
        config_dict['LM_RNN_hidden_state_size'] = 8  # 2 more than subword_RNN_hidden_state_size
        config_dict['table_width'] = 0.08
        config_dict['max_out_dim'] = 11  # 3 more than LM_RNN_hidden_state_size
        config_dict['max_out_K'] = 3

        baseline_model = BaselineModel(config_dict)
        baseline_model.initialize()
        y_hat_CE = baseline_model.cost(subword_id_input_=x, subword_id_input_mask_=x_mask, subword_id_target_=y, subword_id_target_mask_=y_mask)

        cg = ComputationGraph(y_hat_CE)
        # cost = cg.outputs[0]

        f_cross_entropy = theano.function([x, x_mask, y, y_mask], [y_hat_CE])

        print('Graph inputs')
        print(cg.inputs)
        print(cg.inputs)

        num_times = 5
        for data in stream.get_epoch_iterator(as_dict=True):
            num_times -= 1
def run_training(config, tr_stream, dev_stream=None, use_bokeh=True):

    # Monitoring extensions
    try:
        from blocks_extras.extensions.plot import Plot
        BOKEH_AVAILABLE = True
    except ImportError:
        BOKEH_AVAILABLE = False
    print('Bokeh avalablity: ' + str(BOKEH_AVAILABLE))

    logger = logging.getLogger(__name__)

    # Create Theano variables
    logger.info('Creating theano variables')
    x = T.tensor3('features', dtype=config.params['data_dtype'])
    x_mask = T.tensor3('features_mask', dtype=config.params['mask_dtype'])
    y = T.matrix('targets', dtype=config.params['data_dtype'])
    y_mask = T.matrix('targets_mask', dtype=config.params['mask_dtype'])


    # Construct model
    logger.info('Building baseline model')
    baseline_model = BaselineModel(config.params)
    baseline_model.initialize()

    cost = baseline_model.cost(subword_id_input_=x, subword_id_input_mask_=x_mask,
                               subword_id_target_=y, subword_id_target_mask_=y_mask)

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)


    # apply dropout for regularization
    if config.params['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        logger.info('Applying dropout')
        dropout_inputs = [x for x in cg.intermediary_variables
                          if x.name == 'maxout_apply_output']
        print(cg.intermediary_variables)
        print(cg.variables)
        print(cg.inputs)
        print(cg.parameters)

        print(dropout_inputs)
        cg = apply_dropout(cg, dropout_inputs, config.params['dropout'])

    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=config.params['finish_after']),
        TrainingDataMonitoring([cost], after_batch=True),
        Printing(after_batch=True)
        #CheckpointNMT(config['saveto'], every_n_batches=config['save_freq'])]
    ]

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            Plot('Baseline model', channels=[['baselinemodel_cost_cost']],
                 after_batch=True))

    # Set up training algorithm
    logger.info("Initializing training algorithm")
    algorithm = GradientDescent(
        cost=cost, parameters=cg.parameters,
        step_rule=CompositeRule([StepClipping(config.params['step_clipping']), eval(config.params['step_rule'])()])
    )

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(
        model=baseline_model,
        algorithm=algorithm,
        data_stream=tr_stream,
        extensions=extensions
    )

    # Train
    main_loop.run()

    print('DONE TRAINING')