def test_stream():

    # Dummy vocabulary
    vocab = {"<S>": 0, "</S>": 1, "<UNK>": 2}
    with tempfile.NamedTemporaryFile() as src_data:
        with tempfile.NamedTemporaryFile() as trg_data:
            get_tr_stream(src_vocab=vocab, trg_vocab=vocab, src_data=src_data.name, trg_data=trg_data.name)
    with tempfile.NamedTemporaryFile() as val_set:
        get_dev_stream(val_set=val_set.name, src_vocab=vocab)
Exemplo n.º 2
0
    logger.info('end build sample model : f_init, f_next')

    src_vocab = pickle.load(open(config['src_vocab']))
    trg_vocab = pickle.load(open(config['trg_vocab']))
    src_vocab = ensure_special_tokens(src_vocab,
                                      bos_idx=0, eos_idx=config['src_vocab_size'] - 1,
                                      unk_idx=config['unk_id'])
    trg_vocab = ensure_special_tokens(trg_vocab,
                                      bos_idx=0, eos_idx=config['src_vocab_size'] - 1,
                                      unk_idx=config['unk_id'])
    trg_vocab_reverse = {index: word for word, index in trg_vocab.iteritems()}
    src_vocab_reverse = {index: word for word, index in src_vocab.iteritems()}
    logger.info('load dict finished ! src dic size : {} trg dic size : {}.'.format(len(src_vocab), len(trg_vocab)))

    tr_stream = get_tr_stream(**config)
    dev_stream = get_dev_stream(**config)
    logger.info('start training!!!')
    batch_count = 0

    val_time = 0
    best_score = 0.
    for epoch in range(config['max_epoch']):
        for tr_data in tr_stream.get_epoch_iterator():
            batch_count += 1
            tr_fn(*tr_data)

            # sample
            if batch_count % config['sampling_freq'] == 0:
                trans_sample(tr_data[0], tr_data[2], f_init, f_next, config['hook_samples'],
                             src_vocab_reverse, trg_vocab_reverse, batch_count)
Exemplo n.º 3
0
def init():
    parser = argparse.ArgumentParser()
    parser.add_argument('--proto', default='get_config_cs2en', help='Prototype config')
    args = parser.parse_args()
    config = getattr(configurations, args.proto)()
    main(config, get_tr_stream(**config), get_dev_stream(**config))
Exemplo n.º 4
0
import argparse
import logging
import pprint

import config

from __init__ import main
from lexicon import create_dictionary_from_lexicon, create_dictionary_from_punctuation_marks
from stream import get_tr_stream, get_dev_stream

logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.INFO)

# Get the arguments
parser = argparse.ArgumentParser()
parser.add_argument("--proto",  default="get_config", help="Prototype config to use for config")
parser.add_argument("--bokeh",  default=False, action="store_true", help="Use bokeh server for plotting")
args = parser.parse_args()


if __name__ == "__main__":
    config = getattr(config, args.proto)()
    #logger.info("Model options:\n{}".format(pprint.pformat(config)))

    data_path = "%s/data_global_cmvn_with_phones_alignment_pitch_features.h5" % config["data_dir"]
    tr_stream = get_tr_stream(data_path, config["src_eos_idx"], config["phones"]["sil"], config["trg_eos_idx"], seq_len=config["seq_len"], batch_size=config["batch_size"], sort_k_batches=config["sort_k_batches"])
    dev_stream = get_dev_stream(data_path)
    main(config, tr_stream, dev_stream, args.bokeh)
Exemplo n.º 5
0
                          samples=samples,
                          config=config,
                          model=search_model,
                          data_stream=dev_stream,
                          normalize=config['normalized_bleu'],
                          every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=training_model,
                         algorithm=algorithm,
                         data_stream=tr_stream,
                         extensions=extensions)

    # Train!
    main_loop.run()


if __name__ == "__main__":
    assert sys.version_info >= (3, 4)
    # Get configurations for model
    configuration = configurations.get_config()
    logger.info("Model options:\n{}".format(pprint.pformat(configuration)))
    # Get data streams and call main
    main(configuration, get_tr_stream(**configuration),
         get_dev_stream(**configuration))
Exemplo n.º 6
0
def main(config): 
	vocab_src, _ = text_to_dict([config['train_src'],
		config['dev_src'], config['test_src']])
	vocab_tgt, cabvo = text_to_dict([config['train_tgt'],
		config['dev_tgt']])

	# Create Theano variables
	logger.info('Creating theano variables')
	source_sentence = tensor.lmatrix('source')
	source_sentence_mask = tensor.matrix('source_mask')
	target_sentence = tensor.lmatrix('target')
	target_sentence_mask = tensor.matrix('target_mask')
	source_sentence.tag.test_value = [[13, 20, 0, 20, 0, 20, 0],
										[1, 4, 8, 4, 8, 4, 8],]
	source_sentence_mask.tag.test_value = [[0, 1, 0, 1, 0, 1, 0],
											[1, 0, 1, 0, 1, 0, 1],]
	target_sentence.tag.test_value = [[0,1,1,5],
										[2,0,1,0],]
	target_sentence_mask.tag.test_value = [[0,1,1,0],
											[1,1,1,0],]


	logger.info('Building RNN encoder-decoder')
	### Building Encoder 
	embedder = LookupTable(
		length=len(vocab_src), 
		dim=config['embed_src'], 
		weights_init=IsotropicGaussian(),
		biases_init=Constant(0.0), 
		name='embedder')
	transformer = Linear(
		config['embed_src'], 
		config['hidden_src']*4, 
		weights_init=IsotropicGaussian(),
		biases_init=Constant(0.0), 
		name='transformer')

	lstminit = np.asarray([0.0,]*config['hidden_src']+[0.0,]*config['hidden_src']+[1.0,]*config['hidden_src']+[0.0,]*config['hidden_src'])
	encoder = Bidirectional(
		LSTM(
			dim=config['hidden_src'], 
			weights_init=IsotropicGaussian(0.01),
			biases_init=Constant(lstminit)),
		name='encoderBiLSTM'
		)
	encoder.prototype.weights_init = Orthogonal()
	
	### Building Decoder 
	lstminit = np.asarray([0.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt']+[1.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt'])
	transition = LSTM2GO(
		attended_dim=config['hidden_tgt'], 
		dim=config['hidden_tgt'], 
		weights_init=IsotropicGaussian(0.01),
		biases_init=Constant(lstminit), 
		name='decoderLSTM')

	attention = SequenceContentAttention( 
		state_names=transition.apply.states, # default activation is Tanh
		state_dims=[config['hidden_tgt']],
		attended_dim=config['hidden_src']*2,
		match_dim=config['hidden_tgt'], 
		name="attention")

	readout = Readout(
		source_names=['states', 
			'feedback', 
			attention.take_glimpses.outputs[0]],
		readout_dim=len(vocab_tgt),
		emitter = SoftmaxEmitter(
			name='emitter'), 
		feedback_brick = LookupFeedback(
			num_outputs=len(vocab_tgt), 
			feedback_dim=config['embed_tgt'], 
			name='feedback'), 
		post_merge=InitializableFeedforwardSequence([
			Bias(dim=config['hidden_tgt'], 
				name='softmax_bias').apply,
			Linear(input_dim=config['hidden_tgt'], 
				output_dim=config['embed_tgt'],
				use_bias=False, 
				name='softmax0').apply,
			Linear(input_dim=config['embed_tgt'], 
				name='softmax1').apply]),
		merged_dim=config['hidden_tgt'])

	decoder = SequenceGenerator(
		readout=readout, 
		transition=transition, 
		attention=attention, 
		weights_init=IsotropicGaussian(0.01), 
		biases_init=Constant(0),
		name="generator",
		fork=Fork(
			[name for name in transition.apply.sequences if name != 'mask'], 
			prototype=Linear()),
		add_contexts=True)
	decoder.transition.weights_init = Orthogonal()

	#printchildren(encoder, 1)
	# Initialize model
	logger.info('Initializing model')
	embedder.initialize()
	transformer.initialize()
	encoder.initialize()
	decoder.initialize()
	
	# Apply model 
	embedded = embedder.apply(source_sentence)
	tansformed = transformer.apply(embedded)
	encoded = encoder.apply(tansformed)[0]
	generated = decoder.generate(
		n_steps=2*source_sentence.shape[1], 
		batch_size=source_sentence.shape[0], 
		attended = encoded.dimshuffle(1,0,2), 
		attended_mask=tensor.ones(source_sentence.shape).T
		)
	print 'Generated: ', generated
	# generator_generate_outputs
	#samples = generated[1] # For GRU 
	samples = generated[2] # For LSTM
	samples.name = 'samples'
	#samples_cost = generated[4] # For GRU 
	samples_cost = generated[5] # For LSTM
	samples_cost = 'sampling_cost'
	cost = decoder.cost(
		mask = target_sentence_mask.T, 
		outputs = target_sentence.T, 
		attended = encoded.dimshuffle(1,0,2), 
		attended_mask = source_sentence_mask.T)
	cost.name = 'target_cost'
	cost.tag.aggregation_scheme = TakeLast(cost)
	model = Model(cost)
	
	logger.info('Creating computational graph')
	cg = ComputationGraph(cost)
	
	# apply dropout for regularization
	if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog
		logger.info('Applying dropout')
		dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output']
		cg = apply_dropout(cg, dropout_inputs, config['dropout'])

	######## 
	# Print shapes
	shapes = [param.get_value().shape for param in cg.parameters]
	logger.info("Parameter shapes: ")
	for shape, count in Counter(shapes).most_common():
		logger.info('	{:15}: {}'.format(shape, count))
	logger.info("Total number of parameters: {}".format(len(shapes)))

	printchildren(embedder, 1)
	printchildren(transformer, 1)
	printchildren(encoder, 1)
	printchildren(decoder, 1)
	# Print parameter names
	# enc_dec_param_dict = merge(Selector(embedder).get_parameters(), Selector(encoder).get_parameters(), Selector(decoder).get_parameters())
	# enc_dec_param_dict = merge(Selector(decoder).get_parameters())
	# logger.info("Parameter names: ")
	# for name, value in enc_dec_param_dict.items():
	# 	logger.info('	{:15}: {}'.format(value.get_value().shape, name))
	# logger.info("Total number of parameters: {}".format(len(enc_dec_param_dict)))
	##########

	# Training data 
	train_stream = get_train_stream(config, 
		[config['train_src'],], [config['train_tgt'],], 
		vocab_src, vocab_tgt)
	dev_stream = get_dev_stream(
		[config['dev_src'],], [config['dev_tgt'],], 
		vocab_src, vocab_tgt)
	test_stream = get_test_stream([config['test_src'],], vocab_src)

	# Set extensions
	logger.info("Initializing extensions")
	extensions = [
		FinishAfter(after_n_batches=config['finish_after']),
		ProgressBar(),
		TrainingDataMonitoring([cost], 
			prefix="tra", 
			after_batch=True),
		DataStreamMonitoring(variables=[cost], 
			data_stream=dev_stream, 
			prefix="dev", 
			after_batch=True), 
		Sampler(
			model=Model(samples), 
			data_stream=dev_stream,
			vocab=cabvo,
			saveto=config['saveto']+'dev',
			every_n_batches=config['save_freq']), 
		Sampler(
			model=Model(samples), 
			data_stream=test_stream,
			vocab=cabvo,
			saveto=config['saveto']+'test',
			after_n_batches=1, 
			on_resumption=True,
			before_training=True), 
		Plotter(saveto=config['saveto'], after_batch=True),
		Printing(after_batch=True),
		Checkpoint(
			path=config['saveto'], 
			parameters = cg.parameters,
			save_main_loop=False,
			every_n_batches=config['save_freq'])]
	if BOKEH_AVAILABLE: 
		Plot('Training cost', channels=[['target_cost']], after_batch=True)
	if config['reload']: 
		extensions.append(Load(path=config['saveto'], 
			load_iteration_state=False, 
			load_log=False))
	else: 
		with open(config['saveto']+'.txt', 'w') as f: 
			pass 

	# Set up training algorithm
	logger.info("Initializing training algorithm")
	algorithm = GradientDescent(cost=cost, 
		parameters=cg.parameters,
		step_rule=CompositeRule([StepClipping(config['step_clipping']), 
			eval(config['step_rule'])()])
    )

	# Initialize main loop
	logger.info("Initializing main loop")
	main_loop = MainLoop(
		model=model,
		algorithm=algorithm,
		data_stream=train_stream,
		extensions=extensions)
	main_loop.run()
Exemplo n.º 7
0
        i +=1
    return new_result

# Get the arguments
parser = argparse.ArgumentParser()
parser.add_argument("--proto",  default="get_config_cs2en",
                    help="Prototype config to use for config")
parser.add_argument("--bokeh",  default=False, action="store_true",
                    help="Use bokeh server for plotting")
args = parser.parse_args()

#get configuration
config = getattr(configurations, args.proto)()

tr_stream = get_tr_stream(**config)
validate_stream = get_dev_stream(**config)

for i in range(1):

    logger.info('Creating theano variables')
    print("create theano variables")
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask') # what is the source_mask
    target_sentence = tensor.lmatrix('target')
    target_sentence_mask = tensor.matrix('target_mask')
    #sampling_input = tensor.lmatrix('input')
# Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(
        config['src_vocab_size'], config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(
Exemplo n.º 8
0
import argparse
import logging
import pprint
import sys

import configurations

from __init__ import main
from stream import get_tr_stream, get_dev_stream

logger = logging.getLogger(__name__)

# Get the arguments
parser = argparse.ArgumentParser()
parser.add_argument("--proto",  default="get_config_cs2en",
                    help="Prototype config to use for config")
parser.add_argument("--bokeh",  default=False, action="store_true",
                    help="Use bokeh server for plotting")
args = parser.parse_args()


if __name__ == "__main__":
    # Get configurations for model
    configuration = getattr(configurations, args.proto)()
    logger.info("Model options:\n{}".format(pprint.pformat(configuration)))
    # Get data streams and call main

    #sys.exit(0)
    main(configuration, get_tr_stream(**configuration),
         get_dev_stream(**configuration), args.bokeh)