def test_stream(): # Dummy vocabulary vocab = {"<S>": 0, "</S>": 1, "<UNK>": 2} with tempfile.NamedTemporaryFile() as src_data: with tempfile.NamedTemporaryFile() as trg_data: get_tr_stream(src_vocab=vocab, trg_vocab=vocab, src_data=src_data.name, trg_data=trg_data.name) with tempfile.NamedTemporaryFile() as val_set: get_dev_stream(val_set=val_set.name, src_vocab=vocab)
logger.info('end build sample model : f_init, f_next') src_vocab = pickle.load(open(config['src_vocab'])) trg_vocab = pickle.load(open(config['trg_vocab'])) src_vocab = ensure_special_tokens(src_vocab, bos_idx=0, eos_idx=config['src_vocab_size'] - 1, unk_idx=config['unk_id']) trg_vocab = ensure_special_tokens(trg_vocab, bos_idx=0, eos_idx=config['src_vocab_size'] - 1, unk_idx=config['unk_id']) trg_vocab_reverse = {index: word for word, index in trg_vocab.iteritems()} src_vocab_reverse = {index: word for word, index in src_vocab.iteritems()} logger.info('load dict finished ! src dic size : {} trg dic size : {}.'.format(len(src_vocab), len(trg_vocab))) tr_stream = get_tr_stream(**config) dev_stream = get_dev_stream(**config) logger.info('start training!!!') batch_count = 0 val_time = 0 best_score = 0. for epoch in range(config['max_epoch']): for tr_data in tr_stream.get_epoch_iterator(): batch_count += 1 tr_fn(*tr_data) # sample if batch_count % config['sampling_freq'] == 0: trans_sample(tr_data[0], tr_data[2], f_init, f_next, config['hook_samples'], src_vocab_reverse, trg_vocab_reverse, batch_count)
def init(): parser = argparse.ArgumentParser() parser.add_argument('--proto', default='get_config_cs2en', help='Prototype config') args = parser.parse_args() config = getattr(configurations, args.proto)() main(config, get_tr_stream(**config), get_dev_stream(**config))
import argparse import logging import pprint import config from __init__ import main from lexicon import create_dictionary_from_lexicon, create_dictionary_from_punctuation_marks from stream import get_tr_stream, get_dev_stream logger = logging.getLogger(__name__) logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) # Get the arguments parser = argparse.ArgumentParser() parser.add_argument("--proto", default="get_config", help="Prototype config to use for config") parser.add_argument("--bokeh", default=False, action="store_true", help="Use bokeh server for plotting") args = parser.parse_args() if __name__ == "__main__": config = getattr(config, args.proto)() #logger.info("Model options:\n{}".format(pprint.pformat(config))) data_path = "%s/data_global_cmvn_with_phones_alignment_pitch_features.h5" % config["data_dir"] tr_stream = get_tr_stream(data_path, config["src_eos_idx"], config["phones"]["sil"], config["trg_eos_idx"], seq_len=config["seq_len"], batch_size=config["batch_size"], sort_k_batches=config["sort_k_batches"]) dev_stream = get_dev_stream(data_path) main(config, tr_stream, dev_stream, args.bokeh)
samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run() if __name__ == "__main__": assert sys.version_info >= (3, 4) # Get configurations for model configuration = configurations.get_config() logger.info("Model options:\n{}".format(pprint.pformat(configuration))) # Get data streams and call main main(configuration, get_tr_stream(**configuration), get_dev_stream(**configuration))
def main(config): vocab_src, _ = text_to_dict([config['train_src'], config['dev_src'], config['test_src']]) vocab_tgt, cabvo = text_to_dict([config['train_tgt'], config['dev_tgt']]) # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') source_sentence.tag.test_value = [[13, 20, 0, 20, 0, 20, 0], [1, 4, 8, 4, 8, 4, 8],] source_sentence_mask.tag.test_value = [[0, 1, 0, 1, 0, 1, 0], [1, 0, 1, 0, 1, 0, 1],] target_sentence.tag.test_value = [[0,1,1,5], [2,0,1,0],] target_sentence_mask.tag.test_value = [[0,1,1,0], [1,1,1,0],] logger.info('Building RNN encoder-decoder') ### Building Encoder embedder = LookupTable( length=len(vocab_src), dim=config['embed_src'], weights_init=IsotropicGaussian(), biases_init=Constant(0.0), name='embedder') transformer = Linear( config['embed_src'], config['hidden_src']*4, weights_init=IsotropicGaussian(), biases_init=Constant(0.0), name='transformer') lstminit = np.asarray([0.0,]*config['hidden_src']+[0.0,]*config['hidden_src']+[1.0,]*config['hidden_src']+[0.0,]*config['hidden_src']) encoder = Bidirectional( LSTM( dim=config['hidden_src'], weights_init=IsotropicGaussian(0.01), biases_init=Constant(lstminit)), name='encoderBiLSTM' ) encoder.prototype.weights_init = Orthogonal() ### Building Decoder lstminit = np.asarray([0.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt']+[1.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt']) transition = LSTM2GO( attended_dim=config['hidden_tgt'], dim=config['hidden_tgt'], weights_init=IsotropicGaussian(0.01), biases_init=Constant(lstminit), name='decoderLSTM') attention = SequenceContentAttention( state_names=transition.apply.states, # default activation is Tanh state_dims=[config['hidden_tgt']], attended_dim=config['hidden_src']*2, match_dim=config['hidden_tgt'], name="attention") readout = Readout( source_names=['states', 'feedback', attention.take_glimpses.outputs[0]], readout_dim=len(vocab_tgt), emitter = SoftmaxEmitter( name='emitter'), feedback_brick = LookupFeedback( num_outputs=len(vocab_tgt), feedback_dim=config['embed_tgt'], name='feedback'), post_merge=InitializableFeedforwardSequence([ Bias(dim=config['hidden_tgt'], name='softmax_bias').apply, Linear(input_dim=config['hidden_tgt'], output_dim=config['embed_tgt'], use_bias=False, name='softmax0').apply, Linear(input_dim=config['embed_tgt'], name='softmax1').apply]), merged_dim=config['hidden_tgt']) decoder = SequenceGenerator( readout=readout, transition=transition, attention=attention, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator", fork=Fork( [name for name in transition.apply.sequences if name != 'mask'], prototype=Linear()), add_contexts=True) decoder.transition.weights_init = Orthogonal() #printchildren(encoder, 1) # Initialize model logger.info('Initializing model') embedder.initialize() transformer.initialize() encoder.initialize() decoder.initialize() # Apply model embedded = embedder.apply(source_sentence) tansformed = transformer.apply(embedded) encoded = encoder.apply(tansformed)[0] generated = decoder.generate( n_steps=2*source_sentence.shape[1], batch_size=source_sentence.shape[0], attended = encoded.dimshuffle(1,0,2), attended_mask=tensor.ones(source_sentence.shape).T ) print 'Generated: ', generated # generator_generate_outputs #samples = generated[1] # For GRU samples = generated[2] # For LSTM samples.name = 'samples' #samples_cost = generated[4] # For GRU samples_cost = generated[5] # For LSTM samples_cost = 'sampling_cost' cost = decoder.cost( mask = target_sentence_mask.T, outputs = target_sentence.T, attended = encoded.dimshuffle(1,0,2), attended_mask = source_sentence_mask.T) cost.name = 'target_cost' cost.tag.aggregation_scheme = TakeLast(cost) model = Model(cost) logger.info('Creating computational graph') cg = ComputationGraph(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) ######## # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) printchildren(embedder, 1) printchildren(transformer, 1) printchildren(encoder, 1) printchildren(decoder, 1) # Print parameter names # enc_dec_param_dict = merge(Selector(embedder).get_parameters(), Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) # enc_dec_param_dict = merge(Selector(decoder).get_parameters()) # logger.info("Parameter names: ") # for name, value in enc_dec_param_dict.items(): # logger.info(' {:15}: {}'.format(value.get_value().shape, name)) # logger.info("Total number of parameters: {}".format(len(enc_dec_param_dict))) ########## # Training data train_stream = get_train_stream(config, [config['train_src'],], [config['train_tgt'],], vocab_src, vocab_tgt) dev_stream = get_dev_stream( [config['dev_src'],], [config['dev_tgt'],], vocab_src, vocab_tgt) test_stream = get_test_stream([config['test_src'],], vocab_src) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), ProgressBar(), TrainingDataMonitoring([cost], prefix="tra", after_batch=True), DataStreamMonitoring(variables=[cost], data_stream=dev_stream, prefix="dev", after_batch=True), Sampler( model=Model(samples), data_stream=dev_stream, vocab=cabvo, saveto=config['saveto']+'dev', every_n_batches=config['save_freq']), Sampler( model=Model(samples), data_stream=test_stream, vocab=cabvo, saveto=config['saveto']+'test', after_n_batches=1, on_resumption=True, before_training=True), Plotter(saveto=config['saveto'], after_batch=True), Printing(after_batch=True), Checkpoint( path=config['saveto'], parameters = cg.parameters, save_main_loop=False, every_n_batches=config['save_freq'])] if BOKEH_AVAILABLE: Plot('Training cost', channels=[['target_cost']], after_batch=True) if config['reload']: extensions.append(Load(path=config['saveto'], load_iteration_state=False, load_log=False)) else: with open(config['saveto']+'.txt', 'w') as f: pass # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=model, algorithm=algorithm, data_stream=train_stream, extensions=extensions) main_loop.run()
i +=1 return new_result # Get the arguments parser = argparse.ArgumentParser() parser.add_argument("--proto", default="get_config_cs2en", help="Prototype config to use for config") parser.add_argument("--bokeh", default=False, action="store_true", help="Use bokeh server for plotting") args = parser.parse_args() #get configuration config = getattr(configurations, args.proto)() tr_stream = get_tr_stream(**config) validate_stream = get_dev_stream(**config) for i in range(1): logger.info('Creating theano variables') print("create theano variables") source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') # what is the source_mask target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') #sampling_input = tensor.lmatrix('input') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder(
import argparse import logging import pprint import sys import configurations from __init__ import main from stream import get_tr_stream, get_dev_stream logger = logging.getLogger(__name__) # Get the arguments parser = argparse.ArgumentParser() parser.add_argument("--proto", default="get_config_cs2en", help="Prototype config to use for config") parser.add_argument("--bokeh", default=False, action="store_true", help="Use bokeh server for plotting") args = parser.parse_args() if __name__ == "__main__": # Get configurations for model configuration = getattr(configurations, args.proto)() logger.info("Model options:\n{}".format(pprint.pformat(configuration))) # Get data streams and call main #sys.exit(0) main(configuration, get_tr_stream(**configuration), get_dev_stream(**configuration), args.bokeh)