def reduce( fn, sequences, outputs_info, non_sequences=None, go_backwards=False, mode=None, name=None, ): """ Similar behaviour as python's reduce. Parameters ---------- fn The function that ``reduce`` applies at each iteration step (see ``scan`` for more info). sequences List of sequences over which ``reduce`` iterates (see ``scan`` for more info). outputs_info List of dictionaries describing the outputs of reduce (see ``scan`` for more info). non_sequences List of arguments passed to ``fn``. ``reduce`` will not iterate over these arguments (see ``scan`` for more info). go_backwards : bool Decides the direction of iteration. True means that sequences are parsed from the end towards the beginning, while False is the other way around. mode See ``scan``. name See ``scan``. """ rval = scan( fn=fn, sequences=sequences, outputs_info=outputs_info, non_sequences=non_sequences, go_backwards=go_backwards, truncate_gradient=-1, mode=mode, name=name, ) if isinstance(rval[0], (list, tuple)): return [x[-1] for x in rval[0]], rval[1] else: return rval[0][-1], rval[1]
def linear_cg(compute_Gv, bs, rtol=1e-6, maxit=1000, damp=0, floatX=None, profile=0): """ assume all are lists all the time Reference: http://en.wikipedia.org/wiki/Conjugate_gradient_method """ n_params = len(bs) def loop(rsold, *args): ps = args[:n_params] rs = args[n_params:2 * n_params] xs = args[2 * n_params:] _Aps = compute_Gv(*ps)[0] Aps = [x + damp * y for x, y in zip(_Aps, ps)] alpha = rsold / sum((x * y).sum() for x, y in zip(Aps, ps)) xs = [x + alpha * p for x, p in zip(xs, ps)] rs = [r - alpha * Ap for r, Ap, in zip(rs, Aps)] rsnew = sum((r * r).sum() for r in rs) ps = [r + rsnew / rsold * p for r, p in zip(rs, ps)] return [rsnew]+ps+rs+xs, \ theano.scan_module.until(abs(rsnew) < rtol) r0s = bs _p0s = [tensor.unbroadcast(tensor.shape_padleft(x), 0) for x in r0s] _r0s = [tensor.unbroadcast(tensor.shape_padleft(x), 0) for x in r0s] _x0s = [ tensor.unbroadcast(tensor.shape_padleft(tensor.zeros_like(x)), 0) for x in bs ] _rsold = sum((r * r).sum() for r in r0s) #_rsold = tensor.unbroadcast(tensor.shape_padleft(rsold),0) outs, updates = scan(loop, outputs_info=[_rsold] + _p0s + _r0s + _x0s, n_steps=maxit, mode=theano.Mode(linker='cvm'), name='linear_conjugate_gradient', profile=profile) fxs = outs[1 + 2 * n_params:] #return [x[0] for x in fxs] # 5vision hacks x = theano.gradient.disconnected_grad(fxs[0][-1].flatten()) residual = outs[0][-1] return [x, residual]
def linear_cg_precond(compute_Gv, bs, Msz, rtol=1e-16, maxit=100000, floatX=None): """ assume all are lists all the time Reference: http://en.wikipedia.org/wiki/Conjugate_gradient_method """ n_params = len(bs) def loop(rsold, *args): ps = args[:n_params] rs = args[n_params:2 * n_params] xs = args[2 * n_params:] Aps = compute_Gv(*ps) alpha = rsold / sum((x * y).sum() for x, y in zip(Aps, ps)) xs = [x + alpha * p for x, p in zip(xs, ps)] rs = [r - alpha * Ap for r, Ap, in zip(rs, Aps)] zs = [r / z for r, z in zip(rs, Msz)] rsnew = sum((r * z).sum() for r, z in zip(rs, zs)) ps = [z + rsnew / rsold * p for z, p in zip(zs, ps)] return [rsnew] + ps + rs + xs, theano.scan_module.until(abs(rsnew) < rtol) r0s = bs _p0s = [ tensor.unbroadcast(tensor.shape_padleft(x / z), 0) for x, z in zip(r0s, Msz) ] _r0s = [tensor.unbroadcast(tensor.shape_padleft(x), 0) for x in r0s] _x0s = [ tensor.unbroadcast(tensor.shape_padleft(tensor.zeros_like(x)), 0) for x in bs ] rsold = sum((r * r / z).sum() for r, z in zip(r0s, Msz)) _rsold = tensor.unbroadcast(tensor.shape_padleft(rsold), 0) outs, updates = scan(loop, states=[_rsold] + _p0s + _r0s + _x0s, n_steps=maxit, mode=theano.Mode(linker='c|py'), name='linear_conjugate_gradient', profile=0) fxs = outs[1 + 2 * n_params:] return [x[0] for x in fxs]
def reduce(fn, sequences, outputs_info, non_sequences=None, go_backwards=False, mode=None, name=None): """ Similar behaviour as python's reduce. Parameters ---------- fn The function that ``reduce`` applies at each iteration step (see ``scan`` for more info). sequences List of sequences over which ``reduce`` iterates (see ``scan`` for more info). outputs_info List of dictionaries describing the outputs of reduce (see ``scan`` for more info). non_sequences List of arguments passed to ``fn``. ``reduce`` will not iterate over these arguments (see ``scan`` for more info). go_backwards : bool Decides the direction of iteration. True means that sequences are parsed from the end towards the begining, while False is the other way around. mode See ``scan``. name See ``scan``. """ rval = scan(fn=fn, sequences=sequences, outputs_info=outputs_info, non_sequences=non_sequences, go_backwards=go_backwards, truncate_gradient=-1, mode=mode, name=name) if isinstance(rval[0], (list, tuple)): return [x[-1] for x in rval[0]], rval[1] else: return rval[0][-1], rval[1]
def map( fn, sequences, non_sequences=None, truncate_gradient=-1, go_backwards=False, mode=None, name=None, ): """ Similar behaviour as python's map. Parameters ---------- fn The function that ``map`` applies at each iteration step (see ``scan`` for more info). sequences List of sequences over which ``map`` iterates (see ``scan`` for more info). non_sequences List of arguments passed to ``fn``. ``map`` will not iterate over these arguments (see ``scan`` for more info). truncate_gradient See ``scan``. go_backwards : bool Decides the direction of iteration. True means that sequences are parsed from the end towards the beginning, while False is the other way around. mode See ``scan``. name See ``scan``. """ return scan( fn=fn, sequences=sequences, outputs_info=[], non_sequences=non_sequences, truncate_gradient=truncate_gradient, go_backwards=go_backwards, mode=mode, name=name, )
def map(fn, sequences, non_sequences=None, truncate_gradient=-1, go_backwards=False, mode=None, name=None): """ Similar behaviour as python's map. Parameters ---------- fn The function that ``map`` applies at each iteration step (see ``scan`` for more info). sequences List of sequences over which ``map`` iterates (see ``scan`` for more info). non_sequences List of arguments passed to ``fn``. ``map`` will not iterate over these arguments (see ``scan`` for more info). truncate_gradient See ``scan``. go_backwards : bool Decides the direction of iteration. True means that sequences are parsed from the end towards the begining, while False is the other way around. mode See ``scan``. name See ``scan``. """ return scan(fn=fn, sequences=sequences, outputs_info=[], non_sequences=non_sequences, truncate_gradient=truncate_gradient, go_backwards=go_backwards, mode=mode, name=name)
def jobman(state, channel): # load dataset rng = numpy.random.RandomState(state['seed']) # declare the dimensionalies of the input and output if state['chunks'] == 'words': state['n_in'] = 10000 state['n_out'] = 10000 else: state['n_in'] = 50 state['n_out'] = 50 train_data, valid_data, test_data = get_text_data(state) ## BEGIN Tutorial ### Define Theano Input Variables x = TT.lvector('x') y = TT.lvector('y') h0 = theano.shared(numpy.zeros((eval(state['nhids'])[-1],), dtype='float32')) ### Neural Implementation of the Operators: \oplus #### Word Embedding emb_words = MultiLayer( rng, n_in=state['n_in'], n_hids=eval(state['inp_nhids']), activation=eval(state['inp_activ']), init_fn='sample_weights_classic', weight_noise=state['weight_noise'], rank_n_approx = state['rank_n_approx'], scale=state['inp_scale'], sparsity=state['inp_sparse'], learn_bias = True, bias_scale=eval(state['inp_bias']), name='emb_words') #### Deep Transition Recurrent Layer rec = eval(state['rec_layer'])( rng, eval(state['nhids']), activation = eval(state['rec_activ']), #activation = 'TT.nnet.sigmoid', bias_scale = eval(state['rec_bias']), scale=eval(state['rec_scale']), sparsity=eval(state['rec_sparse']), init_fn=eval(state['rec_init']), weight_noise=state['weight_noise'], name='rec') #### Stiching them together ##### (1) Get the embedding of a word x_emb = emb_words(x, no_noise_bias=state['no_noise_bias']) ##### (2) Embedding + Hidden State via DT Recurrent Layer reset = TT.scalar('reset') rec_layer = rec(x_emb, n_steps=x.shape[0], init_state=h0*reset, no_noise_bias=state['no_noise_bias'], truncate_gradient=state['truncate_gradient'], batch_size=1) ## BEGIN Exercise: DOT-RNN ### Neural Implementation of the Operators: \lhd #### Exercise (1) #### TODO: Define a layer from the hidden state to the intermediate layer #### Exercise (1) #### TODO: Define a layer from the input to the intermediate Layer #### Hidden State: Combine emb_state and emb_words_out #### Exercise (1) #### TODO: Define an activation layer #### Exercise (2) #### TODO: Define a dropout layer #### Softmax Layer output_layer = SoftmaxLayer( rng, eval(state['dout_nhid']), state['n_out'], scale=state['out_scale'], bias_scale=state['out_bias_scale'], init_fn="sample_weights_classic", weight_noise=state['weight_noise'], sparsity=state['out_sparse'], sum_over_time=True, name='out') ### Few Optional Things #### Direct shortcut from x to y if state['shortcut_inpout']: shortcut = MultiLayer( rng, n_in=state['n_in'], n_hids=eval(state['inpout_nhids']), activations=eval(state['inpout_activ']), init_fn='sample_weights_classic', weight_noise = state['weight_noise'], scale=eval(state['inpout_scale']), sparsity=eval(state['inpout_sparse']), learn_bias=eval(state['inpout_learn_bias']), bias_scale=eval(state['inpout_bias']), name='shortcut') #### Learning rate scheduling (1/(1+n/beta)) state['clr'] = state['lr'] def update_lr(obj, cost): stp = obj.step if isinstance(obj.state['lr_start'], int) and stp > obj.state['lr_start']: time = float(stp - obj.state['lr_start']) new_lr = obj.state['clr']/(1+time/obj.state['lr_beta']) obj.lr = new_lr if state['lr_adapt']: rec.add_schedule(update_lr) ### Neural Implementations of the Language Model #### Training if state['shortcut_inpout']: additional_inputs = [rec_layer, shortcut(x)] else: additional_inputs = [rec_layer] ##### Exercise (1): Compute the output intermediate layer ##### TODO: Compute the output intermediate layer ##### Exercise (2): Apply Dropout ##### TODO: Apply the dropout layer train_model = output_layer(outhid, no_noise_bias=state['no_noise_bias'], additional_inputs=additional_inputs).train(target=y, scale=numpy.float32(1./state['seqlen'])) nw_h0 = rec_layer.out[rec_layer.out.shape[0]-1] if state['carry_h0']: train_model.updates += [(h0, nw_h0)] #### Validation h0val = theano.shared(numpy.zeros((eval(state['nhids'])[-1],), dtype='float32')) rec_layer = rec(emb_words(x, use_noise=False), n_steps = x.shape[0], batch_size=1, init_state=h0val*reset, use_noise=False) nw_h0 = rec_layer.out[rec_layer.out.shape[0]-1] ##### Exercise (1): ##### TODO: Compute the output intermediate layer ##### Exercise (2): Apply Dropout ##### TODO: Apply the dropout layer without noise if state['shortcut_inpout']: additional_inputs=[rec_layer, shortcut(x, use_noise=False)] else: additional_inputs=[rec_layer] valid_model = output_layer(outhid, additional_inputs=additional_inputs, use_noise=False).validate(target=y, sum_over_time=True) valid_updates = [] if state['carry_h0']: valid_updates = [(h0val, nw_h0)] valid_fn = theano.function([x,y, reset], valid_model.out, name='valid_fn', updates=valid_updates) #### Sampling ##### single-step sampling def sample_fn(word_tm1, h_tm1): x_emb = emb_words(word_tm1, use_noise = False, one_step=True) h0 = rec(x_emb, state_before=h_tm1, one_step=True, use_noise=False)[-1] outhid = outhid_dropout(outhid_activ(emb_state(h0, use_noise=False, one_step=True) + emb_words_out(word_tm1, use_noise=False, one_step=True), one_step=True), use_noise=False, one_step=True) word = output_layer.get_sample(state_below=outhid, additional_inputs=[h0], temp=1.) return word, h0 ##### scan for iterating the single-step sampling multiple times [samples, summaries], updates = scan(sample_fn, states = [ TT.alloc(numpy.int64(0), state['sample_steps']), TT.alloc(numpy.float32(0), 1, eval(state['nhids'])[-1])], n_steps= state['sample_steps'], name='sampler_scan') ##### build a Theano function for sampling sample_fn = theano.function([], [samples], updates=updates, profile=False, name='sample_fn') ##### Load a dictionary dictionary = numpy.load(state['dictionary']) if state['chunks'] == 'chars': dictionary = dictionary['unique_chars'] else: dictionary = dictionary['unique_words'] def hook_fn(): sample = sample_fn()[0] print 'Sample:', if state['chunks'] == 'chars': print "".join(dictionary[sample]) else: for si in sample: print dictionary[si], print ### Build and Train a Model #### Define a model model = LM_Model( cost_layer = train_model, weight_noise_amount=state['weight_noise_amount'], valid_fn = valid_fn, clean_before_noise_fn = False, noise_fn = None, rng = rng) if state['reload']: model.load(state['prefix']+'model.npz') #### Define a trainer ##### Training algorithm (SGD) if state['moment'] < 0: algo = SGD(model, state, train_data) else: algo = SGD_m(model, state, train_data) ##### Main loop of the trainer main = MainLoop(train_data, valid_data, test_data, model, algo, state, channel, train_cost = False, hooks = hook_fn, validate_postprocess = eval(state['validate_postprocess'])) ## Run! main.main()