def create_model(dbn, input_shape, input_var, mask_shape, mask_var, lstm_size=250, win=T.iscalar('theta)')): dbn_layers = dbn.get_all_layers() weights = [] biases = [] weights.append(dbn_layers[1].W) weights.append(dbn_layers[2].W) weights.append(dbn_layers[3].W) weights.append(dbn_layers[4].W) biases.append(dbn_layers[1].b) biases.append(dbn_layers[2].b) biases.append(dbn_layers[3].b) biases.append(dbn_layers[4].b) gate_parameters = Gate(W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), b=las.init.Constant(0.)) cell_parameters = Gate( W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=las.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=tanh) l_in = InputLayer(input_shape, input_var, 'input') l_mask = InputLayer(mask_shape, mask_var, 'mask') symbolic_batchsize = l_in.input_var.shape[0] symbolic_seqlen = l_in.input_var.shape[1] l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1') l_encoder = create_pretrained_encoder(weights, biases, l_reshape1) encoder_len = las.layers.get_output_shape(l_encoder)[-1] l_reshape2 = ReshapeLayer( l_encoder, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2') l_delta = DeltaLayer(l_reshape2, win, name='delta') l_lstm, l_lstm_back = create_blstm(l_delta, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm1') l_sum1 = ElemwiseSumLayer([l_lstm, l_lstm_back], name='sum1') l_forward_slice1 = SliceLayer(l_sum1, -1, 1, name='slice1') # Now, we can apply feed-forward layers as usual. # We want the network to predict a classification for the sequence, # so we'll use a the number of classes. l_out = DenseLayer(l_forward_slice1, num_units=26, nonlinearity=las.nonlinearities.softmax, name='output') return l_out
def create_model(dbn, input_shape, input_var, mask_shape, mask_var, lstm_size=250, win=T.iscalar('theta)'), output_classes=26, w_init_fn=GlorotUniform, use_peepholes=False, use_blstm=True): weights, biases, shapes, nonlinearities = dbn gate_parameters = Gate( W_in=w_init_fn, W_hid=w_init_fn, b=las.init.Constant(0.)) cell_parameters = Gate( W_in=w_init_fn, W_hid=w_init_fn, # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=las.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=tanh) l_in = InputLayer(input_shape, input_var, 'input') l_mask = InputLayer(mask_shape, mask_var, 'mask') symbolic_batchsize = l_in.input_var.shape[0] symbolic_seqlen = l_in.input_var.shape[1] l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1') l_encoder = create_pretrained_encoder(l_reshape1, weights, biases, shapes, nonlinearities, ['fc1', 'fc2', 'fc3', 'bottleneck']) encoder_len = las.layers.get_output_shape(l_encoder)[-1] l_reshape2 = ReshapeLayer(l_encoder, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2') l_delta = DeltaLayer(l_reshape2, win, name='delta') if use_blstm: l_lstm, l_lstm_back = create_blstm(l_delta, l_mask, lstm_size, cell_parameters, gate_parameters, 'blstm1', use_peepholes) # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. l_sum1 = ElemwiseSumLayer([l_lstm, l_lstm_back], name='sum1') # reshape, flatten to 2 dimensions to run softmax on all timesteps l_reshape3 = ReshapeLayer(l_sum1, (-1, lstm_size), name='reshape3') else: l_lstm = create_lstm(l_delta, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm', use_peepholes) l_reshape3 = ReshapeLayer(l_lstm, (-1, lstm_size), name='reshape3') # Now, we can apply feed-forward layers as usual. # We want the network to predict a classification for the sequence, # so we'll use a the number of classes. l_softmax = DenseLayer( l_reshape3, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='softmax') l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen, output_classes), name='output') return l_out
def create_model(input_shape, input_var, mask_shape, mask_var, window, lstm_size=250, output_classes=26, w_init=las.init.GlorotUniform(), use_peepholes=False, use_blstm=True): gate_parameters = Gate(W_in=w_init, W_hid=w_init, b=las.init.Constant(0.)) cell_parameters = Gate( W_in=w_init, W_hid=w_init, # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=las.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=tanh) l_in = InputLayer(input_shape, input_var, 'input') l_mask = InputLayer(mask_shape, mask_var, name='mask') symbolic_seqlen = l_in.input_var.shape[1] l_delta = DeltaLayer(l_in, window, name='delta') if use_blstm: f_lstm, b_lstm = create_blstm(l_delta, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm', use_peepholes) l_sum = ElemwiseSumLayer([f_lstm, b_lstm], name='sum') # reshape to (num_examples * seq_len, lstm_size) l_reshape = ReshapeLayer(l_sum, (-1, lstm_size), name='reshape') else: l_lstm = create_lstm(l_delta, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm', use_peepholes) l_reshape = ReshapeLayer(l_lstm, (-1, lstm_size), name='reshape') # Now, we can apply feed-forward layers as usual. # We want the network to predict a classification for the sequence, # so we'll use a the number of classes. l_softmax = DenseLayer(l_reshape, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='softmax') l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen, output_classes), name='output') return l_out
def create_model(s1_ae, s2_ae, s3_ae, s1_shape, s1_var, s2_shape, s2_var, s3_shape, s3_var, mask_shape, mask_var, lstm_size=250, lstm2_size=250, win=T.iscalar('theta)'), output_classes=26, fusiontype='concat', w_init_fn=las.init.Orthogonal(), use_peepholes=True): s1_bn_weights, s1_bn_biases, s1_bn_shapes, s1_bn_nonlinearities = s1_ae s2_weights, s2_biases, s2_shapes, s2_nonlinearities = s2_ae s3_weights, s3_biases, s3_shapes, s3_nonlinearities = s3_ae gate_parameters = Gate(W_in=w_init_fn, W_hid=w_init_fn, b=las.init.Constant(0.)) cell_parameters = Gate( W_in=w_init_fn, W_hid=w_init_fn, # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=las.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=tanh) l_s1 = InputLayer(s1_shape, s1_var, 's1_im') l_mask = InputLayer(mask_shape, mask_var, 'mask') l_s2 = InputLayer(s2_shape, s2_var, 's2_im') l_s3 = InputLayer(s3_shape, s3_var, 's3_im') symbolic_batchsize_s1 = l_s1.input_var.shape[0] symbolic_seqlen_s1 = l_s1.input_var.shape[1] symbolic_batchsize_s2 = l_s2.input_var.shape[0] symbolic_seqlen_s2 = l_s2.input_var.shape[1] symbolic_batchsize_s3 = l_s3.input_var.shape[0] symbolic_seqlen_s3 = l_s3.input_var.shape[1] l_reshape1_s1 = ReshapeLayer(l_s1, (-1, s1_shape[-1]), name='reshape1_s1') l_encoder_s1 = create_pretrained_encoder( l_reshape1_s1, s1_bn_weights, s1_bn_biases, s1_bn_shapes, s1_bn_nonlinearities, ['fc1_s1', 'fc2_s1', 'fc3_s1', 'bottleneck_s1']) s1_len = las.layers.get_output_shape(l_encoder_s1)[-1] l_reshape2_s1 = ReshapeLayer( l_encoder_s1, (symbolic_batchsize_s1, symbolic_seqlen_s1, s1_len), name='reshape2_s1') l_delta_s1 = DeltaLayer(l_reshape2_s1, win, name='delta_s1') l_delta_s1_dropout = DropoutLayer(l_delta_s1, name='dropout_s1') # s2 images l_reshape1_s2 = ReshapeLayer(l_s2, (-1, s2_shape[-1]), name='reshape1_s2') l_encoder_s2 = create_pretrained_encoder( l_reshape1_s2, s2_weights, s2_biases, s2_shapes, s2_nonlinearities, ['fc1_s2', 'fc2_s2', 'fc3_s2', 'bottleneck_s2']) s2_len = las.layers.get_output_shape(l_encoder_s2)[-1] l_reshape2_s2 = ReshapeLayer( l_encoder_s2, (symbolic_batchsize_s2, symbolic_seqlen_s2, s2_len), name='reshape2_s2') l_delta_s2 = DeltaLayer(l_reshape2_s2, win, name='delta_s2') l_delta_s2_dropout = DropoutLayer(l_delta_s2, name='dropout_s2') # s3 images l_reshape1_s3 = ReshapeLayer(l_s3, (-1, s3_shape[-1]), name='reshape1_s3') l_encoder_s3 = create_pretrained_encoder( l_reshape1_s3, s3_weights, s3_biases, s3_shapes, s3_nonlinearities, ['fc1_s3', 'fc2_s3', 'fc3_s3', 'bottleneck_s3']) s3_len = las.layers.get_output_shape(l_encoder_s3)[-1] l_reshape2_s3 = ReshapeLayer( l_encoder_s3, (symbolic_batchsize_s3, symbolic_seqlen_s3, s3_len), name='reshape2_s3') l_delta_s3 = DeltaLayer(l_reshape2_s3, win, name='delta_s3') l_delta_s3_dropout = DropoutLayer(l_delta_s3, name='dropout_s3') l_lstm_s1 = LSTMLayer( l_delta_s1_dropout, lstm_size * 2, peepholes=use_peepholes, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='lstm_s1') l_lstm_s2 = LSTMLayer( l_delta_s2_dropout, lstm_size * 2, peepholes=use_peepholes, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='lstm_s2') l_lstm_s3 = LSTMLayer( l_delta_s3_dropout, lstm_size * 2, peepholes=use_peepholes, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='lstm_s3') # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. if fusiontype == 'adasum': l_fuse = AdaptiveElemwiseSumLayer([l_lstm_s1, l_lstm_s2, l_lstm_s3], name='adasum1') elif fusiontype == 'sum': l_fuse = ElemwiseSumLayer([l_lstm_s1, l_lstm_s2, l_lstm_s3], name='sum1') elif fusiontype == 'concat': l_fuse = ConcatLayer([l_lstm_s1, l_lstm_s2, l_lstm_s3], axis=-1, name='concat') l_fuse_dropout = DropoutLayer(l_fuse, name='concat_dropout') f_lstm_agg, b_lstm_agg = create_blstm(l_fuse_dropout, l_mask, lstm2_size, cell_parameters, gate_parameters, 'lstm_agg') l_sum2 = ElemwiseSumLayer([f_lstm_agg, b_lstm_agg], name='sum2') # reshape to (num_examples * seq_len, lstm_size) l_reshape3 = ReshapeLayer(l_sum2, (-1, lstm_size * 2), name='reshape3') # Now, we can apply feed-forward layers as usual. # We want the network to predict a classification for the sequence, # so we'll use a the number of classes. l_softmax = DenseLayer(l_reshape3, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='softmax') l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen_s1, output_classes), name='output') return l_out, l_fuse
def create_model(ae, diff_ae, input_shape, input_var, mask_shape, mask_var, diff_shape, diff_var, lstm_size=250, win=T.iscalar('theta)'), output_classes=26, fusiontype='concat', w_init_fn=las.init.Orthogonal(), use_peepholes=True): bn_weights, bn_biases, bn_shapes, bn_nonlinearities = extract_weights(ae) diff_weights, diff_biases, diff_shapes, diff_nonlinearities = extract_weights( diff_ae) gate_parameters = Gate(W_in=w_init_fn, W_hid=w_init_fn, b=las.init.Constant(0.)) cell_parameters = Gate( W_in=w_init_fn, W_hid=w_init_fn, # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=las.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=tanh) l_raw = InputLayer(input_shape, input_var, 'raw_im') l_mask = InputLayer(mask_shape, mask_var, 'mask') l_diff = InputLayer(diff_shape, diff_var, 'diff_im') symbolic_batchsize_raw = l_raw.input_var.shape[0] symbolic_seqlen_raw = l_raw.input_var.shape[1] symbolic_batchsize_diff = l_diff.input_var.shape[0] symbolic_seqlen_diff = l_diff.input_var.shape[1] l_reshape1_raw = ReshapeLayer(l_raw, (-1, input_shape[-1]), name='reshape1_raw') l_encoder_raw = create_pretrained_encoder( l_reshape1_raw, bn_weights, bn_biases, bn_shapes, bn_nonlinearities, ['fc1_raw', 'fc2_raw', 'fc3_raw', 'bottleneck_raw']) raw_len = las.layers.get_output_shape(l_encoder_raw)[-1] l_reshape2_raw = ReshapeLayer( l_encoder_raw, (symbolic_batchsize_raw, symbolic_seqlen_raw, raw_len), name='reshape2_raw') l_delta_raw = DeltaLayer(l_reshape2_raw, win, name='delta_raw') # diff images l_reshape1_diff = ReshapeLayer(l_diff, (-1, diff_shape[-1]), name='reshape1_diff') l_encoder_diff = create_pretrained_encoder( l_reshape1_diff, diff_weights, diff_biases, diff_shapes, diff_nonlinearities, ['fc1_diff', 'fc2_diff', 'fc3_diff', 'bottleneck_diff']) diff_len = las.layers.get_output_shape(l_encoder_diff)[-1] l_reshape2_diff = ReshapeLayer( l_encoder_diff, (symbolic_batchsize_diff, symbolic_seqlen_diff, diff_len), name='reshape2_diff') l_delta_diff = DeltaLayer(l_reshape2_diff, win, name='delta_diff') l_lstm_raw = LSTMLayer( l_delta_raw, int(lstm_size), peepholes=use_peepholes, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='lstm_raw') l_lstm_diff = LSTMLayer( l_delta_diff, lstm_size, peepholes=use_peepholes, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='lstm_diff') # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. if fusiontype == 'adasum': l_fuse = AdaptiveElemwiseSumLayer([l_lstm_raw, l_lstm_diff], name='adasum1') elif fusiontype == 'sum': l_fuse = ElemwiseSumLayer([l_lstm_raw, l_lstm_diff], name='sum1') elif fusiontype == 'concat': l_fuse = ConcatLayer([l_lstm_raw, l_lstm_diff], axis=-1, name='concat') # l_drop_agg = DropoutLayer(l_sum1, name='dropout_agg') f_lstm_agg, b_lstm_agg = create_blstm(l_fuse, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm_agg') l_sum2 = ElemwiseSumLayer([f_lstm_agg, b_lstm_agg], name='sum2') ''' l_lstm_agg = LSTMLayer( l_drop_agg, lstm_size * 2, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='lstm_agg') # implement drop-out regularization l_dropout = DropoutLayer(l_sum1, p=0.4, name='dropout1') l_lstm2, l_lstm2_back = create_blstm(l_dropout, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm2') # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. l_sum2 = ElemwiseSumLayer([l_lstm2, l_lstm2_back]) ''' # l_sum2 = ElemwiseSumLayer([f_lstm_agg, b_lstm_agg], name='sum2') l_forward_slice1 = SliceLayer(l_sum2, -1, 1, name='slice1') # Now, we can apply feed-forward layers as usual. # We want the network to predict a classification for the sequence, # so we'll use a the number of classes. l_out = DenseLayer(l_forward_slice1, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='output') return l_out, l_fuse
def create_model(dbn, input_shape, input_var, mask_shape, mask_var, dct_shape, dct_var, lstm_size=250, win=T.iscalar('theta)'), output_classes=26, fusiontype='sum', w_init_fn=las.init.GlorotUniform(), use_peepholes=False, nonlinearities=rectify): weights, biases, shapes, nonlinearities = dbn names = ['fc1', 'fc2', 'fc3', 'bottleneck'] gate_parameters = Gate(W_in=w_init_fn, W_hid=w_init_fn, b=las.init.Constant(0.)) cell_parameters = Gate( W_in=w_init_fn, W_hid=w_init_fn, # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=las.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=tanh) l_in = InputLayer(input_shape, input_var, 'input') l_mask = InputLayer(mask_shape, mask_var, 'mask') l_dct = InputLayer(dct_shape, dct_var, 'dct') symbolic_batchsize = l_in.input_var.shape[0] symbolic_seqlen = l_in.input_var.shape[1] l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1') l_encoder = create_pretrained_encoder(l_reshape1, weights, biases, shapes, nonlinearities, names) encoder_len = las.layers.get_output_shape(l_encoder)[-1] l_reshape2 = ReshapeLayer( l_encoder, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2') l_delta = DeltaLayer(l_reshape2, win, name='delta') l_delta_dct = DeltaLayer(l_dct, win, name='delta_dct') l_lstm_bn = LSTMLayer( l_delta, lstm_size, peepholes=use_peepholes, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='lstm_bn') l_lstm_dct = LSTMLayer( l_delta_dct, lstm_size, peepholes=use_peepholes, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='lstm_dct') # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. if fusiontype == 'sum': l_fuse = ElemwiseSumLayer([l_lstm_bn, l_lstm_dct], name='sum1') elif fusiontype == 'adasum': l_fuse = AdaptiveElemwiseSumLayer([l_lstm_bn, l_lstm_dct], name='adasum') elif fusiontype == 'concat': l_fuse = ConcatLayer([l_lstm_bn, l_lstm_dct], axis=2, name='concat') else: raise ValueError(message='Unsupported Fusion Type used!') f_lstm_agg, b_lstm_agg = create_blstm(l_fuse, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm_agg') l_sum2 = ElemwiseSumLayer([f_lstm_agg, b_lstm_agg], name='sum2') # reshape to (num_examples * seq_len, lstm_size) l_reshape3 = ReshapeLayer(l_sum2, (-1, lstm_size), name='reshape3') # l_forward_slice1 = SliceLayer(l_sum2, -1, 1, name='slice1') # Now, we can apply feed-forward layers as usual. # We want the network to predict a classification for the sequence, # so we'll use a the number of classes. l_softmax = DenseLayer(l_reshape3, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='softmax') l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen, output_classes), name='output') return l_out, l_fuse
def create_model(dbn, input_shape, input_var, mask_shape, mask_var, dct_shape, dct_var, lstm_size=250, win=T.iscalar('theta)'), output_classes=26): dbn_layers = dbn.get_all_layers() weights = [] biases = [] weights.append(dbn_layers[1].W.astype('float32')) weights.append(dbn_layers[2].W.astype('float32')) weights.append(dbn_layers[3].W.astype('float32')) weights.append(dbn_layers[4].W.astype('float32')) biases.append(dbn_layers[1].b.astype('float32')) biases.append(dbn_layers[2].b.astype('float32')) biases.append(dbn_layers[3].b.astype('float32')) biases.append(dbn_layers[4].b.astype('float32')) gate_parameters = Gate(W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), b=las.init.Constant(0.)) cell_parameters = Gate( W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=las.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=tanh) l_in = InputLayer(input_shape, input_var, 'input') l_mask = InputLayer(mask_shape, mask_var, 'mask') l_dct = InputLayer(dct_shape, dct_var, 'dct') symbolic_batchsize = l_in.input_var.shape[0] symbolic_seqlen = l_in.input_var.shape[1] l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1') l_encoder = create_pretrained_encoder(weights, biases, l_reshape1) encoder_len = las.layers.get_output_shape(l_encoder)[-1] l_reshape2 = ReshapeLayer( l_encoder, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2') l_delta = DeltaLayer(l_reshape2, win, name='delta') l_delta_drop = DropoutLayer(l_delta, name='dropout_delta') l_dct_drop = DropoutLayer(l_dct, p=0.2, name='dropout_dct') l_lstm_bn = LSTMLayer( l_delta_drop, lstm_size * 2, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='lstm_bn') l_lstm_dct = LSTMLayer( l_dct_drop, lstm_size * 2, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='lstm_dct') # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. # l_sum1 = AdaptiveElemwiseSumLayer([l_lstm_bn, l_lstm_dct], name='adasum1') l_sum1 = ElemwiseSumLayer([l_lstm_bn, l_lstm_dct], name='sum1') l_sum1_drop = DropoutLayer(l_sum1, name='dropout_agg') # f_lstm_agg, b_lstm_agg = create_blstm(l_sum1, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm_agg') l_lstm_agg = LSTMLayer( l_sum1_drop, lstm_size * 2, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='lstm_agg') ''' # implement drop-out regularization l_dropout = DropoutLayer(l_sum1, p=0.4, name='dropout1') l_lstm2, l_lstm2_back = create_blstm(l_dropout, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm2') # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. l_sum2 = ElemwiseSumLayer([l_lstm2, l_lstm2_back]) ''' # l_sum2 = ElemwiseSumLayer([f_lstm_agg, b_lstm_agg], name='sum2') l_forward_slice1 = SliceLayer(l_lstm_agg, -1, 1, name='slice1') # Now, we can apply feed-forward layers as usual. # We want the network to predict a classification for the sequence, # so we'll use a the number of classes. l_out = DenseLayer(l_forward_slice1, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='output') return l_out, l_sum1
def create_model(dbn, input_shape, input_var, mask_shape, mask_var, dct_shape, dct_var, lstm_size=250, win=T.iscalar('theta)'), output_classes=26): dbn_layers = dbn.get_all_layers() weights = [] biases = [] weights.append(dbn_layers[1].W.astype('float32')) weights.append(dbn_layers[2].W.astype('float32')) weights.append(dbn_layers[3].W.astype('float32')) weights.append(dbn_layers[4].W.astype('float32')) biases.append(dbn_layers[1].b.astype('float32')) biases.append(dbn_layers[2].b.astype('float32')) biases.append(dbn_layers[3].b.astype('float32')) biases.append(dbn_layers[4].b.astype('float32')) gate_parameters = Gate(W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), b=las.init.Constant(0.)) cell_parameters = Gate( W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=las.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=tanh) l_in = InputLayer(input_shape, input_var, 'input') l_mask = InputLayer(mask_shape, mask_var, 'mask') l_dct = InputLayer(dct_shape, dct_var, 'dct') symbolic_batchsize = l_in.input_var.shape[0] symbolic_seqlen = l_in.input_var.shape[1] l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1') l_encoder = create_pretrained_encoder(weights, biases, l_reshape1) l_encoder_bn = BatchNormLayer(l_encoder, name='batchnorm1') encoder_len = las.layers.get_output_shape(l_encoder)[-1] l_reshape2 = ReshapeLayer( l_encoder_bn, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2') l_delta = DeltaLayer(l_reshape2, win, name='delta') l_concat = ConcatLayer([l_delta, l_dct], axis=2, name='concat') l_dropout1 = DropoutLayer(l_concat, name='dropout1') l_lstm, l_lstm_back = create_blstm(l_dropout1, l_mask, lstm_size * 2, cell_parameters, gate_parameters, 'lstm1') # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. l_sum1 = ElemwiseSumLayer([l_lstm, l_lstm_back], name='sum1') # implement drop-out regularization l_dropout2 = DropoutLayer(l_sum1, name='dropout2') l_lstm2, l_lstm2_back = create_blstm(l_dropout2, l_mask, lstm_size * 2, cell_parameters, gate_parameters, 'lstm2') # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. l_sum2 = ElemwiseSumLayer([l_lstm2, l_lstm2_back]) l_forward_slice1 = SliceLayer(l_sum2, -1, 1, name='slice1') # Now, we can apply feed-forward layers as usual. # We want the network to predict a classification for the sequence, # so we'll use a the number of classes. l_out = DenseLayer(l_forward_slice1, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='output') return l_out
def create_pretrained_model(s1_ae, s1_lstm, s2_ae, s2_lstm, s3_ae, s3_lstm, s4_ae, s4_lstm, s5_ae, s5_lstm, s1_shape, s1_var, s2_shape, s2_var, s3_shape, s3_var, s4_shape, s4_var, s5_shape, s5_var, mask_shape, mask_var, lstm_size=250, win=T.iscalar('theta)'), output_classes=26, fusiontype='concat', w_init_fn=las.init.Orthogonal(), use_peepholes=True, use_blstm_substream=False): s1_bn_weights, s1_bn_biases, s1_bn_shapes, s1_bn_nonlinearities = s1_ae s2_weights, s2_biases, s2_shapes, s2_nonlinearities = s2_ae s3_weights, s3_biases, s3_shapes, s3_nonlinearities = s3_ae s4_weights, s4_biases, s4_shapes, s4_nonlinearities = s4_ae s5_weights, s5_biases, s5_shapes, s5_nonlinearities = s5_ae gate_parameters = Gate( W_in=w_init_fn, W_hid=w_init_fn, b=las.init.Constant(0.)) cell_parameters = Gate( W_in=w_init_fn, W_hid=w_init_fn, # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=las.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=tanh) l_s1 = InputLayer(s1_shape, s1_var, 's1_im') l_mask = InputLayer(mask_shape, mask_var, 'mask') l_s2 = InputLayer(s2_shape, s2_var, 's2_im') l_s3 = InputLayer(s3_shape, s3_var, 's3_im') l_s4 = InputLayer(s4_shape, s4_var, 's4_im') l_s5 = InputLayer(s5_shape, s5_var, 's5_im') symbolic_batchsize_s1 = l_s1.input_var.shape[0] symbolic_seqlen_s1 = l_s1.input_var.shape[1] symbolic_batchsize_s2 = l_s2.input_var.shape[0] symbolic_seqlen_s2 = l_s2.input_var.shape[1] symbolic_batchsize_s3 = l_s3.input_var.shape[0] symbolic_seqlen_s3 = l_s3.input_var.shape[1] symbolic_batchsize_s4 = l_s4.input_var.shape[0] symbolic_seqlen_s4 = l_s4.input_var.shape[1] symbolic_batchsize_s5 = l_s5.input_var.shape[0] symbolic_seqlen_s5 = l_s5.input_var.shape[1] l_reshape1_s1 = ReshapeLayer(l_s1, (-1, s1_shape[-1]), name='reshape1_s1') l_encoder_s1 = create_pretrained_encoder(l_reshape1_s1, s1_bn_weights, s1_bn_biases, s1_bn_shapes, s1_bn_nonlinearities, ['fc1_s1', 'fc2_s1', 'fc3_s1', 'bottleneck_s1']) s1_len = las.layers.get_output_shape(l_encoder_s1)[-1] l_reshape2_s1 = ReshapeLayer(l_encoder_s1, (symbolic_batchsize_s1, symbolic_seqlen_s1, s1_len), name='reshape2_s1') l_delta_s1 = DeltaLayer(l_reshape2_s1, win, name='delta_s1') # s2 images l_reshape1_s2 = ReshapeLayer(l_s2, (-1, s2_shape[-1]), name='reshape1_s2') l_encoder_s2 = create_pretrained_encoder(l_reshape1_s2, s2_weights, s2_biases, s2_shapes, s2_nonlinearities, ['fc1_s2', 'fc2_s2', 'fc3_s2', 'bottleneck_s2']) s2_len = las.layers.get_output_shape(l_encoder_s2)[-1] l_reshape2_s2 = ReshapeLayer(l_encoder_s2, (symbolic_batchsize_s2, symbolic_seqlen_s2, s2_len), name='reshape2_s2') l_delta_s2 = DeltaLayer(l_reshape2_s2, win, name='delta_s2') # s3 images l_reshape1_s3 = ReshapeLayer(l_s3, (-1, s3_shape[-1]), name='reshape1_s3') l_encoder_s3 = create_pretrained_encoder(l_reshape1_s3, s3_weights, s3_biases, s3_shapes, s3_nonlinearities, ['fc1_s3', 'fc2_s3', 'fc3_s3', 'bottleneck_s3']) s3_len = las.layers.get_output_shape(l_encoder_s3)[-1] l_reshape2_s3 = ReshapeLayer(l_encoder_s3, (symbolic_batchsize_s3, symbolic_seqlen_s3, s3_len), name='reshape2_s3') l_delta_s3 = DeltaLayer(l_reshape2_s3, win, name='delta_s3') # s4 images l_reshape1_s4 = ReshapeLayer(l_s4, (-1, s4_shape[-1]), name='reshape1_s4') l_encoder_s4 = create_pretrained_encoder(l_reshape1_s4, s4_weights, s4_biases, s4_shapes, s4_nonlinearities, ['fc1_s4', 'fc2_s4', 'fc3_s4', 'bottleneck_s4']) s4_len = las.layers.get_output_shape(l_encoder_s4)[-1] l_reshape2_s4 = ReshapeLayer(l_encoder_s4, (symbolic_batchsize_s4, symbolic_seqlen_s4, s4_len), name='reshape2_s4') l_delta_s4 = DeltaLayer(l_reshape2_s4, win, name='delta_s4') # s5 images l_reshape1_s5 = ReshapeLayer(l_s5, (-1, s5_shape[-1]), name='reshape1_s5') l_encoder_s5 = create_pretrained_encoder(l_reshape1_s5, s5_weights, s5_biases, s5_shapes, s5_nonlinearities, ['fc1_s5', 'fc2_s5', 'fc3_s5', 'bottleneck_s5']) s5_len = las.layers.get_output_shape(l_encoder_s5)[-1] l_reshape2_s5 = ReshapeLayer(l_encoder_s5, (symbolic_batchsize_s5, symbolic_seqlen_s5, s5_len), name='reshape2_s5') l_delta_s5 = DeltaLayer(l_reshape2_s5, win, name='delta_s5') if not use_blstm_substream: l_lstm_s1 = create_pretrained_lstm(s1_lstm, 'f_lstm', l_delta_s1, l_mask, lstm_size, cell_parameters, gate_parameters, 'f_lstm_s1', use_peepholes) l_lstm_s2 = create_pretrained_lstm(s2_lstm, 'f_lstm', l_delta_s2, l_mask, lstm_size, cell_parameters, gate_parameters, 'f_lstm_s2', use_peepholes) l_lstm_s3 = create_pretrained_lstm(s3_lstm, 'f_lstm', l_delta_s3, l_mask, lstm_size, cell_parameters, gate_parameters, 'f_lstm_s3', use_peepholes) l_lstm_s4 = create_pretrained_lstm(s4_lstm, 'f_lstm', l_delta_s4, l_mask, lstm_size, cell_parameters, gate_parameters, 'f_lstm_s4', use_peepholes) l_lstm_s5 = create_pretrained_lstm(s5_lstm, 'f_lstm', l_delta_s5, l_mask, lstm_size, cell_parameters, gate_parameters, 'f_lstm_s5', use_peepholes) else: f_lstm_s1 = create_pretrained_lstm(s1_lstm, 'f_lstm', l_delta_s1, l_mask, lstm_size, cell_parameters, gate_parameters, 'f_lstm_s1', use_peepholes) b_lstm_s1 = create_pretrained_lstm(s1_lstm, 'b_lstm', l_delta_s1, l_mask, lstm_size, cell_parameters, gate_parameters, 'b_lstm_s1', use_peepholes, backwards=True) l_lstm_s1 = ElemwiseSumLayer([f_lstm_s1, b_lstm_s1], name='sum_b_lstm_s1') f_lstm_s2 = create_pretrained_lstm(s2_lstm, 'f_lstm', l_delta_s2, l_mask, lstm_size, cell_parameters, gate_parameters, 'f_lstm_s2', use_peepholes) b_lstm_s2 = create_pretrained_lstm(s2_lstm, 'b_lstm', l_delta_s2, l_mask, lstm_size, cell_parameters, gate_parameters, 'b_lstm_s2', use_peepholes, backwards=True) l_lstm_s2 = ElemwiseSumLayer([f_lstm_s2, b_lstm_s2], name='sum_b_lstm_s2') f_lstm_s3 = create_pretrained_lstm(s3_lstm, 'f_lstm', l_delta_s3, l_mask, lstm_size, cell_parameters, gate_parameters, 'f_lstm_s3', use_peepholes) b_lstm_s3 = create_pretrained_lstm(s3_lstm, 'b_lstm', l_delta_s3, l_mask, lstm_size, cell_parameters, gate_parameters, 'b_lstm_s3', use_peepholes, backwards=True) l_lstm_s3 = ElemwiseSumLayer([f_lstm_s3, b_lstm_s3], name='sum_b_lstm_s3') f_lstm_s4 = create_pretrained_lstm(s4_lstm, 'f_lstm', l_delta_s4, l_mask, lstm_size, cell_parameters, gate_parameters, 'f_lstm_s4', use_peepholes) b_lstm_s4 = create_pretrained_lstm(s4_lstm, 'b_lstm', l_delta_s4, l_mask, lstm_size, cell_parameters, gate_parameters, 'b_lstm_s4', use_peepholes, backwards=True) l_lstm_s4 = ElemwiseSumLayer([f_lstm_s4, b_lstm_s4], name='sum_b_lstm_s4') f_lstm_s5 = create_pretrained_lstm(s5_lstm, 'f_lstm', l_delta_s5, l_mask, lstm_size, cell_parameters, gate_parameters, 'f_lstm_s5', use_peepholes) b_lstm_s5 = create_pretrained_lstm(s5_lstm, 'b_lstm', l_delta_s5, l_mask, lstm_size, cell_parameters, gate_parameters, 'b_lstm_s5', use_peepholes, backwards=True) l_lstm_s5 = ElemwiseSumLayer([f_lstm_s5, b_lstm_s5], name='sum_b_lstm_s5') # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. if fusiontype == 'adasum': l_fuse = AdaptiveElemwiseSumLayer([l_lstm_s1, l_lstm_s2, l_lstm_s3,l_lstm_s4,l_lstm_s5], name='adasum1') elif fusiontype == 'sum': l_fuse = ElemwiseSumLayer([l_lstm_s1, l_lstm_s2, l_lstm_s3,l_lstm_s4,l_lstm_s5], name='sum1') elif fusiontype == 'concat': l_fuse = ConcatLayer([l_lstm_s1, l_lstm_s2, l_lstm_s3,l_lstm_s4,l_lstm_s5], axis=-1, name='concat') f_lstm_agg, b_lstm_agg = create_blstm(l_fuse, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm_agg') l_sum2 = ElemwiseSumLayer([f_lstm_agg, b_lstm_agg], name='sum2') # reshape to (num_examples * seq_len, lstm_size) l_reshape3 = ReshapeLayer(l_sum2, (-1, lstm_size), name='reshape3') # Now, we can apply feed-forward layers as usual. # We want the network to predict a classification for the sequence, # so we'll use a the number of classes. l_softmax = DenseLayer( l_reshape3, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='softmax') l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen_s1, output_classes), name='output') return l_out, l_fuse
def load_saved_model(model_path, stream_params, input_shape, input_var, mask_shape, mask_var, lstm_size=250, win=T.iscalar('theta)'), output_classes=26, w_init_fn=GlorotUniform(), use_peepholes=False, use_blstm=True): """ loads a saved model :param model_path: path to model parameters :param stream_params: stream parameters in a tuple of ([layer 1 dimension, ..., layer N dimension], [layer 1 nonlinearity, ..., layer N nonlinearity] :param input_shape: input shape eg: (None, None, 1500) :param input_var: input theano variable :param mask_shape: mask shape eg: (None, None) if variable lengths :param mask_var: mask theano variable :param lstm_size: number of lstm units for lstm layer :param win: window theano variable :param output_classes: number of output classes :param w_init_fn: weight initialization function used for initializing model :param use_peepholes: use peepholes for lstm layers :return: saved model """ shapes, nonlinearities = stream_params gate_parameters = Gate( W_in=w_init_fn, W_hid=w_init_fn, b=las.init.Constant(0.)) cell_parameters = Gate( W_in=w_init_fn, W_hid=w_init_fn, # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=las.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=tanh) l_in = InputLayer(input_shape, input_var, 'input') l_mask = InputLayer(mask_shape, mask_var, 'mask') symbolic_batchsize = l_in.input_var.shape[0] symbolic_seqlen = l_in.input_var.shape[1] l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1') l_encoder = create_encoder(l_reshape1, shapes, nonlinearities, ['fc1', 'fc2', 'fc3', 'bottleneck']) encoder_len = las.layers.get_output_shape(l_encoder)[-1] l_reshape2 = ReshapeLayer(l_encoder, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2') l_delta = DeltaLayer(l_reshape2, win, name='delta') if use_blstm: l_lstm, l_lstm_back = create_blstm(l_delta, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm', use_peepholes) # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. l_sum1 = ElemwiseSumLayer([l_lstm, l_lstm_back], name='sum1') # reshape, flatten to 2 dimensions to run softmax on all timesteps l_reshape3 = ReshapeLayer(l_sum1, (-1, lstm_size), name='reshape3') else: l_lstm = create_lstm(l_delta, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm', use_peepholes) l_reshape3 = ReshapeLayer(l_lstm, (-1, lstm_size), name='reshape3') # Now, we can apply feed-forward layers as usual. # We want the network to predict a classification for the sequence, # so we'll use a the number of classes. l_softmax = DenseLayer( l_reshape3, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='softmax') l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen, output_classes), name='output') load_model_params(l_out, model_path) return l_out
def create_pretrained_substream(weights, biases, input_shape, input_var, mask_shape, mask_var, name, lstm_size=250, win=T.iscalar('theta'), nonlinearity=rectify, w_init_fn=las.init.Orthogonal(), use_peepholes=True): gate_parameters = Gate(W_in=w_init_fn, W_hid=w_init_fn, b=las.init.Constant(0.)) cell_parameters = Gate( W_in=w_init_fn, W_hid=w_init_fn, # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=las.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=tanh) l_input = InputLayer(input_shape, input_var, 'input_' + name) l_mask = InputLayer(mask_shape, mask_var, 'mask') symbolic_batchsize_raw = l_input.input_var.shape[0] symbolic_seqlen_raw = l_input.input_var.shape[1] l_reshape1_raw = ReshapeLayer(l_input, (-1, input_shape[-1]), name='reshape1_' + name) l_encoder_raw = create_pretrained_encoder( l_reshape1_raw, weights, biases, [2000, 1000, 500, 50], [nonlinearity, nonlinearity, nonlinearity, linear], ['fc1_' + name, 'fc2_' + name, 'fc3_' + name, 'bottleneck_' + name]) input_len = las.layers.get_output_shape(l_encoder_raw)[-1] l_reshape2 = ReshapeLayer( l_encoder_raw, (symbolic_batchsize_raw, symbolic_seqlen_raw, input_len), name='reshape2_' + name) l_delta = DeltaLayer(l_reshape2, win, name='delta_' + name) l_lstm = LSTMLayer( l_delta, int(lstm_size), peepholes=use_peepholes, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='lstm_' + name) return l_lstm