Пример #1
0
def create_model(input_dim):
    row = sequence.input_variable(shape=input_dim)
    col = sequence.input_variable(shape=input_dim)
    rowh = Sequential([Embedding(opt.embed), Stabilizer(), Dropout(opt.dropout)])(row)
    colh = Sequential([Embedding(opt.embed), Stabilizer(), Dropout(opt.dropout)])(col)

    x = C.splice(rowh, colh, axis=-1)
    x = lightlstm(opt.embed, opt.nhid)(x)
    x = For(range(opt.layer-1), lambda: lightlstm(opt.nhid, opt.nhid))(x)
    rowh = C.slice(x, -1, opt.nhid * 0, opt.nhid * 1)
    colh = C.slice(x, -1, opt.nhid * 1, opt.nhid * 2)

    row_predict = Sequential([Dropout(opt.dropout), Dense(input_dim)])(rowh)
    col_predict = Sequential([Dropout(opt.dropout), Dense(input_dim)])(colh)

    # variable : row label and col label
    row_label = sequence.input_variable(shape=input_dim)
    col_label = sequence.input_variable(shape=input_dim)
    model = C.combine([row_predict, col_predict])

    return {'row':       row,
            'col':       col,
            'row_label': row_label,
            'col_label': col_label,
            'model':     model}
Пример #2
0
def create_model(input_sequence, label_sequence, vocab_dim, hidden_dim):
    # Create the rnn that computes the latent representation for the next token.
    rnn_with_latent_output = Sequential([
        C.layers.Embedding(hidden_dim),
        For(
            range(num_layers), lambda: Sequential([
                Stabilizer(),
                Recurrence(LSTM(hidden_dim), go_backwards=False)
            ])),
    ])

    # Apply it to the input sequence.
    latent_vector = rnn_with_latent_output(input_sequence)

    # Connect the latent output to (sampled/full) softmax.
    if use_sampled_softmax:
        weights = load_sampling_weights(token_frequencies_file_path)
        smoothed_weights = np.float32(np.power(weights, alpha))
        sampling_weights = C.reshape(C.Constant(smoothed_weights),
                                     shape=(1, vocab_dim))
        z, ce, errs = cross_entropy_with_sampled_softmax(
            latent_vector, label_sequence, vocab_dim, hidden_dim,
            softmax_sample_size, sampling_weights)
    else:
        z, ce, errs = cross_entropy_with_full_softmax(latent_vector,
                                                      label_sequence,
                                                      vocab_dim, hidden_dim)

    return z, ce, errs
Пример #3
0
def create_model(output_dim):

    return Sequential([
        For(
            range(num_layers), lambda: Sequential([
                Stabilizer(),
                Recurrence(LSTM(hidden_dim), go_backwards=False)
            ])),
        Dense(output_dim)
    ])
Пример #4
0
def test_layers_stabilizer():
    y = input(4)
    p = Stabilizer()(y)

    dat = np.array([[1.0,2.0,3.0,4.0]], dtype=np.float32)
    res = p(y).eval({y: dat})

    # a stabilizer starts with having no effect, hence input=output
    np.testing.assert_array_almost_equal(res, dat, decimal=6, \
        err_msg="Error in layer normalization computation")
Пример #5
0
    X = np.eye(vocab_size, dtype=np.float32)[xi]
    Y = np.eye(vocab_size, dtype=np.float32)[yi]

    return [X], [Y]


get_sample(0)

input_sequence = sequence.input_variable(shape=vocab_size)
label_sequence = sequence.input_variable(shape=vocab_size)

model = Sequential([
    For(
        range(2), lambda: Sequential(
            [Stabilizer(),
             Recurrence(LSTM(256), go_backwards=False)])),
    Dense(vocab_size)
])

z = model(input_sequence)
z_sm = cntk.softmax(z)

ce = cross_entropy_with_softmax(z, label_sequence)
errs = classification_error(z, label_sequence)

lr_per_sample = learning_rate_schedule(0.001, UnitType.sample)
momentum_time_constant = momentum_as_time_constant_schedule(1100)
clipping_threshold_per_sample = 5.0
gradient_clipping_with_truncation = True
learner = momentum_sgd(
Пример #6
0
    X = np.eye(vocab_size, dtype=np.float32)[xi]
    Y = np.eye(vocab_size, dtype=np.float32)[yi]

    return [X], [Y]
sample(0)


input_seq_axis = Axis('inputAxis')
input_sequence = sequence.input_variable(shape=vocab_size, sequence_axis=input_seq_axis)
label_sequence = sequence.input_variable(shape=vocab_size, sequence_axis=input_seq_axis)

# model = Sequential([Dense(300),Dense(vocab_size)])

model = Sequential([
        For(range(2), lambda:
                   Sequential([Stabilizer(), Recurrence(LSTM(256), go_backwards=False)])),
        Dense(vocab_size)])

z = model(input_sequence)

ce = cross_entropy_with_softmax(z, label_sequence)
errs = classification_error(z, label_sequence)

lr_per_sample = learning_rate_schedule(0.001, UnitType.sample)
momentum_time_constant = momentum_as_time_constant_schedule(1100)
clipping_threshold_per_sample = 5.0
gradient_clipping_with_truncation = True
learner = momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant,
                    gradient_clipping_threshold_per_sample=clipping_threshold_per_sample,
                    gradient_clipping_with_truncation=gradient_clipping_with_truncation)
progress_printer = ProgressPrinter(freq=100, tag='Training')
Пример #7
0
def _RecurrentBlock(type, shape, cell_shape, activation, use_peepholes,
                    init, init_bias,
                    enable_self_stabilization, dropout_rate, seed,
                    name=''):
    '''
    Helper to create a recurrent block of type 'WeightDroppedLSTM', 'GRU', or RNNStep.
    '''

    has_projection = cell_shape is not None

    shape = _as_tuple(shape)

    cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape
    if len(shape) != 1 or len(cell_shape) != 1:
        raise ValueError("%s: shape and cell_shape must be vectors (rank-1 tensors)" % type)
        # otherwise we'd need to fix slicing and Param initializers

    stack_axis = -1  # for efficient computation, we stack multiple variables (along the fastest-changing one, to match BS)
    # determine stacking dimensions
    cell_shape_list = list(cell_shape)
    stacked_dim = cell_shape_list[stack_axis]
    cell_shape_list[stack_axis] = stacked_dim * {
        'IndRNN': 1,
        'IndyLSTM': 4,
        'WeightDroppedLSTM': 4
    }[type]
    cell_shape_stacked = tuple(cell_shape_list)  # patched dims with stack_axis duplicated 4 times
    cell_shape_list[stack_axis] = stacked_dim * {
        'IndRNN': 1,
        'IndyLSTM': 4,
        'WeightDroppedLSTM': 4
    }[type]
    cell_shape_stacked_H = tuple(cell_shape_list)  # patched dims with stack_axis duplicated 4 times

    # parameters
    b  = Parameter(            cell_shape_stacked,   init=init_bias, name='b')                              # bias
    W  = Parameter(_INFERRED + cell_shape_stacked,   init=init,      name='W')                              # input
    H  = Parameter(shape     + cell_shape_stacked_H, init=init,      name='H')                              # hidden-to-hidden
    H1 = Parameter(            cell_shape_stacked_H, init=init,      name='H1') if type == 'IndyLSTM' else None  # hidden-to-hidden
    H2 = Parameter(shape                 ,           init=init,      name='H2') if type == 'IndRNN' else None  # hidden-to-hidden
    Ci = Parameter(            cell_shape,           init=init,      name='Ci') if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}
    Cf = Parameter(            cell_shape,           init=init,      name='Cf') if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}
    Co = Parameter(            cell_shape,           init=init,      name='Co') if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}

    Wmr = Parameter(cell_shape + shape, init=init, name='P') if has_projection else None  # final projection

    # each use of a stabilizer layer must get its own instance
    Sdh = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='dh_stabilizer')
    Sdc = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='dc_stabilizer')
    Sct = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='c_stabilizer')
    Sht = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='P_stabilizer')

    # DropConnect
    dropout = C.layers.Dropout(dropout_rate=dropout_rate, seed=seed, name='h_dropout')

    # define the model function itself
    # general interface for Recurrence():
    #   (all previous outputs delayed, input) --> (outputs and state)
    # where
    #  - the first output is the main output, e.g. 'h' for LSTM
    #  - the remaining outputs, if any, are additional state
    #  - if for some reason output != state, then output is still fed back and should just be ignored by the recurrent block

    # LSTM model function
    # in this case:
    #   (dh, dc, x) --> (h, c)
    def weight_dropped_lstm(dh, dc, x):

        dhs = Sdh(dh)  # previous values, stabilized
        dcs = Sdc(dc)
        # note: input does not get a stabilizer here, user is meant to do that outside

        # projected contribution from input(s), hidden, and bias
        proj4 = b + times(x, W) + times(dhs, dropout(H))

        it_proj  = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim)  # split along stack_axis
        bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim)
        ft_proj  = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim)
        ot_proj  = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim)

        # helper to inject peephole connection if requested
        def peep(x, c, C):
            return x + C * c if use_peepholes else x

        it = sigmoid(peep(it_proj, dcs, Ci))  # input gate(t)
        # TODO: should both activations be replaced?
        bit = it * activation(bit_proj)  # applied to tanh of input network

        ft = sigmoid(peep(ft_proj, dcs, Cf))  # forget-me-not gate(t)
        bft = ft * dc  # applied to cell(t-1)

        ct = bft + bit  # c(t) is sum of both

        ot = sigmoid(peep(ot_proj, Sct(ct), Co))  # output gate(t)
        ht = ot * activation(ct)  # applied to tanh(cell(t))

        c = ct  # cell value
        h = times(Sht(ht), Wmr) if has_projection else ht

        return h, c

    # LSTM model function
    # in this case:
    #   (dh, dc, x) --> (h, c)
    def indy_lstm(dh, dc, x):

        dhs = Sdh(dh)  # previous values, stabilized
        dcs = Sdc(dc)
        # note: input does not get a stabilizer here, user is meant to do that outside

        # projected contribution from input(s), hidden, and bias
        proj4 = b + times(x, W) + C.splice(dhs, dhs, dhs, dhs) * H1  # 4 is the number of stacked dim

        it_proj  = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim)  # split along stack_axis
        bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim)
        ft_proj  = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim)
        ot_proj  = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim)

        # helper to inject peephole connection if requested
        def peep(x, c, C):
            return x + C * c if use_peepholes else x

        it = sigmoid(peep(it_proj, dcs, Ci))  # input gate(t)
        # TODO: should both activations be replaced?
        bit = it * activation(bit_proj)  # applied to tanh of input network

        ft = sigmoid(peep(ft_proj, dcs, Cf))  # forget-me-not gate(t)
        bft = ft * dc  # applied to cell(t-1)

        ct = bft + bit  # c(t) is sum of both

        ot = sigmoid(peep(ot_proj, Sct(ct), Co))  # output gate(t)
        ht = ot * activation(ct)  # applied to tanh(cell(t))

        c = ct  # cell value
        h = times(Sht(ht), Wmr) if has_projection else ht

        return h, c

    def ind_rnn(dh, x):
        dhs = Sdh(dh)  # previous value, stabilized
        ht = activation(times(x, W) + dhs * H2 + b)
        h = times(Sht(ht), Wmr) if has_projection else ht
        return h

    function = {
        'IndRNN': ind_rnn,
        'IndyLSTM': indy_lstm,
        'WeightDroppedLSTM':    weight_dropped_lstm
    }[type]

    # return the corresponding lambda as a CNTK Function
    return BlockFunction(type, name)(function)