Exemplo n.º 1
0
    def gru(dh, x):

        dhs = Sdh(dh)  # previous value, stabilized
        # note: input does not get a stabilizer here, user is meant to do that outside

        # projected contribution from input(s), hidden, and bias
        projx3 = b + times(x, W)
        projh2  = times(dhs, H)

        zt_proj = slice (projx3, stack_axis, 0*stacked_dim, 1*stacked_dim) + slice (projh2, stack_axis, 0*stacked_dim, 1*stacked_dim)
        rt_proj = slice (projx3, stack_axis, 1*stacked_dim, 2*stacked_dim) + slice (projh2, stack_axis, 1*stacked_dim, 2*stacked_dim)
        ct_proj = slice (projx3, stack_axis, 2*stacked_dim, 3*stacked_dim)

        zt = sigmoid (zt_proj)        # update gate z(t)

        rt = sigmoid (rt_proj)        # reset gate r(t)

        rs = dhs * rt        # "cell" c
        ct = activation (ct_proj + times(rs, H1))

        ht = (1 - zt) * ct + zt * dhs # hidden state ht / output

        # for comparison: CUDNN_GRU
        # i(t) = sigmoid(W_i x(t) +          R_i h(t-1)  + b_Wi + b_Ru)
        # r(t) = sigmoid(W_r x(t) +          R_r h(t-1)  + b_Wr + b_Rr)   --same up to here
        # h'(t) =   tanh(W_h x(t) + r(t) .* (R_h h(t-1)) + b_Wh + b_Rh)   --r applied after projection? Would make life easier!
        # h(t) = (1 - i(t) .* h'(t)) + i(t) .* h(t-1)                     --TODO: need to confirm bracketing with NVIDIA

        h = times(Sht(ht), Wmr) if has_projection else \
            ht

        # returns the new state as a tuple with names but order matters
        return Function.NamedOutput(h=h)
Exemplo n.º 2
0
    def lstm(dh, dc, x):
        # projected contribution from input(s), hidden, and bias

        dropped_H = dropout(H) if weight_drop_rate is not None else H
        proj4 = b + times(x, W) + times(dh, dropped_H)

        # slicing layout different from cntk's implementation
        it_proj = slice(proj4, stack_axis, 0 * stacked_dim,
                        1 * stacked_dim)  # split along stack_axis
        ft_proj = slice(proj4, stack_axis, 1 * stacked_dim, 2 * stacked_dim)
        bit_proj = slice(proj4, stack_axis, 2 * stacked_dim,
                         3 * stacked_dim)  # g gate
        ot_proj = slice(proj4, stack_axis, 3 * stacked_dim, 4 * stacked_dim)

        it = sigmoid(it_proj)  # input gate(t)
        bit = it * activation(bit_proj)  # applied to tanh of input network

        ft = sigmoid(ft_proj)  # forget-me-not gate(t)
        bft = ft * dc  # applied to cell(t-1)

        ct = bft + bit  # c(t) is sum of both

        ot = sigmoid(ot_proj)  # output gate(t)
        ht = ot * activation(ct)  # applied to tanh(cell(t))
        return ht, ct
Exemplo n.º 3
0
    def weight_dropped_lstm(dh, dc, x):

        dhs = Sdh(dh)  # previous values, stabilized
        dcs = Sdc(dc)
        # note: input does not get a stabilizer here, user is meant to do that outside

        # projected contribution from input(s), hidden, and bias
        proj4 = b + times(x, W) + times(dhs, dropout(H))

        it_proj  = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim)  # split along stack_axis
        bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim)
        ft_proj  = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim)
        ot_proj  = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim)

        # helper to inject peephole connection if requested
        def peep(x, c, C):
            return x + C * c if use_peepholes else x

        it = sigmoid(peep(it_proj, dcs, Ci))  # input gate(t)
        # TODO: should both activations be replaced?
        bit = it * activation(bit_proj)  # applied to tanh of input network

        ft = sigmoid(peep(ft_proj, dcs, Cf))  # forget-me-not gate(t)
        bft = ft * dc  # applied to cell(t-1)

        ct = bft + bit  # c(t) is sum of both

        ot = sigmoid(peep(ot_proj, Sct(ct), Co))  # output gate(t)
        ht = ot * activation(ct)  # applied to tanh(cell(t))

        c = ct  # cell value
        h = times(Sht(ht), Wmr) if has_projection else ht

        return h, c
Exemplo n.º 4
0
    def gru(dh, x):

        dhs = Sdh(dh)  # previous value, stabilized
        # note: input does not get a stabilizer here, user is meant to do that outside

        # projected contribution from input(s), hidden, and bias
        projx3 = b + times(x, W)
        projh2  = times(dhs, H)

        zt_proj = slice (projx3, stack_axis, 0*stacked_dim, 1*stacked_dim) + slice (projh2, stack_axis, 0*stacked_dim, 1*stacked_dim)
        rt_proj = slice (projx3, stack_axis, 1*stacked_dim, 2*stacked_dim) + slice (projh2, stack_axis, 1*stacked_dim, 2*stacked_dim)
        ct_proj = slice (projx3, stack_axis, 2*stacked_dim, 3*stacked_dim)

        zt = sigmoid (zt_proj)        # update gate z(t)

        rt = sigmoid (rt_proj)        # reset gate r(t)

        rs = dhs * rt        # "cell" c
        ct = activation (ct_proj + times(rs, H1))

        ht = (1 - zt) * ct + zt * dhs # hidden state ht / output

        # for comparison: CUDNN_GRU
        # i(t) = sigmoid(W_i x(t) +          R_i h(t-1)  + b_Wi + b_Ru)
        # r(t) = sigmoid(W_r x(t) +          R_r h(t-1)  + b_Wr + b_Rr)   --same up to here
        # h'(t) =   tanh(W_h x(t) + r(t) .* (R_h h(t-1)) + b_Wh + b_Rh)   --r applied after projection? Would make life easier!
        # h(t) = (1 - i(t) .* h'(t)) + i(t) .* h(t-1)                     --TODO: need to confirm bracketing with NVIDIA

        h = times(Sht(ht), Wmr) if has_projection else \
            ht

        # returns the new state as a tuple with names but order matters
        return Function.NamedOutput(h=h)
Exemplo n.º 5
0
    def lstm(dh, dc, sv, x):

        # projected contribution from input(s), hidden, and bias
        proj3 = b + times(x, W) + times(dh, H) + times(sv, Hsv)

        it_proj = slice(proj3, stack_axis, 0 * stacked_dim, 1 * stacked_dim)
        ft_proj = slice(proj3, stack_axis, 1 * stacked_dim, 2 * stacked_dim)
        ot_proj = slice(proj3, stack_axis, 2 * stacked_dim, 3 * stacked_dim)

        it = sigmoid(it_proj)  # input gate(t)
        ft = sigmoid(ft_proj)  # forget-me-not gate(t)
        ot = sigmoid(ot_proj)  # output gate(t)

        # the following is reading gate
        proj3rg = sigmoid(
            times(x, Wrg) + times(dh, Hrg) + times(sv, Hsvrg) + brg)
        v = proj3rg * sv

        cx_t = tanh(times(x, Wcx) + times(dh, Hcx))

        # need to do stablization ??
        # update memory cell
        c = it * cx_t + ft * dc + tanh(times(v, Wfc))

        h = ot * tanh(c)

        return (h, c, v)
Exemplo n.º 6
0
def predict(model, params):
    """
  Compute the prediction result of the given model
  """
    model_args = {arg.name: arg for arg in model.arguments}
    context = model_args['context']
    entity_ids_mask = model_args['entity_ids_mask']
    entity_condition = greater(entity_ids_mask, 0, name='condidion')
    # Get all the enities in the paragraph via gather operator, which will create a new dynamic sequence axis
    entities_all = sequence.gather(entity_condition,
                                   entity_condition,
                                   name='entities_all')
    # The generated dynamic axis has the same length as the input enity id sequence,
    # so we asign it as the entity id's dynamic axis.
    entity_ids = input(shape=(params.entity_dim),
                       is_sparse=True,
                       dynamic_axes=entities_all.dynamic_axes,
                       name='entity_ids')
    wordvocab_dim = params.vocab_dim
    answers = sequence.scatter(sequence.gather(model.outputs[-1],
                                               entity_condition),
                               entities_all,
                               name='Final_Ans')
    entity_id_matrix = ops.slice(ops.reshape(entity_ids, params.entity_dim),
                                 -1, 1, params.entity_dim)
    expand_pred = sequence.reduce_sum(element_times(answers, entity_id_matrix))
    pred_max = ops.hardmax(expand_pred, name='pred_max')
    return pred_max
Exemplo n.º 7
0
 def _convolution(x):
     if group == 1:
         apply_x = _conv_ops(w, x)
     else:
         groups_data = [ops.slice(x, axis=0, begin_index=i * sub_input_channels,
                                  end_index=(i + 1) * sub_input_channels) for i in range(0, group)]
         apply_sub = [_conv_ops(group_kernel, group_data)
                      for group_kernel, group_data in zip(groups_kernel, groups_data)]
         apply_x = ops.splice(*apply_sub, axis=0)
     if bias_init is not None:
         apply_x += b
     return apply_x
Exemplo n.º 8
0
    def lstm(dh, dc, x):

        dhs = Sdh(dh)  # previous values, stabilized
        dcs = Sdc(dc)
        # note: input does not get a stabilizer here, user is meant to do that outside

        # projected contribution from input(s), hidden, and bias
        proj4 = b + times(x, W) + times(dhs, H)

        it_proj  = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim)  # split along stack_axis
        bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim)
        ft_proj  = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim)
        ot_proj  = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim)

        # helper to inject peephole connection if requested
        def peep(x, c, C):
            return x + C * c if use_peepholes else x

        it = sigmoid (peep (it_proj, dcs, Ci))        # input gate(t)
        # TODO: should both activations be replaced?
        bit = it * activation (bit_proj)              # applied to tanh of input network

        ft = sigmoid (peep (ft_proj, dcs, Cf))        # forget-me-not gate(t)
        bft = ft * dc                                 # applied to cell(t-1)

        ct = bft + bit                                # c(t) is sum of both

        ot = sigmoid (peep (ot_proj, Sct(ct), Co))    # output gate(t)
        ht = ot * activation (ct)                     # applied to tanh(cell(t))

        c = ct                                        # cell value
        h = times(Sht(ht), Wmr) if has_projection else \
            ht

        # returns the new state as a tuple with names but order matters
        return (Function.NamedOutput(h=h), Function.NamedOutput(c=c))
Exemplo n.º 9
0
 def _convolution(x):
     if group == 1:
         apply_x = _conv_ops(w, x)
     else:
         groups_data = [
             ops.slice(x,
                       axis=0,
                       begin_index=i * sub_input_channels,
                       end_index=(i + 1) * sub_input_channels)
             for i in range(0, group)
         ]
         apply_sub = [
             _conv_ops(group_kernel, group_data)
             for group_kernel, group_data in zip(
                 groups_kernel, groups_data)
         ]
         apply_x = ops.splice(*apply_sub, axis=0)
     if bias_init is not None:
         apply_x += b
     return apply_x
Exemplo n.º 10
0
def sequence_to_sequence_translator(debug_output=False):

    input_vocab_dim = 69
    label_vocab_dim = 69

    hidden_dim = 512
    num_layers = 2

    # Source and target inputs to the model
    batch_axis = Axis.default_batch_axis()
    input_seq_axis = Axis("inputAxis")
    label_seq_axis = Axis("labelAxis")

    input_dynamic_axes = [batch_axis, input_seq_axis]
    raw_input = input_variable(shape=(input_vocab_dim), dynamic_axes=input_dynamic_axes)

    label_dynamic_axes = [batch_axis, label_seq_axis]
    raw_labels = input_variable(shape=(label_vocab_dim), dynamic_axes=label_dynamic_axes)

    # Instantiate the sequence to sequence translation model
    input_sequence = raw_input

    # Drop the sentence start token from the label, for decoder training
    label_sequence = slice(raw_labels, label_seq_axis, 1, 0)
    label_sentence_start = sequence.first(raw_labels)

    is_first_label = sequence.is_first(label_sequence)
    label_sentence_start_scattered = sequence.scatter(label_sentence_start, is_first_label)

    # Encoder
    encoder_outputH = stabilize(input_sequence)
    for i in range(0, num_layers):
        (encoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(
            encoder_outputH.output(), hidden_dim, hidden_dim, future_value, future_value
        )

    thought_vectorH = sequence.first(encoder_outputH)
    thought_vectorC = sequence.first(encoder_outputC)

    thought_vector_broadcastH = sequence.broadcast_as(thought_vectorH, label_sequence)
    thought_vector_broadcastC = sequence.broadcast_as(thought_vectorC, label_sequence)

    # Decoder
    decoder_history_from_ground_truth = label_sequence
    decoder_input = element_select(
        is_first_label, label_sentence_start_scattered, past_value(decoder_history_from_ground_truth)
    )

    decoder_outputH = stabilize(decoder_input)
    for i in range(0, num_layers):
        if i > 0:
            recurrence_hookH = past_value
            recurrence_hookC = past_value
        else:
            isFirst = sequence.is_first(label_sequence)
            recurrence_hookH = lambda operand: element_select(isFirst, thought_vector_broadcastH, past_value(operand))
            recurrence_hookC = lambda operand: element_select(isFirst, thought_vector_broadcastC, past_value(operand))

        (decoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(
            decoder_outputH.output(), hidden_dim, hidden_dim, recurrence_hookH, recurrence_hookC
        )

    decoder_output = decoder_outputH
    decoder_dim = hidden_dim

    # Softmax output layer
    z = linear_layer(stabilize(decoder_output), label_vocab_dim)
    ce = cross_entropy_with_softmax(z, label_sequence)
    errs = classification_error(z, label_sequence)

    # Instantiate the trainer object to drive the model training
    lr = 0.007
    momentum_time_constant = 1100
    momentum_per_sample = momentums_per_sample(math.exp(-1.0 / momentum_time_constant))
    clipping_threshold_per_sample = 2.3
    gradient_clipping_with_truncation = True

    trainer = Trainer(
        z,
        ce,
        errs,
        [
            momentum_sgd(
                z.parameters(),
                lr,
                momentum_per_sample,
                clipping_threshold_per_sample,
                gradient_clipping_with_truncation,
            )
        ],
    )

    rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.train-dev-20-21.ctf"
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path)
    feature_stream_name = "features"
    labels_stream_name = "labels"

    mb_source = text_format_minibatch_source(
        path,
        [
            StreamConfiguration(feature_stream_name, input_vocab_dim, True, "S0"),
            StreamConfiguration(labels_stream_name, label_vocab_dim, True, "S1"),
        ],
        10000,
    )
    features_si = mb_source[feature_stream_name]
    labels_si = mb_source[labels_stream_name]

    # Get minibatches of sequences to train with and perform model training
    minibatch_size = 72
    training_progress_output_freq = 30
    if debug_output:
        training_progress_output_freq = training_progress_output_freq / 3

    while True:
        mb = mb_source.get_next_minibatch(minibatch_size)
        if len(mb) == 0:
            break

        # Specify the mapping of input variables in the model to actual
        # minibatch data to be trained with
        arguments = {raw_input: mb[features_si], raw_labels: mb[labels_si]}
        trainer.train_minibatch(arguments)

        print_training_progress(trainer, i, training_progress_output_freq)
        i += 1

    rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.test.ctf"
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path)

    test_mb_source = text_format_minibatch_source(
        path,
        [
            StreamConfiguration(feature_stream_name, input_vocab_dim, True, "S0"),
            StreamConfiguration(labels_stream_name, label_vocab_dim, True, "S1"),
        ],
        10000,
        False,
    )
    features_si = test_mb_source[feature_stream_name]
    labels_si = test_mb_source[labels_stream_name]

    # choose this to be big enough for the longest sentence
    train_minibatch_size = 1024

    # Get minibatches of sequences to test and perform testing
    i = 0
    total_error = 0.0
    while True:
        mb = test_mb_source.get_next_minibatch(train_minibatch_size)
        if len(mb) == 0:
            break

        # Specify the mapping of input variables in the model to actual
        # minibatch data to be tested with
        arguments = {raw_input: mb[features_si], raw_labels: mb[labels_si]}
        mb_error = trainer.test_minibatch(arguments)

        total_error += mb_error

        if debug_output:
            print("Minibatch {}, Error {} ".format(i, mb_error))

        i += 1

    # Average of evaluation errors of all test minibatches
    return total_error / i
Exemplo n.º 11
0
def LSTM(shape, cell_shape=None, use_peepholes=use_peepholes_default_or_False,
         init=init_default_or_glorot_uniform, init_bias=init_bias_default_or_0,
         enable_self_stabilization=enable_self_stabilization_default_or_False): # (x, (h, c))

    use_peepholes             = use_peepholes             if _is_given(use_peepholes)             else _current_default_options.use_peepholes
    enable_self_stabilization = enable_self_stabilization if _is_given(enable_self_stabilization) else _current_default_options.enable_self_stabilization
    has_projection = cell_shape is not None
    has_aux = False

    if has_aux:
        UntestedBranchError("LSTM, has_aux option")

    shape = _as_tuple(shape)

    cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape
    if len(shape) != 1 or len(cell_shape) != 1:
        raise ValueError("LSTM: shape and cell_shape must be vectors (rank-1 tensors)")
        # otherwise we'd need to fix slicing and Param initializers

    stack_axis = -1  # stacking along the fastest-changing one, to match BS
    # determine stacking dimensions
    cell_shape_list = list(cell_shape)
    stacked_dim = cell_shape_list[0]
    cell_shape_list[stack_axis] = stacked_dim*4
    cell_shape_stacked = tuple(cell_shape_list)  # patched dims with stack_axis duplicated 4 times

    # parameters
    b  = Parameter(            cell_shape_stacked, init=init_bias, name='b')                              # a bias
    W  = Parameter(_INFERRED + cell_shape_stacked, init=init,      name='W')                              # input
    A  = Parameter(_INFERRED + cell_shape_stacked, init=init,      name='A') if has_aux else None         # aux input (optional)
    H  = Parameter(shape     + cell_shape_stacked, init=init,      name='H')                              # hidden-to-hidden
    Ci = Parameter(            cell_shape,         init=init,      name='Ci') if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}
    Cf = Parameter(            cell_shape,         init=init,      name='Cf') if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}
    Co = Parameter(            cell_shape,         init=init,      name='Co') if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}

    Wmr = Parameter(cell_shape + shape, init=init) if has_projection else None  # final projection

    Sdh = Stabilizer() if enable_self_stabilization else identity
    Sdc = Stabilizer() if enable_self_stabilization else identity
    Sct = Stabilizer() if enable_self_stabilization else identity
    Sht = Stabilizer() if enable_self_stabilization else identity

    def create_hc_placeholder():
        # we pass the known dimensions here, which makes dimension inference easier
        return (Placeholder(shape=shape, name='hPh'), Placeholder(shape=cell_shape, name='cPh')) # (h, c)

    # parameters to model function
    x = Placeholder(name='lstm_block_arg')
    prev_state = create_hc_placeholder()

    # formula of model function
    dh, dc = prev_state

    dhs = Sdh(dh)  # previous values, stabilized
    dcs = Sdc(dc)
    # note: input does not get a stabilizer here, user is meant to do that outside

    # projected contribution from input(s), hidden, and bias
    proj4 = b + times(x, W) + times(dhs, H) + times(aux, A) if has_aux else \
            b + times(x, W) + times(dhs, H)

    it_proj  = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim)  # split along stack_axis
    bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim)
    ft_proj  = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim)
    ot_proj  = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim)

    # add peephole connection if requested
    def peep(x, c, C):
        return x + C * c if use_peepholes else x

    it = sigmoid (peep (it_proj, dcs, Ci))        # input gate(t)
    bit = it * tanh (bit_proj)                    # applied to tanh of input network

    ft = sigmoid (peep (ft_proj, dcs, Cf))        # forget-me-not gate(t)
    bft = ft * dc                                 # applied to cell(t-1)

    ct = bft + bit                                # c(t) is sum of both

    ot = sigmoid (peep (ot_proj, Sct(ct), Co))    # output gate(t)
    ht = ot * tanh (ct)                           # applied to tanh(cell(t))

    c = ct                                        # cell value
    h = times(Sht(ht), Wmr) if has_projection else \
        ht

    _name_node(h, 'h')
    if _trace_layers:
        _log_node(h)  # this looks right
    _name_node(c, 'c')

    # TODO: figure out how to do scoping, and also rename all the apply... to expression
    apply_x_h_c = combine ([h, c])
    # return to caller a helper function to create placeholders for recurrence
    # Note that this function will only exist in the object returned here, but not any cloned version of it.
    apply_x_h_c.create_placeholder = create_hc_placeholder
    #return Block(apply_x_h_c, 'LSTM') # BUGBUG: fails with "RuntimeError: A Function instance with more than one output cannot be implicitly converted to a Variable"
    return apply_x_h_c
Exemplo n.º 12
0
def train_sequence_to_sequence_translator():

    input_vocab_dim = 69
    label_vocab_dim = 69

    hidden_dim = 512
    num_layers = 2

    # Source and target inputs to the model
    input_dynamic_axes = [ Axis('inputAxis'), Axis.default_batch_axis() ]
    raw_input = input_variable(shape=(input_vocab_dim), dynamic_axes = input_dynamic_axes)

    label_dynamic_axes = [ Axis('labelAxis'), Axis.default_batch_axis() ]
    raw_labels = input_variable(shape=(label_vocab_dim), dynamic_axes = label_dynamic_axes)

    # Instantiate the sequence to sequence translation model
    input_sequence = raw_input

    # Drop the sentence start token from the label, for decoder training
    label_sequence = slice(raw_labels, label_dynamic_axes[0], 1, 0)
    label_sentence_start = sequence.first(raw_labels)

    is_first_label = sequence.is_first(label_sequence)
    label_sentence_start_scattered = sequence.scatter(label_sentence_start, is_first_label)

    # Encoder
    encoder_outputH = stabilize(input_sequence)
    for i in range(0, num_layers):
        (encoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(encoder_outputH, hidden_dim, hidden_dim, future_value, future_value)

    thought_vectorH = sequence.first(encoder_outputH)
    thought_vectorC = sequence.first(encoder_outputC)

    thought_vector_broadcastH = sequence.broadcast_as(thought_vectorH, label_sequence)
    thought_vector_broadcastC = sequence.broadcast_as(thought_vectorC, label_sequence)
    
    # Decoder
    decoder_history_from_ground_truth = label_sequence
    decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(decoder_history_from_ground_truth))

    decoder_outputH = stabilize(decoder_input)
    for i in range(0, num_layers):
        if (i == 0):
            recurrence_hookH = past_value
            recurrence_hookC = past_value
        else:
            isFirst = sequence.is_first(label_sequence)
            recurrence_hookH = lambda operand: element_select(isFirst, thought_vector_broadcastH, past_value(operand))
            recurrence_hookC = lambda operand: element_select(isFirst, thought_vector_broadcastC, past_value(operand))

        (decoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(decoder_outputH, hidden_dim, hidden_dim, recurrence_hookH, recurrence_hookC)

    decoder_output = decoder_outputH
    decoder_dim = hidden_dim

    # Softmax output layer
    z = linear_layer(stabilize(decoder_output), label_vocab_dim)
    ce = cross_entropy_with_softmax(z, label_sequence)
    errs = classification_error(z, label_sequence)

    rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.train-dev-20-21.ctf"
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path)
    feature_stream_name = 'features'
    labels_stream_name = 'labels'

    mb_source = text_format_minibatch_source(path, [ 
                    StreamConfiguration( feature_stream_name, input_vocab_dim, True, 'S0' ), 
                    StreamConfiguration( labels_stream_name, label_vocab_dim, True, 'S1') ], 10000)
    features_si = mb_source.stream_info(feature_stream_name)
    labels_si = mb_source.stream_info(labels_stream_name)

    # Instantiate the trainer object to drive the model training
    lr = learning_rates_per_sample(0.007)
    momentum_time_constant = 1100
    momentum_per_sample = momentums_per_sample(math.exp(-1.0 / momentum_time_constant))
    clipping_threshold_per_sample = 2.3
    gradient_clipping_with_truncation = True

    trainer = Trainer(z, ce, errs, [momentum_sgd_learner(z.owner.parameters(), lr, momentum_per_sample, clipping_threshold_per_sample, gradient_clipping_with_truncation)])                   

    # Get minibatches of sequences to train with and perform model training
    minibatch_size = 72
    training_progress_output_freq = 10
    while True:
        mb = mb_source.get_next_minibatch(minibatch_size)
        if  len(mb) == 0:
            break

        # Specify the mapping of input variables in the model to actual minibatch data to be trained with
        arguments = {raw_input : mb[features_si].m_data, raw_labels : mb[labels_si].m_data}
        trainer.train_minibatch(arguments)

        print_training_progress(trainer, i, training_progress_output_freq)

        i += 1
Exemplo n.º 13
0
def sequence_to_sequence_translator(debug_output=False, run_test=False):

    input_vocab_dim = 69
    label_vocab_dim = 69

    # network complexity; initially low for faster testing
    hidden_dim = 256
    num_layers = 1

    # Source and target inputs to the model
    batch_axis = Axis.default_batch_axis()
    input_seq_axis = Axis('inputAxis')
    label_seq_axis = Axis('labelAxis')

    input_dynamic_axes = [batch_axis, input_seq_axis]
    raw_input = input_variable(shape=(input_vocab_dim),
                               dynamic_axes=input_dynamic_axes,
                               name='raw_input')

    label_dynamic_axes = [batch_axis, label_seq_axis]
    raw_labels = input_variable(shape=(label_vocab_dim),
                                dynamic_axes=label_dynamic_axes,
                                name='raw_labels')

    # Instantiate the sequence to sequence translation model
    input_sequence = raw_input

    # Drop the sentence start token from the label, for decoder training
    label_sequence = slice(raw_labels, label_seq_axis, 1,
                           0)  # <s> A B C </s> --> A B C </s>
    label_sentence_start = sequence.first(raw_labels)  # <s>

    is_first_label = sequence.is_first(label_sequence)  # <s> 0 0 0 ...
    label_sentence_start_scattered = sequence.scatter(label_sentence_start,
                                                      is_first_label)

    # Encoder
    encoder_outputH = stabilize(input_sequence)
    for i in range(0, num_layers):
        (encoder_outputH,
         encoder_outputC) = LSTMP_component_with_self_stabilization(
             encoder_outputH.output, hidden_dim, hidden_dim, future_value,
             future_value)

    thought_vectorH = sequence.first(encoder_outputH)
    thought_vectorC = sequence.first(encoder_outputC)

    thought_vector_broadcastH = sequence.broadcast_as(thought_vectorH,
                                                      label_sequence)
    thought_vector_broadcastC = sequence.broadcast_as(thought_vectorC,
                                                      label_sequence)

    # Decoder
    decoder_history_hook = alias(
        label_sequence, name='decoder_history_hook')  # copy label_sequence

    decoder_input = element_select(is_first_label,
                                   label_sentence_start_scattered,
                                   past_value(decoder_history_hook))

    decoder_outputH = stabilize(decoder_input)
    for i in range(0, num_layers):
        if (i > 0):
            recurrence_hookH = past_value
            recurrence_hookC = past_value
        else:
            isFirst = sequence.is_first(label_sequence)
            recurrence_hookH = lambda operand: element_select(
                isFirst, thought_vector_broadcastH, past_value(operand))
            recurrence_hookC = lambda operand: element_select(
                isFirst, thought_vector_broadcastC, past_value(operand))

        (decoder_outputH,
         encoder_outputC) = LSTMP_component_with_self_stabilization(
             decoder_outputH.output, hidden_dim, hidden_dim, recurrence_hookH,
             recurrence_hookC)

    decoder_output = decoder_outputH

    # Softmax output layer
    z = linear_layer(stabilize(decoder_output), label_vocab_dim)

    # Criterion nodes
    ce = cross_entropy_with_softmax(z, label_sequence)
    errs = classification_error(z, label_sequence)

    # network output for decoder history
    net_output = hardmax(z)

    # make a clone of the graph where the ground truth is replaced by the network output
    ng = z.clone(CloneMethod.share,
                 {decoder_history_hook.output: net_output.output})

    # Instantiate the trainer object to drive the model training
    lr = 0.007
    minibatch_size = 72
    momentum_time_constant = 1100
    m_schedule = momentum_schedule(momentum_time_constant)
    clipping_threshold_per_sample = 2.3
    gradient_clipping_with_truncation = True
    learner = momentum_sgd(z.parameters, lr, m_schedule,
                           clipping_threshold_per_sample,
                           gradient_clipping_with_truncation)
    trainer = Trainer(z, ce, errs, learner)

    # setup data
    rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.train-dev-20-21.ctf"
    train_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                              rel_path)
    valid_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                              "tiny.ctf")

    feature_stream_name = 'features'
    labels_stream_name = 'labels'

    # readers
    randomize_data = True
    if run_test:
        randomize_data = False  # because we want to get an exact error
    train_reader = text_format_minibatch_source(train_path, [
        StreamConfiguration(feature_stream_name, input_vocab_dim, True, 'S0'),
        StreamConfiguration(labels_stream_name, label_vocab_dim, True, 'S1')
    ],
                                                randomize=randomize_data)
    features_si_tr = train_reader.stream_info(feature_stream_name)
    labels_si_tr = train_reader.stream_info(labels_stream_name)

    valid_reader = text_format_minibatch_source(valid_path, [
        StreamConfiguration(feature_stream_name, input_vocab_dim, True, 'S0'),
        StreamConfiguration(labels_stream_name, label_vocab_dim, True, 'S1')
    ],
                                                randomize=False)
    features_si_va = valid_reader.stream_info(feature_stream_name)
    labels_si_va = valid_reader.stream_info(labels_stream_name)

    # get the vocab for printing output sequences in plaintext
    rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.mapping"
    vocab_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                              rel_path)
    vocab = [w.strip() for w in open(vocab_path).readlines()]
    i2w = {i: ch for i, ch in enumerate(vocab)}

    # Get minibatches of sequences to train with and perform model training
    i = 0
    mbs = 0
    epoch_size = 908241
    max_epochs = 10
    training_progress_output_freq = 500

    # make things more basic for running a quicker test
    if run_test:
        epoch_size = 5000
        max_epochs = 1
        training_progress_output_freq = 30

    for epoch in range(max_epochs):
        loss_numer = 0
        metric_numer = 0
        denom = 0

        while i < (epoch + 1) * epoch_size:

            # get next minibatch of training data
            mb_train = train_reader.next_minibatch(minibatch_size)

            train_args = {
                'raw_input': mb_train[features_si_tr],
                'raw_labels': mb_train[labels_si_tr]
            }
            trainer.train_minibatch(train_args)

            # collect epoch-wide stats
            samples = trainer.previous_minibatch_sample_count
            loss_numer += trainer.previous_minibatch_loss_average * samples
            metric_numer += trainer.previous_minibatch_evaluation_average * samples
            denom += samples

            # every N MBs evaluate on a test sequence to visually show how we're doing
            if mbs % training_progress_output_freq == 0:
                mb_valid = valid_reader.next_minibatch(minibatch_size)
                valid_args = {
                    'raw_input': mb_valid[features_si_va],
                    'raw_labels': mb_valid[labels_si_va]
                }

                e = ng.eval(valid_args)
                print_sequences(e, i2w)

            print_training_progress(trainer, mbs,
                                    training_progress_output_freq)
            i += mb_train[labels_si_tr].num_samples
            mbs += 1

        print("--- EPOCH %d DONE: loss = %f, errs = %f ---" %
              (epoch, loss_numer / denom, 100.0 * (metric_numer / denom)))

    # now setup a test run
    rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.test.ctf"
    test_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                             rel_path)

    test_reader = text_format_minibatch_source(test_path, [
        StreamConfiguration(feature_stream_name, input_vocab_dim, True, 'S0'),
        StreamConfiguration(labels_stream_name, label_vocab_dim, True, 'S1')
    ],
                                               10000,
                                               randomize=False)
    features_si_te = test_reader.stream_info(feature_stream_name)
    labels_si_te = test_reader.stream_info(labels_stream_name)

    test_minibatch_size = 1024

    # Get minibatches of sequences to test and perform testing
    i = 0
    total_error = 0.0
    while True:
        mb = test_reader.next_minibatch(test_minibatch_size)
        if len(mb) == 0:
            break

        # Specify the mapping of input variables in the model to actual
        # minibatch data to be tested with
        arguments = {
            raw_input: mb[features_si_te],
            raw_labels: mb[labels_si_te]
        }
        mb_error = trainer.test_minibatch(arguments)

        total_error += mb_error

        if debug_output:
            print("Minibatch {}, Error {} ".format(i, mb_error))

        i += 1

    # Average of evaluation errors of all test minibatches
    return total_error / i
Exemplo n.º 14
0
def sequence_to_sequence_translator(debug_output=False, run_test=False):

    input_vocab_dim = 69
    label_vocab_dim = 69

    # network complexity; initially low for faster testing
    hidden_dim = 256
    num_layers = 1

    # Source and target inputs to the model
    batch_axis = Axis.default_batch_axis()
    input_seq_axis = Axis('inputAxis')
    label_seq_axis = Axis('labelAxis')

    input_dynamic_axes = [batch_axis, input_seq_axis]
    raw_input = input_variable(
        shape=(input_vocab_dim), dynamic_axes=input_dynamic_axes, name='raw_input')

    label_dynamic_axes = [batch_axis, label_seq_axis]
    raw_labels = input_variable(
        shape=(label_vocab_dim), dynamic_axes=label_dynamic_axes, name='raw_labels')

    # Instantiate the sequence to sequence translation model
    input_sequence = raw_input

    # Drop the sentence start token from the label, for decoder training
    label_sequence = slice(raw_labels, label_seq_axis, 1, 0) # <s> A B C </s> --> A B C </s>
    label_sentence_start = sequence.first(raw_labels)        # <s>

    is_first_label = sequence.is_first(label_sequence)       # <s> 0 0 0 ...
    label_sentence_start_scattered = sequence.scatter(
        label_sentence_start, is_first_label)

    # Encoder
    encoder_outputH = stabilize(input_sequence)
    for i in range(0, num_layers):
        (encoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(
            encoder_outputH.output, hidden_dim, hidden_dim, future_value, future_value)

    thought_vectorH = sequence.first(encoder_outputH)
    thought_vectorC = sequence.first(encoder_outputC)

    thought_vector_broadcastH = sequence.broadcast_as(
        thought_vectorH, label_sequence)
    thought_vector_broadcastC = sequence.broadcast_as(
        thought_vectorC, label_sequence)

    # Decoder
    decoder_history_hook = alias(label_sequence, name='decoder_history_hook') # copy label_sequence

    decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(
        decoder_history_hook))

    decoder_outputH = stabilize(decoder_input)
    for i in range(0, num_layers):
        if (i > 0):
            recurrence_hookH = past_value
            recurrence_hookC = past_value
        else:
            isFirst = sequence.is_first(label_sequence)
            recurrence_hookH = lambda operand: element_select(
                isFirst, thought_vector_broadcastH, past_value(operand))
            recurrence_hookC = lambda operand: element_select(
                isFirst, thought_vector_broadcastC, past_value(operand))

        (decoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(
            decoder_outputH.output, hidden_dim, hidden_dim, recurrence_hookH, recurrence_hookC)

    decoder_output = decoder_outputH

    # Softmax output layer
    z = linear_layer(stabilize(decoder_output), label_vocab_dim)

    # Criterion nodes
    ce = cross_entropy_with_softmax(z, label_sequence)
    errs = classification_error(z, label_sequence)

    # network output for decoder history
    net_output = hardmax(z)

    # make a clone of the graph where the ground truth is replaced by the network output
    ng = z.clone(CloneMethod.share, {decoder_history_hook.output : net_output.output})

    # Instantiate the trainer object to drive the model training
    lr = 0.007
    minibatch_size = 72
    momentum_time_constant = 1100
    m_schedule = momentum_schedule(momentum_time_constant)
    clipping_threshold_per_sample = 2.3
    gradient_clipping_with_truncation = True
    learner = momentum_sgd(z.parameters, lr, m_schedule, clipping_threshold_per_sample, gradient_clipping_with_truncation)
    trainer = Trainer(z, ce, errs, learner)

    # setup data
    rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.train-dev-20-21.ctf"
    train_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path)
    valid_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tiny.ctf")

    # readers
    randomize_data = True
    if run_test:
        randomize_data = False # because we want to get an exact error

    train_reader = create_reader(train_path, randomize_data, input_vocab_dim, label_vocab_dim)
    train_bind = {
        raw_input  : train_reader.streams.features,
        raw_labels : train_reader.streams.labels
    }

    # get the vocab for printing output sequences in plaintext
    rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.mapping"
    vocab_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path)
    vocab = [w.strip() for w in open(vocab_path).readlines()]
    i2w = { i:ch for i,ch in enumerate(vocab) }

    # Get minibatches of sequences to train with and perform model training
    i = 0
    mbs = 0
    epoch_size = 908241
    max_epochs = 10
    training_progress_output_freq = 500

    # make things more basic for running a quicker test
    if run_test:
        epoch_size = 5000
        max_epochs = 1
        training_progress_output_freq = 30

    valid_reader = create_reader(valid_path, False, input_vocab_dim, label_vocab_dim)
    valid_bind = {
            find_arg_by_name('raw_input',ng)  : valid_reader.streams.features,
            find_arg_by_name('raw_labels',ng) : valid_reader.streams.labels
        }

    for epoch in range(max_epochs):
        loss_numer = 0
        metric_numer = 0
        denom = 0

        while i < (epoch+1) * epoch_size:
            # get next minibatch of training data
            mb_train = train_reader.next_minibatch(minibatch_size, input_map=train_bind)
            trainer.train_minibatch(mb_train)

            # collect epoch-wide stats
            samples = trainer.previous_minibatch_sample_count
            loss_numer += trainer.previous_minibatch_loss_average * samples
            metric_numer += trainer.previous_minibatch_evaluation_average * samples
            denom += samples

            # every N MBs evaluate on a test sequence to visually show how we're doing
            if mbs % training_progress_output_freq == 0:
                mb_valid = valid_reader.next_minibatch(minibatch_size, input_map=valid_bind)
                e = ng.eval(mb_valid)
                print_sequences(e, i2w)

            print_training_progress(trainer, mbs, training_progress_output_freq)
            i += mb_train[raw_labels].num_samples
            mbs += 1

        print("--- EPOCH %d DONE: loss = %f, errs = %f ---" % (epoch, loss_numer/denom, 100.0*(metric_numer/denom)))


    error1 = translator_test_error(z, trainer, input_vocab_dim, label_vocab_dim)

    save_model(z, "seq2seq.dnn")
    z = load_model("seq2seq.dnn")

    label_seq_axis = Axis('labelAxis')
    label_sequence = slice(find_arg_by_name('raw_labels',z), label_seq_axis, 1, 0)
    ce = cross_entropy_with_softmax(z, label_sequence)
    errs = classification_error(z, label_sequence)
    trainer = Trainer(z, ce, errs, [momentum_sgd(
                    z.parameters, lr, m_schedule, clipping_threshold_per_sample, gradient_clipping_with_truncation)])

    error2 = translator_test_error(z, trainer, input_vocab_dim, label_vocab_dim)

    assert error1 == error2

    return error1
Exemplo n.º 15
0
def create_model(params: model_params):
    """
  Create ReasoNet model
  Args:
    params (class:`model_params`): The parameters used to create the model
  """
    logger.log(
        "Create model: dropout_rate: {0}, init:{1}, embedding_init: {2}".
        format(params.dropout_rate, params.init, params.embedding_init))
    # Query and Doc/Context/Paragraph inputs to the model
    query_seq_axis = Axis('sourceAxis')
    context_seq_axis = Axis('contextAxis')
    query_sequence = sequence.input(shape=(params.vocab_dim),
                                    is_sparse=True,
                                    sequence_axis=query_seq_axis,
                                    name='query')
    context_sequence = sequence.input(shape=(params.vocab_dim),
                                      is_sparse=True,
                                      sequence_axis=context_seq_axis,
                                      name='context')
    entity_ids_mask = sequence.input(shape=(1, ),
                                     is_sparse=False,
                                     sequence_axis=context_seq_axis,
                                     name='entity_ids_mask')
    # embedding
    if params.embedding_init is None:
        embedding_init = create_random_matrix(params.vocab_dim,
                                              params.embedding_dim)
    else:
        embedding_init = params.embedding_init
    embedding = parameter(shape=(params.vocab_dim, params.embedding_dim),
                          init=None)
    embedding.value = embedding_init
    embedding_matrix = constant(embedding_init,
                                shape=(params.vocab_dim, params.embedding_dim))

    if params.dropout_rate is not None:
        query_embedding = ops.dropout(times(query_sequence, embedding),
                                      params.dropout_rate,
                                      name='query_embedding')
        context_embedding = ops.dropout(times(context_sequence, embedding),
                                        params.dropout_rate,
                                        name='context_embedding')
    else:
        query_embedding = times(query_sequence,
                                embedding,
                                name='query_embedding')
        context_embedding = times(context_sequence,
                                  embedding,
                                  name='context_embedding')

    contextGruW = Parameter(_INFERRED + _as_tuple(params.hidden_dim),
                            init=glorot_uniform(),
                            name='gru_params')
    queryGruW = Parameter(_INFERRED + _as_tuple(params.hidden_dim),
                          init=glorot_uniform(),
                          name='gru_params')

    entity_embedding = ops.times(context_sequence,
                                 embedding_matrix,
                                 name='constant_entity_embedding')
    # Unlike other words in the context, we keep the entity vectors fixed as a random vector so that each vector just means an identifier of different entities in the context and it has no semantic meaning
    full_context_embedding = ops.element_select(entity_ids_mask,
                                                entity_embedding,
                                                context_embedding)
    context_memory = ops.optimized_rnnstack(full_context_embedding,
                                            contextGruW,
                                            params.hidden_dim,
                                            1,
                                            True,
                                            recurrent_op='gru',
                                            name='context_mem')

    query_memory = ops.optimized_rnnstack(query_embedding,
                                          queryGruW,
                                          params.hidden_dim,
                                          1,
                                          True,
                                          recurrent_op='gru',
                                          name='query_mem')
    qfwd = ops.slice(sequence.last(query_memory),
                     -1,
                     0,
                     params.hidden_dim,
                     name='fwd')
    qbwd = ops.slice(sequence.first(query_memory),
                     -1,
                     params.hidden_dim,
                     params.hidden_dim * 2,
                     name='bwd')
    init_status = ops.splice(
        qfwd, qbwd,
        name='Init_Status')  # get last fwd status and first bwd status
    return attention_model(context_memory,
                           query_memory,
                           init_status,
                           params.hidden_dim,
                           params.attention_dim,
                           max_steps=params.max_rl_steps)
Exemplo n.º 16
0
def sequence_to_sequence_translator(debug_output=False):

    input_vocab_dim = 69
    label_vocab_dim = 69

    hidden_dim = 512
    num_layers = 2

    # Source and target inputs to the model
    batch_axis = Axis.default_batch_axis()
    input_seq_axis = Axis('inputAxis')
    label_seq_axis = Axis('labelAxis')

    input_dynamic_axes = [batch_axis, input_seq_axis]
    raw_input = input_variable(
        shape=(input_vocab_dim), dynamic_axes=input_dynamic_axes)

    label_dynamic_axes = [batch_axis, label_seq_axis]
    raw_labels = input_variable(
        shape=(label_vocab_dim), dynamic_axes=label_dynamic_axes)

    # Instantiate the sequence to sequence translation model
    input_sequence = raw_input

    # Drop the sentence start token from the label, for decoder training
    label_sequence = slice(raw_labels, label_seq_axis, 1, 0)
    label_sentence_start = sequence.first(raw_labels)

    is_first_label = sequence.is_first(label_sequence)
    label_sentence_start_scattered = sequence.scatter(
        label_sentence_start, is_first_label)

    # Encoder
    encoder_outputH = stabilize(input_sequence)
    for i in range(0, num_layers):
        (encoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(
            encoder_outputH.output, hidden_dim, hidden_dim, future_value, future_value)

    thought_vectorH = sequence.first(encoder_outputH)
    thought_vectorC = sequence.first(encoder_outputC)

    thought_vector_broadcastH = sequence.broadcast_as(
        thought_vectorH, label_sequence)
    thought_vector_broadcastC = sequence.broadcast_as(
        thought_vectorC, label_sequence)

    # Decoder
    decoder_history_from_ground_truth = label_sequence
    decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(
        decoder_history_from_ground_truth))

    decoder_outputH = stabilize(decoder_input)
    for i in range(0, num_layers):
        if (i > 0):
            recurrence_hookH = past_value
            recurrence_hookC = past_value
        else:
            isFirst = sequence.is_first(label_sequence)
            recurrence_hookH = lambda operand: element_select(
                isFirst, thought_vector_broadcastH, past_value(operand))
            recurrence_hookC = lambda operand: element_select(
                isFirst, thought_vector_broadcastC, past_value(operand))

        (decoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(
            decoder_outputH.output, hidden_dim, hidden_dim, recurrence_hookH, recurrence_hookC)

    decoder_output = decoder_outputH
    decoder_dim = hidden_dim

    # Softmax output layer
    z = linear_layer(stabilize(decoder_output), label_vocab_dim)
    ce = cross_entropy_with_softmax(z, label_sequence)
    errs = classification_error(z, label_sequence)

    # Instantiate the trainer object to drive the model training
    lr = 0.007
    momentum_time_constant = 1100
    m_schedule = momentum_schedule(momentum_time_constant)
    clipping_threshold_per_sample = 2.3
    gradient_clipping_with_truncation = True

    trainer = Trainer(z, ce, errs, [momentum_sgd(z.parameters, lr, m_schedule, clipping_threshold_per_sample, gradient_clipping_with_truncation)])                   

    rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.train-dev-20-21.ctf"
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path)
    feature_stream_name = 'features'
    labels_stream_name = 'labels'

    mb_source = text_format_minibatch_source(path, [
        StreamConfiguration(feature_stream_name, input_vocab_dim, True, 'S0'),
        StreamConfiguration(labels_stream_name, label_vocab_dim, True, 'S1')], 10000)
    features_si = mb_source[feature_stream_name]
    labels_si = mb_source[labels_stream_name]

    # Get minibatches of sequences to train with and perform model training
    minibatch_size = 72
    training_progress_output_freq = 30
    if debug_output:
        training_progress_output_freq = training_progress_output_freq/3

    while True:
        mb = mb_source.next_minibatch(minibatch_size)
        if len(mb) == 0:
            break

        # Specify the mapping of input variables in the model to actual
        # minibatch data to be trained with
        arguments = {raw_input: mb[features_si],
                     raw_labels: mb[labels_si]}
        trainer.train_minibatch(arguments)

        print_training_progress(trainer, i, training_progress_output_freq)
        i += 1

    rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.test.ctf"
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), rel_path)

    test_mb_source = text_format_minibatch_source(path, [
        StreamConfiguration(feature_stream_name, input_vocab_dim, True, 'S0'),
        StreamConfiguration(labels_stream_name, label_vocab_dim, True, 'S1')], 10000, False)
    features_si = test_mb_source[feature_stream_name]
    labels_si = test_mb_source[labels_stream_name]

    # choose this to be big enough for the longest sentence
    train_minibatch_size = 1024 

    # Get minibatches of sequences to test and perform testing
    i = 0
    total_error = 0.0
    while True:
        mb = test_mb_source.next_minibatch(train_minibatch_size)
        if len(mb) == 0:
            break

        # Specify the mapping of input variables in the model to actual
        # minibatch data to be tested with
        arguments = {raw_input: mb[features_si],
                     raw_labels: mb[labels_si]}
        mb_error = trainer.test_minibatch(arguments)

        total_error += mb_error

        if debug_output:
            print("Minibatch {}, Error {} ".format(i, mb_error))

        i += 1

    # Average of evaluation errors of all test minibatches
    return total_error / i
Exemplo n.º 17
0
def sequence_to_sequence_translator(debug_output=False, run_test=False):

    input_vocab_dim = 69
    label_vocab_dim = 69

    # network complexity; initially low for faster testing
    hidden_dim = 256
    num_layers = 1

    # Source and target inputs to the model
    batch_axis = Axis.default_batch_axis()
    input_seq_axis = Axis('inputAxis')
    label_seq_axis = Axis('labelAxis')

    input_dynamic_axes = [batch_axis, input_seq_axis]
    raw_input = input_variable(shape=(input_vocab_dim),
                               dynamic_axes=input_dynamic_axes,
                               name='raw_input')

    label_dynamic_axes = [batch_axis, label_seq_axis]
    raw_labels = input_variable(shape=(label_vocab_dim),
                                dynamic_axes=label_dynamic_axes,
                                name='raw_labels')

    # Instantiate the sequence to sequence translation model
    input_sequence = raw_input

    # Drop the sentence start token from the label, for decoder training
    label_sequence = slice(raw_labels, label_seq_axis, 1,
                           0)  # <s> A B C </s> --> A B C </s>
    label_sentence_start = sequence.first(raw_labels)  # <s>

    is_first_label = sequence.is_first(label_sequence)  # <s> 0 0 0 ...
    label_sentence_start_scattered = sequence.scatter(label_sentence_start,
                                                      is_first_label)

    # Encoder
    encoder_outputH = stabilize(input_sequence)
    for i in range(0, num_layers):
        (encoder_outputH,
         encoder_outputC) = LSTMP_component_with_self_stabilization(
             encoder_outputH.output, hidden_dim, hidden_dim, future_value,
             future_value)

    thought_vectorH = sequence.first(encoder_outputH)
    thought_vectorC = sequence.first(encoder_outputC)

    thought_vector_broadcastH = sequence.broadcast_as(thought_vectorH,
                                                      label_sequence)
    thought_vector_broadcastC = sequence.broadcast_as(thought_vectorC,
                                                      label_sequence)

    # Decoder
    decoder_history_hook = alias(
        label_sequence, name='decoder_history_hook')  # copy label_sequence

    decoder_input = element_select(is_first_label,
                                   label_sentence_start_scattered,
                                   past_value(decoder_history_hook))

    decoder_outputH = stabilize(decoder_input)
    for i in range(0, num_layers):
        if (i > 0):
            recurrence_hookH = past_value
            recurrence_hookC = past_value
        else:
            isFirst = sequence.is_first(label_sequence)
            recurrence_hookH = lambda operand: element_select(
                isFirst, thought_vector_broadcastH, past_value(operand))
            recurrence_hookC = lambda operand: element_select(
                isFirst, thought_vector_broadcastC, past_value(operand))

        (decoder_outputH,
         encoder_outputC) = LSTMP_component_with_self_stabilization(
             decoder_outputH.output, hidden_dim, hidden_dim, recurrence_hookH,
             recurrence_hookC)

    decoder_output = decoder_outputH

    # Softmax output layer
    z = linear_layer(stabilize(decoder_output), label_vocab_dim)

    # Criterion nodes
    ce = cross_entropy_with_softmax(z, label_sequence)
    errs = classification_error(z, label_sequence)

    # network output for decoder history
    net_output = hardmax(z)

    # make a clone of the graph where the ground truth is replaced by the network output
    ng = z.clone(CloneMethod.share,
                 {decoder_history_hook.output: net_output.output})

    # Instantiate the trainer object to drive the model training
    lr_per_sample = 0.007
    minibatch_size = 72
    momentum_time_constant = momentum_as_time_constant_schedule(1100)
    clipping_threshold_per_sample = 2.3
    gradient_clipping_with_truncation = True
    learner = momentum_sgd(
        z.parameters,
        lr_per_sample,
        momentum_time_constant,
        gradient_clipping_threshold_per_sample=clipping_threshold_per_sample,
        gradient_clipping_with_truncation=gradient_clipping_with_truncation)
    trainer = Trainer(z, ce, errs, learner)

    # setup data
    rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.train-dev-20-21.ctf"
    train_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                              rel_path)
    valid_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                              "tiny.ctf")

    # readers
    randomize_data = True
    if run_test:
        randomize_data = False  # because we want to get an exact error

    train_reader = create_reader(train_path, randomize_data, input_vocab_dim,
                                 label_vocab_dim)
    train_bind = {
        raw_input: train_reader.streams.features,
        raw_labels: train_reader.streams.labels
    }

    # get the vocab for printing output sequences in plaintext
    rel_path = r"../../../../Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.mapping"
    vocab_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                              rel_path)
    vocab = [w.strip() for w in open(vocab_path).readlines()]
    i2w = {i: ch for i, ch in enumerate(vocab)}

    # Get minibatches of sequences to train with and perform model training
    i = 0
    mbs = 0
    epoch_size = 908241
    max_epochs = 10
    training_progress_output_freq = 500

    # make things more basic for running a quicker test
    if run_test:
        epoch_size = 5000
        max_epochs = 1
        training_progress_output_freq = 30

    valid_reader = create_reader(valid_path, False, input_vocab_dim,
                                 label_vocab_dim)
    valid_bind = {
        find_arg_by_name('raw_input', ng): valid_reader.streams.features,
        find_arg_by_name('raw_labels', ng): valid_reader.streams.labels
    }

    for epoch in range(max_epochs):
        loss_numer = 0
        metric_numer = 0
        denom = 0

        while i < (epoch + 1) * epoch_size:
            # get next minibatch of training data
            mb_train = train_reader.next_minibatch(minibatch_size,
                                                   input_map=train_bind)
            trainer.train_minibatch(mb_train)

            # collect epoch-wide stats
            samples = trainer.previous_minibatch_sample_count
            loss_numer += trainer.previous_minibatch_loss_average * samples
            metric_numer += trainer.previous_minibatch_evaluation_average * samples
            denom += samples

            # every N MBs evaluate on a test sequence to visually show how we're doing
            if mbs % training_progress_output_freq == 0:
                mb_valid = valid_reader.next_minibatch(minibatch_size,
                                                       input_map=valid_bind)
                e = ng.eval(mb_valid)
                print_sequences(e, i2w)

            print_training_progress(trainer, mbs,
                                    training_progress_output_freq)
            i += mb_train[raw_labels].num_samples
            mbs += 1

        print("--- EPOCH %d DONE: loss = %f, errs = %f ---" %
              (epoch, loss_numer / denom, 100.0 * (metric_numer / denom)))

    error1 = translator_test_error(z, trainer, input_vocab_dim,
                                   label_vocab_dim)

    save_model(z, "seq2seq.dnn")
    z = load_model("seq2seq.dnn")

    label_seq_axis = Axis('labelAxis')
    label_sequence = slice(find_arg_by_name('raw_labels', z), label_seq_axis,
                           1, 0)
    ce = cross_entropy_with_softmax(z, label_sequence)
    errs = classification_error(z, label_sequence)
    trainer = Trainer(z, ce, errs, [
        momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant,
                     clipping_threshold_per_sample,
                     gradient_clipping_with_truncation)
    ])

    error2 = translator_test_error(z, trainer, input_vocab_dim,
                                   label_vocab_dim)

    assert error1 == error2

    return error1