Exemplo n.º 1
0
def fmeasure(output, target, beta=1):
    """
    This operation computes the f-measure between the output and target. If beta is set as one,
    its called the f1-scorce or dice similarity coefficient. f1-scorce is monotonic in jaccard distance.

    f-measure = (1 + beta ** 2) * precision * recall / (beta ** 2 * precision + recall)

    This loss function is frequently used in semantic segmentation of images. Works with imbalanced classes, for
    balanced classes you should prefer cross_entropy instead.
    This operation works with both binary and multiclass classification.

    Args:
        output: the output values from the network
        target: it is usually a one-hot vector where the hot bit corresponds to the label index
        beta: greater than one weights recall higher than precision, less than one for the opposite.
        Commonly chosen values are 0.5, 1 or 2.

    Returns:
        :class:`~cntk.ops.functions.Function`

    """

    assert len(target.shape) == len(output.shape)

    if len(output.shape) == 3:
        axis = (1, 2)  # assumes that the first axis is the class axis
    else:
        axis = None

    correct_predictions = C.reduce_sum(output * target, axis=axis)
    precision = correct_predictions / C.reduce_sum(output, axis=axis)
    recall = correct_predictions / C.reduce_sum(target, axis=axis)
    return 1 - (1 + beta ** 2) * precision * recall / (beta ** 2 * precision + recall)
Exemplo n.º 2
0
def cross_entropy_with_full_softmax(
    output,  # Node providing the output of the lstm layers
    target_vector,  # Node providing the expected labels
    sv_dim, 
    vocab_dim
    ):
    sv_vector = output.outputs[3]
    z = output.outputs[0]
    zT = C.times_transpose(z, target_vector)
    # cross entropy loss with softmax function
    ce = - C.log(zT)
    # the error 
    zMax = C.reduce_max(z)
    error = C.less(zT, zMax)
    ce = sequence.reduce_sum(ce)
    # discourages the network from turning more than one gate off in a single time step.
    sumc = C.abs(C.sequence.slice(sv_vector, 1, 0) - C.sequence.slice(sv_vector, 0, -1))
    sumc = sequence.reduce_sum(0.0001 * C.pow(100.0, sumc))
    #ce += sumc
    # penalise generated utterances that failed to render all the required slots
    sumc += C.abs(C.sequence.last(sv_vector))
    sumc += C.abs(C.sequence.first(sv_vector) - output.outputs[4])
    sumc = C.reduce_sum(sumc)
    ce = C.reduce_sum(ce)
    ce += sumc
    return ce, error
Exemplo n.º 3
0
def create_detection_losses(cls_score, label_targets, bbox_pred, rois, bbox_targets, bbox_inside_weights, cfg):
    # The losses are normalized by the batch size
    # classification loss
    p_cls_score = placeholder()
    p_label_targets = placeholder()
    cls_loss = cross_entropy_with_softmax(p_cls_score, p_label_targets, axis=1)
    cls_normalization_factor = 1.0 / cfg.NUM_ROI_PROPOSALS
    normalized_cls_loss = reduce_sum(cls_loss) * cls_normalization_factor

    reduced_cls_loss = cntk.as_block(normalized_cls_loss,
                                     [(p_cls_score, cls_score), (p_label_targets, label_targets)],
                                     'CrossEntropyWithSoftmax', 'norm_cls_loss')

    # regression loss
    p_bbox_pred = placeholder()
    p_bbox_targets = placeholder()
    p_bbox_inside_weights = placeholder()
    bbox_loss = SmoothL1Loss(cfg.SIGMA_DET_L1, p_bbox_pred, p_bbox_targets, p_bbox_inside_weights, 1.0)
    bbox_normalization_factor = 1.0 / cfg.NUM_ROI_PROPOSALS
    normalized_bbox_loss = reduce_sum(bbox_loss) * bbox_normalization_factor

    reduced_bbox_loss = cntk.as_block(normalized_bbox_loss,
                                      [(p_bbox_pred, bbox_pred), (p_bbox_targets, bbox_targets),
                                       (p_bbox_inside_weights, bbox_inside_weights)],
                                      'SmoothL1Loss', 'norm_bbox_loss')

    detection_losses = plus(reduced_cls_loss, reduced_bbox_loss, name="detection_losses")

    return detection_losses
Exemplo n.º 4
0
def create_detection_losses(cls_score, label_targets, bbox_pred, rois, bbox_targets, bbox_inside_weights, cfg):
    # The losses are normalized by the batch size
    # classification loss
    p_cls_score = placeholder()
    p_label_targets = placeholder()
    cls_loss = cross_entropy_with_softmax(p_cls_score, p_label_targets, axis=1)
    cls_normalization_factor = 1.0 / cfg.NUM_ROI_PROPOSALS
    normalized_cls_loss = reduce_sum(cls_loss) * cls_normalization_factor

    reduced_cls_loss = cntk.as_block(normalized_cls_loss,
                                     [(p_cls_score, cls_score), (p_label_targets, label_targets)],
                                     'CrossEntropyWithSoftmax', 'norm_cls_loss')

    # regression loss
    p_bbox_pred = placeholder()
    p_bbox_targets = placeholder()
    p_bbox_inside_weights = placeholder()
    bbox_loss = SmoothL1Loss(cfg.SIGMA_DET_L1, p_bbox_pred, p_bbox_targets, p_bbox_inside_weights, 1.0)
    bbox_normalization_factor = 1.0 / cfg.NUM_ROI_PROPOSALS
    normalized_bbox_loss = reduce_sum(bbox_loss) * bbox_normalization_factor

    reduced_bbox_loss = cntk.as_block(normalized_bbox_loss,
                                     [(p_bbox_pred, bbox_pred), (p_bbox_targets, bbox_targets), (p_bbox_inside_weights, bbox_inside_weights)],
                                     'SmoothL1Loss', 'norm_bbox_loss')

    detection_losses = plus(reduced_cls_loss, reduced_bbox_loss, name="detection_losses")

    return detection_losses
Exemplo n.º 5
0
def create_detection_losses(cls_score, label_targets, rois, bbox_pred, bbox_targets, bbox_inside_weights):
    # classification loss
    cls_loss = cross_entropy_with_softmax(cls_score, label_targets, axis=1)

    p_cls_loss = placeholder()
    p_rois = placeholder()
    # The terms that are accounted for in the cls loss are those that correspond to an actual roi proposal --> do not count no-op (all-zero) rois
    roi_indicator = reduce_sum(p_rois, axis=1)
    cls_num_terms = reduce_sum(cntk.greater_equal(roi_indicator, 0.0))
    cls_normalization_factor = 1.0 / cls_num_terms
    normalized_cls_loss = reduce_sum(p_cls_loss) * cls_normalization_factor

    reduced_cls_loss = cntk.as_block(normalized_cls_loss,
                                     [(p_cls_loss, cls_loss), (p_rois, rois)],
                                     'Normalize', 'norm_cls_loss')

    # regression loss
    p_bbox_pred = placeholder()
    p_bbox_targets = placeholder()
    p_bbox_inside_weights = placeholder()
    bbox_loss = SmoothL1Loss(cfg["CNTK"].SIGMA_DET_L1, p_bbox_pred, p_bbox_targets, p_bbox_inside_weights, 1.0)
    # The bbox loss is normalized by the batch size
    bbox_normalization_factor = 1.0 / cfg["TRAIN"].BATCH_SIZE
    normalized_bbox_loss = reduce_sum(bbox_loss) * bbox_normalization_factor

    reduced_bbox_loss = cntk.as_block(normalized_bbox_loss,
                                     [(p_bbox_pred, bbox_pred), (p_bbox_targets, bbox_targets), (p_bbox_inside_weights, bbox_inside_weights)],
                                     'SmoothL1Loss', 'norm_bbox_loss')

    detection_losses = plus(reduced_cls_loss, reduced_bbox_loss, name="detection_losses")

    return detection_losses
Exemplo n.º 6
0
def sample_gaussian_mdn(prediction_tensor, nmix: int, ndim: int):
    """ Constructs sampling nodes from mixture density network outputs

    Example:
        ndim, nmix = 1, 3
        a = C.input_variable(ndim)
        prediction = Dense((ndim + 2) * nmix)(a)
        sampled = sample_gaussian_mdn(prediction, nmix, ndim)

        results = sampled.eval({a: x})  # different results every time you eval

    Arguments:
        prediction_tensor: input tensor
        nmix (int): number of mixture
        ndim (int): number of dimension of gaussian

    Returns:
        :class:`~cntk.ops.functions.Function`

    """
    alpha_tensor, mu_tensor, sigma_tensor = gaussian_mdn_coeff(
        prediction_tensor, nmix=nmix, ndim=ndim)

    selected_alpha = random.sample(alpha_tensor)
    selected_mu_tensor = C.reduce_sum(mu_tensor *
                                      C.expand_dims(selected_alpha, axis=-1),
                                      axis=0)
    selected_sigma_tensor = C.reduce_sum(sigma_tensor * selected_alpha, axis=0)

    sampled = C.random.normal_like(
        selected_sigma_tensor) * selected_sigma_tensor + selected_mu_tensor
    return sampled
Exemplo n.º 7
0
def dice_coefficient(x, y):
    # average of per-channel dice coefficient
    # global dice coefificnet doesn't work as class with larger region dominates the metrics
    # https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient
    intersection = C.reduce_sum(x * y, axis=(1,2))

    return C.reduce_mean(2.0 * intersection / (C.reduce_sum(x, axis=(1,2)) + C.reduce_sum(y, axis=(1,2)) + 1.0))
Exemplo n.º 8
0
    def forward_network(cls, input_dim: int):# , batch_norm: bool = False):
        chunk = {}
        log_det_J = 0

        chunk['input_dim'] = input_dim
        _out = _ph = C.placeholder(input_dim, name='place_holder')

        _half_dim = input_dim//2
        _x1, _x2 = _out[:_half_dim], _out[_half_dim:]

        chunk['log_s_func'] = _log_s_func = cls.basic_network(_half_dim, 'log_s_func')
        chunk['t_func'] = _t_func = cls.basic_network(_half_dim, 't_func')

        _log_s, _t = _log_s_func(_x1), _t_func(_x1)
        _x2 = _t + _x2 * C.exp(_log_s)

        log_det_J += C.reduce_sum(_log_s)

        _out = C.splice(_x1, _x2)

        # ====
        _x1, _x2 = _out[:_half_dim], _out[_half_dim:]

        chunk['log_s_func2'] = _log_s_func2 = cls.basic_network(_half_dim, 'log_s_func2')
        chunk['t_func2'] = _t_func2 = cls.basic_network(_half_dim, 't_func2')

        _log_s2, _t2 = _log_s_func2(_x2), _t_func2(_x2)
        _x1 = _x1 * C.exp(_log_s2) + _t2

        log_det_J += C.reduce_sum(_log_s2)

        _out = _Y = C.splice(_x1, _x2)
        # _out = C.as_block(_out, [(_ph,_ph)],'asdf1','zxcv1')

        return _out, log_det_J, chunk
def fmeasure(output, target, beta=1):
    """
    This operation computes the f-measure between the output and target. If beta is set as one,
    its called the f1-scorce or dice similarity coefficient. f1-scorce is monotonic in jaccard distance.

    f-measure = (1 + beta ** 2) * precision * recall / (beta ** 2 * precision + recall)

    This loss function is frequently used in semantic segmentation of images. Works with imbalanced classes, for
    balanced classes you should prefer cross_entropy instead.
    This operation works with both binary and multiclass classification.

    Args:
        output: the output values from the network
        target: it is usually a one-hot vector where the hot bit corresponds to the label index
        beta: greater than one weights recall higher than precision, less than one for the opposite.
        Commonly chosen values are 0.5, 1 or 2.

    Returns:
        :class:`~cntk.ops.functions.Function`

    """

    assert len(target.shape) == len(output.shape)

    if len(output.shape) == 3:
        axis = (1, 2)  # assumes that the first axis is the class axis
    else:
        axis = None

    correct_predictions = C.reduce_sum(output * target, axis=axis)
    precision = correct_predictions / C.reduce_sum(output, axis=axis)
    recall = correct_predictions / C.reduce_sum(target, axis=axis)
    return 1 - (1 + beta**2) * precision * recall / (beta**2 * precision +
                                                     recall)
Exemplo n.º 10
0
def create_detection_losses(cls_score, label_targets, rois, bbox_pred, bbox_targets, bbox_inside_weights):
    # classification loss
    cls_loss = cross_entropy_with_softmax(cls_score, label_targets, axis=1)

    p_cls_loss = placeholder()
    p_rois = placeholder()
    # The terms that are accounted for in the cls loss are those that correspond to an actual roi proposal --> do not count no-op (all-zero) rois
    roi_indicator = reduce_sum(p_rois, axis=1)
    cls_num_terms = reduce_sum(cntk.greater_equal(roi_indicator, 0.0))
    cls_normalization_factor = 1.0 / cls_num_terms
    normalized_cls_loss = reduce_sum(p_cls_loss) * cls_normalization_factor

    reduced_cls_loss = cntk.as_block(normalized_cls_loss,
                                     [(p_cls_loss, cls_loss), (p_rois, rois)],
                                     'Normalize', 'norm_cls_loss')

    # regression loss
    p_bbox_pred = placeholder()
    p_bbox_targets = placeholder()
    p_bbox_inside_weights = placeholder()
    bbox_loss = SmoothL1Loss(cfg["CNTK"].SIGMA_DET_L1, p_bbox_pred, p_bbox_targets, p_bbox_inside_weights, 1.0)
    # The bbox loss is normalized by the batch size
    bbox_normalization_factor = 1.0 / cfg["TRAIN"].BATCH_SIZE
    normalized_bbox_loss = reduce_sum(bbox_loss) * bbox_normalization_factor

    reduced_bbox_loss = cntk.as_block(normalized_bbox_loss,
                                     [(p_bbox_pred, bbox_pred), (p_bbox_targets, bbox_targets), (p_bbox_inside_weights, bbox_inside_weights)],
                                     'SmoothL1Loss', 'norm_bbox_loss')

    detection_losses = plus(reduced_cls_loss, reduced_bbox_loss, name="detection_losses")

    return detection_losses
Exemplo n.º 11
0
def dice_coefficient(x, y):
    # average of per-channel dice coefficient

    intersection = C.reduce_sum(x * y, axis=(1, 2))

    return C.reduce_mean(
        2.0 * intersection /
        (C.reduce_sum(x, axis=(1, 2)) + C.reduce_sum(y, axis=(1, 2)) + 1.0))
Exemplo n.º 12
0
    def model(seq_image, decoded):
        params = dense(decoded)
        g_x, g_y, sigma2, delta, gamma = attention_parameters(params)

        i = C.Constant(np.arange(n) + 1, )  # col of patch
        j = C.Constant(np.arange(n) + 1, )  # row of patch
        mu_x = g_x + (i - n / 2 - 0.5) * delta
        mu_y = g_y + (j - n / 2 - 0.5) * delta
        mu_x = C.expand_dims(mu_x, axis=-1)
        mu_y = C.expand_dims(mu_y, axis=-1)
        # mu_x: [#, *] [n, 1]
        # mu_y: [#, *] [n, 1]

        image = C.sequence.unpack(seq_image,
                                  padding_value=0,
                                  no_mask_output=True)
        # image: [#] [*image_width, filters, image_height]

        width_pos = Cx.sequence.position(seq_image)
        # width_pos: [#, *] [1]

        width_pos_unpacked = C.sequence.unpack(width_pos,
                                               padding_value=999_999,
                                               no_mask_output=True)
        # width_pos: [#] [*image_width, 1]

        a = C.sequence.broadcast_as(C.swapaxes(width_pos_unpacked), mu_x)
        # a: [#, *] [1, *image_width]
        # x pos index of image (width)

        b = C.Constant(np.arange(image_height).reshape((1, -1)))
        # b: [] [1, image_height]
        # y pos index of image (height)

        # calculate the which portion of the image that is attended by the gaussian filter
        f_xi = C.exp(-0.5 * C.square(a - mu_x) / sigma2)
        f_yj = C.exp(-0.5 * C.square(b - mu_y) / sigma2)
        # f_xi: [#, *] [n, *image_width]
        # f_yj: [#, *] [n, image_height]

        z_x = C.reduce_sum(f_xi, axis=1)
        z_y = C.reduce_sum(f_yj, axis=1)
        # z_x: [#, *] [n]
        # z_y: [#, *] [n]

        f_xi = f_xi / z_x
        f_yj = f_yj / z_y
        # f_xi: [#, *] [n, *image_width]
        # f_yj: [#, *] [n, image_height]

        # combine filters from x and y
        image_broadcasted = C.sequence.broadcast_as(image, f_yj)
        attended = gamma * C.times(
            f_xi, C.times_transpose(image_broadcasted, f_yj), output_rank=2)
        # attended: [#, *] [n, filters, n]
        attended = C.swapaxes(attended)
        # attended: [#, *] [filters, n (x) , n (y)]
        return attended
Exemplo n.º 13
0
def criteria(label, output, block_size, c_classes, weights):
    ''' Define the loss function and metric '''
    probs = cntk.softmax(output, axis=0)
    log_probs = cntk.log(probs)
    ce = cntk.times(weights,
                    -cntk.element_times(log_probs, label),
                    output_rank=2)
    mean_ce = cntk.reduce_mean(ce)
    _, w, h = label.shape
    pe = cntk.classification_error(probs, label, axis=0) - \
     cntk.reduce_sum(cntk.slice(label, 0, 0, 1)) / cntk.reduce_sum(label)
    return (mean_ce, pe)
Exemplo n.º 14
0
def test_transpose_backward():
    shape = (2, 3, 4)
    p = (2, 0, 1)
    x0 = np.arange(np.prod(shape), dtype=np.float32).reshape(*shape)
    shapet = tuple(shape[i] for i in p)
    x = C.input_variable(shape, needs_gradient=True)
    y = C.reduce_sum(C.cos(C.transpose(x, p)))
    xt = C.input_variable(shapet, needs_gradient=True)
    yt = C.reduce_sum(C.cos(xt))
    g = np.squeeze(y.grad({x:x0}))
    gt = np.squeeze(yt.grad({xt:np.transpose(x0, p)}))
    assert np.allclose(np.transpose(g, p), gt)
Exemplo n.º 15
0
def test_transpose_backward():
    shape = (2, 3, 4)
    p = (2, 0, 1)
    x0 = np.arange(np.prod(shape), dtype=np.float32).reshape(*shape)
    shapet = tuple(shape[i] for i in p)
    x = C.input_variable(shape, needs_gradient=True)
    y = C.reduce_sum(C.cos(C.transpose(x, p)))
    xt = C.input_variable(shapet, needs_gradient=True)
    yt = C.reduce_sum(C.cos(xt))
    g = np.squeeze(y.grad({x: x0}))
    gt = np.squeeze(yt.grad({xt: np.transpose(x0, p)}))
    assert np.allclose(np.transpose(g, p), gt)
Exemplo n.º 16
0
    def var(array,W=_W,B=None,square=0,sqrt=0,V=False,sizz=0):
        #W=tf.transpose(W, [0,2,3,1])
        
        arrs=array.shape
        ashp=W.shape
        sb=(W.shape[1],1,1)
        WV=W.shape[-2:]
        xi=(-2,-1)
        x2=(-2,-1,-3)

        if V:
            print(W.eval())
            print(arrs,ashp)
        mul=(array*W)

        if V:
            print('Wsamp',W[-1,-1].eval())
            print('array*w',(mul.eval())[0,-1])

        size=C.reduce_sum(W,axis=xi)#shape=(outputs, channel)

        if V:
            print("sizesamp",size.shape,size.eval())
        if B is None:
            B=C.constant(0,shape=W.shape[0:2],dtype=np.float32)#channel
        B=C.reshape(B,(*B.shape,*[1 for _ in range(len(ashp)-len(B.shape))]))
        if sizz==1:
            mean=C.reduce_sum(mul,axis=xi)/size
        else:
            mean=C.reduce_sum(mul,axis=xi)/C.constant(value=WV[0]*WV[1],shape=sb,dtype=np.float32)
        if V:
            print("meansamp",mean.eval()[0,-1])
        if square:
            i=(C.square(mul-mean)+B)
        else:
            i=(((mul)-mean)+B)
        di=i/size
        if V==2:
            print("i",i.eval(),"i")
            print("di",di.eval(),"di")
        if V:
            print('isamp',i.shape,i.eval()[-1,-1,])
        out=C.reduce_sum(i+B,axis=x2)
        #out=np.rollaxis(np.sum(i+B,axis=x2),-1,1)
        print(out.shape)
        if sqrt:
            out=C.sqrt(out)
        out=C.swapaxes(C.reshape(out,out.shape[:4]), 3, 1)
        print(out.shape)
        assert out.shape==(arrs[0],ashp[0],arrs[1],arrs[2])
        return(out)
Exemplo n.º 17
0
    def attention_layer(self, context, query):
        q_processed = C.placeholder(shape=(2 * self.hidden_dim, ))
        c_processed = C.placeholder(shape=(2 * self.hidden_dim, ))

        #convert query's sequence axis to static
        qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs

        # This part deserves some explanation
        # It is the attention layer
        # In the paper they use a 6 * dim dimensional vector
        # here we split it in three parts because the different parts
        # participate in very different operations
        # so W * [h; u; h.* u] becomes w1 * h + w2 * u + w3 * (h.*u)
        ws1 = C.parameter(shape=(2 * self.hidden_dim, 1),
                          init=C.glorot_uniform())
        ws2 = C.parameter(shape=(2 * self.hidden_dim, 1),
                          init=C.glorot_uniform())
        ws3 = C.parameter(shape=(1, 2 * self.hidden_dim),
                          init=C.glorot_uniform())
        att_bias = C.parameter(shape=(), init=0)

        wh = C.times(c_processed, ws1)
        wu = C.reshape(C.times(qvw, ws2), (-1, ))
        whu = C.reshape(
            C.reduce_sum(c_processed *
                         C.sequence.broadcast_as(qvw * ws3, c_processed),
                         axis=1), (-1, ))
        S = wh + whu + C.sequence.broadcast_as(wu, c_processed) + att_bias
        # mask out values outside of Query, and fill in gaps with -1e+30 as neutral value for both reduce_log_sum_exp and reduce_max
        qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, c_processed)
        S = C.element_select(qvw_mask_expanded, S, C.constant(-1e+30))
        q_attn = C.reshape(C.softmax(S), (-1, 1))
        #q_attn = print_node(q_attn)
        c2q = C.reshape(
            C.reduce_sum(C.sequence.broadcast_as(qvw, q_attn) * q_attn,
                         axis=0), (-1))

        max_col = C.reduce_max(S)
        c_attn = C.sequence.softmax(max_col)

        htilde = C.sequence.reduce_sum(c_processed * c_attn)
        q2c = C.sequence.broadcast_as(htilde, c_processed)
        q2c_out = c_processed * q2c

        att_context = C.splice(c_processed, c2q, c_processed * c2q, q2c_out)

        return C.as_block(att_context, [(c_processed, context),
                                        (q_processed, query)],
                          'attention_layer', 'attention_layer')
Exemplo n.º 18
0
 def attention(h_enc, h_dec):
     history_axis = h_dec  # we use history_axis wherever we pass this only for the sake of passing its axis
     # TODO: pull this apart so that we can compute the encoder window only once and apply it to multiple decoders
     # --- encoder state window
     (h_enc, h_enc_valid) = PastValueWindow(
         attention_span, axis=attention_axis,
         go_backwards=go_backwards)(h_enc).outputs
     h_enc_proj = attn_proj_enc(h_enc)
     # window must be broadcast to every decoder time step
     h_enc_proj = C.sequence.broadcast_as(h_enc_proj, history_axis)
     h_enc_valid = C.sequence.broadcast_as(h_enc_valid, history_axis)
     # --- decoder state
     # project decoder hidden state
     h_dec_proj = attn_proj_dec(h_dec)
     tanh_out = C.tanh(h_dec_proj +
                       h_enc_proj)  # (attention_span, attention_dim)
     u = attn_proj_tanh(tanh_out)  # (attention_span, 1)
     u_masked = u + (
         h_enc_valid - 1
     ) * 50  # logzero-out the unused elements for the softmax denominator  TODO: use a less arbitrary number than 50
     attention_weights = C.softmax(
         u_masked, axis=attention_axis)  #, name='attention_weights')
     attention_weights = Label('attention_weights')(attention_weights)
     # now take weighted sum over the encoder state vectors
     h_att = C.reduce_sum(C.element_times(h_enc_proj, attention_weights),
                          axis=attention_axis)
     h_att = attn_final_stab(h_att)
     return h_att
Exemplo n.º 19
0
 def new_attention(encoder_hidden_state, decoder_hidden_state):
     # encode_hidden_state: [#, e] [h]
     # decoder_hidden_state: [#, d] [H]
     unpacked_encoder_hidden_state, valid_mask = C.sequence.unpack(encoder_hidden_state, padding_value=0).outputs
     # unpacked_encoder_hidden_state: [#] [*=e, h]
     # valid_mask: [#] [*=e]
     projected_encoder_hidden_state = C.sequence.broadcast_as(attn_proj_enc(unpacked_encoder_hidden_state), decoder_hidden_state)
     # projected_encoder_hidden_state: [#, d] [*=e, attention_dim]
     broadcast_valid_mask = C.sequence.broadcast_as(C.reshape(valid_mask, (1,), 1), decoder_hidden_state)
     # broadcast_valid_mask: [#, d] [*=e]
     projected_decoder_hidden_state = attn_proj_dec(decoder_hidden_state)
     # projected_decoder_hidden_state: [#, d] [attention_dim]
     tanh_output = C.tanh(projected_decoder_hidden_state + projected_encoder_hidden_state)
     # tanh_output: [#, d] [*=e, attention_dim]
     attention_logits = attn_proj_tanh(tanh_output)
     # attention_logits = [#, d] [*=e, 1]
     minus_inf = C.constant(-1e+30)
     masked_attention_logits = C.element_select(broadcast_valid_mask, attention_logits, minus_inf)
     # masked_attention_logits = [#, d] [*=e]
     attention_weights = C.softmax(masked_attention_logits, axis=0)
     attention_weights = Label('attention_weights')(attention_weights)
     # attention_weights = [#, d] [*=e]
     attended_encoder_hidden_state = C.reduce_sum(attention_weights * C.sequence.broadcast_as(unpacked_encoder_hidden_state, attention_weights), axis=0)
     # attended_encoder_hidden_state = [#, d] [1, h]
     output = attn_final_stab(C.reshape(attended_encoder_hidden_state, (), 0, 1))
     # output = [#, d], [h]
     return output
Exemplo n.º 20
0
def build_graph(self_attention,
                self_penalty,
                embeded_dim=60,
                h_dim=150,
                d_a=350,
                r=30):

    with C.layers.default_options(init=C.xavier()):
        embeded = C.layers.Embedding(embeded_dim)(x)
        embeded = C.layers.Stabilizer()(embeded)

        H = create_birnn(C.layers.GRU(h_dim), C.layers.GRU(h_dim))(embeded)

        if self_attention:
            Ws1 = C.parameter(shape=(d_a, 2 * h_dim), name="Ws1")
            Ws2 = C.parameter(shape=(r, d_a), name="Ws2")
            A = C.softmax(C.times(Ws2, C.tanh(C.times_transpose(Ws1, H))))
            H = C.times(A, H)  # the M in the paper

            if self_penalty:
                I = C.constant(np.eye(r), dtype=np.float32)
                P = C.times_transpose(A, A) - I  # r*r
                p = C.reduce_sum(C.abs(C.element_times(
                    P, P)))  # frobenius norm **2

        y_ = C.layers.Dense(200, activation=C.ops.relu)(H)

        # y_pre = C.layers.Dense(num_labels, activation = None)(y_)
        def selfAtt(x):
            y_pre = C.layers.Dense(num_labels, activation=None)(y_)
            return y_pre

        if self_penalty:
            selfAtt.p = p
        return selfAtt
Exemplo n.º 21
0
def reduce_sum(x, axis=0, name=''): 
    '''
    Computes the sum of the input tensor's elements across one axis. if `axis==rank`,
    then the sum will be computed over all axes, that is, the output is a scalar,
    which is the sum of tensor's elements.

    Examples:
        >>> # create 3x2 matrix in a sequence of length 1 in a batch of one sample
        >>> data = [[10, 20],[30, 40],[50, 60]]        
        
        >>> # reduce over the first axis
        >>> C.eval(C.reduce_sum(data, 0))
        [array([[[  90.,  120.]]])]     
        
        >>> # reduce over the second axis
        >>> C.eval(C.reduce_sum(data, 1))
        [array([[[  30.],
                 [  70.],
                 [ 110.]]])]        
        
        >>> # reduce over the all axes
        >>> C.eval(C.reduce_sum(data, 2))
        [array([[ 210.]])]       

    Args:
        x: input tensor
        axis (:class:`cntk.Axis`): axis along which the reduction will be performed
        name (str): the name of the node in the network

    Returns:
        :class:`cntk.Function`
    '''
    from cntk import reduce_sum
    x = sanitize_input(x)
    return reduce_sum(x, axis, name).output()    
Exemplo n.º 22
0
 def create_model(self):
     self.input_dim = 1000
     self.embed_dim = 30
     i = C.input_variable((self.input_dim,), is_sparse=True)
     self.p = C.parameter(shape=(self.input_dim, self.embed_dim), init=1)
     o = C.times(i, self.p)
     self.z = C.reduce_sum(o)
Exemplo n.º 23
0
    def simi_attention(self, input, memory):
        '''
        return:
        memory weighted vectors over input [#,c][d]
        weight
        '''
        input_ph = C.placeholder()  # [#,c][d]
        mem_ph = C.placeholder()  # [#,q][d]

        input_dense = Dense(2 * self.hidden_dim, bias=False, input_rank=1)
        mem_dense = Dense(2 * self.hidden_dim, bias=False, input_rank=1)
        bias = C.Parameter(shape=(2 * self.hidden_dim, ), init=0.0)
        weight_dense = Dense(1, bias=False, input_rank=1)

        proj_inp = input_dense(input_ph)  # [#,c][d]
        proj_mem = mem_dense(mem_ph)  # [#,q][d]
        unpack_memory, mem_mask = C.sequence.unpack(
            proj_mem, 0).outputs  # [#][*=q, d] [#][*=q]
        expand_mem = C.sequence.broadcast_as(unpack_memory,
                                             proj_inp)  # [#,c][*=q,d]
        expand_mask = C.sequence.broadcast_as(mem_mask, proj_inp)  # [#,c][*=q]
        matrix = C.reshape(weight_dense(C.tanh(proj_inp + expand_mem + bias)),
                           (-1, ))  # [#,c][*=q]
        matrix = C.element_select(expand_mask, matrix, -1e30)
        logits = C.softmax(matrix, axis=0)  # [#,c][*=q]
        weight_mem = C.reduce_sum(C.reshape(logits, (-1, 1)) * expand_mem,
                                  axis=0)  # [#,c][d]
        weight_mem = C.reshape(weight_mem, (-1, ))

        return C.as_block(C.combine(weight_mem, logits), [(input_ph, input),
                                                          (mem_ph, memory)],
                          'simi_attention', 'simi_attention')
Exemplo n.º 24
0
    def attention(encoded, network):
        abk = dense(network)
        a, b, k = gaussian_windows_attention_coefficients(abk, nb_mixtures)
        # print("abk shape:", a.shape, b.shape, k.shape)
        # a, b, k: [#, n] [nb_mixture, 1]
        # context: [#, c] [char_ohe]

        encoded_unpacked = C.sequence.unpack(encoded, padding_value=0, no_mask_output=True)
        # context_unpacked: [#] [*=c, char_ohe]
        u = Cx.sequence.position(encoded)  # position gives shape=(1, )
        # u: [#, c], [1]
        u_values, u_valid = C.sequence.unpack(u, padding_value=999_999).outputs
        # u_values: [#] [*=c, 1]
        # u_valid: [#] [*=c]
        u_values_broadcast = C.swapaxes(C.sequence.broadcast_as(u_values, k))
        # u_values_broadcast: [#, n] [1, *=c]
        u_valid_broadcast = C.sequence.broadcast_as(C.reshape(u_valid, (1,), 1), k)
        # u_valid_broadcast: [#, n] [*=c, 1] ~ shape verified correct at his point

        # print("u_values_broadcast shape:", u_values_broadcast.shape)
        # print("abk shape:", a.shape, b.shape, k.shape)
        phi = window_weight(a, b, k, u_values_broadcast)
        # phi: [#, n] [*=c, 1]
        zero = C.constant(0)
        phi = C.element_select(u_valid_broadcast, phi, zero, name="phi")
        # phi: [#, n] [*=c, 1]
        attended = C.reduce_sum(phi * C.sequence.broadcast_as(encoded_unpacked, phi), axis=0)
        # [#, n] [1, char_ohe]
        # print("attended_context shape:", attended_context.shape)
        output = C.squeeze(attended, name="GaussianWindowAttention")
        # [#, n] [char_ohe]
        return output
Exemplo n.º 25
0
    def criterion(self):

        # hyperparameters
        lambda_val = 0.5

        # Margin loss
        left = ct.square(ct.relu(0.9 - self.length))
        right = ct.square(ct.relu(self.length - 0.1))
        left = ct.reshape(left, (-1))
        right = ct.reshape(right, (-1))
        lc = self.labels * left + lambda_val * (1 - self.labels) * right

        margin_loss = ct.reduce_sum(lc, axis=0)
        margin_loss = ct.reduce_mean(margin_loss, axis=ct.axis.Axis.default_batch_axis())

        # classification_error
        predict = ct.softmax(self.length, axis=0)
        error = ct.classification_error(ct.reshape(predict, (10)), self.labels)

        total_loss = margin_loss
        reconstruction_err = 0

        if self.use_reconstruction:
            features = ct.reshape(self.features, shape=(-1,))
            encoder = ct.reshape(self.training_model, shape=(-1,))
            squared = ct.square(encoder - features)
            reconstruction_err = ct.reduce_mean(squared, axis=0)
            reconstruction_err = ct.reduce_mean(reconstruction_err, axis=ct.axis.Axis.default_batch_axis())
            total_loss = margin_loss + (0.0005*784) * reconstruction_err

        return total_loss, error
Exemplo n.º 26
0
def test_sequence_reduce_over_reduced_scalar():
    x = C.sequence.input_variable(shape=(1), needs_gradient=True)
    op = C.sequence.reduce_sum(C.reduce_sum(x))

    grad, result = op.grad({x : np.asarray([[-1], [3], [5]], dtype=np.float32)}, outputs=[op])
    assert np.array_equal(result, [7.0])
    assert np.array_equal(grad[0], [[1.0], [1.0], [1.0]])
Exemplo n.º 27
0
def pad(x, pattern, mode=C.CONSTANT_PAD, constant_value=0, name=''):
    """
    Pads a tensor in the sequence axis according to the specified patterns.
    Three padding modes are supported: CONSTANT / REFLECT / SYMMETRIC.

    Arguments:
        x: tensor to be padded.
        pattern (tuple with 2 integers): how many values to add before and after the contents in the sequence axis.
        mode (int): padding mode: C.ops.CONSTANT_PAD, C.ops.REFLECT_PAD and C.ops.SYMMETRIC_PAD
        constant_value: the value used to fill the padding cells, only meaningful under CONSTANT mode.
        name (str, optional): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`
    """
    if not all(isinstance(i, int) for i in pattern) or not isinstance(pattern, tuple):
        raise ValueError(f"pattern {pattern} must be a tuple with 2 integers")

    ndim = len(x.shape)
    null_pattern = [(0, 0)] * ndim
    final_pattern = [pattern] + null_pattern

    b, valid = C.sequence.unpack(x, padding_value=0).outputs
    c = C.pad(b, final_pattern, mode=mode, constant_value=constant_value)
    seq_length = C.reduce_sum(valid, axis=0) + C.Constant(sum(pattern))
    d = C.to_sequence(c, seq_length, name=name)
    return d
Exemplo n.º 28
0
def test_trainer(tmpdir, no_eval_function):
    in1 = input(shape=(1, ))
    labels = input(shape=(1, ))
    p = parameter(shape=(2, ), init=10)
    z = plus(in1, reduce_sum(p), name='z')
    ce = cross_entropy_with_softmax(z, labels)
    if no_eval_function:
        errs = None
    else:
        errs = classification_error(z, labels)

    momentum_time_constant = momentum_as_time_constant_schedule(1100)
    lr_per_sample = learning_rate_schedule(0.007, UnitType.sample)
    trainer = Trainer(z, (ce, errs), [
        momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)
    ])
    in1_value = [[1], [2]]
    label_value = [[0], [1]]
    arguments = {in1: in1_value, labels: label_value}
    z_output = z.output
    updated, var_map = trainer.train_minibatch(arguments, outputs=[z_output])

    p = str(tmpdir / 'checkpoint.dat')
    trainer.save_checkpoint(p)
    trainer.restore_from_checkpoint(p)

    assert trainer.model.name == 'z'

    # Ensure that Swig is not leaking raw types
    assert isinstance(trainer.model, Function)
    assert trainer.model.__doc__
    assert isinstance(trainer.parameter_learners[0], Learner)
Exemplo n.º 29
0
def test_sequence_reduce_over_reduced_scalar():
    x = C.sequence.input_variable(shape=(1), needs_gradient=True)
    op = C.sequence.reduce_sum(C.reduce_sum(x))

    grad, result = op.grad({x : np.asarray([[-1], [3], [5]], dtype=np.float32)}, outputs=[op])
    assert np.array_equal(result, [7.0])
    assert np.array_equal(grad[0], [[1.0], [1.0], [1.0]])
Exemplo n.º 30
0
def reduce_sum(x, axis=0, name=''): 
    '''
    Computes the sum of the input tensor's elements across one axis. if `axis==rank`,
    then the sum will be computed over all axes, that is, the output is a scalar,
    which is the sum of tensor's elements.

    Examples:
        >>> # create 3x2 matrix in a sequence of length 1 in a batch of one sample
        >>> data = [[10, 20],[30, 40],[50, 60]]        
        
        >>> # reduce over the first axis
        >>> C.eval(C.reduce_sum(data, 0))
        [array([[[  90.,  120.]]])]     
        
        >>> # reduce over the second axis
        >>> C.eval(C.reduce_sum(data, 1))
        [array([[[  30.],
                 [  70.],
                 [ 110.]]])]        
        
        >>> # reduce over the all axes
        >>> C.eval(C.reduce_sum(data, 2))
        [array([[ 210.]])]       

    Args:
        x: input tensor
        axis (:class:`cntk.Axis`): axis along which the reduction will be performed
        name (str): the name of the node in the network

    Returns:
        :class:`cntk.Function`
    '''
    from cntk import reduce_sum
    x = sanitize_input(x)
    return reduce_sum(x, axis, name).output()    
Exemplo n.º 31
0
def test_trainer(tmpdir, no_eval_function):
    in1 = input_variable(shape=(1,))
    labels = input_variable(shape=(1,))
    p = parameter(shape=(2,), init=10)
    z = plus(in1, reduce_sum(p), name='z')
    ce = cross_entropy_with_softmax(z, labels)
    if no_eval_function:
        errs = None
    else:
        errs = classification_error(z, labels)

    momentum_time_constant = momentum_as_time_constant_schedule(1100)
    lr_per_sample = learning_rate_schedule(0.007, UnitType.sample)
    trainer = Trainer(z, (ce, errs),
            [momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)])
    in1_value = [[1],[2]]
    label_value = [[0], [1]]
    arguments = {in1: in1_value, labels: label_value}
    z_output = z.output
    updated, var_map = trainer.train_minibatch(arguments, outputs=[z_output])

    p = str(tmpdir / 'checkpoint.dat')
    trainer.save_checkpoint(p)
    trainer.restore_from_checkpoint(p)

    assert trainer.model.name == 'z'

    # Ensure that Swig is not leaking raw types
    assert isinstance(trainer.model, Function)
    assert trainer.model.__doc__
    assert isinstance(trainer.parameter_learners[0], Learner)
Exemplo n.º 32
0
    def window_weight(a, b, k, u):
        """
        Calculate Phi is the window weight of character seq at position u of time t.
        Function tested to be correct on 2018-25-02 using numpy equivalent

        math:
            phi = summation of mixtures { a * exp ( -b * (k - u) ^ 2 ) }

        Args:
            a: importance of window within the mixture. Not normalised and doesn't sum to one.
            b: width of attention window
            k: location of window
            u: integer position of each item in sequence. Value from 1 to seq_length. (rank 2 tensor) [-3, 1]

        Returns:
            :class:`~cntk.ops.functions.Function`

        """
        # print(f"k shape: {k.shape}, u shape: {u.shape}")
        phi = a * C.exp(-1 * b * C.square(k - u))
        # print("internal phi shape:", phi.shape)
        phi = C.swapaxes(C.reduce_sum(phi,
                                      axis=0))  # Reduce sum the mixture axis
        # phi: [#, n] [*-c, 1]
        return phi
Exemplo n.º 33
0
def flow_reverse(chunk):
    input_dim = chunk['input_dim']
    log_det_J = 0
    _half_dim = input_dim//2

    _ph = C.placeholder(input_dim, name='place_holder')
    _log_s_func = chunk['log_s_func']
    _t_func = chunk['t_func']

    _y1, _y2 = _ph[:_half_dim], _ph[_half_dim:]
    _log_s = _log_s_func(_y2)
    _t = _t_func(_y2)
    _s = C.exp(_log_s)
    _x1 = (_y1-_t)/_s
    _x2 = _y2
    _X = C.splice(_x1, _x2)

    log_det_J += C.reduce_sum(C.log(C.abs(_s)))

    _w = chunk['W_rot_mat']
    chunk['W_rot_mat_inv'] = _inv_w = C.Constant(np.linalg.inv(_w.value), name='inv_W')
    _out = _X@_inv_w
    log_det_J += input_dim*C.log(C.det(_inv_w))

    # if 'scale' in chunk:
    #     _out -= chunk['bias']
    #     _out /= chunk['scale']
    #     log_det_J += input_dim*C.reduce_sum(C.log(C.abs(chunk['scale'])))

    # _out -= chunk['b']
    # _out @= _inv_w

    return _out, log_det_J
def test_trainer(tmpdir, no_eval_function):
    in1 = C.input_variable(shape=(1,))
    labels = C.input_variable(shape=(1,))
    p = parameter(shape=(2,), init=10)
    z = plus(in1, reduce_sum(p), name='z')
    ce = cross_entropy_with_softmax(z, labels)
    if no_eval_function:
        errs = None
    else:
        errs = classification_error(z, labels)

    momentum_time_constant = C.momentum_as_time_constant_schedule(1100)
    lr_per_sample = C.learning_parameter_schedule(0.007, minibatch_size =1)
    trainer = C.Trainer(z, (ce, errs),
            [C.momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)])
    in1_value = [[1],[2]]
    label_value = [[0], [1]]
    arguments = {in1: in1_value, labels: label_value}
    z_output = z.output
    updated, var_map = trainer.train_minibatch(arguments, outputs=[z_output])

    p = str(tmpdir / 'checkpoint.dat')
    external_state = {"additional external state":math.pi, "nested dict":{"a":"b"}, "list":[1,2,3]}
    trainer.save_checkpoint(p, external_state)
    restored_state = trainer.restore_from_checkpoint(p)

    assert external_state == restored_state

    assert trainer.model.name == 'z'

    # Ensure that Swig is not leaking raw types
    assert isinstance(trainer.model, Function)
    assert trainer.model.__doc__
    assert isinstance(trainer.parameter_learners[0], C.Learner)
Exemplo n.º 35
0
    def attention(query, key, value):
        dk = C.reduce_sum(C.ones_like(query))  # cannot use sequence.last, will conflict with recurrence
        # dk: [#, *] [1, ] and value = int(dim_of_query)

        unpacked_key = C.sequence.unpack(key, padding_value=0, no_mask_output=True)  # [#] [-3, key_dim]
        unpacked_value = C.sequence.unpack(value, padding_value=0, no_mask_output=True)  # [#] [-3, value_dim]

        broadcasted_key = C.sequence.broadcast_as(unpacked_key, query)  # [#, *] [-3, key_dim]
        scaled = C.times_transpose(query, broadcasted_key) / dk
        # [#, *] [q_dim] @ [#, *] [key_dim, -3], assert q_dim == key_dim
        # scaled: [#, *] [-3, ] => for every key seq element, there is a corresponding score

        # masked out invalid temporal connections to obey_sequence_order
        if obey_sequence_order and max_seq_len:
            unpacked_scaled, scaled_mask = C.sequence.unpack(scaled, padding_value=0).outputs
            # unpacked_scaled: [#] [-3, -3]  <== matrix will be top right diagonally zero-ed
            # scaled_mask: [#] [-3,]

            minus_inf = C.constant(-1e+30)
            valid_connections = C.Constant(np.tril(np.ones((max_seq_len, max_seq_len)), k=0))  # [] [max_seq, max_seq]
            valid_connections = C.reconcile_dynamic_axes(valid_connections, unpacked_scaled)  # [#] [max_seq, max_seq]
            valid_connections = C.crop_manual(valid_connections, unpacked_scaled, 0, 0)  # [#] [-3, -3]
            unpacked_scaled = C.element_select(valid_connections, unpacked_scaled, minus_inf)  # [#] [-3, -3]
            scaled = C.to_sequence_like(unpacked_scaled, query)  # [#, *] [-3]

        elif obey_sequence_order and not max_seq_len:
            raise ValueError("max_seq_len must be defined when obey_sequence_order is True")

        attended = C.times(C.softmax(scaled, axis=-1), C.sequence.broadcast_as(unpacked_value, query))  # [#, *] [value_dim,]
        return attended
Exemplo n.º 36
0
 def multiFunc(self, arg1):
     # load or create the inputs we need
     multiIn = C.input(shape=arg1.shape, dynamic_axes = arg1.dynamic_axes)
     bit_map = C.constant(self.bit_map)
     max_bits = self.bit_map.max()
     shape = multiIn.shape
     reformed = C.reshape(multiIn, (-1,))
     # lets compute the means we need
     # carry over represents the remaining value that needs to binarized. For a single bit, this is just the input. For more bits,
     # it is the difference between the previous bits approximation and the true value.
     carry_over = multiIn
     approx = C.element_times(multiIn, 0)
     # iterate through the maximum number of bits specified by the bit maps, basically compute each level of binarization
     for i in range(max_bits):
         # determine which values of the input should be binarized to i bits or more
         hot_vals = C.greater(bit_map, i)
         # select only the values which we need to binarize
         valid_vals = C.element_select(hot_vals, carry_over, 0)
         # compute mean on a per kernel basis, reshaping is done to allow for sum reduction along only axis 0 (the kernels)
         mean = C.element_divide(C.reduce_sum(C.reshape(C.abs(valid_vals), (valid_vals.shape[0], -1)), axis=1), C.reduce_sum(C.reshape(hot_vals, (hot_vals.shape[0], -1)), axis=1))
         # reshape the mean to match the dimensionality of the input
         mean = C.reshape(mean, (mean.shape[0], mean.shape[1], 1, 1))
         # binarize the carry over
         bits = C.greater(carry_over, 0)
         bits = C.element_select(bits, bits, -1)
         bits = C.element_select(hot_vals, bits, 0)
         # add in the equivalent binary representation to the approximation
         approx = C.plus(approx, C.element_times(mean, bits))
         # compute the new carry over
         carry_over = C.plus(C.element_times(C.element_times(-1, bits), mean), carry_over)
         
     return approx, multiIn
 def create_model(self):
     self.input_dim = 1000
     self.embed_dim = 30
     i = C.input_variable((self.input_dim, ), is_sparse=True)
     self.p = C.parameter(shape=(self.input_dim, self.embed_dim), init=1)
     o = C.times(i, self.p)
     self.z = C.reduce_sum(o)
Exemplo n.º 38
0
def test_ReduceSum(tmpdir, dtype):
    with C.default_options(dtype=dtype):
        data = np.array(
            [[[5, 1], [20, 2]], [[30, 1], [40, 2]], [[55, 1], [60, 2]]],
            dtype=dtype)
        model = C.reduce_sum(data, 0)
        verify_no_input(model, tmpdir, 'ReduceSum_0')
Exemplo n.º 39
0
def run_cntk(image_path, model_path):
    import functools
    import cv2

    model = cntk.load_model(model_path)

    pool_nodes = list()
    for l in cntk.logging.depth_first_search(model, lambda x: True, depth=0):
        if type(l) is cntk.ops.functions.Function:
            description = str(l)
            if description.find('Pooling') >= 0:
                pool_nodes.append(l)
                print(l)
    print(pool_nodes)

    # node contributions to the loss metric
    layer_contributions = {
        pool_nodes[2]: 1,
        pool_nodes[3]: 3,
    }

    # Define the loss
    loss = None
    for layer in layer_contributions.keys():
        coeff = layer_contributions[layer]
        activation = layer.output
        scaling = functools.reduce(lambda x, y: x * y, activation.shape)
        sum_squares = cntk.reduce_sum(cntk.square(activation))
        scaled_sum_squares = (coeff / scaling) * sum_squares
        if loss is None:
            loss = scaled_sum_squares
        else:
            loss += scaled_sum_squares

    dream = cntk.input_variable(shape=model.arguments[0].shape,
                                needs_gradient=True,
                                name='features')
    model = cntk.ops.combine(loss).clone(
        cntk.ops.CloneMethod.freeze, substitutions={model.arguments[0]: dream})
    step = 0.1  # Gradient ascent step size
    iterations = 5  # Number of ascent steps per scale

    # Load the image into a Numpy array
    img = cv2.imread(image_path)
    img = cv2.resize(img, (224, 224))

    # cv2.imshow('Original Image', img.copy())

    img = img.astype(np.float32)
    img = np.transpose(img, (2, 0, 1))
    img /= 127.5
    img -= 1
    img = gradient_ascent_cntk(model, img, iterations=iterations, step=step)
    img = np.transpose(img, (1, 2, 0))
    img /= 2.
    img += 0.5
    img *= 255.
    img = np.clip(img, 0, 255).astype('uint8')
    return img
Exemplo n.º 40
0
def test_conv_cudnn_batch_size_change(device_id):
    if device_id == -1:
        pytest.skip('Test only runs on GPU')

    np.random.seed(0)
    input_shape = (1, 16, 100)
    input1 = C.sequence.input_variable(input_shape, needs_gradient=True, sequence_axis=C.Axis.new_unique_dynamic_axis('c'))
    input2 = C.sequence.input_variable(input_shape, needs_gradient=True, sequence_axis=C.Axis.new_unique_dynamic_axis('q'))
    conv = C.layers.Convolution2D((5,8), 100, activation=C.relu, init=C.glorot_uniform(), bias=True, init_bias=0)
    output = C.reduce_sum(conv(input1), axis=C.Axis.all_axes()) + C.reduce_sum(conv(input2), axis=C.Axis.all_axes())
    num_batches = 100 # change to greater value for a more thorough test
    batch_size = 1
    max_seq_len = [100, 10]
    for batch in range(num_batches):
        seq_lens = [[int(x*msl+1) for x in np.random.random((batch_size))] for msl in max_seq_len]
        output.grad({input1:[np.random.random((sl,) + input_shape).astype(np.float32) for sl in seq_lens[0]],
                     input2:[np.random.random((sl,) + input_shape).astype(np.float32) for sl in seq_lens[1]]})
Exemplo n.º 41
0
 def multiFunc(self, arg1):
     multiIn = C.input(shape=arg1.shape, dynamic_axes = arg1.dynamic_axes)
     bit_map = C.constant(self.bit_map)
     max_bits = self.bit_map.max()
     shape = multiIn.shape
     reformed = C.reshape(multiIn, (-1,))
     carry_over = multiIn
     approx = C.element_times(multiIn, 0)
     for i in range(max_bits):
         hot_vals = C.greater(bit_map, i)
         valid_vals = C.element_select(hot_vals, carry_over, 0)
         mean = C.element_divide(C.reduce_sum(C.abs(valid_vals)), C.reduce_sum(hot_vals))
         bits = C.greater(carry_over, 0)
         bits = C.element_select(bits, bits, -1)
         bits = C.element_select(hot_vals, bits, 0)
         approx = C.plus(approx, C.element_times(mean, bits))
         carry_over = C.plus(C.element_times(C.element_times(-1, bits), mean), carry_over)
         
     return approx, multiIn
Exemplo n.º 42
0
def create_sample_model(device, writer=None,
                        lr_per_sample=C.learning_parameter_schedule_per_sample([0.3, 0.2, 0.1, 0.0])):
    in1 = sequence.input_variable(shape=(input_dim,))
    labels = sequence.input_variable(shape=(input_dim,))
    p = parameter(shape=(input_dim,), init=10, device=device)
    z = plus(in1, reduce_sum(p), name='z')
    ce = cross_entropy_with_softmax(z, labels)
    errs = classification_error(z, labels)

    learner = C.sgd(z.parameters, lr_per_sample)
    trainer = C.Trainer(z, (ce, errs), [learner], writer)
    return (trainer, in1, labels)
Exemplo n.º 43
0
def test_sequence_unpack_with_broadcast_as(device_id, precision):
    x = C.sequence.input_variable(5)
    a = C.sequence.input_variable(4, sequence_axis=C.Axis('a'))
    y, mask = C.sequence.unpack(x, 0).outputs
    bvm = C.sequence.broadcast_as(0 * C.reduce_sum(y) + mask, a)

    x1 = [np.arange(7 * 5).reshape(7, 5).astype('f'), np.arange(3 * 5).reshape(3, 5).astype('f')]
    a1 = [np.arange(3 * 4).reshape(3, 4).astype('f'), np.arange(6 * 4).reshape(6, 4).astype('f')]

    expected = [np.ones((3, 7), dtype=np.float32), np.ones((6, 7), dtype=np.float32)]
    expected[1][:,3:] = 0

    actual = bvm.eval({x: x1, a: a1})
    for actual_i, expected_i in zip(actual, expected):
        assert np.allclose(actual_i, expected_i)
Exemplo n.º 44
0
def test_output_to_retain():
    in1 = input_variable(shape=(1,))
    labels = input_variable(shape=(1,))
    p = parameter(shape=(2,), init=10)
    z = plus(in1, reduce_sum(p), name='z')
    ce = cross_entropy_with_softmax(z, labels)
    errs = classification_error(z, labels)
    momentum_time_constant = momentum_as_time_constant_schedule(1100)
    lr_per_sample = learning_rate_schedule(0.007, UnitType.sample)
    trainer = Trainer(z, (ce, errs),
            [momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)])
    in1_value = [[[1]], [[2]]]
    label_value = [[0], [1]]
    arguments = {in1: in1_value, labels: label_value}
    z_output = z.output
    updated, var_map = trainer.train_minibatch(arguments, outputs=[z_output])
    assert np.allclose(var_map[z_output], np.asarray(in1_value)+20)
Exemplo n.º 45
0
def attention_pooling(inputs, inputs_mask, inputs_weights, decode, decode_weights, keys):
    """
    inputs: shape=(n, dim)
    inputs_weight: shape=(dim, dim)
    decode: shape=(1, dec_dim)
    decode_weights: shape=(dec_dim, dim)
    keys: shape=(dim, 1)
    
    """
    w_in = C.times(inputs, inputs_weights)  #shape=(n, dim)
    w_dec = C.times(decode, decode_weights) #shape=(dim, 1)
    S = C.tanh(w_in + C.sequence.broadcast_as(w_dec, w_in)) #shape=(n, dim)
    S = C.element_select(inputs_mask, S, C.constant(-1e+30))
    S = C.times(S, keys) #shape=(n)
    S = C.ops.sequence.softmax(S, name="softmax")
    attention = C.reduce_sum(inputs * S, axis=0)
    return attention
Exemplo n.º 46
0
def test_nce_backward_indices(classes, xdim, batch, expected_value, device_id, precision):
    """
    Simple test that makes sure that the derivatives have the correct sparsity pattern
    """

    # ignore precision, only sparsity pattern matters for this test
    dt = np.float32

    from cntk.losses import nce_loss
    import scipy
    trials = 10

    # Establish baseline
    expected_count = np.zeros(classes)
    I = C.constant(np.eye(classes, dtype=dt))
    q = np.arange(classes, dtype=dt) + 1
    z = C.reduce_sum(C.times(C.random_sample(q, 32, True, seed=98052), I), axis=0)
    for i in range(trials):
        expected_count[np.nonzero(z.eval().ravel())] += 1

    # Set things up to measure the same thing with nce_loss

    x = C.input_variable(xdim, needs_gradient=True)
    y = C.input_variable(classes, is_sparse=True)

    x0 = np.arange(batch * xdim, dtype=dt).reshape((batch, xdim))/(batch * xdim)
    data = np.ones(batch, dtype=dt)
    indices = list(range(10,10*batch+1,10))
    indptr = list(range(batch+1))
    y0 = scipy.sparse.csr_matrix((data, indices, indptr), shape=(batch, classes))

    b = C.parameter((classes, 1))
    W = C.parameter((classes, C.InferredDimension))

    gb = np.zeros(classes)
    vb = C.input_variable((classes, 1), dtype=dt)
    Ib = C.constant(np.eye(1, dtype=dt))
    zb = C.times(vb, Ib)

    loss = C.nce_loss(W, b, x, y, q, seed=98052)
    for i in range(trials):
        v = loss.grad({x: x0, y: y0}, wrt=loss.parameters, as_numpy=False)
        gb[np.nonzero(zb.eval({vb: v[b]}).ravel())] += 1
    for i in range(classes):
        assert gb[i] == expected_count[i] or (i in indices and gb[i] == trials)
Exemplo n.º 47
0
def test_op_reduce_sum(input_data, axis, device_id, precision):
    # Forward pass test
    # ==================
    # We compute the expected output for the forward pass.
    # We need two surrounding brackets:
    # The first for sequences (length=1, since we have dynamic_axis='').
    # The second for batch of one sample.

    # keepdims = True as CNTK keeps them as well
    def reduce_sum(x, axis, keepdims=True):
        x_aa = AA(x)
        if axis == len(x_aa.shape):
            return [np.reshape(np.add.reduce(np.ravel(x_aa)), (1, 1))]
        return [[np.add.reduce(x_aa, axis, dtype=PRECISION_TO_TYPE[precision], keepdims=keepdims)]]

    expected_result = reduce_sum(input_data, axis)

    a = I([input_data])

    # splice using the operator
    result = C.reduce_sum(a, axis)

    unittest_helper(
        result, None, expected_result, device_id=device_id, precision=precision, clean_up=True, backward_pass=False
    )

    # Backward pass test
    # ==================
    # The gradient of the reduce_sum operator is all ones in the shape of the input

    def grad_reduce_sum(x):
        return AA(np.ones_like(x, dtype=PRECISION_TO_TYPE[precision]))

    expected_gradient = [[grad_reduce_sum(input_data)]]

    unittest_helper(
        result,
        None,
        expected_gradient,
        device_id=device_id,
        precision=precision,
        clean_up=True,
        backward_pass=True,
        input_node=a,
    )
Exemplo n.º 48
0
def create_binary_convolution_model():

    # Input variables denoting the features and label data
    feature_var = C.input((num_channels, image_height, image_width))
    label_var = C.input((num_classes))

    # apply model to input
    scaled_input = C.element_times(C.constant(0.00390625), feature_var)

    # first layer is ok to be full precision
    z = C.layers.Convolution((3, 3), 32, pad=True, activation=C.relu)(scaled_input)
    z = C.layers.MaxPooling((3,3), strides=(2,2))(z)

    z = C.layers.BatchNormalization(map_rank=1)(z)
    z = BinaryConvolution(z, (3,3), 128, channels=32, pad=True)
    z = C.layers.MaxPooling((3,3), strides=(2,2))(z)

    z = C.layers.BatchNormalization(map_rank=1)(z)
    z = BinaryConvolution(z, (3,3), 128, channels=128, pad=True)
    z = C.layers.MaxPooling((3,3), strides=(2,2))(z)

    z = C.layers.BatchNormalization(map_rank=1)(z)
    z = BinaryConvolution(z, (1,1), num_classes, channels=128, pad=True)
    z = C.layers.AveragePooling((z.shape[1], z.shape[2]))(z)
    z = C.reshape(z, (num_classes,))

    # Add binary regularization (ala Gang Hua)
    weight_sum = C.constant(0)
    for p in z.parameters:
        if (p.name == "filter"):
            weight_sum = C.plus(weight_sum, C.reduce_sum(C.minus(1, C.square(p))))
    bin_reg = C.element_times(.000005, weight_sum)

    # After the last layer, we need to apply a learnable scale
    SP = C.parameter(shape=z.shape, init=0.001)
    z = C.element_times(z, SP)

    # loss and metric
    ce = C.cross_entropy_with_softmax(z, label_var)
    ce = C.plus(ce, bin_reg)
    pe = C.classification_error(z, label_var)

    return C.combine([z, ce, pe])
Exemplo n.º 49
0
def test_restore_constants(tmpdir):
    C.device.try_set_default_device(C.device.cpu())
    def _setvalue(x, v):
        x.value = 0 * x.value + v if len(x.shape)> 0 else np.array(v, dtype=np.float32)

    def _setall(f, v):
        for x in f.constants + f.parameters:
            _setvalue(x, v)

    def _checkall(f, v):
        for x in f.constants + f.parameters:
            assert (x.value == v).all()

    x = C.input_variable(10)
    f = C.layers.BatchNormalization()(x)
    trainer = C.Trainer(f, C.reduce_sum(f), C.sgd(f.parameters, C.learning_rate_schedule(0.1, 'sample')))

    model_filename = str(tmpdir / 'function.out')
    checkpoint_filename = str(tmpdir / 'checkpoint.out')
    _setall(f, 1)
    f.save(model_filename)
    _checkall(f, 1)

    _setall(f, 2)
    trainer.save_checkpoint(checkpoint_filename)
    _checkall(f, 2)

    _setall(f, 3)
    _checkall(f, 3)
    trainer.restore_from_checkpoint(checkpoint_filename)
    _checkall(f, 2)

    f2 = C.Function.load(model_filename)
    _checkall(f2, 1)

    _setall(f, 4)
    _checkall(f, 4)
    f.restore(model_filename)
    _checkall(f, 1)

    _setall(f2, 5)
    _checkall(f2, 5)
Exemplo n.º 50
0
def run_distributed_training(tmpdir, create_func):

    in1 = sequence.input_variable(shape=1)
    labels = sequence.input_variable(shape=1)
    p = parameter(shape=2, init=10)
    z = plus(in1, reduce_sum(p), name='z')
    ce = cross_entropy_with_softmax(z, labels)
    errs = classification_error(z, labels)

    momentum_time_constant = C.momentum_as_time_constant_schedule(1100)
    lr_per_sample = C.learning_rate_schedule(0.007, C.UnitType.sample)
    dist_learner = create_func(C.momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True))

    communicator = dist_learner.communicator()
    workers = communicator.workers()
    current_worker = communicator.current_worker()
    found_rank = False
    for wk in workers:
        if current_worker.global_rank == wk.global_rank:
            found_rank = True

    assert found_rank

    trainer = C.Trainer(z, (ce, errs), [ dist_learner ])
    in1_value = [[1],[2]]
    label_value = [[0], [1]]
    arguments = {in1: in1_value, labels: label_value}
    z_output = z.output
    updated, var_map = trainer.train_minibatch(arguments, outputs=[z_output])
    
    p = str(tmpdir / 'checkpoint.dat')
    trainer.save_checkpoint(p)
    trainer.restore_from_checkpoint(p)

    communicator.barrier()

    assert trainer.model.name == 'z'

    # Ensure that Swig is not leaking raw types
    assert isinstance(trainer.model, Function)
    assert trainer.model.__doc__
Exemplo n.º 51
0
def train(data_path, model_path, log_file, config_file, restore=False, profiling=False, gen_heartbeat=False):
    polymath = PolyMath(config_file)
    z, loss = polymath.model()
    training_config = importlib.import_module(config_file).training_config

    max_epochs = training_config['max_epochs']
    log_freq = training_config['log_freq']

    progress_writers = [C.logging.ProgressPrinter(
                            num_epochs = max_epochs,
                            freq = log_freq,
                            tag = 'Training',
                            log_to_file = log_file,
                            rank = C.Communicator.rank(),
                            gen_heartbeat = gen_heartbeat)]

    lr = C.learning_parameter_schedule(training_config['lr'], minibatch_size=None, epoch_size=None)

    ema = {}
    dummies = []
    for p in z.parameters:
        ema_p = C.constant(0, shape=p.shape, dtype=p.dtype, name='ema_%s' % p.uid)
        ema[p.uid] = ema_p
        dummies.append(C.reduce_sum(C.assign(ema_p, 0.999 * ema_p + 0.001 * p)))
    dummy = C.combine(dummies)

    learner = C.adadelta(z.parameters, lr)

    if C.Communicator.num_workers() > 1:
        learner = C.data_parallel_distributed_learner(learner)

    tensorboard_writer = TensorBoardProgressWriter(freq=10, log_dir='log', model=z)
    trainer = C.Trainer(z, (loss, None), learner, tensorboard_writer)

    if profiling:
        C.debugging.start_profiler(sync_gpu=True)

    train_data_file = os.path.join(data_path, training_config['train_data'])
    train_data_ext = os.path.splitext(train_data_file)[-1].lower()

    model_file = os.path.join(model_path, model_name)
    model = C.combine(list(z.outputs) + [loss.output])
    label_ab = argument_by_name(loss, 'ab')

    epoch_stat = {
        'best_val_err' : 100,
        'best_since'   : 0,
        'val_since'    : 0}

    if restore and os.path.isfile(model_file):
        trainer.restore_from_checkpoint(model_file)
        #after restore always re-evaluate
        epoch_stat['best_val_err'] = validate_model(os.path.join(data_path, training_config['val_data']), model, polymath)

    def post_epoch_work(epoch_stat):
        trainer.summarize_training_progress()
        epoch_stat['val_since'] += 1

        if epoch_stat['val_since'] == training_config['val_interval']:
            epoch_stat['val_since'] = 0
            temp = dict((p.uid, p.value) for p in z.parameters)
            for p in trainer.model.parameters:
                p.value = ema[p.uid].value
            val_err = validate_model(os.path.join(data_path, training_config['val_data']), model, polymath)
            if epoch_stat['best_val_err'] > val_err:
                epoch_stat['best_val_err'] = val_err
                epoch_stat['best_since'] = 0
                trainer.save_checkpoint(model_file)
                for p in trainer.model.parameters:
                    p.value = temp[p.uid]
            else:
                epoch_stat['best_since'] += 1
                if epoch_stat['best_since'] > training_config['stop_after']:
                    return False

        if profiling:
            C.debugging.enable_profiler()

        return True

    if train_data_ext == '.ctf':
        mb_source, input_map = create_mb_and_map(loss, train_data_file, polymath)

        minibatch_size = training_config['minibatch_size'] # number of samples
        epoch_size = training_config['epoch_size']

        for epoch in range(max_epochs):
            num_seq = 0
            while True:
                if trainer.total_number_of_samples_seen >= training_config['distributed_after']:
                    data = mb_source.next_minibatch(minibatch_size*C.Communicator.num_workers(), input_map=input_map, num_data_partitions=C.Communicator.num_workers(), partition_index=C.Communicator.rank())
                else:
                    data = mb_source.next_minibatch(minibatch_size, input_map=input_map)

                trainer.train_minibatch(data)
                num_seq += trainer.previous_minibatch_sample_count
                dummy.eval()
                if num_seq >= epoch_size:
                    break
            if not post_epoch_work(epoch_stat):
                break
    else:
        if train_data_ext != '.tsv':
            raise Exception("Unsupported format")

        minibatch_seqs = training_config['minibatch_seqs'] # number of sequences

        for epoch in range(max_epochs):       # loop over epochs
            tsv_reader = create_tsv_reader(loss, train_data_file, polymath, minibatch_seqs, C.Communicator.num_workers())
            minibatch_count = 0
            for data in tsv_reader:
                if (minibatch_count % C.Communicator.num_workers()) == C.Communicator.rank():
                    trainer.train_minibatch(data) # update model with it
                    dummy.eval()
                minibatch_count += 1
            if not post_epoch_work(epoch_stat):
                break

    if profiling:
        C.debugging.stop_profiler()
Exemplo n.º 52
0
def test_ReduceSum(tmpdir):
    data = np.array([[[5,1], [20,2]],[[30,1], [40,2]],[[55,1], [60,2]]], dtype=np.float32)
    model = C.reduce_sum(data, 0)
    verify_no_input(model, tmpdir, 'ReduceSum_0')
Exemplo n.º 53
0
def validate_model(test_data, model, polymath):
    begin_logits = model.outputs[0]
    end_logits   = model.outputs[1]
    loss         = model.outputs[2]
    root = C.as_composite(loss.owner)
    mb_source, input_map = create_mb_and_map(root, test_data, polymath, randomize=False, repeat=False)
    begin_label = argument_by_name(root, 'ab')
    end_label   = argument_by_name(root, 'ae')

    begin_prediction = C.sequence.input_variable(1, sequence_axis=begin_label.dynamic_axes[1], needs_gradient=True)
    end_prediction = C.sequence.input_variable(1, sequence_axis=end_label.dynamic_axes[1], needs_gradient=True)

    best_span_score = symbolic_best_span(begin_prediction, end_prediction)
    predicted_span = C.layers.Recurrence(C.plus)(begin_prediction - C.sequence.past_value(end_prediction))
    true_span = C.layers.Recurrence(C.plus)(begin_label - C.sequence.past_value(end_label))
    common_span = C.element_min(predicted_span, true_span)
    begin_match = C.sequence.reduce_sum(C.element_min(begin_prediction, begin_label))
    end_match = C.sequence.reduce_sum(C.element_min(end_prediction, end_label))

    predicted_len = C.sequence.reduce_sum(predicted_span)
    true_len = C.sequence.reduce_sum(true_span)
    common_len = C.sequence.reduce_sum(common_span)
    f1 = 2*common_len/(predicted_len+true_len)
    exact_match = C.element_min(begin_match, end_match)
    precision = common_len/predicted_len
    recall = common_len/true_len
    overlap = C.greater(common_len, 0)
    s = lambda x: C.reduce_sum(x, axis=C.Axis.all_axes())
    stats = C.splice(s(f1), s(exact_match), s(precision), s(recall), s(overlap), s(begin_match), s(end_match))

    # Evaluation parameters
    minibatch_size = 20000
    num_sequences = 0

    stat_sum = 0
    loss_sum = 0

    while True:
        data = mb_source.next_minibatch(minibatch_size, input_map=input_map)
        if not data or not (begin_label in data) or data[begin_label].num_sequences == 0:
            break
        out = model.eval(data, outputs=[begin_logits,end_logits,loss], as_numpy=False)
        testloss = out[loss]
        g = best_span_score.grad({begin_prediction:out[begin_logits], end_prediction:out[end_logits]}, wrt=[begin_prediction,end_prediction], as_numpy=False)
        other_input_map = {begin_prediction: g[begin_prediction], end_prediction: g[end_prediction], begin_label: data[begin_label], end_label: data[end_label]}
        stat_sum += stats.eval((other_input_map))
        loss_sum += np.sum(testloss.asarray())
        num_sequences += data[begin_label].num_sequences

    stat_avg = stat_sum / num_sequences
    loss_avg = loss_sum / num_sequences

    print("Validated {} sequences, loss {:.4f}, F1 {:.4f}, EM {:.4f}, precision {:4f}, recall {:4f} hasOverlap {:4f}, start_match {:4f}, end_match {:4f}".format(
            num_sequences,
            loss_avg,
            stat_avg[0],
            stat_avg[1],
            stat_avg[2],
            stat_avg[3],
            stat_avg[4],
            stat_avg[5],
            stat_avg[6]))

    return loss_avg
Exemplo n.º 54
0
def test_sweep_based_schedule(tmpdir, device_id):
    from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs
    from cntk import cross_entropy_with_softmax, classification_error, plus, reduce_sum, sequence
    from cntk import Trainer

    input_dim = 69

    ctf_data = '''\
0   |S0 3:1   |S1 3:1 |# <s>
0   |S0 4:1 |# A    |S1 32:1 |# ~AH
0   |S0 5:1 |# B    |S1 36:1 |# ~B
0   |S0 4:1 |# A    |S1 31:1 |# ~AE
0   |S0 7:1 |# D    |S1 38:1 |# ~D
0   |S0 12:1 |# I   |S1 47:1 |# ~IY
0   |S0 1:1 |# </s> |S1 1:1 |# </s>
2   |S0 60:1 |# <s> |S1 3:1 |# <s>
2   |S0 61:1 |# A   |S1 32:1 |# ~AH
'''
    ctf_file = str(tmpdir/'2seqtest.txt')
    with open(ctf_file, 'w') as f:
        f.write(ctf_data)

    mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs(
        features  = StreamDef(field='S0', shape=input_dim,  is_sparse=True),
        labels    = StreamDef(field='S1', shape=input_dim,  is_sparse=True)
    )), randomize=False)

    in1 = sequence.input_variable(shape=(input_dim,))
    labels = sequence.input_variable(shape=(input_dim,))
    p = parameter(shape=(input_dim,), init=10)
    z = plus(in1, reduce_sum(p), name='z')
    ce = cross_entropy_with_softmax(z, labels)
    errs = classification_error(z, labels)

    lr_per_sample = learning_rate_schedule([0.3, 0.2, 0.1, 0.0], UnitType.sample)
    learner = sgd(z.parameters, lr_per_sample)
    trainer = Trainer(z, (ce, errs), [learner])

    input_map = {
        in1       : mbs.streams.features,
        labels : mbs.streams.labels
    }

    # fetch minibatch (first sequence)
    data = mbs.next_minibatch(1, input_map=input_map)
    trainer.train_minibatch(data)
    assert learner.learning_rate() == 0.3

    # fetch minibatch (second sequence, sweep ends at this point)
    data = mbs.next_minibatch(1, input_map=input_map)
    trainer.train_minibatch(data)
    assert learner.learning_rate() == 0.2

    # fetch minibatch (both sequences -- entire sweep in one go)
    data = mbs.next_minibatch(9, input_map=input_map)
    trainer.train_minibatch(data)
    assert learner.learning_rate() == 0.1

    # fetch minibatch (multiple sweeps)
    data = mbs.next_minibatch(30, input_map=input_map)
    trainer.train_minibatch(data, outputs=[z.output])
    assert learner.learning_rate() == 0.0
Exemplo n.º 55
0
def test_ReduceSum(tmpdir, dtype):
    with C.default_options(dtype = dtype):
        data = np.array([[[5,1], [20,2]],[[30,1], [40,2]],[[55,1], [60,2]]], dtype=dtype)
        model = C.reduce_sum(data, 0)
        verify_no_input(model, tmpdir, 'ReduceSum_0')
Exemplo n.º 56
0
def create_rpn(conv_out, scaled_gt_boxes, im_info, add_loss_functions=True,
               proposal_layer_param_string=None):
    '''
    Creates a region proposal network for object detection as proposed in the "Faster R-CNN" paper:
        Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun:
        "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks"

    Outputs object detection proposals by applying estimated bounding-box
    transformations to a set of regular boxes (called "anchors").

    Args:
        conv_out:        The convolutional feature map, i.e. the output of the conv layers from the pretrained classification network
        scaled_gt_boxes: The ground truth boxes as (x1, y1, x2, y2, label). Coordinates are absolute pixels wrt. the input image.
        im_info:         (image_widht, image_height, image_scale) as CNTK variable or constant
        add_loss_functions: If set to True rpn_losses will be returned, otherwise None is returned for the losses
        proposal_layer_param_string: A yaml parameter string that is passed to the proposal layer.

    Returns:
        rpn_rois - the proposed ROIs
        rpn_losses - the losses (SmoothL1 loss for bbox regression plus cross entropy for objectness)
    '''

    # RPN network
    # init = 'normal', initValueScale = 0.01, initBias = 0.1
    rpn_conv_3x3 = Convolution((3, 3), 256, activation=relu, pad=True, strides=1,
                                init = normal(scale=0.01), init_bias=0.1)(conv_out)
    rpn_cls_score = Convolution((1, 1), 18, activation=None, name="rpn_cls_score",
                                init = normal(scale=0.01), init_bias=0.1)(rpn_conv_3x3)  # 2(bg/fg)  * 9(anchors)
    rpn_bbox_pred = Convolution((1, 1), 36, activation=None, name="rpn_bbox_pred",
                                init = normal(scale=0.01), init_bias=0.1)(rpn_conv_3x3)  # 4(coords) * 9(anchors)

    # apply softmax to get (bg, fg) probabilities and reshape predictions back to grid of (18, H, W)
    num_predictions = int(np.prod(rpn_cls_score.shape) / 2)
    rpn_cls_score_rshp = reshape(rpn_cls_score, (2, num_predictions))
    rpn_cls_prob = softmax(rpn_cls_score_rshp, axis=0, name="objness_softmax")
    rpn_cls_prob_reshape = reshape(rpn_cls_prob, rpn_cls_score.shape)

    # proposal layer
    rpn_rois_raw = user_function(ProposalLayer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, param_str=proposal_layer_param_string))
    rpn_rois = alias(rpn_rois_raw, name='rpn_rois')

    rpn_losses = None
    if(add_loss_functions):
        # RPN targets
        # Comment: rpn_cls_score is only passed   vvv   to get width and height of the conv feature map ...
        atl = user_function(AnchorTargetLayer(rpn_cls_score, scaled_gt_boxes, im_info, param_str=proposal_layer_param_string))
        rpn_labels = atl.outputs[0]
        rpn_bbox_targets = atl.outputs[1]
        rpn_bbox_inside_weights = atl.outputs[2]

        # For loss functions: ignore label predictions for the 'ignore label',
        # i.e. set target and prediction to 0 --> needs to be softmaxed before
        rpn_labels_rshp = reshape(rpn_labels, (1, num_predictions))
        ignore = user_function(IgnoreLabel(rpn_cls_prob, rpn_labels_rshp, ignore_label=-1))
        rpn_cls_prob_ignore = ignore.outputs[0]
        fg_targets = ignore.outputs[1]
        bg_targets = 1 - fg_targets
        rpn_labels_ignore = splice(bg_targets, fg_targets, axis=0)

        # RPN losses
        rpn_loss_cls = cross_entropy_with_softmax(rpn_cls_prob_ignore, rpn_labels_ignore, axis=0)
        rpn_loss_bbox = user_function(SmoothL1Loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights))
        rpn_losses = plus(reduce_sum(rpn_loss_cls), reduce_sum(rpn_loss_bbox), name="rpn_losses")

    return rpn_rois, rpn_losses
def create_rpn(conv_out, scaled_gt_boxes, im_info, add_loss_functions=True,
               proposal_layer_param_string=None, conv_bias_init=0.0):
    '''
    Creates a region proposal network for object detection as proposed in the "Faster R-CNN" paper:
        Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun:
        "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks"

    Outputs object detection proposals by applying estimated bounding-box
    transformations to a set of regular boxes (called "anchors").

    Args:
        conv_out:        The convolutional feature map, i.e. the output of the conv layers from the pretrained classification network
        scaled_gt_boxes: The ground truth boxes as (x1, y1, x2, y2, label). Coordinates are absolute pixels wrt. the input image.
        im_info:         A CNTK variable or constant containing
                         (pad_width, pad_height, scaled_image_width, scaled_image_height, orig_img_width, orig_img_height)
                         e.g. (1000, 1000, 1000, 600, 500, 300) for an original image of 600x300 that is scaled and padded to 1000x1000
        add_loss_functions: If set to True rpn_losses will be returned, otherwise None is returned for the losses
        proposal_layer_param_string: A yaml parameter string that is passed to the proposal layer.

    Returns:
        rpn_rois - the proposed ROIs
        rpn_losses - the losses (SmoothL1 loss for bbox regression plus cross entropy for objectness)
    '''

    # RPN network
    # init = 'normal', initValueScale = 0.01, initBias = 0.1
    num_channels = cfg["CNTK"].RPN_NUM_CHANNELS
    rpn_conv_3x3 = Convolution((3, 3), num_channels, activation=relu, pad=True, strides=1,
                                init = normal(scale=0.01), init_bias=conv_bias_init)(conv_out)
    rpn_cls_score = Convolution((1, 1), 18, activation=None, name="rpn_cls_score",
                                init = normal(scale=0.01), init_bias=conv_bias_init)(rpn_conv_3x3)  # 2(bg/fg)  * 9(anchors)
    rpn_bbox_pred = Convolution((1, 1), 36, activation=None, name="rpn_bbox_pred",
                                init = normal(scale=0.01), init_bias=conv_bias_init)(rpn_conv_3x3)  # 4(coords) * 9(anchors)

    # apply softmax to get (bg, fg) probabilities and reshape predictions back to grid of (18, H, W)
    num_predictions = int(rpn_cls_score.shape[0] / 2)
    rpn_cls_score_rshp = reshape(rpn_cls_score, (2, num_predictions, rpn_cls_score.shape[1], rpn_cls_score.shape[2]), name="rpn_cls_score_rshp")
    p_rpn_cls_score_rshp = cntk.placeholder()
    rpn_cls_sm = softmax(p_rpn_cls_score_rshp, axis=0)
    rpn_cls_prob = cntk.as_block(rpn_cls_sm, [(p_rpn_cls_score_rshp, rpn_cls_score_rshp)], 'Softmax', 'rpn_cls_prob')
    rpn_cls_prob_reshape = reshape(rpn_cls_prob, rpn_cls_score.shape, name="rpn_cls_prob_reshape")

    # proposal layer
    rpn_rois_raw = user_function(ProposalLayer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, param_str=proposal_layer_param_string))
    rpn_rois = alias(rpn_rois_raw, name='rpn_rois')

    rpn_losses = None
    if(add_loss_functions):
        # RPN targets
        # Comment: rpn_cls_score is only passed   vvv   to get width and height of the conv feature map ...
        atl = user_function(AnchorTargetLayer(rpn_cls_score, scaled_gt_boxes, im_info, param_str=proposal_layer_param_string))
        rpn_labels = atl.outputs[0]
        rpn_bbox_targets = atl.outputs[1]
        rpn_bbox_inside_weights = atl.outputs[2]

        # classification loss
        p_rpn_labels = cntk.placeholder()
        p_rpn_cls_score_rshp = cntk.placeholder()

        keeps = cntk.greater_equal(p_rpn_labels, 0.0)
        fg_labels = element_times(p_rpn_labels, keeps, name="fg_targets")
        bg_labels = minus(1, fg_labels, name="bg_targets")
        rpn_labels_ignore = splice(bg_labels, fg_labels, axis=0)
        rpn_ce = cross_entropy_with_softmax(p_rpn_cls_score_rshp, rpn_labels_ignore, axis=0)
        rpn_loss_cls = element_times(rpn_ce, keeps)

        # The terms that are accounted for in the cls loss are those that have a label >= 0
        cls_num_terms = reduce_sum(keeps)
        cls_normalization_factor = 1.0 / cls_num_terms
        normalized_rpn_cls_loss = reduce_sum(rpn_loss_cls) * cls_normalization_factor

        reduced_rpn_loss_cls = cntk.as_block(normalized_rpn_cls_loss,
                                         [(p_rpn_labels, rpn_labels), (p_rpn_cls_score_rshp, rpn_cls_score_rshp)],
                                         'CE_with_ignore', 'norm_rpn_cls_loss')

        # regression loss
        p_rpn_bbox_pred = cntk.placeholder()
        p_rpn_bbox_targets = cntk.placeholder()
        p_rpn_bbox_inside_weights = cntk.placeholder()
        rpn_loss_bbox = SmoothL1Loss(cfg["CNTK"].SIGMA_RPN_L1, p_rpn_bbox_pred, p_rpn_bbox_targets, p_rpn_bbox_inside_weights, 1.0)
        # The bbox loss is normalized by the rpn batch size
        bbox_normalization_factor = 1.0 / cfg["TRAIN"].RPN_BATCHSIZE
        normalized_rpn_bbox_loss = reduce_sum(rpn_loss_bbox) * bbox_normalization_factor

        reduced_rpn_loss_bbox = cntk.as_block(normalized_rpn_bbox_loss,
                                          [(p_rpn_bbox_pred, rpn_bbox_pred), (p_rpn_bbox_targets, rpn_bbox_targets),
                                           (p_rpn_bbox_inside_weights, rpn_bbox_inside_weights)],
                                          'SmoothL1Loss', 'norm_rpn_bbox_loss')

        rpn_losses = plus(reduced_rpn_loss_cls, reduced_rpn_loss_bbox, name="rpn_losses")

    return rpn_rois, rpn_losses