Пример #1
0
 def l2_normalize(vector):
     square_sum = dy.sqrt(
         dy.bmax(
             dy.sum_elems(dy.square(vector)),
             np.finfo(float).eps * dy.ones((1))[0],
         ))
     return dy.cdiv(vector, square_sum)
Пример #2
0
    def __call__(self, s1, s2):
        b_nli = dy.parameter(self.b_nli)
        W_nli_1 = dy.parameter(self.W_nli_1)
        W_nli_2 = dy.parameter(self.W_nli_2)
        W_nli_u = dy.parameter(self.W_nli_u)
        W_nli_v = dy.parameter(self.W_nli_v)
        u = dy.square(s1 - s2)
        v = dy.cmult(s1, s2)
        relu = dy.rectify(dy.affine_transform([b_nli, W_nli_1, s1, W_nli_2, s2, W_nli_u, u, W_nli_v, v]))

        b_s = dy.parameter(self.b_s)
        w_s = dy.parameter(self.w_s)
        return dy.affine_transform([b_s, w_s, relu])
Пример #3
0
def calc_reinforce_loss(words, tags, delta):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    word_reps = LSTM.transduce([LOOKUP[x] for x in words])

    # Softmax scores
    W = dy.parameter(W_sm)
    b = dy.parameter(b_sm)

    #calculate the probability distribution
    scores = [dy.affine_transform([b, W, x]) for x in word_reps]
    losses = [
        dy.pickneglogsoftmax(score, tag) for score, tag in zip(scores, tags)
    ]
    probs = [-dy.exp(loss).as_array() for loss in losses]

    #then take samples from the probability distribution
    samples = [np.random.choice(range(len(x)), p=x) for x in probs]

    #calculate accuracy=reward
    correct = [sample == tag for sample, tag in zip(samples, tags)]
    r_i = float(sum(correct)) / len(correct)
    r = dy.constant((1), r_i)
    # Reward baseline for each word
    W_bl = dy.parameter(W_bl_p)
    b_bl = dy.parameter(b_bl_p)
    r_b = [
        dy.affine_transform([b_bl, W_bl, dy.nobackprop(x)]) for x in word_reps
    ]

    #we need to take the value in order to break the computation graph
    #as the reward portion is trained seperatley and not backpropogated through during the overall score
    rewards_over_baseline = [(r - dy.nobackprop(x)) for x in r_b]
    #the scores for training the baseline
    baseline_scores = [dy.square(r - x) for x in r_b]

    #then calculate the reinforce scores using reinforce
    reinforce_scores = [
        r_s * score for r_s, score in zip(rewards_over_baseline, scores)
    ]

    #we want the first len(sent)-delta scores from xent then delta scores from reinforce
    #for mixer
    if len(scores) > delta:
        mixer_scores = scores[:len(scores) - delta] + reinforce_scores[delta -
                                                                       1:]
    else:
        mixer_scores = reinforce_scores
    return dy.esum(mixer_scores), dy.esum(baseline_scores)
Пример #4
0
    def loss(self, features, y):

        b1 = dy.parameter(self.b1)
        W1 = dy.parameter(self.W1)
        b2 = dy.parameter(self.b2)
        W2 = dy.parameter(self.W2)

        x = dy.inputVector(features)

        prediction = dy.tanh(
            dy.affine_transform(
                [b2, W2, dy.tanh(dy.affine_transform([b1, W1, x]))]))

        loss = dy.square(prediction - y)

        return prediction, loss
Пример #5
0
def calc_reinforce_loss(words, tags, delta):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    word_reps = LSTM.transduce([LOOKUP[x] for x in words])

    # Softmax scores
    W = dy.parameter(W_sm)
    b = dy.parameter(b_sm)

    #calculate the probability distribution 
    scores = [dy.affine_transform([b, W, x]) for x in word_reps]
    losses = [dy.pickneglogsoftmax(score, tag) for score, tag in zip(scores, tags)]
    probs = [-dy.exp(loss).as_array() for loss in losses]

    #then take samples from the probability distribution
    samples = [np.random.choice(range(len(x)), p=x) for x in probs]

    #calculate accuracy=reward
    correct = [sample == tag for sample, tag in zip(samples, tags)]
    r_i = float(sum(correct))/len(correct)
    r = dy.constant((1), r_i)
    # Reward baseline for each word
    W_bl = dy.parameter(W_bl_p)
    b_bl = dy.parameter(b_bl_p)
    r_b = [dy.affine_transform([b_bl, W_bl, dy.nobackprop(x)]) for x in word_reps]

    #we need to take the value in order to break the computation graph
    #as the reward portion is trained seperatley and not backpropogated through during the overall score
    rewards_over_baseline = [(r - dy.nobackprop(x)) for x in r_b]
    #the scores for training the baseline
    baseline_scores = [dy.square(r - x) for x in r_b]

    #then calculate the reinforce scores using reinforce
    reinforce_scores = [r_s*score for r_s, score in zip(rewards_over_baseline, scores)]

    #we want the first len(sent)-delta scores from xent then delta scores from reinforce
    #for mixer
    if len(scores) > delta:
        mixer_scores = scores[:len(scores)-delta] + reinforce_scores[delta-1:]
    else:
        mixer_scores = reinforce_scores
    return dy.esum(mixer_scores), dy.esum(baseline_scores)
Пример #6
0
    def _perform_calc_loss(
        self, model: 'model_base.ConditionedModel',
        src: Union[sent.Sentence, 'batchers.Batch'],
        trg: Union[sent.Sentence,
                   'batchers.Batch']) -> losses.FactoredLossExpr:
        assert hasattr(model, "attender") and hasattr(model.attender, "attention_vecs"), \
               "Must be called after MLELoss with models that have attender."

        masked_attn = model.attender.attention_vecs
        if trg.mask is not None:
            trg_mask = 1 - (trg.mask.np_arr.transpose())
            masked_attn = [
                dy.cmult(attn, dy.inputTensor(mask, batched=True))
                for attn, mask in zip(masked_attn, trg_mask)
            ]
        loss = dy.sum_elems(dy.square(1 - dy.esum(masked_attn)))
        units = [t.len_unpadded() for t in trg]
        return losses.FactoredLossExpr(
            {"global_fertility": losses.LossExpr(loss, units)})
Пример #7
0
 def global_fertility(self, a):
     return dy.sum_elems(dy.square(1 - dy.esum(a)))
Пример #8
0
def mse_loss(predictions, target):
    diff = predictions - target
    square = dy.square(diff)
    mean = dy.mean_elems(square)

    return mean
Пример #9
0
 def global_fertility(self, a: Sequence[dy.Expression]) -> dy.Expression:
   return dy.sum_elems(dy.square(1 - dy.esum(a)))
Пример #10
0
    def predict(self, features, task_name, train=False):
        """
        Steps through the computation graph and obtains predictions for the
        provided input features.
        :param features: a list of word  embeddings for every word in the sequence
        :param task_name: the name of the task that should be predicted
        :param train: if the model is training; apply noise in this case
        :return output: the output predictions
                penalty: the summed subspace penalty (0 if no constraint)
        """
        if train:  # noise is added only at training time

            features = [dynet.noise(fe, self.noise_sigma) for fe in features]

        # only if we use cross-stitch we have a layer for each task;
        # otherwise we just have one layer for all tasks
        num_layers = self.h_layers
        inputs = [features] * len(self.task_names)
        inputs_rev = [features] * len(self.task_names)

        target_task_id = self.task_names.index(
            task_name) if self.cross_stitch else 0

        # collect the forward and backward sequences for each task at every
        # layer for the layer connection units
        layer_forward_sequences = []
        layer_backward_sequences = []

        penalty = dynet.const_parameter(self.subspace_penalty)

        for i in range(0, num_layers):
            forward_sequences = []
            backward_sequences = []
            for j in range(num_task_layers):
                predictor = self.predictors['inner'][i][j]
                forward_sequence, backward_sequence = predictor.predict_sequence(
                    inputs[j], inputs_rev[j])
                if i > 0 and self.activation:
                    # activation between LSTM layers
                    forward_sequence = [
                        self.activation(s) for s in forward_sequence
                    ]
                    backward_sequence = [
                        self.activation(s) for s in backward_sequence
                    ]
                forward_sequences.append(forward_sequence)
                backward_sequences.append(backward_sequence)

                if self.num_subspaces == 2 and self.constraint_weight != 0:
                    # returns a list per layer, i.e. here a list with one item
                    lstm_parameters = \
                        predictor.builder.get_parameter_expressions()[0]

                    # lstm parameters consists of these weights:
                    # Wix,Wih,Wic,bi,Wox,Woh,Woc,bo,Wcx,Wch,bc
                    for param_idx in range(len(lstm_parameters)):
                        if param_idx in self.constrain_matrices:
                            W = lstm_parameters[param_idx]
                            W_shape = np.array(W.value()).shape

                            if (len(W_shape) < 2):
                                W_shape = [W_shape[0], 1]

                            # split matrix into its two subspaces
                            W_subspaces = dynet.reshape(
                                W, (self.num_subspaces, W_shape[0] /
                                    float(self.num_subspaces), W_shape[1]))
                            subspace_1, subspace_2 = W_subspaces[
                                0], W_subspaces[1]

                            # calculate the matrix product of the two matrices
                            matrix_product = dynet.transpose(
                                subspace_1) * subspace_2

                            # take the squared Frobenius norm by squaring
                            # every element and then summing them
                            squared_frobenius_norm = dynet.sum_elems(
                                dynet.square(matrix_product))
                            penalty += squared_frobenius_norm

            if self.cross_stitch:
                # takes as input a list of input lists and produces a list of
                # outputs where the index indicates the task
                forward_sequences = self.predictors['cross_stitch'][i].stitch(
                    forward_sequences)
                backward_sequences = self.predictors['cross_stitch'][i].stitch(
                    backward_sequences)

            inputs = forward_sequences
            inputs_rev = backward_sequences
            layer_forward_sequences.append(forward_sequences)
            layer_backward_sequences.append(backward_sequences)

            if i == num_layers - 1:
                output_predictor = \
                    self.predictors['output_layers_dict'][task_name]

                # get the forward/backward states of all task layers
                task_forward_sequences = [
                    layer_seq_list[target_task_id][-1]
                    for layer_seq_list in layer_forward_sequences
                ]

                task_backward_sequences = [
                    layer_seq_list[target_task_id][0]
                    for layer_seq_list in layer_backward_sequences
                ]

                if (num_layers > 1):
                    forward_input = \
                        self.predictors['layer_stitch'][
                            target_task_id].stitch(task_forward_sequences)
                    backward_input = \
                        self.predictors['layer_stitch'][
                            target_task_id].stitch(task_backward_sequences)

                else:
                    forward_input = task_forward_sequences[0]
                    backward_input = task_backward_sequences[0]

                concat_layer = dynet.concatenate(
                    [forward_input, backward_input])

                if train and self.noise_sigma > 0.0:
                    concat_layer = dynet.noise(concat_layer, self.noise_sigma)

                output = []

                if ('sentiment' in task_name):  #Multi-label

                    for i in range(len(output_predictor)):

                        output.append(output_predictor[i](concat_layer))

                else:
                    output.append(output_predictor(concat_layer))

                #output = output_predictor.predict_sequence(concat_layer)

                return output, penalty
        raise Exception('Error: This place should not be reached.')
Пример #11
0
    def predict(self, features, task_name, train=False):
        """
        Steps through the computation graph and obtains predictions for the
        provided input features.
        :param features: a list of concatenated word and character-based
                         embeddings for every word in the sequence
        :param task_name: the name of the task that should be predicted
        :param train: if the model is training; apply noise in this case
        :return output: the output predictions
                penalty: the summed subspace penalty (0 if no constraint)
        """
        if train:  # only do at training time
            features = [dynet.noise(fe, self.noise_sigma) for fe in features]

        output_expected_at_layer = self.predictors['task_expected_at'][
            task_name]
        output_expected_at_layer -= 1  # remove 1 as layers are 0-indexed

        # only if we use cross-stitch we have a layer for each task;
        # otherwise we just have one layer for all tasks
        num_layers = self.h_layers
        num_task_layers = len(self.predictors['inner'][0])
        inputs = [features] * num_task_layers
        inputs_rev = [features] * num_task_layers

        # similarly, with cross-stitching, we have multiple output layers
        target_task_id = self.task_names.index(
            task_name) if self.cross_stitch else 0

        # collect the forward and backward sequences for each task at every
        # layer for the layer connection units
        layer_forward_sequences = []
        layer_backward_sequences = []
        penalty = dynet.parameter(self.subspace_penalty, update=False)
        for i in range(0, num_layers):
            forward_sequences = []
            backward_sequences = []
            for j in range(num_task_layers):
                predictor = self.predictors['inner'][i][j]
                forward_sequence, backward_sequence = predictor.predict_sequence(
                    inputs[j], inputs_rev[j])
                if i > 0 and self.activation:
                    # activation between LSTM layers
                    forward_sequence = [
                        self.activation(s) for s in forward_sequence
                    ]
                    backward_sequence = [
                        self.activation(s) for s in backward_sequence
                    ]
                forward_sequences.append(forward_sequence)
                backward_sequences.append(backward_sequence)

                if self.num_subspaces == 2 and self.constraint_weight != 0:
                    # returns a list per layer, i.e. here a list with one item
                    lstm_parameters = \
                        predictor.builder.get_parameter_expressions()[0]

                    # lstm parameters consists of these weights:
                    # Wix,Wih,Wic,bi,Wox,Woh,Woc,bo,Wcx,Wch,bc
                    for param_idx in range(len(lstm_parameters)):
                        if param_idx in self.constrain_matrices:
                            W = lstm_parameters[param_idx]
                            W_shape = np.array(W.value()).shape

                            # split matrix into its two subspaces
                            W_subspaces = dynet.reshape(
                                W, (self.num_subspaces, W_shape[0] /
                                    float(self.num_subspaces), W_shape[1]))
                            subspace_1, subspace_2 = W_subspaces[
                                0], W_subspaces[1]

                            # calculate the matrix product of the two matrices
                            matrix_product = dynet.transpose(
                                subspace_1) * subspace_2

                            # take the squared Frobenius norm by squaring
                            # every element and then summing them
                            squared_frobenius_norm = dynet.sum_elems(
                                dynet.square(matrix_product))
                            penalty += squared_frobenius_norm

            if self.cross_stitch:
                # takes as input a list of input lists and produces a list of
                # outputs where the index indicates the task
                forward_sequences = self.predictors['cross_stitch'][i].stitch(
                    forward_sequences)
                backward_sequences = self.predictors['cross_stitch'][i].stitch(
                    backward_sequences)

            inputs = forward_sequences
            inputs_rev = backward_sequences
            layer_forward_sequences.append(forward_sequences)
            layer_backward_sequences.append(backward_sequences)

            if i == output_expected_at_layer:
                output_predictor = \
                    self.predictors['output_layers_dict'][task_name]

                # get the forward/backward states of all task layers
                task_forward_sequences = [
                    layer_seq_list[target_task_id]
                    for layer_seq_list in layer_forward_sequences
                ]
                task_backward_sequences = [
                    layer_seq_list[target_task_id]
                    for layer_seq_list in layer_backward_sequences
                ]

                if self.layer_connect == STITCH:
                    # stitch the forward and backward sequences together
                    forward_inputs = \
                        self.predictors['layer_stitch'][
                            target_task_id].stitch(task_forward_sequences)
                    backward_inputs = \
                        self.predictors['layer_stitch'][
                            target_task_id].stitch(task_backward_sequences)
                elif self.layer_connect == SKIP:
                    # use skip connections
                    forward_inputs = [
                        dynet.esum(list(layer_states))
                        for layer_states in zip(*task_forward_sequences)
                    ]
                    backward_inputs = [
                        dynet.esum(list(layer_states))
                        for layer_states in zip(*task_backward_sequences)
                    ]
                else:
                    # otherwise just use the sequences from the last layer
                    forward_inputs = forward_sequences[target_task_id]
                    backward_inputs = backward_sequences[target_task_id]

                if self.layer_connect == CONCAT:
                    layer_concatenated = []
                    # concatenate forward and backward states of layers
                    for fwd_seqs, bwd_seqs in zip(task_forward_sequences,
                                                  task_backward_sequences):
                        layer_concatenated.append([
                            dynet.concatenate([f, b])
                            for f, b in zip(fwd_seqs, reversed(bwd_seqs))
                        ])
                    # concatenate the states of all the task layers
                    concat_layer = [
                        dynet.concatenate(list(layer_states))
                        for layer_states in zip(*layer_concatenated)
                    ]
                else:
                    concat_layer = [
                        dynet.concatenate([f, b]) for f, b in zip(
                            forward_inputs, reversed(backward_inputs))
                    ]

                if train and self.noise_sigma > 0.0:
                    concat_layer = [
                        dynet.noise(fe, self.noise_sigma)
                        for fe in concat_layer
                    ]

                output = output_predictor.predict_sequence(concat_layer)
                return output, penalty
        raise Exception('Error: This place should not be reached.')
Пример #12
0
 def l2_normalize(x):
     square_sum = dynet.sqrt(dynet.bmax(dynet.sum_elems(dynet.square(x)), np.finfo(float).eps * dynet.ones((1))[0]))
     return dynet.cdiv(x, square_sum)
Пример #13
0
 def l2_normalize(x):
     epsilon = np.finfo(float).eps * dy.ones(pred.dim()[0])
     norm = dy.sqrt(dy.sum_elems(dy.square(x)))
     sign = dy.cdiv(x, dy.bmax(dy.abs(x), epsilon))
     return dy.cdiv(dy.cmult(sign, dy.bmax(dy.abs(x), epsilon)), dy.bmax(norm, epsilon[0]))
Пример #14
0
    def __call__(self, x: dy.Expression, att_mask: np.ndarray,
                 batch_mask: np.ndarray, p: numbers.Real):
        """
    x: expression of dimensions (input_dim, time) x batch
    att_mask: numpy array of dimensions (time, time); pre-transposed
    batch_mask: numpy array of dimensions (batch, time)
    p: dropout prob
    """
        sent_len = x.dim()[0][1]
        batch_size = x[0].dim()[1]

        if self.downsample_factor > 1:
            if sent_len % self.downsample_factor != 0:
                raise ValueError(
                    "For 'reshape' downsampling, sequence lengths must be multiples of the downsampling factor. "
                    "Configure batcher accordingly.")
            if batch_mask is not None:
                batch_mask = batch_mask[:, ::self.downsample_factor]
            sent_len_out = sent_len // self.downsample_factor
            sent_len = sent_len_out
            out_mask = x.mask
            if self.downsample_factor > 1 and out_mask is not None:
                out_mask = out_mask.lin_subsampled(
                    reduce_factor=self.downsample_factor)

            x = ExpressionSequence(expr_tensor=dy.reshape(
                x.as_tensor(), (x.dim()[0][0] * self.downsample_factor,
                                x.dim()[0][1] / self.downsample_factor),
                batch_size=batch_size),
                                   mask=out_mask)
            residual = SAAMTimeDistributed()(x)
        else:
            residual = SAAMTimeDistributed()(x)
            sent_len_out = sent_len
        if self.model_dim != self.input_dim * self.downsample_factor:
            residual = self.res_shortcut.transform(residual)

        # Concatenate all the words together for doing vectorized affine transform
        if self.kq_pos_encoding_type is None:
            kvq_lin = self.linear_kvq.transform(SAAMTimeDistributed()(x))
            key_up = self.shape_projection(
                dy.pick_range(kvq_lin, 0, self.head_count * self.dim_per_head),
                batch_size)
            value_up = self.shape_projection(
                dy.pick_range(kvq_lin, self.head_count * self.dim_per_head,
                              2 * self.head_count * self.dim_per_head),
                batch_size)
            query_up = self.shape_projection(
                dy.pick_range(kvq_lin, 2 * self.head_count * self.dim_per_head,
                              3 * self.head_count * self.dim_per_head),
                batch_size)
        else:
            assert self.kq_pos_encoding_type == "embedding"
            encoding = self.kq_positional_embedder.embed_sent(
                sent_len).as_tensor()
            kq_lin = self.linear_kq.transform(SAAMTimeDistributed()(
                ExpressionSequence(
                    expr_tensor=dy.concatenate([x.as_tensor(), encoding]))))
            key_up = self.shape_projection(
                dy.pick_range(kq_lin, 0, self.head_count * self.dim_per_head),
                batch_size)
            query_up = self.shape_projection(
                dy.pick_range(kq_lin, self.head_count * self.dim_per_head,
                              2 * self.head_count * self.dim_per_head),
                batch_size)
            v_lin = self.linear_v.transform(SAAMTimeDistributed()(x))
            value_up = self.shape_projection(v_lin, batch_size)

        if self.cross_pos_encoding_type:
            assert self.cross_pos_encoding_type == "embedding"
            emb1 = dy.pick_range(dy.parameter(self.cross_pos_emb_p1), 0,
                                 sent_len)
            emb2 = dy.pick_range(dy.parameter(self.cross_pos_emb_p2), 0,
                                 sent_len)
            key_up = dy.reshape(key_up,
                                (sent_len, self.dim_per_head, self.head_count),
                                batch_size=batch_size)
            key_up = dy.concatenate_cols(
                [dy.cmult(key_up, emb1),
                 dy.cmult(key_up, emb2)])
            key_up = dy.reshape(key_up, (sent_len, self.dim_per_head * 2),
                                batch_size=self.head_count * batch_size)
            query_up = dy.reshape(
                query_up, (sent_len, self.dim_per_head, self.head_count),
                batch_size=batch_size)
            query_up = dy.concatenate_cols(
                [dy.cmult(query_up, emb2),
                 dy.cmult(query_up, -emb1)])
            query_up = dy.reshape(query_up, (sent_len, self.dim_per_head * 2),
                                  batch_size=self.head_count * batch_size)

        scaled = query_up * dy.transpose(
            key_up / math.sqrt(self.dim_per_head)
        )  # scale before the matrix multiplication to save memory

        # Apply Mask here
        if not self.ignore_masks:
            if att_mask is not None:
                att_mask_inp = att_mask * -100.0
                if self.downsample_factor > 1:
                    att_mask_inp = att_mask_inp[::self.downsample_factor, ::
                                                self.downsample_factor]
                scaled += dy.inputTensor(att_mask_inp)
            if batch_mask is not None:
                # reshape (batch, time) -> (time, head_count*batch), then *-100
                inp = np.resize(np.broadcast_to(batch_mask.T[:, np.newaxis, :],
                                                (sent_len, self.head_count, batch_size)),
                                (1, sent_len, self.head_count * batch_size)) \
                      * -100
                mask_expr = dy.inputTensor(inp, batched=True)
                scaled += mask_expr
            if self.diag_gauss_mask:
                diag_growing = np.zeros((sent_len, sent_len, self.head_count))
                for i in range(sent_len):
                    for j in range(sent_len):
                        diag_growing[i, j, :] = -(i - j)**2 / 2.0
                e_diag_gauss_mask = dy.inputTensor(diag_growing)
                e_sigma = dy.parameter(self.diag_gauss_mask_sigma)
                if self.square_mask_std:
                    e_sigma = dy.square(e_sigma)
                e_sigma_sq_inv = dy.cdiv(
                    dy.ones(e_sigma.dim()[0], batch_size=batch_size),
                    dy.square(e_sigma))
                e_diag_gauss_mask_final = dy.cmult(e_diag_gauss_mask,
                                                   e_sigma_sq_inv)
                scaled += dy.reshape(e_diag_gauss_mask_final,
                                     (sent_len, sent_len),
                                     batch_size=batch_size * self.head_count)

        # Computing Softmax here.
        attn = dy.softmax(scaled, d=1)
        if LOG_ATTENTION:
            yaml_logger.info({
                "key": "selfatt_mat_ax0",
                "value": np.average(attn.value(), axis=0).dumps(),
                "desc": self.desc
            })
            yaml_logger.info({
                "key": "selfatt_mat_ax1",
                "value": np.average(attn.value(), axis=1).dumps(),
                "desc": self.desc
            })
            yaml_logger.info({
                "key": "selfatt_mat_ax0_ent",
                "value": entropy(attn.value()).dumps(),
                "desc": self.desc
            })
            yaml_logger.info({
                "key": "selfatt_mat_ax1_ent",
                "value": entropy(attn.value().transpose()).dumps(),
                "desc": self.desc
            })

        self.select_att_head = 0
        if self.select_att_head is not None:
            attn = dy.reshape(attn, (sent_len, sent_len, self.head_count),
                              batch_size=batch_size)
            sel_mask = np.zeros((1, 1, self.head_count))
            sel_mask[0, 0, self.select_att_head] = 1.0
            attn = dy.cmult(attn, dy.inputTensor(sel_mask))
            attn = dy.reshape(attn, (sent_len, sent_len),
                              batch_size=self.head_count * batch_size)

        # Applying dropout to attention
        if p > 0.0:
            drop_attn = dy.dropout(attn, p)
        else:
            drop_attn = attn

        # Computing weighted attention score
        attn_prod = drop_attn * value_up

        # Reshaping the attn_prod to input query dimensions
        out = dy.reshape(attn_prod,
                         (sent_len_out, self.dim_per_head * self.head_count),
                         batch_size=batch_size)
        out = dy.transpose(out)
        out = dy.reshape(out, (self.model_dim, ),
                         batch_size=batch_size * sent_len_out)
        #     out = dy.reshape_transpose_reshape(attn_prod, (sent_len_out, self.dim_per_head * self.head_count), (self.model_dim,), pre_batch_size=batch_size, post_batch_size=batch_size*sent_len_out)

        if self.plot_attention:
            from sklearn.metrics.pairwise import cosine_similarity
            assert batch_size == 1
            mats = []
            for i in range(attn.dim()[1]):
                mats.append(dy.pick_batch_elem(attn, i).npvalue())
                self.plot_att_mat(
                    mats[-1], "{}.sent_{}.head_{}.png".format(
                        self.plot_attention, self.plot_attention_counter, i),
                    300)
            avg_mat = np.average(mats, axis=0)
            self.plot_att_mat(
                avg_mat,
                "{}.sent_{}.head_avg.png".format(self.plot_attention,
                                                 self.plot_attention_counter),
                300)
            cosim_before = cosine_similarity(x.as_tensor().npvalue().T)
            self.plot_att_mat(
                cosim_before, "{}.sent_{}.cosim_before.png".format(
                    self.plot_attention, self.plot_attention_counter), 600)
            cosim_after = cosine_similarity(out.npvalue().T)
            self.plot_att_mat(
                cosim_after, "{}.sent_{}.cosim_after.png".format(
                    self.plot_attention, self.plot_attention_counter), 600)
            self.plot_attention_counter += 1

        # Adding dropout and layer normalization
        if p > 0.0:
            res = dy.dropout(out, p) + residual
        else:
            res = out + residual
        ret = self.layer_norm.transform(res)
        return ret
Пример #15
0
newSeason.append(newSeason[0])
newLevels.append(1 * dy.cdiv(y[0], newSeason[0]))
#perform smoothing
for i in range(1, len(df)):
    newLevels.append(levelSm * dy.cdiv(y[i], newSeason[i]) +
                     (1 - levelSm) * newLevels[i - 1])
    newSeason.append(seasonSm * dy.cdiv(y[i], newLevels[i]) +
                     (1 - seasonSm) * newSeason[i])
s = dy.concatenate(newSeason)
l = dy.concatenate(newLevels)

#penalize sudden level changes (should be scale independent - it is dependent)\
#should penalize 2nd derivative
l_log_diff = dy.log(dy.cdiv(l[1:], l[0:l.dim()[0][0] - 1]))
l_penalty = l_log_diff[1:] - l_log_diff[0:l_log_diff.dim()[0][0] - 1]
level_loss = dy.mean_elems(dy.square(l_penalty)) * 10
print(level_loss.value())

preds = []
outputs = []

#wez y i usun sezonowosc i level
for i in range(n, len(df) - h):
    inputs = y[i - n:i]  #n okresy
    curr_season = s[i - n:i]
    inputs = dy.cdiv(inputs, l[i])
    inputs = dy.cdiv(inputs, curr_season)
    inputs = dy.log(inputs)
    reseasonalize = s[i + 1]  #poprzedni okres +1 krok
    preds.append(dy.exp(fcstr(inputs)) * l[i] * reseasonalize)
    outputs.append(y[i + 1])  #+1 krok
Пример #16
0
 def global_fertility(self, a: Sequence[tt.Tensor]) -> tt.Tensor:
     return dy.sum_elems(dy.square(1 - dy.esum(a)))